[net,2/4] vhost_net: rework on the lock ordering for busy polling
diff mbox series

Message ID 20181210094454.21144-3-jasowang@redhat.com
State Superseded
Headers show
Series
  • Fix various issue of vhost
Related show

Commit Message

Jason Wang Dec. 10, 2018, 9:44 a.m. UTC
When we try to do rx busy polling in tx path in commit 441abde4cd84
("net: vhost: add rx busy polling in tx path"), we lock rx vq mutex
after tx vq mutex is held. This may lead deadlock so we try to lock vq
one by one in commit 78139c94dc8c ("net: vhost: lock the vqs one by
one"). With this commit, we avoid the deadlock with the assumption
that handle_rx() and handle_tx() run in a same process. But this
commit remove the protection for IOTLB updating which requires the
mutex of each vq to be held.

To solve this issue, the first step is to have a exact same lock
ordering for vhost_net. This is done through:

- For handle_rx(), if busy polling is enabled, lock tx vq immediately.
- For handle_tx(), always lock rx vq before tx vq, and unlock it if
  busy polling is not enabled.
- Remove the tricky locking codes in busy polling.

With this, we can have a exact same lock ordering for vhost_net, this
allows us to safely revert commit 78139c94dc8c ("net: vhost: lock the
vqs one by one") in next patch.

The patch will add two more atomic operations on the tx path during
each round of handle_tx(). 1 byte TCP_RR does not notice such
overhead.

Fixes: commit 78139c94dc8c ("net: vhost: lock the vqs one by one")
Cc: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/vhost/net.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

Comments

Michael S. Tsirkin Dec. 11, 2018, 1:34 a.m. UTC | #1
On Mon, Dec 10, 2018 at 05:44:52PM +0800, Jason Wang wrote:
> When we try to do rx busy polling in tx path in commit 441abde4cd84
> ("net: vhost: add rx busy polling in tx path"), we lock rx vq mutex
> after tx vq mutex is held. This may lead deadlock so we try to lock vq
> one by one in commit 78139c94dc8c ("net: vhost: lock the vqs one by
> one"). With this commit, we avoid the deadlock with the assumption
> that handle_rx() and handle_tx() run in a same process. But this
> commit remove the protection for IOTLB updating which requires the
> mutex of each vq to be held.
> 
> To solve this issue, the first step is to have a exact same lock
> ordering for vhost_net. This is done through:
> 
> - For handle_rx(), if busy polling is enabled, lock tx vq immediately.
> - For handle_tx(), always lock rx vq before tx vq, and unlock it if
>   busy polling is not enabled.
> - Remove the tricky locking codes in busy polling.
> 
> With this, we can have a exact same lock ordering for vhost_net, this
> allows us to safely revert commit 78139c94dc8c ("net: vhost: lock the
> vqs one by one") in next patch.
> 
> The patch will add two more atomic operations on the tx path during
> each round of handle_tx(). 1 byte TCP_RR does not notice such
> overhead.
> 
> Fixes: commit 78139c94dc8c ("net: vhost: lock the vqs one by one")
> Cc: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/vhost/net.c | 18 +++++++++++++++---
>  1 file changed, 15 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index ab11b2bee273..5f272ab4d5b4 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -513,7 +513,6 @@ static void vhost_net_busy_poll(struct vhost_net *net,
>  	struct socket *sock;
>  	struct vhost_virtqueue *vq = poll_rx ? tvq : rvq;
>  
> -	mutex_lock_nested(&vq->mutex, poll_rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
>  	vhost_disable_notify(&net->dev, vq);
>  	sock = rvq->private_data;
>  
> @@ -543,8 +542,6 @@ static void vhost_net_busy_poll(struct vhost_net *net,
>  		vhost_net_busy_poll_try_queue(net, vq);
>  	else if (!poll_rx) /* On tx here, sock has no rx data. */
>  		vhost_enable_notify(&net->dev, rvq);
> -
> -	mutex_unlock(&vq->mutex);
>  }
>  
>  static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> @@ -913,10 +910,16 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
>  static void handle_tx(struct vhost_net *net)
>  {
>  	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
> +	struct vhost_net_virtqueue *nvq_rx = &net->vqs[VHOST_NET_VQ_RX];
>  	struct vhost_virtqueue *vq = &nvq->vq;
> +	struct vhost_virtqueue *vq_rx = &nvq_rx->vq;
>  	struct socket *sock;
>  
> +	mutex_lock_nested(&vq_rx->mutex, VHOST_NET_VQ_RX);
>  	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
> +	if (!vq->busyloop_timeout)
> +		mutex_unlock(&vq_rx->mutex);
> +
>  	sock = vq->private_data;
>  	if (!sock)
>  		goto out;
> @@ -933,6 +936,8 @@ static void handle_tx(struct vhost_net *net)
>  		handle_tx_copy(net, sock);
>  
>  out:
> +	if (vq->busyloop_timeout)
> +		mutex_unlock(&vq_rx->mutex);
>  	mutex_unlock(&vq->mutex);
>  }
>  


So rx mutex taken on tx path now.  And tx mutex is on rc path ...  This
is just messed up. Why can't tx polling drop rx lock before
getting the tx lock and vice versa?

Or if we really wanted to force everything to be locked at
all times, let's just use a single mutex.



> @@ -1060,7 +1065,9 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>  static void handle_rx(struct vhost_net *net)
>  {
>  	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
> +	struct vhost_net_virtqueue *nvq_tx = &net->vqs[VHOST_NET_VQ_TX];
>  	struct vhost_virtqueue *vq = &nvq->vq;
> +	struct vhost_virtqueue *vq_tx = &nvq_tx->vq;
>  	unsigned uninitialized_var(in), log;
>  	struct vhost_log *vq_log;
>  	struct msghdr msg = {
> @@ -1086,6 +1093,9 @@ static void handle_rx(struct vhost_net *net)
>  	int recv_pkts = 0;
>  
>  	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
> +	if (vq->busyloop_timeout)
> +		mutex_lock_nested(&vq_tx->mutex, VHOST_NET_VQ_TX);
> +
>  	sock = vq->private_data;
>  	if (!sock)
>  		goto out;
> @@ -1200,6 +1210,8 @@ static void handle_rx(struct vhost_net *net)
>  out:
>  	vhost_net_signal_used(nvq);
>  	mutex_unlock(&vq->mutex);
> +	if (vq->busyloop_timeout)
> +		mutex_unlock(&vq_tx->mutex);
>  }
>  
>  static void handle_tx_kick(struct vhost_work *work)
> -- 
> 2.17.1
Jason Wang Dec. 11, 2018, 3:06 a.m. UTC | #2
On 2018/12/11 上午9:34, Michael S. Tsirkin wrote:
> On Mon, Dec 10, 2018 at 05:44:52PM +0800, Jason Wang wrote:
>> When we try to do rx busy polling in tx path in commit 441abde4cd84
>> ("net: vhost: add rx busy polling in tx path"), we lock rx vq mutex
>> after tx vq mutex is held. This may lead deadlock so we try to lock vq
>> one by one in commit 78139c94dc8c ("net: vhost: lock the vqs one by
>> one"). With this commit, we avoid the deadlock with the assumption
>> that handle_rx() and handle_tx() run in a same process. But this
>> commit remove the protection for IOTLB updating which requires the
>> mutex of each vq to be held.
>>
>> To solve this issue, the first step is to have a exact same lock
>> ordering for vhost_net. This is done through:
>>
>> - For handle_rx(), if busy polling is enabled, lock tx vq immediately.
>> - For handle_tx(), always lock rx vq before tx vq, and unlock it if
>>    busy polling is not enabled.
>> - Remove the tricky locking codes in busy polling.
>>
>> With this, we can have a exact same lock ordering for vhost_net, this
>> allows us to safely revert commit 78139c94dc8c ("net: vhost: lock the
>> vqs one by one") in next patch.
>>
>> The patch will add two more atomic operations on the tx path during
>> each round of handle_tx(). 1 byte TCP_RR does not notice such
>> overhead.
>>
>> Fixes: commit 78139c94dc8c ("net: vhost: lock the vqs one by one")
>> Cc: Tonghao Zhang<xiangxia.m.yue@gmail.com>
>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>> ---
>>   drivers/vhost/net.c | 18 +++++++++++++++---
>>   1 file changed, 15 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>> index ab11b2bee273..5f272ab4d5b4 100644
>> --- a/drivers/vhost/net.c
>> +++ b/drivers/vhost/net.c
>> @@ -513,7 +513,6 @@ static void vhost_net_busy_poll(struct vhost_net *net,
>>   	struct socket *sock;
>>   	struct vhost_virtqueue *vq = poll_rx ? tvq : rvq;
>>   
>> -	mutex_lock_nested(&vq->mutex, poll_rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
>>   	vhost_disable_notify(&net->dev, vq);
>>   	sock = rvq->private_data;
>>   
>> @@ -543,8 +542,6 @@ static void vhost_net_busy_poll(struct vhost_net *net,
>>   		vhost_net_busy_poll_try_queue(net, vq);
>>   	else if (!poll_rx) /* On tx here, sock has no rx data. */
>>   		vhost_enable_notify(&net->dev, rvq);
>> -
>> -	mutex_unlock(&vq->mutex);
>>   }
>>   
>>   static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
>> @@ -913,10 +910,16 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
>>   static void handle_tx(struct vhost_net *net)
>>   {
>>   	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
>> +	struct vhost_net_virtqueue *nvq_rx = &net->vqs[VHOST_NET_VQ_RX];
>>   	struct vhost_virtqueue *vq = &nvq->vq;
>> +	struct vhost_virtqueue *vq_rx = &nvq_rx->vq;
>>   	struct socket *sock;
>>   
>> +	mutex_lock_nested(&vq_rx->mutex, VHOST_NET_VQ_RX);
>>   	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
>> +	if (!vq->busyloop_timeout)
>> +		mutex_unlock(&vq_rx->mutex);
>> +
>>   	sock = vq->private_data;
>>   	if (!sock)
>>   		goto out;
>> @@ -933,6 +936,8 @@ static void handle_tx(struct vhost_net *net)
>>   		handle_tx_copy(net, sock);
>>   
>>   out:
>> +	if (vq->busyloop_timeout)
>> +		mutex_unlock(&vq_rx->mutex);
>>   	mutex_unlock(&vq->mutex);
>>   }
>>   
> So rx mutex taken on tx path now.  And tx mutex is on rc path ...  This
> is just messed up. Why can't tx polling drop rx lock before
> getting the tx lock and vice versa?


Because we want to poll both tx and rx virtqueue at the same time 
(vhost_net_busy_poll()).

     while (vhost_can_busy_poll(endtime)) {
         if (vhost_has_work(&net->dev)) {
             *busyloop_intr = true;
             break;
         }

         if ((sock_has_rx_data(sock) &&
              !vhost_vq_avail_empty(&net->dev, rvq)) ||
             !vhost_vq_avail_empty(&net->dev, tvq))
             break;

         cpu_relax();

     }


And we disable kicks and notification for better performance.


>
> Or if we really wanted to force everything to be locked at
> all times, let's just use a single mutex.
>
>
>

We could, but it might requires more changes which could be done for 
-next I believe.


Thanks
Michael S. Tsirkin Dec. 11, 2018, 4:04 a.m. UTC | #3
On Tue, Dec 11, 2018 at 11:06:43AM +0800, Jason Wang wrote:
> 
> On 2018/12/11 上午9:34, Michael S. Tsirkin wrote:
> > On Mon, Dec 10, 2018 at 05:44:52PM +0800, Jason Wang wrote:
> > > When we try to do rx busy polling in tx path in commit 441abde4cd84
> > > ("net: vhost: add rx busy polling in tx path"), we lock rx vq mutex
> > > after tx vq mutex is held. This may lead deadlock so we try to lock vq
> > > one by one in commit 78139c94dc8c ("net: vhost: lock the vqs one by
> > > one"). With this commit, we avoid the deadlock with the assumption
> > > that handle_rx() and handle_tx() run in a same process. But this
> > > commit remove the protection for IOTLB updating which requires the
> > > mutex of each vq to be held.
> > > 
> > > To solve this issue, the first step is to have a exact same lock
> > > ordering for vhost_net. This is done through:
> > > 
> > > - For handle_rx(), if busy polling is enabled, lock tx vq immediately.
> > > - For handle_tx(), always lock rx vq before tx vq, and unlock it if
> > >    busy polling is not enabled.
> > > - Remove the tricky locking codes in busy polling.
> > > 
> > > With this, we can have a exact same lock ordering for vhost_net, this
> > > allows us to safely revert commit 78139c94dc8c ("net: vhost: lock the
> > > vqs one by one") in next patch.
> > > 
> > > The patch will add two more atomic operations on the tx path during
> > > each round of handle_tx(). 1 byte TCP_RR does not notice such
> > > overhead.
> > > 
> > > Fixes: commit 78139c94dc8c ("net: vhost: lock the vqs one by one")
> > > Cc: Tonghao Zhang<xiangxia.m.yue@gmail.com>
> > > Signed-off-by: Jason Wang<jasowang@redhat.com>
> > > ---
> > >   drivers/vhost/net.c | 18 +++++++++++++++---
> > >   1 file changed, 15 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > index ab11b2bee273..5f272ab4d5b4 100644
> > > --- a/drivers/vhost/net.c
> > > +++ b/drivers/vhost/net.c
> > > @@ -513,7 +513,6 @@ static void vhost_net_busy_poll(struct vhost_net *net,
> > >   	struct socket *sock;
> > >   	struct vhost_virtqueue *vq = poll_rx ? tvq : rvq;
> > > -	mutex_lock_nested(&vq->mutex, poll_rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
> > >   	vhost_disable_notify(&net->dev, vq);
> > >   	sock = rvq->private_data;
> > > @@ -543,8 +542,6 @@ static void vhost_net_busy_poll(struct vhost_net *net,
> > >   		vhost_net_busy_poll_try_queue(net, vq);
> > >   	else if (!poll_rx) /* On tx here, sock has no rx data. */
> > >   		vhost_enable_notify(&net->dev, rvq);
> > > -
> > > -	mutex_unlock(&vq->mutex);
> > >   }
> > >   static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> > > @@ -913,10 +910,16 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
> > >   static void handle_tx(struct vhost_net *net)
> > >   {
> > >   	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
> > > +	struct vhost_net_virtqueue *nvq_rx = &net->vqs[VHOST_NET_VQ_RX];
> > >   	struct vhost_virtqueue *vq = &nvq->vq;
> > > +	struct vhost_virtqueue *vq_rx = &nvq_rx->vq;
> > >   	struct socket *sock;
> > > +	mutex_lock_nested(&vq_rx->mutex, VHOST_NET_VQ_RX);
> > >   	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
> > > +	if (!vq->busyloop_timeout)
> > > +		mutex_unlock(&vq_rx->mutex);
> > > +
> > >   	sock = vq->private_data;
> > >   	if (!sock)
> > >   		goto out;
> > > @@ -933,6 +936,8 @@ static void handle_tx(struct vhost_net *net)
> > >   		handle_tx_copy(net, sock);
> > >   out:
> > > +	if (vq->busyloop_timeout)
> > > +		mutex_unlock(&vq_rx->mutex);
> > >   	mutex_unlock(&vq->mutex);
> > >   }
> > So rx mutex taken on tx path now.  And tx mutex is on rc path ...  This
> > is just messed up. Why can't tx polling drop rx lock before
> > getting the tx lock and vice versa?
> 
> 
> Because we want to poll both tx and rx virtqueue at the same time
> (vhost_net_busy_poll()).
> 
>     while (vhost_can_busy_poll(endtime)) {
>         if (vhost_has_work(&net->dev)) {
>             *busyloop_intr = true;
>             break;
>         }
> 
>         if ((sock_has_rx_data(sock) &&
>              !vhost_vq_avail_empty(&net->dev, rvq)) ||
>             !vhost_vq_avail_empty(&net->dev, tvq))
>             break;
> 
>         cpu_relax();
> 
>     }
> 
> 
> And we disable kicks and notification for better performance.

Right but it's all slow path - it happens when queue is
otherwise empty. So this is what I am saying: let's drop the locks
we hold around this.


> 
> > 
> > Or if we really wanted to force everything to be locked at
> > all times, let's just use a single mutex.
> > 
> > 
> > 
> 
> We could, but it might requires more changes which could be done for -next I
> believe.
> 
> 
> Thanks

I'd rather we kept the fine grained locking. E.g. people are
looking at splitting the tx and rx threads. But if not possible
let's fix it cleanly with a coarse-grained one. A mess here will
just create more trouble later.
Jason Wang Dec. 12, 2018, 3:03 a.m. UTC | #4
On 2018/12/11 下午12:04, Michael S. Tsirkin wrote:
> On Tue, Dec 11, 2018 at 11:06:43AM +0800, Jason Wang wrote:
>> On 2018/12/11 上午9:34, Michael S. Tsirkin wrote:
>>> On Mon, Dec 10, 2018 at 05:44:52PM +0800, Jason Wang wrote:
>>>> When we try to do rx busy polling in tx path in commit 441abde4cd84
>>>> ("net: vhost: add rx busy polling in tx path"), we lock rx vq mutex
>>>> after tx vq mutex is held. This may lead deadlock so we try to lock vq
>>>> one by one in commit 78139c94dc8c ("net: vhost: lock the vqs one by
>>>> one"). With this commit, we avoid the deadlock with the assumption
>>>> that handle_rx() and handle_tx() run in a same process. But this
>>>> commit remove the protection for IOTLB updating which requires the
>>>> mutex of each vq to be held.
>>>>
>>>> To solve this issue, the first step is to have a exact same lock
>>>> ordering for vhost_net. This is done through:
>>>>
>>>> - For handle_rx(), if busy polling is enabled, lock tx vq immediately.
>>>> - For handle_tx(), always lock rx vq before tx vq, and unlock it if
>>>>     busy polling is not enabled.
>>>> - Remove the tricky locking codes in busy polling.
>>>>
>>>> With this, we can have a exact same lock ordering for vhost_net, this
>>>> allows us to safely revert commit 78139c94dc8c ("net: vhost: lock the
>>>> vqs one by one") in next patch.
>>>>
>>>> The patch will add two more atomic operations on the tx path during
>>>> each round of handle_tx(). 1 byte TCP_RR does not notice such
>>>> overhead.
>>>>
>>>> Fixes: commit 78139c94dc8c ("net: vhost: lock the vqs one by one")
>>>> Cc: Tonghao Zhang<xiangxia.m.yue@gmail.com>
>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>> ---
>>>>    drivers/vhost/net.c | 18 +++++++++++++++---
>>>>    1 file changed, 15 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>>>> index ab11b2bee273..5f272ab4d5b4 100644
>>>> --- a/drivers/vhost/net.c
>>>> +++ b/drivers/vhost/net.c
>>>> @@ -513,7 +513,6 @@ static void vhost_net_busy_poll(struct vhost_net *net,
>>>>    	struct socket *sock;
>>>>    	struct vhost_virtqueue *vq = poll_rx ? tvq : rvq;
>>>> -	mutex_lock_nested(&vq->mutex, poll_rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
>>>>    	vhost_disable_notify(&net->dev, vq);
>>>>    	sock = rvq->private_data;
>>>> @@ -543,8 +542,6 @@ static void vhost_net_busy_poll(struct vhost_net *net,
>>>>    		vhost_net_busy_poll_try_queue(net, vq);
>>>>    	else if (!poll_rx) /* On tx here, sock has no rx data. */
>>>>    		vhost_enable_notify(&net->dev, rvq);
>>>> -
>>>> -	mutex_unlock(&vq->mutex);
>>>>    }
>>>>    static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
>>>> @@ -913,10 +910,16 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
>>>>    static void handle_tx(struct vhost_net *net)
>>>>    {
>>>>    	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
>>>> +	struct vhost_net_virtqueue *nvq_rx = &net->vqs[VHOST_NET_VQ_RX];
>>>>    	struct vhost_virtqueue *vq = &nvq->vq;
>>>> +	struct vhost_virtqueue *vq_rx = &nvq_rx->vq;
>>>>    	struct socket *sock;
>>>> +	mutex_lock_nested(&vq_rx->mutex, VHOST_NET_VQ_RX);
>>>>    	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
>>>> +	if (!vq->busyloop_timeout)
>>>> +		mutex_unlock(&vq_rx->mutex);
>>>> +
>>>>    	sock = vq->private_data;
>>>>    	if (!sock)
>>>>    		goto out;
>>>> @@ -933,6 +936,8 @@ static void handle_tx(struct vhost_net *net)
>>>>    		handle_tx_copy(net, sock);
>>>>    out:
>>>> +	if (vq->busyloop_timeout)
>>>> +		mutex_unlock(&vq_rx->mutex);
>>>>    	mutex_unlock(&vq->mutex);
>>>>    }
>>> So rx mutex taken on tx path now.  And tx mutex is on rc path ...  This
>>> is just messed up. Why can't tx polling drop rx lock before
>>> getting the tx lock and vice versa?
>>
>> Because we want to poll both tx and rx virtqueue at the same time
>> (vhost_net_busy_poll()).
>>
>>      while (vhost_can_busy_poll(endtime)) {
>>          if (vhost_has_work(&net->dev)) {
>>              *busyloop_intr = true;
>>              break;
>>          }
>>
>>          if ((sock_has_rx_data(sock) &&
>>               !vhost_vq_avail_empty(&net->dev, rvq)) ||
>>              !vhost_vq_avail_empty(&net->dev, tvq))
>>              break;
>>
>>          cpu_relax();
>>
>>      }
>>
>>
>> And we disable kicks and notification for better performance.
> Right but it's all slow path - it happens when queue is
> otherwise empty. So this is what I am saying: let's drop the locks
> we hold around this.


Is this really safe? I looks to me it can race with SET_VRING_ADDR. And 
the codes did more:

- access sock object

- access device IOTLB

- enable and disable notification

None of above is safe without the protection of vq mutex.


>
>
>>> Or if we really wanted to force everything to be locked at
>>> all times, let's just use a single mutex.
>>>
>>>
>>>
>> We could, but it might requires more changes which could be done for -next I
>> believe.
>>
>>
>> Thanks
> I'd rather we kept the fine grained locking. E.g. people are
> looking at splitting the tx and rx threads. But if not possible
> let's fix it cleanly with a coarse-grained one. A mess here will
> just create more trouble later.
>

I believe we won't go back to coarse one. Looks like we can solve this 
by using mutex_trylock() for rxq during TX. And don't do polling for rxq 
is a IOTLB updating is pending.

Let me post V2.

Thanks
Michael S. Tsirkin Dec. 12, 2018, 3:40 a.m. UTC | #5
On Wed, Dec 12, 2018 at 11:03:57AM +0800, Jason Wang wrote:
> 
> On 2018/12/11 下午12:04, Michael S. Tsirkin wrote:
> > On Tue, Dec 11, 2018 at 11:06:43AM +0800, Jason Wang wrote:
> > > On 2018/12/11 上午9:34, Michael S. Tsirkin wrote:
> > > > On Mon, Dec 10, 2018 at 05:44:52PM +0800, Jason Wang wrote:
> > > > > When we try to do rx busy polling in tx path in commit 441abde4cd84
> > > > > ("net: vhost: add rx busy polling in tx path"), we lock rx vq mutex
> > > > > after tx vq mutex is held. This may lead deadlock so we try to lock vq
> > > > > one by one in commit 78139c94dc8c ("net: vhost: lock the vqs one by
> > > > > one"). With this commit, we avoid the deadlock with the assumption
> > > > > that handle_rx() and handle_tx() run in a same process. But this
> > > > > commit remove the protection for IOTLB updating which requires the
> > > > > mutex of each vq to be held.
> > > > > 
> > > > > To solve this issue, the first step is to have a exact same lock
> > > > > ordering for vhost_net. This is done through:
> > > > > 
> > > > > - For handle_rx(), if busy polling is enabled, lock tx vq immediately.
> > > > > - For handle_tx(), always lock rx vq before tx vq, and unlock it if
> > > > >     busy polling is not enabled.
> > > > > - Remove the tricky locking codes in busy polling.
> > > > > 
> > > > > With this, we can have a exact same lock ordering for vhost_net, this
> > > > > allows us to safely revert commit 78139c94dc8c ("net: vhost: lock the
> > > > > vqs one by one") in next patch.
> > > > > 
> > > > > The patch will add two more atomic operations on the tx path during
> > > > > each round of handle_tx(). 1 byte TCP_RR does not notice such
> > > > > overhead.
> > > > > 
> > > > > Fixes: commit 78139c94dc8c ("net: vhost: lock the vqs one by one")
> > > > > Cc: Tonghao Zhang<xiangxia.m.yue@gmail.com>
> > > > > Signed-off-by: Jason Wang<jasowang@redhat.com>
> > > > > ---
> > > > >    drivers/vhost/net.c | 18 +++++++++++++++---
> > > > >    1 file changed, 15 insertions(+), 3 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > > > index ab11b2bee273..5f272ab4d5b4 100644
> > > > > --- a/drivers/vhost/net.c
> > > > > +++ b/drivers/vhost/net.c
> > > > > @@ -513,7 +513,6 @@ static void vhost_net_busy_poll(struct vhost_net *net,
> > > > >    	struct socket *sock;
> > > > >    	struct vhost_virtqueue *vq = poll_rx ? tvq : rvq;
> > > > > -	mutex_lock_nested(&vq->mutex, poll_rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
> > > > >    	vhost_disable_notify(&net->dev, vq);
> > > > >    	sock = rvq->private_data;
> > > > > @@ -543,8 +542,6 @@ static void vhost_net_busy_poll(struct vhost_net *net,
> > > > >    		vhost_net_busy_poll_try_queue(net, vq);
> > > > >    	else if (!poll_rx) /* On tx here, sock has no rx data. */
> > > > >    		vhost_enable_notify(&net->dev, rvq);
> > > > > -
> > > > > -	mutex_unlock(&vq->mutex);
> > > > >    }
> > > > >    static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> > > > > @@ -913,10 +910,16 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
> > > > >    static void handle_tx(struct vhost_net *net)
> > > > >    {
> > > > >    	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
> > > > > +	struct vhost_net_virtqueue *nvq_rx = &net->vqs[VHOST_NET_VQ_RX];
> > > > >    	struct vhost_virtqueue *vq = &nvq->vq;
> > > > > +	struct vhost_virtqueue *vq_rx = &nvq_rx->vq;
> > > > >    	struct socket *sock;
> > > > > +	mutex_lock_nested(&vq_rx->mutex, VHOST_NET_VQ_RX);
> > > > >    	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
> > > > > +	if (!vq->busyloop_timeout)
> > > > > +		mutex_unlock(&vq_rx->mutex);
> > > > > +
> > > > >    	sock = vq->private_data;
> > > > >    	if (!sock)
> > > > >    		goto out;
> > > > > @@ -933,6 +936,8 @@ static void handle_tx(struct vhost_net *net)
> > > > >    		handle_tx_copy(net, sock);
> > > > >    out:
> > > > > +	if (vq->busyloop_timeout)
> > > > > +		mutex_unlock(&vq_rx->mutex);
> > > > >    	mutex_unlock(&vq->mutex);
> > > > >    }
> > > > So rx mutex taken on tx path now.  And tx mutex is on rc path ...  This
> > > > is just messed up. Why can't tx polling drop rx lock before
> > > > getting the tx lock and vice versa?
> > > 
> > > Because we want to poll both tx and rx virtqueue at the same time
> > > (vhost_net_busy_poll()).
> > > 
> > >      while (vhost_can_busy_poll(endtime)) {
> > >          if (vhost_has_work(&net->dev)) {
> > >              *busyloop_intr = true;
> > >              break;
> > >          }
> > > 
> > >          if ((sock_has_rx_data(sock) &&
> > >               !vhost_vq_avail_empty(&net->dev, rvq)) ||
> > >              !vhost_vq_avail_empty(&net->dev, tvq))
> > >              break;
> > > 
> > >          cpu_relax();
> > > 
> > >      }
> > > 
> > > 
> > > And we disable kicks and notification for better performance.
> > Right but it's all slow path - it happens when queue is
> > otherwise empty. So this is what I am saying: let's drop the locks
> > we hold around this.
> 
> 
> Is this really safe? I looks to me it can race with SET_VRING_ADDR. And the
> codes did more:
> 
> - access sock object
> 
> - access device IOTLB
> 
> - enable and disable notification
> 
> None of above is safe without the protection of vq mutex.


ys but take another lock. just not nested.


> 
> > 
> > 
> > > > Or if we really wanted to force everything to be locked at
> > > > all times, let's just use a single mutex.
> > > > 
> > > > 
> > > > 
> > > We could, but it might requires more changes which could be done for -next I
> > > believe.
> > > 
> > > 
> > > Thanks
> > I'd rather we kept the fine grained locking. E.g. people are
> > looking at splitting the tx and rx threads. But if not possible
> > let's fix it cleanly with a coarse-grained one. A mess here will
> > just create more trouble later.
> > 
> 
> I believe we won't go back to coarse one. Looks like we can solve this by
> using mutex_trylock() for rxq during TX. And don't do polling for rxq is a
> IOTLB updating is pending.
> 
> Let me post V2.
> 
> Thanks

Patch
diff mbox series

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index ab11b2bee273..5f272ab4d5b4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -513,7 +513,6 @@  static void vhost_net_busy_poll(struct vhost_net *net,
 	struct socket *sock;
 	struct vhost_virtqueue *vq = poll_rx ? tvq : rvq;
 
-	mutex_lock_nested(&vq->mutex, poll_rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
 	vhost_disable_notify(&net->dev, vq);
 	sock = rvq->private_data;
 
@@ -543,8 +542,6 @@  static void vhost_net_busy_poll(struct vhost_net *net,
 		vhost_net_busy_poll_try_queue(net, vq);
 	else if (!poll_rx) /* On tx here, sock has no rx data. */
 		vhost_enable_notify(&net->dev, rvq);
-
-	mutex_unlock(&vq->mutex);
 }
 
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
@@ -913,10 +910,16 @@  static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 static void handle_tx(struct vhost_net *net)
 {
 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
+	struct vhost_net_virtqueue *nvq_rx = &net->vqs[VHOST_NET_VQ_RX];
 	struct vhost_virtqueue *vq = &nvq->vq;
+	struct vhost_virtqueue *vq_rx = &nvq_rx->vq;
 	struct socket *sock;
 
+	mutex_lock_nested(&vq_rx->mutex, VHOST_NET_VQ_RX);
 	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
+	if (!vq->busyloop_timeout)
+		mutex_unlock(&vq_rx->mutex);
+
 	sock = vq->private_data;
 	if (!sock)
 		goto out;
@@ -933,6 +936,8 @@  static void handle_tx(struct vhost_net *net)
 		handle_tx_copy(net, sock);
 
 out:
+	if (vq->busyloop_timeout)
+		mutex_unlock(&vq_rx->mutex);
 	mutex_unlock(&vq->mutex);
 }
 
@@ -1060,7 +1065,9 @@  static int get_rx_bufs(struct vhost_virtqueue *vq,
 static void handle_rx(struct vhost_net *net)
 {
 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
+	struct vhost_net_virtqueue *nvq_tx = &net->vqs[VHOST_NET_VQ_TX];
 	struct vhost_virtqueue *vq = &nvq->vq;
+	struct vhost_virtqueue *vq_tx = &nvq_tx->vq;
 	unsigned uninitialized_var(in), log;
 	struct vhost_log *vq_log;
 	struct msghdr msg = {
@@ -1086,6 +1093,9 @@  static void handle_rx(struct vhost_net *net)
 	int recv_pkts = 0;
 
 	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
+	if (vq->busyloop_timeout)
+		mutex_lock_nested(&vq_tx->mutex, VHOST_NET_VQ_TX);
+
 	sock = vq->private_data;
 	if (!sock)
 		goto out;
@@ -1200,6 +1210,8 @@  static void handle_rx(struct vhost_net *net)
 out:
 	vhost_net_signal_used(nvq);
 	mutex_unlock(&vq->mutex);
+	if (vq->busyloop_timeout)
+		mutex_unlock(&vq_tx->mutex);
 }
 
 static void handle_tx_kick(struct vhost_work *work)