All of lore.kernel.org
 help / color / mirror / Atom feed
* virtio-net: locking in tx napi
@ 2021-04-12 22:03 Michael S. Tsirkin
  2021-04-12 22:31 ` Michael S. Tsirkin
  0 siblings, 1 reply; 5+ messages in thread
From: Michael S. Tsirkin @ 2021-04-12 22:03 UTC (permalink / raw)
  To: Jason Wang; +Cc: Willem de Bruijn, virtualization

I was working on the spurios interrupt problem and
I noticed something weird.

static int virtnet_poll_tx(struct napi_struct *napi, int budget)
{       
        struct send_queue *sq = container_of(napi, struct send_queue, napi);
        struct virtnet_info *vi = sq->vq->vdev->priv;
        unsigned int index = vq2txq(sq->vq);
        struct netdev_queue *txq;

        if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
                /* We don't need to enable cb for XDP */
                napi_complete_done(napi, 0);
                return 0;
        }

        txq = netdev_get_tx_queue(vi->dev, index);
        __netif_tx_lock(txq, raw_smp_processor_id());
        free_old_xmit_skbs(sq, true);
        __netif_tx_unlock(txq);
        
        virtqueue_napi_complete(napi, sq->vq, 0);
        
        if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
                netif_tx_wake_queue(txq);
        
        return 0;
}       

So virtqueue_napi_complete is called when txq is not locked,
thinkably start_xmit can happen right?

Now virtqueue_napi_complete

static void virtqueue_napi_complete(struct napi_struct *napi,
                                    struct virtqueue *vq, int processed)
{
        int opaque;

        opaque = virtqueue_enable_cb_prepare(vq);
        if (napi_complete_done(napi, processed)) {
                if (unlikely(virtqueue_poll(vq, opaque)))
                        virtqueue_napi_schedule(napi, vq);
        } else {
                virtqueue_disable_cb(vq);
        }
}


So it is calling virtqueue_enable_cb_prepare but tx queue
could be running and can process things in parallel ...
What gives? I suspect this corrupts the ring, and explains
why we are seeing weird hangs with vhost packed ring ...

Jason?


-- 
MST

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: virtio-net: locking in tx napi
  2021-04-12 22:03 virtio-net: locking in tx napi Michael S. Tsirkin
@ 2021-04-12 22:31 ` Michael S. Tsirkin
  2021-04-13  2:29   ` Jason Wang
  0 siblings, 1 reply; 5+ messages in thread
From: Michael S. Tsirkin @ 2021-04-12 22:31 UTC (permalink / raw)
  To: Jason Wang; +Cc: Willem de Bruijn, virtualization

On Mon, Apr 12, 2021 at 06:03:50PM -0400, Michael S. Tsirkin wrote:
> I was working on the spurios interrupt problem and
> I noticed something weird.
> 
> static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> {       
>         struct send_queue *sq = container_of(napi, struct send_queue, napi);
>         struct virtnet_info *vi = sq->vq->vdev->priv;
>         unsigned int index = vq2txq(sq->vq);
>         struct netdev_queue *txq;
> 
>         if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
>                 /* We don't need to enable cb for XDP */
>                 napi_complete_done(napi, 0);
>                 return 0;
>         }
> 
>         txq = netdev_get_tx_queue(vi->dev, index);
>         __netif_tx_lock(txq, raw_smp_processor_id());
>         free_old_xmit_skbs(sq, true);
>         __netif_tx_unlock(txq);
>         
>         virtqueue_napi_complete(napi, sq->vq, 0);
>         
>         if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
>                 netif_tx_wake_queue(txq);
>         
>         return 0;
> }       
> 
> So virtqueue_napi_complete is called when txq is not locked,
> thinkably start_xmit can happen right?
> 
> Now virtqueue_napi_complete
> 
> static void virtqueue_napi_complete(struct napi_struct *napi,
>                                     struct virtqueue *vq, int processed)
> {
>         int opaque;
> 
>         opaque = virtqueue_enable_cb_prepare(vq);
>         if (napi_complete_done(napi, processed)) {
>                 if (unlikely(virtqueue_poll(vq, opaque)))
>                         virtqueue_napi_schedule(napi, vq);
>         } else {
>                 virtqueue_disable_cb(vq);
>         }
> }
> 
> 
> So it is calling virtqueue_enable_cb_prepare but tx queue
> could be running and can process things in parallel ...
> What gives? I suspect this corrupts the ring, and explains
> why we are seeing weird hangs with vhost packed ring ...
> 
> Jason?
> 
> 
> -- 
> MST

and wouldn't the following untested patch make sense then?


diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 82e520d2cb12..c23341b18eb5 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1504,6 +1505,8 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
 	struct virtnet_info *vi = sq->vq->vdev->priv;
 	unsigned int index = vq2txq(sq->vq);
 	struct netdev_queue *txq;
+	int opaque;
+	bool done;
 
 	if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
 		/* We don't need to enable cb for XDP */
@@ -1514,9 +1517,26 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
 	txq = netdev_get_tx_queue(vi->dev, index);
 	__netif_tx_lock(txq, raw_smp_processor_id());
 	free_old_xmit_skbs(sq, true);
+
+	opaque = virtqueue_enable_cb_prepare(sq->vq);
+
+	done = napi_complete_done(napi, 0);
+
+	if (!done)
+		virtqueue_disable_cb(sq->vq);
+
 	__netif_tx_unlock(txq);
 
-	virtqueue_napi_complete(napi, sq->vq, 0);
+	if (done) {
+		if (unlikely(virtqueue_poll(vq, opaque))) {
+			if (napi_schedule_prep(napi)) {
+				__netif_tx_lock(txq, raw_smp_processor_id());
+				virtqueue_disable_cb(sq->vq);
+				__netif_tx_unlock(txq);
+				__napi_schedule(napi);
+			}
+		}
+	}
 
 	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
 		netif_tx_wake_queue(txq);

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: virtio-net: locking in tx napi
  2021-04-12 22:31 ` Michael S. Tsirkin
@ 2021-04-13  2:29   ` Jason Wang
  2021-04-13  4:44     ` Michael S. Tsirkin
  0 siblings, 1 reply; 5+ messages in thread
From: Jason Wang @ 2021-04-13  2:29 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Willem de Bruijn, virtualization


在 2021/4/13 上午6:31, Michael S. Tsirkin 写道:
> On Mon, Apr 12, 2021 at 06:03:50PM -0400, Michael S. Tsirkin wrote:
>> I was working on the spurios interrupt problem and
>> I noticed something weird.
>>
>> static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>> {
>>          struct send_queue *sq = container_of(napi, struct send_queue, napi);
>>          struct virtnet_info *vi = sq->vq->vdev->priv;
>>          unsigned int index = vq2txq(sq->vq);
>>          struct netdev_queue *txq;
>>
>>          if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
>>                  /* We don't need to enable cb for XDP */
>>                  napi_complete_done(napi, 0);
>>                  return 0;
>>          }
>>
>>          txq = netdev_get_tx_queue(vi->dev, index);
>>          __netif_tx_lock(txq, raw_smp_processor_id());
>>          free_old_xmit_skbs(sq, true);
>>          __netif_tx_unlock(txq);
>>          
>>          virtqueue_napi_complete(napi, sq->vq, 0);
>>          
>>          if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
>>                  netif_tx_wake_queue(txq);
>>          
>>          return 0;
>> }
>>
>> So virtqueue_napi_complete is called when txq is not locked,
>> thinkably start_xmit can happen right?


Yes, I think so.


>>
>> Now virtqueue_napi_complete
>>
>> static void virtqueue_napi_complete(struct napi_struct *napi,
>>                                      struct virtqueue *vq, int processed)
>> {
>>          int opaque;
>>
>>          opaque = virtqueue_enable_cb_prepare(vq);
>>          if (napi_complete_done(napi, processed)) {
>>                  if (unlikely(virtqueue_poll(vq, opaque)))
>>                          virtqueue_napi_schedule(napi, vq);
>>          } else {
>>                  virtqueue_disable_cb(vq);
>>          }
>> }
>>
>>
>> So it is calling virtqueue_enable_cb_prepare but tx queue
>> could be running and can process things in parallel ...
>> What gives? I suspect this corrupts the ring, and explains
>> why we are seeing weird hangs with vhost packed ring ...
>>
>> Jason?


It might cause the interrupt to be disabled unexpectedly I think.


>>
>>
>> -- 
>> MST
> and wouldn't the following untested patch make sense then?
>
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 82e520d2cb12..c23341b18eb5 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -1504,6 +1505,8 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>   	struct virtnet_info *vi = sq->vq->vdev->priv;
>   	unsigned int index = vq2txq(sq->vq);
>   	struct netdev_queue *txq;
> +	int opaque;
> +	bool done;
>   
>   	if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
>   		/* We don't need to enable cb for XDP */
> @@ -1514,9 +1517,26 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
>   	txq = netdev_get_tx_queue(vi->dev, index);
>   	__netif_tx_lock(txq, raw_smp_processor_id());
>   	free_old_xmit_skbs(sq, true);
> +
> +	opaque = virtqueue_enable_cb_prepare(sq->vq);
> +
> +	done = napi_complete_done(napi, 0);
> +
> +	if (!done)
> +		virtqueue_disable_cb(sq->vq);
> +
>   	__netif_tx_unlock(txq);
>   
> -	virtqueue_napi_complete(napi, sq->vq, 0);
> +	if (done) {
> +		if (unlikely(virtqueue_poll(vq, opaque))) {
> +			if (napi_schedule_prep(napi)) {
> +				__netif_tx_lock(txq, raw_smp_processor_id());
> +				virtqueue_disable_cb(sq->vq);
> +				__netif_tx_unlock(txq);
> +				__napi_schedule(napi);
> +			}
> +		}
> +	}
>   
>   	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
>   		netif_tx_wake_queue(txq);


I wonder why not simply protect the whole poll_tx with tx lock in this case?

Thanks


_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: virtio-net: locking in tx napi
  2021-04-13  2:29   ` Jason Wang
@ 2021-04-13  4:44     ` Michael S. Tsirkin
  2021-04-13 13:29       ` Willem de Bruijn
  0 siblings, 1 reply; 5+ messages in thread
From: Michael S. Tsirkin @ 2021-04-13  4:44 UTC (permalink / raw)
  To: Jason Wang; +Cc: Willem de Bruijn, virtualization

On Tue, Apr 13, 2021 at 10:29:03AM +0800, Jason Wang wrote:
> 
> 在 2021/4/13 上午6:31, Michael S. Tsirkin 写道:
> > On Mon, Apr 12, 2021 at 06:03:50PM -0400, Michael S. Tsirkin wrote:
> > > I was working on the spurios interrupt problem and
> > > I noticed something weird.
> > > 
> > > static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> > > {
> > >          struct send_queue *sq = container_of(napi, struct send_queue, napi);
> > >          struct virtnet_info *vi = sq->vq->vdev->priv;
> > >          unsigned int index = vq2txq(sq->vq);
> > >          struct netdev_queue *txq;
> > > 
> > >          if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
> > >                  /* We don't need to enable cb for XDP */
> > >                  napi_complete_done(napi, 0);
> > >                  return 0;
> > >          }
> > > 
> > >          txq = netdev_get_tx_queue(vi->dev, index);
> > >          __netif_tx_lock(txq, raw_smp_processor_id());
> > >          free_old_xmit_skbs(sq, true);
> > >          __netif_tx_unlock(txq);
> > >          virtqueue_napi_complete(napi, sq->vq, 0);
> > >          if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
> > >                  netif_tx_wake_queue(txq);
> > >          return 0;
> > > }
> > > 
> > > So virtqueue_napi_complete is called when txq is not locked,
> > > thinkably start_xmit can happen right?
> 
> 
> Yes, I think so.
> 
> 
> > > 
> > > Now virtqueue_napi_complete
> > > 
> > > static void virtqueue_napi_complete(struct napi_struct *napi,
> > >                                      struct virtqueue *vq, int processed)
> > > {
> > >          int opaque;
> > > 
> > >          opaque = virtqueue_enable_cb_prepare(vq);
> > >          if (napi_complete_done(napi, processed)) {
> > >                  if (unlikely(virtqueue_poll(vq, opaque)))
> > >                          virtqueue_napi_schedule(napi, vq);
> > >          } else {
> > >                  virtqueue_disable_cb(vq);
> > >          }
> > > }
> > > 
> > > 
> > > So it is calling virtqueue_enable_cb_prepare but tx queue
> > > could be running and can process things in parallel ...
> > > What gives? I suspect this corrupts the ring, and explains
> > > why we are seeing weird hangs with vhost packed ring ...
> > > 
> > > Jason?
> 
> 
> It might cause the interrupt to be disabled unexpectedly I think.
> 
> 
> > > 
> > > 
> > > -- 
> > > MST
> > and wouldn't the following untested patch make sense then?
> > 
> > 
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 82e520d2cb12..c23341b18eb5 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -1504,6 +1505,8 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> >   	struct virtnet_info *vi = sq->vq->vdev->priv;
> >   	unsigned int index = vq2txq(sq->vq);
> >   	struct netdev_queue *txq;
> > +	int opaque;
> > +	bool done;
> >   	if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
> >   		/* We don't need to enable cb for XDP */
> > @@ -1514,9 +1517,26 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> >   	txq = netdev_get_tx_queue(vi->dev, index);
> >   	__netif_tx_lock(txq, raw_smp_processor_id());
> >   	free_old_xmit_skbs(sq, true);
> > +
> > +	opaque = virtqueue_enable_cb_prepare(sq->vq);
> > +
> > +	done = napi_complete_done(napi, 0);
> > +
> > +	if (!done)
> > +		virtqueue_disable_cb(sq->vq);
> > +
> >   	__netif_tx_unlock(txq);
> > -	virtqueue_napi_complete(napi, sq->vq, 0);
> > +	if (done) {
> > +		if (unlikely(virtqueue_poll(vq, opaque))) {
> > +			if (napi_schedule_prep(napi)) {
> > +				__netif_tx_lock(txq, raw_smp_processor_id());
> > +				virtqueue_disable_cb(sq->vq);
> > +				__netif_tx_unlock(txq);
> > +				__napi_schedule(napi);
> > +			}
> > +		}
> > +	}
> >   	if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
> >   		netif_tx_wake_queue(txq);
> 
> 
> I wonder why not simply protect the whole poll_tx with tx lock in this case?
> 
> Thanks
> 

Well it takes __netif_tx_lock internally does it not? Sounds like a
deadlock to me ..,

-- 
MST

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: virtio-net: locking in tx napi
  2021-04-13  4:44     ` Michael S. Tsirkin
@ 2021-04-13 13:29       ` Willem de Bruijn
  0 siblings, 0 replies; 5+ messages in thread
From: Willem de Bruijn @ 2021-04-13 13:29 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: virtualization

On Tue, Apr 13, 2021 at 12:44 AM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Tue, Apr 13, 2021 at 10:29:03AM +0800, Jason Wang wrote:
> >
> > 在 2021/4/13 上午6:31, Michael S. Tsirkin 写道:
> > > On Mon, Apr 12, 2021 at 06:03:50PM -0400, Michael S. Tsirkin wrote:
> > > > I was working on the spurios interrupt problem and
> > > > I noticed something weird.
> > > >
> > > > static int virtnet_poll_tx(struct napi_struct *napi, int budget)
> > > > {
> > > >          struct send_queue *sq = container_of(napi, struct send_queue, napi);
> > > >          struct virtnet_info *vi = sq->vq->vdev->priv;
> > > >          unsigned int index = vq2txq(sq->vq);
> > > >          struct netdev_queue *txq;
> > > >
> > > >          if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
> > > >                  /* We don't need to enable cb for XDP */
> > > >                  napi_complete_done(napi, 0);
> > > >                  return 0;
> > > >          }
> > > >
> > > >          txq = netdev_get_tx_queue(vi->dev, index);
> > > >          __netif_tx_lock(txq, raw_smp_processor_id());
> > > >          free_old_xmit_skbs(sq, true);
> > > >          __netif_tx_unlock(txq);
> > > >          virtqueue_napi_complete(napi, sq->vq, 0);
> > > >          if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
> > > >                  netif_tx_wake_queue(txq);
> > > >          return 0;
> > > > }
> > > >
> > > > So virtqueue_napi_complete is called when txq is not locked,
> > > > thinkably start_xmit can happen right?
> >
> >
> > Yes, I think so.

Agreed.

I see I'm quite a few emails behind already. Will respond in more
detail to the latest patch series.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-04-13 13:30 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-12 22:03 virtio-net: locking in tx napi Michael S. Tsirkin
2021-04-12 22:31 ` Michael S. Tsirkin
2021-04-13  2:29   ` Jason Wang
2021-04-13  4:44     ` Michael S. Tsirkin
2021-04-13 13:29       ` Willem de Bruijn

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.