All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
@ 2017-12-19  0:40 Sridhar Samudrala
  2017-12-19 15:47 ` Michael S. Tsirkin
                   ` (5 more replies)
  0 siblings, 6 replies; 44+ messages in thread
From: Sridhar Samudrala @ 2017-12-19  0:40 UTC (permalink / raw)
  To: mst, stephen, netdev, virtualization, alexander.duyck, sridhar.samudrala

This patch enables virtio to switch over to a VF datapath when a VF netdev
is present with the same MAC address.  It allows live migration of a VM
with a direct attached VF without the need to setup a bond/team between a
VF and virtio net device in the guest.

The hypervisor needs to unplug the VF device from the guest on the source
host and reset the MAC filter of the VF to initiate failover of datapath to
virtio before starting the migration. After the migration is completed, the
destination hypervisor sets the MAC filter on the VF and plugs it back to
the guest to switch over to VF datapath.

It is entirely based on netvsc implementation and it should be possible to
make this code generic and move it to a common location that can be shared
by netvsc and virtio.

Also, i think we should make this a negotiated feature that is off by
default via a new feature bit.

This patch is based on the discussion initiated by Jesse on this thread.
https://marc.info/?l=linux-virtualization&m=151189725224231&w=2

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
---
 drivers/net/virtio_net.c | 341 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 339 insertions(+), 2 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 559b215c0169..a34c717bb15b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -31,6 +31,8 @@
 #include <linux/average.h>
 #include <linux/filter.h>
 #include <net/route.h>
+#include <linux/netdevice.h>
+#include <linux/netpoll.h>
 
 static int napi_weight = NAPI_POLL_WEIGHT;
 module_param(napi_weight, int, 0444);
@@ -56,6 +58,8 @@ module_param(napi_tx, bool, 0644);
  */
 DECLARE_EWMA(pkt_len, 0, 64)
 
+#define VF_TAKEOVER_INT	(HZ / 10)
+
 #define VIRTNET_DRIVER_VERSION "1.0.0"
 
 static const unsigned long guest_offloads[] = {
@@ -117,6 +121,15 @@ struct receive_queue {
 	char name[40];
 };
 
+struct virtnet_vf_pcpu_stats {
+	u64	rx_packets;
+	u64	rx_bytes;
+	u64	tx_packets;
+	u64	tx_bytes;
+	struct u64_stats_sync   syncp;
+	u32	tx_dropped;
+};
+
 struct virtnet_info {
 	struct virtio_device *vdev;
 	struct virtqueue *cvq;
@@ -179,6 +192,11 @@ struct virtnet_info {
 	u32 speed;
 
 	unsigned long guest_offloads;
+
+	/* State to manage the associated VF interface. */
+	struct net_device __rcu *vf_netdev;
+	struct virtnet_vf_pcpu_stats __percpu *vf_stats;
+	struct delayed_work vf_takeover;
 };
 
 struct padded_vnet_hdr {
@@ -1300,16 +1318,51 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
 }
 
+/* Send skb on the slave VF device. */
+static int virtnet_vf_xmit(struct net_device *dev, struct net_device *vf_netdev,
+			   struct sk_buff *skb)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	unsigned int len = skb->len;
+	int rc;
+
+	skb->dev = vf_netdev;
+	skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
+
+	rc = dev_queue_xmit(skb);
+	if (likely(rc == NET_XMIT_SUCCESS || rc == NET_XMIT_CN)) {
+		struct virtnet_vf_pcpu_stats *pcpu_stats
+			= this_cpu_ptr(vi->vf_stats);
+
+		u64_stats_update_begin(&pcpu_stats->syncp);
+		pcpu_stats->tx_packets++;
+		pcpu_stats->tx_bytes += len;
+		u64_stats_update_end(&pcpu_stats->syncp);
+	} else {
+		this_cpu_inc(vi->vf_stats->tx_dropped);
+	}
+
+	return rc;
+}
+
 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	int qnum = skb_get_queue_mapping(skb);
 	struct send_queue *sq = &vi->sq[qnum];
+	struct net_device *vf_netdev;
 	int err;
 	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
 	bool kick = !skb->xmit_more;
 	bool use_napi = sq->napi.weight;
 
+	/* if VF is present and up then redirect packets
+	 * called with rcu_read_lock_bh
+	 */
+	vf_netdev = rcu_dereference_bh(vi->vf_netdev);
+	if (vf_netdev && netif_running(vf_netdev) && !netpoll_tx_running(dev))
+		return virtnet_vf_xmit(dev, vf_netdev, skb);
+
 	/* Free up any pending old buffers before queueing new ones. */
 	free_old_xmit_skbs(sq);
 
@@ -1456,10 +1509,41 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
 	return ret;
 }
 
+static void virtnet_get_vf_stats(struct net_device *dev,
+				 struct virtnet_vf_pcpu_stats *tot)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	int i;
+
+	memset(tot, 0, sizeof(*tot));
+
+	for_each_possible_cpu(i) {
+		const struct virtnet_vf_pcpu_stats *stats
+				= per_cpu_ptr(vi->vf_stats, i);
+		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+		unsigned int start;
+
+		do {
+			start = u64_stats_fetch_begin_irq(&stats->syncp);
+			rx_packets = stats->rx_packets;
+			tx_packets = stats->tx_packets;
+			rx_bytes = stats->rx_bytes;
+			tx_bytes = stats->tx_bytes;
+		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+
+		tot->rx_packets += rx_packets;
+		tot->tx_packets += tx_packets;
+		tot->rx_bytes   += rx_bytes;
+		tot->tx_bytes   += tx_bytes;
+		tot->tx_dropped += stats->tx_dropped;
+	}
+}
+
 static void virtnet_stats(struct net_device *dev,
 			  struct rtnl_link_stats64 *tot)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
+	struct virtnet_vf_pcpu_stats vf_stats;
 	int cpu;
 	unsigned int start;
 
@@ -1490,6 +1574,13 @@ static void virtnet_stats(struct net_device *dev,
 	tot->rx_dropped = dev->stats.rx_dropped;
 	tot->rx_length_errors = dev->stats.rx_length_errors;
 	tot->rx_frame_errors = dev->stats.rx_frame_errors;
+
+	virtnet_get_vf_stats(dev, &vf_stats);
+	tot->rx_packets += vf_stats.rx_packets;
+	tot->tx_packets += vf_stats.tx_packets;
+	tot->rx_bytes += vf_stats.rx_bytes;
+	tot->tx_bytes += vf_stats.tx_bytes;
+	tot->tx_dropped += vf_stats.tx_dropped;
 }
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -2508,6 +2599,47 @@ static int virtnet_validate(struct virtio_device *vdev)
 	return 0;
 }
 
+static void __virtnet_vf_setup(struct net_device *ndev,
+			       struct net_device *vf_netdev)
+{
+	int ret;
+
+	/* Align MTU of VF with master */
+	ret = dev_set_mtu(vf_netdev, ndev->mtu);
+	if (ret)
+		netdev_warn(vf_netdev,
+			    "unable to change mtu to %u\n", ndev->mtu);
+
+	if (netif_running(ndev)) {
+		ret = dev_open(vf_netdev);
+		if (ret)
+			netdev_warn(vf_netdev,
+				    "unable to open: %d\n", ret);
+	}
+}
+
+/* Setup VF as slave of the virtio device.
+ * Runs in workqueue to avoid recursion in netlink callbacks.
+ */
+static void virtnet_vf_setup(struct work_struct *w)
+{
+	struct virtnet_info *vi
+		= container_of(w, struct virtnet_info, vf_takeover.work);
+	struct net_device *ndev = vi->dev;
+	struct net_device *vf_netdev;
+
+	if (!rtnl_trylock()) {
+		schedule_delayed_work(&vi->vf_takeover, 0);
+		return;
+	}
+
+	vf_netdev = rtnl_dereference(vi->vf_netdev);
+	if (vf_netdev)
+		__virtnet_vf_setup(ndev, vf_netdev);
+
+	rtnl_unlock();
+}
+
 static int virtnet_probe(struct virtio_device *vdev)
 {
 	int i, err;
@@ -2600,6 +2732,11 @@ static int virtnet_probe(struct virtio_device *vdev)
 	}
 
 	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
+	INIT_DELAYED_WORK(&vi->vf_takeover, virtnet_vf_setup);
+
+	vi->vf_stats = netdev_alloc_pcpu_stats(struct virtnet_vf_pcpu_stats);
+	if (!vi->vf_stats)
+		goto free_stats;
 
 	/* If we can receive ANY GSO packets, we must allocate large ones. */
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
@@ -2634,7 +2771,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 			 */
 			dev_err(&vdev->dev, "device MTU appears to have changed "
 				"it is now %d < %d", mtu, dev->min_mtu);
-			goto free_stats;
+			goto free_vf_stats;
 		}
 
 		dev->mtu = mtu;
@@ -2658,7 +2795,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
 	err = init_vqs(vi);
 	if (err)
-		goto free_stats;
+		goto free_vf_stats;
 
 #ifdef CONFIG_SYSFS
 	if (vi->mergeable_rx_bufs)
@@ -2712,6 +2849,8 @@ static int virtnet_probe(struct virtio_device *vdev)
 	cancel_delayed_work_sync(&vi->refill);
 	free_receive_page_frags(vi);
 	virtnet_del_vqs(vi);
+free_vf_stats:
+	free_percpu(vi->vf_stats);
 free_stats:
 	free_percpu(vi->stats);
 free:
@@ -2733,19 +2872,178 @@ static void remove_vq_common(struct virtnet_info *vi)
 	virtnet_del_vqs(vi);
 }
 
+static struct net_device *get_virtio_bymac(const u8 *mac)
+{
+	struct net_device *dev;
+
+	ASSERT_RTNL();
+
+	for_each_netdev(&init_net, dev) {
+		if (dev->netdev_ops != &virtnet_netdev)
+			continue;       /* not a virtio_net device */
+
+		if (ether_addr_equal(mac, dev->perm_addr))
+			return dev;
+	}
+
+	return NULL;
+}
+
+static struct net_device *get_virtio_byref(struct net_device *vf_netdev)
+{
+	struct net_device *dev;
+
+	ASSERT_RTNL();
+
+	for_each_netdev(&init_net, dev) {
+		struct virtnet_info *vi;
+
+		if (dev->netdev_ops != &virtnet_netdev)
+			continue;	/* not a virtio_net device */
+
+		vi = netdev_priv(dev);
+		if (rtnl_dereference(vi->vf_netdev) == vf_netdev)
+			return dev;	/* a match */
+	}
+
+	return NULL;
+}
+
+/* Called when VF is injecting data into network stack.
+ * Change the associated network device from VF to virtio.
+ * note: already called with rcu_read_lock
+ */
+static rx_handler_result_t virtnet_vf_handle_frame(struct sk_buff **pskb)
+{
+	struct sk_buff *skb = *pskb;
+	struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
+	struct virtnet_info *vi = netdev_priv(ndev);
+	struct virtnet_vf_pcpu_stats *pcpu_stats =
+				this_cpu_ptr(vi->vf_stats);
+
+	skb->dev = ndev;
+
+	u64_stats_update_begin(&pcpu_stats->syncp);
+	pcpu_stats->rx_packets++;
+	pcpu_stats->rx_bytes += skb->len;
+	u64_stats_update_end(&pcpu_stats->syncp);
+
+	return RX_HANDLER_ANOTHER;
+}
+
+static int virtnet_vf_join(struct net_device *vf_netdev,
+			   struct net_device *ndev)
+{
+	struct virtnet_info *vi = netdev_priv(ndev);
+	int ret;
+
+	ret = netdev_rx_handler_register(vf_netdev,
+					 virtnet_vf_handle_frame, ndev);
+	if (ret != 0) {
+		netdev_err(vf_netdev,
+			   "can not register virtio VF receive handler (err = %d)\n",
+			   ret);
+		goto rx_handler_failed;
+	}
+
+	ret = netdev_upper_dev_link(vf_netdev, ndev, NULL);
+	if (ret != 0) {
+		netdev_err(vf_netdev,
+			   "can not set master device %s (err = %d)\n",
+			   ndev->name, ret);
+		goto upper_link_failed;
+	}
+
+	/* set slave flag before open to prevent IPv6 addrconf */
+	vf_netdev->flags |= IFF_SLAVE;
+
+	schedule_delayed_work(&vi->vf_takeover, VF_TAKEOVER_INT);
+
+	call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
+
+	netdev_info(vf_netdev, "joined to %s\n", ndev->name);
+	return 0;
+
+upper_link_failed:
+	netdev_rx_handler_unregister(vf_netdev);
+rx_handler_failed:
+	return ret;
+}
+
+static int virtnet_register_vf(struct net_device *vf_netdev)
+{
+	struct net_device *ndev;
+	struct virtnet_info *vi;
+
+	if (vf_netdev->addr_len != ETH_ALEN)
+		return NOTIFY_DONE;
+
+	/* We will use the MAC address to locate the virtio_net interface to
+	 * associate with the VF interface. If we don't find a matching
+	 * virtio interface, move on.
+	 */
+	ndev = get_virtio_bymac(vf_netdev->perm_addr);
+	if (!ndev)
+		return NOTIFY_DONE;
+
+	vi = netdev_priv(ndev);
+	if (rtnl_dereference(vi->vf_netdev))
+		return NOTIFY_DONE;
+
+	if (virtnet_vf_join(vf_netdev, ndev) != 0)
+		return NOTIFY_DONE;
+
+	netdev_info(ndev, "VF registering %s\n", vf_netdev->name);
+
+	dev_hold(vf_netdev);
+	rcu_assign_pointer(vi->vf_netdev, vf_netdev);
+
+	return NOTIFY_OK;
+}
+
+static int virtnet_unregister_vf(struct net_device *vf_netdev)
+{
+	struct net_device *ndev;
+	struct virtnet_info *vi;
+
+	ndev = get_virtio_byref(vf_netdev);
+	if (!ndev)
+		return NOTIFY_DONE;
+
+	vi = netdev_priv(ndev);
+	cancel_delayed_work_sync(&vi->vf_takeover);
+
+	netdev_info(ndev, "VF unregistering %s\n", vf_netdev->name);
+
+	netdev_rx_handler_unregister(vf_netdev);
+	netdev_upper_dev_unlink(vf_netdev, ndev);
+	RCU_INIT_POINTER(vi->vf_netdev, NULL);
+	dev_put(vf_netdev);
+
+	return NOTIFY_OK;
+}
+
 static void virtnet_remove(struct virtio_device *vdev)
 {
 	struct virtnet_info *vi = vdev->priv;
+	struct net_device *vf_netdev;
 
 	virtnet_cpu_notif_remove(vi);
 
 	/* Make sure no work handler is accessing the device. */
 	flush_work(&vi->config_work);
 
+	rtnl_lock();
+	vf_netdev = rtnl_dereference(vi->vf_netdev);
+	if (vf_netdev)
+		virtnet_unregister_vf(vf_netdev);
+	rtnl_unlock();
+
 	unregister_netdev(vi->dev);
 
 	remove_vq_common(vi);
 
+	free_percpu(vi->vf_stats);
 	free_percpu(vi->stats);
 	free_netdev(vi->dev);
 }
@@ -2823,6 +3121,42 @@ static struct virtio_driver virtio_net_driver = {
 #endif
 };
 
+static int virtio_netdev_event(struct notifier_block *this,
+			       unsigned long event, void *ptr)
+{
+	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+	/* Skip our own events */
+	if (event_dev->netdev_ops == &virtnet_netdev)
+		return NOTIFY_DONE;
+
+	/* Avoid non-Ethernet type devices */
+	if (event_dev->type != ARPHRD_ETHER)
+		return NOTIFY_DONE;
+
+	/* Avoid Vlan dev with same MAC registering as VF */
+	if (is_vlan_dev(event_dev))
+		return NOTIFY_DONE;
+
+	/* Avoid Bonding master dev with same MAC registering as VF */
+	if ((event_dev->priv_flags & IFF_BONDING) &&
+	    (event_dev->flags & IFF_MASTER))
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		return virtnet_register_vf(event_dev);
+	case NETDEV_UNREGISTER:
+		return virtnet_unregister_vf(event_dev);
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static struct notifier_block virtio_netdev_notifier = {
+	.notifier_call = virtio_netdev_event,
+};
+
 static __init int virtio_net_driver_init(void)
 {
 	int ret;
@@ -2841,6 +3175,8 @@ static __init int virtio_net_driver_init(void)
         ret = register_virtio_driver(&virtio_net_driver);
 	if (ret)
 		goto err_virtio;
+
+	register_netdevice_notifier(&virtio_netdev_notifier);
 	return 0;
 err_virtio:
 	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
@@ -2853,6 +3189,7 @@ module_init(virtio_net_driver_init);
 
 static __exit void virtio_net_driver_exit(void)
 {
+	unregister_netdevice_notifier(&virtio_netdev_notifier);
 	unregister_virtio_driver(&virtio_net_driver);
 	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
 	cpuhp_remove_multi_state(virtionet_online);
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19  0:40 [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available Sridhar Samudrala
  2017-12-19 15:47 ` Michael S. Tsirkin
@ 2017-12-19 15:47 ` Michael S. Tsirkin
  2017-12-19 17:41   ` Samudrala, Sridhar
  2017-12-19 17:41   ` Samudrala, Sridhar
  2017-12-20 22:33 ` Jakub Kicinski
                   ` (3 subsequent siblings)
  5 siblings, 2 replies; 44+ messages in thread
From: Michael S. Tsirkin @ 2017-12-19 15:47 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: stephen, netdev, virtualization, alexander.duyck

I'll need to look at this more, in particular the feature
bit is missing here. For now one question:

On Mon, Dec 18, 2017 at 04:40:36PM -0800, Sridhar Samudrala wrote:
> @@ -56,6 +58,8 @@ module_param(napi_tx, bool, 0644);
>   */
>  DECLARE_EWMA(pkt_len, 0, 64)
>  
> +#define VF_TAKEOVER_INT	(HZ / 10)
> +
>  #define VIRTNET_DRIVER_VERSION "1.0.0"
>  
>  static const unsigned long guest_offloads[] = {

Why is this delay necessary? And why by 100ms?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19  0:40 [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available Sridhar Samudrala
@ 2017-12-19 15:47 ` Michael S. Tsirkin
  2017-12-19 15:47 ` Michael S. Tsirkin
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 44+ messages in thread
From: Michael S. Tsirkin @ 2017-12-19 15:47 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: netdev, alexander.duyck, virtualization

I'll need to look at this more, in particular the feature
bit is missing here. For now one question:

On Mon, Dec 18, 2017 at 04:40:36PM -0800, Sridhar Samudrala wrote:
> @@ -56,6 +58,8 @@ module_param(napi_tx, bool, 0644);
>   */
>  DECLARE_EWMA(pkt_len, 0, 64)
>  
> +#define VF_TAKEOVER_INT	(HZ / 10)
> +
>  #define VIRTNET_DRIVER_VERSION "1.0.0"
>  
>  static const unsigned long guest_offloads[] = {

Why is this delay necessary? And why by 100ms?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 15:47 ` Michael S. Tsirkin
  2017-12-19 17:41   ` Samudrala, Sridhar
@ 2017-12-19 17:41   ` Samudrala, Sridhar
  2017-12-19 17:55     ` Stephen Hemminger
                       ` (3 more replies)
  1 sibling, 4 replies; 44+ messages in thread
From: Samudrala, Sridhar @ 2017-12-19 17:41 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: stephen, netdev, virtualization, alexander.duyck, Brandeburg, Jesse


On 12/19/2017 7:47 AM, Michael S. Tsirkin wrote:
> I'll need to look at this more, in particular the feature
> bit is missing here. For now one question:
>
> On Mon, Dec 18, 2017 at 04:40:36PM -0800, Sridhar Samudrala wrote:
>> @@ -56,6 +58,8 @@ module_param(napi_tx, bool, 0644);
>>    */
>>   DECLARE_EWMA(pkt_len, 0, 64)
>>   
>> +#define VF_TAKEOVER_INT	(HZ / 10)
>> +
>>   #define VIRTNET_DRIVER_VERSION "1.0.0"
>>   
>>   static const unsigned long guest_offloads[] = {
> Why is this delay necessary? And why by 100ms?

This is based on netvsc implementation and here is the commit that
added this delay.  Not sure if this needs to be 100ms.

commit 6123c66854c174e4982f98195100c1d990f9e5e6
Author: stephen hemminger <stephen@networkplumber.org>
Date:   Wed Aug 9 17:46:03 2017 -0700

     netvsc: delay setup of VF device

     When VF device is discovered, delay bring it automatically up in
     order to allow userspace to some simple changes (like renaming).

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 15:47 ` Michael S. Tsirkin
@ 2017-12-19 17:41   ` Samudrala, Sridhar
  2017-12-19 17:41   ` Samudrala, Sridhar
  1 sibling, 0 replies; 44+ messages in thread
From: Samudrala, Sridhar @ 2017-12-19 17:41 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: netdev, alexander.duyck, virtualization


On 12/19/2017 7:47 AM, Michael S. Tsirkin wrote:
> I'll need to look at this more, in particular the feature
> bit is missing here. For now one question:
>
> On Mon, Dec 18, 2017 at 04:40:36PM -0800, Sridhar Samudrala wrote:
>> @@ -56,6 +58,8 @@ module_param(napi_tx, bool, 0644);
>>    */
>>   DECLARE_EWMA(pkt_len, 0, 64)
>>   
>> +#define VF_TAKEOVER_INT	(HZ / 10)
>> +
>>   #define VIRTNET_DRIVER_VERSION "1.0.0"
>>   
>>   static const unsigned long guest_offloads[] = {
> Why is this delay necessary? And why by 100ms?

This is based on netvsc implementation and here is the commit that
added this delay.  Not sure if this needs to be 100ms.

commit 6123c66854c174e4982f98195100c1d990f9e5e6
Author: stephen hemminger <stephen@networkplumber.org>
Date:   Wed Aug 9 17:46:03 2017 -0700

     netvsc: delay setup of VF device

     When VF device is discovered, delay bring it automatically up in
     order to allow userspace to some simple changes (like renaming).



_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 17:41   ` Samudrala, Sridhar
  2017-12-19 17:55     ` Stephen Hemminger
@ 2017-12-19 17:55     ` Stephen Hemminger
  2017-12-19 18:07       ` Michael S. Tsirkin
                         ` (3 more replies)
  2017-12-19 18:20     ` David Miller
  2017-12-19 18:20     ` David Miller
  3 siblings, 4 replies; 44+ messages in thread
From: Stephen Hemminger @ 2017-12-19 17:55 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: Michael S. Tsirkin, netdev, virtualization, alexander.duyck,
	Brandeburg, Jesse

On Tue, 19 Dec 2017 09:41:39 -0800
"Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:

> On 12/19/2017 7:47 AM, Michael S. Tsirkin wrote:
> > I'll need to look at this more, in particular the feature
> > bit is missing here. For now one question:
> >
> > On Mon, Dec 18, 2017 at 04:40:36PM -0800, Sridhar Samudrala wrote:  
> >> @@ -56,6 +58,8 @@ module_param(napi_tx, bool, 0644);
> >>    */
> >>   DECLARE_EWMA(pkt_len, 0, 64)
> >>   
> >> +#define VF_TAKEOVER_INT	(HZ / 10)
> >> +
> >>   #define VIRTNET_DRIVER_VERSION "1.0.0"
> >>   
> >>   static const unsigned long guest_offloads[] = {  
> > Why is this delay necessary? And why by 100ms?  
> 
> This is based on netvsc implementation and here is the commit that
> added this delay.  Not sure if this needs to be 100ms.
> 
> commit 6123c66854c174e4982f98195100c1d990f9e5e6
> Author: stephen hemminger <stephen@networkplumber.org>
> Date:   Wed Aug 9 17:46:03 2017 -0700
> 
>      netvsc: delay setup of VF device
> 
>      When VF device is discovered, delay bring it automatically up in
>      order to allow userspace to some simple changes (like renaming).
> 
> 
> 

could be 10ms, just enough to let udev do its renaming

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 17:41   ` Samudrala, Sridhar
@ 2017-12-19 17:55     ` Stephen Hemminger
  2017-12-19 17:55     ` Stephen Hemminger
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 44+ messages in thread
From: Stephen Hemminger @ 2017-12-19 17:55 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: netdev, virtualization, alexander.duyck, Michael S. Tsirkin

On Tue, 19 Dec 2017 09:41:39 -0800
"Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:

> On 12/19/2017 7:47 AM, Michael S. Tsirkin wrote:
> > I'll need to look at this more, in particular the feature
> > bit is missing here. For now one question:
> >
> > On Mon, Dec 18, 2017 at 04:40:36PM -0800, Sridhar Samudrala wrote:  
> >> @@ -56,6 +58,8 @@ module_param(napi_tx, bool, 0644);
> >>    */
> >>   DECLARE_EWMA(pkt_len, 0, 64)
> >>   
> >> +#define VF_TAKEOVER_INT	(HZ / 10)
> >> +
> >>   #define VIRTNET_DRIVER_VERSION "1.0.0"
> >>   
> >>   static const unsigned long guest_offloads[] = {  
> > Why is this delay necessary? And why by 100ms?  
> 
> This is based on netvsc implementation and here is the commit that
> added this delay.  Not sure if this needs to be 100ms.
> 
> commit 6123c66854c174e4982f98195100c1d990f9e5e6
> Author: stephen hemminger <stephen@networkplumber.org>
> Date:   Wed Aug 9 17:46:03 2017 -0700
> 
>      netvsc: delay setup of VF device
> 
>      When VF device is discovered, delay bring it automatically up in
>      order to allow userspace to some simple changes (like renaming).
> 
> 
> 

could be 10ms, just enough to let udev do its renaming
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 17:55     ` Stephen Hemminger
@ 2017-12-19 18:07       ` Michael S. Tsirkin
  2017-12-19 18:13         ` Stephen Hemminger
  2017-12-19 18:13         ` Stephen Hemminger
  2017-12-19 18:07       ` Michael S. Tsirkin
                         ` (2 subsequent siblings)
  3 siblings, 2 replies; 44+ messages in thread
From: Michael S. Tsirkin @ 2017-12-19 18:07 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Samudrala, Sridhar, netdev, virtualization, alexander.duyck,
	Brandeburg, Jesse

On Tue, Dec 19, 2017 at 09:55:48AM -0800, Stephen Hemminger wrote:
> On Tue, 19 Dec 2017 09:41:39 -0800
> "Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:
> 
> > On 12/19/2017 7:47 AM, Michael S. Tsirkin wrote:
> > > I'll need to look at this more, in particular the feature
> > > bit is missing here. For now one question:
> > >
> > > On Mon, Dec 18, 2017 at 04:40:36PM -0800, Sridhar Samudrala wrote:  
> > >> @@ -56,6 +58,8 @@ module_param(napi_tx, bool, 0644);
> > >>    */
> > >>   DECLARE_EWMA(pkt_len, 0, 64)
> > >>   
> > >> +#define VF_TAKEOVER_INT	(HZ / 10)
> > >> +
> > >>   #define VIRTNET_DRIVER_VERSION "1.0.0"
> > >>   
> > >>   static const unsigned long guest_offloads[] = {  
> > > Why is this delay necessary? And why by 100ms?  
> > 
> > This is based on netvsc implementation and here is the commit that
> > added this delay.  Not sure if this needs to be 100ms.
> > 
> > commit 6123c66854c174e4982f98195100c1d990f9e5e6
> > Author: stephen hemminger <stephen@networkplumber.org>
> > Date:   Wed Aug 9 17:46:03 2017 -0700
> > 
> >      netvsc: delay setup of VF device
> > 
> >      When VF device is discovered, delay bring it automatically up in
> >      order to allow userspace to some simple changes (like renaming).
> > 
> > 
> > 
> 
> could be 10ms, just enough to let udev do its renaming

Isn't there a way not to depend on udev completing its thing within a given timeframe?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 17:55     ` Stephen Hemminger
  2017-12-19 18:07       ` Michael S. Tsirkin
@ 2017-12-19 18:07       ` Michael S. Tsirkin
  2017-12-19 18:21       ` David Miller
  2017-12-19 18:21       ` David Miller
  3 siblings, 0 replies; 44+ messages in thread
From: Michael S. Tsirkin @ 2017-12-19 18:07 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Samudrala, Sridhar, virtualization, alexander.duyck, netdev

On Tue, Dec 19, 2017 at 09:55:48AM -0800, Stephen Hemminger wrote:
> On Tue, 19 Dec 2017 09:41:39 -0800
> "Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:
> 
> > On 12/19/2017 7:47 AM, Michael S. Tsirkin wrote:
> > > I'll need to look at this more, in particular the feature
> > > bit is missing here. For now one question:
> > >
> > > On Mon, Dec 18, 2017 at 04:40:36PM -0800, Sridhar Samudrala wrote:  
> > >> @@ -56,6 +58,8 @@ module_param(napi_tx, bool, 0644);
> > >>    */
> > >>   DECLARE_EWMA(pkt_len, 0, 64)
> > >>   
> > >> +#define VF_TAKEOVER_INT	(HZ / 10)
> > >> +
> > >>   #define VIRTNET_DRIVER_VERSION "1.0.0"
> > >>   
> > >>   static const unsigned long guest_offloads[] = {  
> > > Why is this delay necessary? And why by 100ms?  
> > 
> > This is based on netvsc implementation and here is the commit that
> > added this delay.  Not sure if this needs to be 100ms.
> > 
> > commit 6123c66854c174e4982f98195100c1d990f9e5e6
> > Author: stephen hemminger <stephen@networkplumber.org>
> > Date:   Wed Aug 9 17:46:03 2017 -0700
> > 
> >      netvsc: delay setup of VF device
> > 
> >      When VF device is discovered, delay bring it automatically up in
> >      order to allow userspace to some simple changes (like renaming).
> > 
> > 
> > 
> 
> could be 10ms, just enough to let udev do its renaming

Isn't there a way not to depend on udev completing its thing within a given timeframe?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 18:07       ` Michael S. Tsirkin
@ 2017-12-19 18:13         ` Stephen Hemminger
  2017-12-19 18:13         ` Stephen Hemminger
  1 sibling, 0 replies; 44+ messages in thread
From: Stephen Hemminger @ 2017-12-19 18:13 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Samudrala, Sridhar, netdev, virtualization, alexander.duyck,
	Brandeburg, Jesse

On Tue, 19 Dec 2017 20:07:01 +0200
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Tue, Dec 19, 2017 at 09:55:48AM -0800, Stephen Hemminger wrote:
> > On Tue, 19 Dec 2017 09:41:39 -0800
> > "Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:
> >   
> > > On 12/19/2017 7:47 AM, Michael S. Tsirkin wrote:  
> > > > I'll need to look at this more, in particular the feature
> > > > bit is missing here. For now one question:
> > > >
> > > > On Mon, Dec 18, 2017 at 04:40:36PM -0800, Sridhar Samudrala wrote:    
> > > >> @@ -56,6 +58,8 @@ module_param(napi_tx, bool, 0644);
> > > >>    */
> > > >>   DECLARE_EWMA(pkt_len, 0, 64)
> > > >>   
> > > >> +#define VF_TAKEOVER_INT	(HZ / 10)
> > > >> +
> > > >>   #define VIRTNET_DRIVER_VERSION "1.0.0"
> > > >>   
> > > >>   static const unsigned long guest_offloads[] = {    
> > > > Why is this delay necessary? And why by 100ms?    
> > > 
> > > This is based on netvsc implementation and here is the commit that
> > > added this delay.  Not sure if this needs to be 100ms.
> > > 
> > > commit 6123c66854c174e4982f98195100c1d990f9e5e6
> > > Author: stephen hemminger <stephen@networkplumber.org>
> > > Date:   Wed Aug 9 17:46:03 2017 -0700
> > > 
> > >      netvsc: delay setup of VF device
> > > 
> > >      When VF device is discovered, delay bring it automatically up in
> > >      order to allow userspace to some simple changes (like renaming).
> > > 
> > > 
> > >   
> > 
> > could be 10ms, just enough to let udev do its renaming  
> 
> Isn't there a way not to depend on udev completing its thing within a given timeframe?

Not that I know. the path is quite indirect.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 18:07       ` Michael S. Tsirkin
  2017-12-19 18:13         ` Stephen Hemminger
@ 2017-12-19 18:13         ` Stephen Hemminger
  1 sibling, 0 replies; 44+ messages in thread
From: Stephen Hemminger @ 2017-12-19 18:13 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Samudrala, Sridhar, virtualization, alexander.duyck, netdev

On Tue, 19 Dec 2017 20:07:01 +0200
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Tue, Dec 19, 2017 at 09:55:48AM -0800, Stephen Hemminger wrote:
> > On Tue, 19 Dec 2017 09:41:39 -0800
> > "Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:
> >   
> > > On 12/19/2017 7:47 AM, Michael S. Tsirkin wrote:  
> > > > I'll need to look at this more, in particular the feature
> > > > bit is missing here. For now one question:
> > > >
> > > > On Mon, Dec 18, 2017 at 04:40:36PM -0800, Sridhar Samudrala wrote:    
> > > >> @@ -56,6 +58,8 @@ module_param(napi_tx, bool, 0644);
> > > >>    */
> > > >>   DECLARE_EWMA(pkt_len, 0, 64)
> > > >>   
> > > >> +#define VF_TAKEOVER_INT	(HZ / 10)
> > > >> +
> > > >>   #define VIRTNET_DRIVER_VERSION "1.0.0"
> > > >>   
> > > >>   static const unsigned long guest_offloads[] = {    
> > > > Why is this delay necessary? And why by 100ms?    
> > > 
> > > This is based on netvsc implementation and here is the commit that
> > > added this delay.  Not sure if this needs to be 100ms.
> > > 
> > > commit 6123c66854c174e4982f98195100c1d990f9e5e6
> > > Author: stephen hemminger <stephen@networkplumber.org>
> > > Date:   Wed Aug 9 17:46:03 2017 -0700
> > > 
> > >      netvsc: delay setup of VF device
> > > 
> > >      When VF device is discovered, delay bring it automatically up in
> > >      order to allow userspace to some simple changes (like renaming).
> > > 
> > > 
> > >   
> > 
> > could be 10ms, just enough to let udev do its renaming  
> 
> Isn't there a way not to depend on udev completing its thing within a given timeframe?

Not that I know. the path is quite indirect.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 17:41   ` Samudrala, Sridhar
                       ` (2 preceding siblings ...)
  2017-12-19 18:20     ` David Miller
@ 2017-12-19 18:20     ` David Miller
  2017-12-20 10:51       ` Vitaly Kuznetsov
  2017-12-20 10:51       ` Vitaly Kuznetsov
  3 siblings, 2 replies; 44+ messages in thread
From: David Miller @ 2017-12-19 18:20 UTC (permalink / raw)
  To: sridhar.samudrala
  Cc: mst, stephen, netdev, virtualization, alexander.duyck, jesse.brandeburg

From: "Samudrala, Sridhar" <sridhar.samudrala@intel.com>
Date: Tue, 19 Dec 2017 09:41:39 -0800

> This is based on netvsc implementation and here is the commit that
> added this delay.  Not sure if this needs to be 100ms.
> 
> commit 6123c66854c174e4982f98195100c1d990f9e5e6
> Author: stephen hemminger <stephen@networkplumber.org>
> Date:   Wed Aug 9 17:46:03 2017 -0700
> 
>     netvsc: delay setup of VF device
> 
>     When VF device is discovered, delay bring it automatically up in
>     order to allow userspace to some simple changes (like renaming).

This is kind of bogus, I should have called this out when the patch
was posted.

Any delay is wrong, there needs to be tight synchronization if a
userspace operation must occur before proceeding.  If something
happens and userspace is delayed, this whole thing doesn't work.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 17:41   ` Samudrala, Sridhar
  2017-12-19 17:55     ` Stephen Hemminger
  2017-12-19 17:55     ` Stephen Hemminger
@ 2017-12-19 18:20     ` David Miller
  2017-12-19 18:20     ` David Miller
  3 siblings, 0 replies; 44+ messages in thread
From: David Miller @ 2017-12-19 18:20 UTC (permalink / raw)
  To: sridhar.samudrala; +Cc: mst, netdev, alexander.duyck, virtualization

From: "Samudrala, Sridhar" <sridhar.samudrala@intel.com>
Date: Tue, 19 Dec 2017 09:41:39 -0800

> This is based on netvsc implementation and here is the commit that
> added this delay.  Not sure if this needs to be 100ms.
> 
> commit 6123c66854c174e4982f98195100c1d990f9e5e6
> Author: stephen hemminger <stephen@networkplumber.org>
> Date:   Wed Aug 9 17:46:03 2017 -0700
> 
>     netvsc: delay setup of VF device
> 
>     When VF device is discovered, delay bring it automatically up in
>     order to allow userspace to some simple changes (like renaming).

This is kind of bogus, I should have called this out when the patch
was posted.

Any delay is wrong, there needs to be tight synchronization if a
userspace operation must occur before proceeding.  If something
happens and userspace is delayed, this whole thing doesn't work.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 17:55     ` Stephen Hemminger
                         ` (2 preceding siblings ...)
  2017-12-19 18:21       ` David Miller
@ 2017-12-19 18:21       ` David Miller
  2017-12-19 18:41         ` Stephen Hemminger
  2017-12-19 18:41         ` Stephen Hemminger
  3 siblings, 2 replies; 44+ messages in thread
From: David Miller @ 2017-12-19 18:21 UTC (permalink / raw)
  To: stephen
  Cc: sridhar.samudrala, mst, netdev, virtualization, alexander.duyck,
	jesse.brandeburg

From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 19 Dec 2017 09:55:48 -0800

> could be 10ms, just enough to let udev do its renaming

Please, move to some kind of notification or event based handling of
this problem.

No delay is safe, what if userspace gets swapped out or whatever
else might make userspace stall unexpectedly?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 17:55     ` Stephen Hemminger
  2017-12-19 18:07       ` Michael S. Tsirkin
  2017-12-19 18:07       ` Michael S. Tsirkin
@ 2017-12-19 18:21       ` David Miller
  2017-12-19 18:21       ` David Miller
  3 siblings, 0 replies; 44+ messages in thread
From: David Miller @ 2017-12-19 18:21 UTC (permalink / raw)
  To: stephen; +Cc: mst, sridhar.samudrala, alexander.duyck, virtualization, netdev

From: Stephen Hemminger <stephen@networkplumber.org>
Date: Tue, 19 Dec 2017 09:55:48 -0800

> could be 10ms, just enough to let udev do its renaming

Please, move to some kind of notification or event based handling of
this problem.

No delay is safe, what if userspace gets swapped out or whatever
else might make userspace stall unexpectedly?

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 18:21       ` David Miller
  2017-12-19 18:41         ` Stephen Hemminger
@ 2017-12-19 18:41         ` Stephen Hemminger
  2017-12-19 19:42           ` Samudrala, Sridhar
                             ` (4 more replies)
  1 sibling, 5 replies; 44+ messages in thread
From: Stephen Hemminger @ 2017-12-19 18:41 UTC (permalink / raw)
  To: David Miller
  Cc: sridhar.samudrala, mst, netdev, virtualization, alexander.duyck,
	jesse.brandeburg

On Tue, 19 Dec 2017 13:21:17 -0500 (EST)
David Miller <davem@davemloft.net> wrote:

> From: Stephen Hemminger <stephen@networkplumber.org>
> Date: Tue, 19 Dec 2017 09:55:48 -0800
> 
> > could be 10ms, just enough to let udev do its renaming  
> 
> Please, move to some kind of notification or event based handling of
> this problem.
> 
> No delay is safe, what if userspace gets swapped out or whatever
> else might make userspace stall unexpectedly?
> 

The plan is to remove the delay and do the naming in the kernel.
This was suggested by Lennart since udev is only doing naming policy
because kernel names were not repeatable.

This makes the VF show up as "ethN_vf" on Hyper-V which is user friendly.

Patch is pending.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 18:21       ` David Miller
@ 2017-12-19 18:41         ` Stephen Hemminger
  2017-12-19 18:41         ` Stephen Hemminger
  1 sibling, 0 replies; 44+ messages in thread
From: Stephen Hemminger @ 2017-12-19 18:41 UTC (permalink / raw)
  To: David Miller
  Cc: mst, sridhar.samudrala, alexander.duyck, virtualization, netdev

On Tue, 19 Dec 2017 13:21:17 -0500 (EST)
David Miller <davem@davemloft.net> wrote:

> From: Stephen Hemminger <stephen@networkplumber.org>
> Date: Tue, 19 Dec 2017 09:55:48 -0800
> 
> > could be 10ms, just enough to let udev do its renaming  
> 
> Please, move to some kind of notification or event based handling of
> this problem.
> 
> No delay is safe, what if userspace gets swapped out or whatever
> else might make userspace stall unexpectedly?
> 

The plan is to remove the delay and do the naming in the kernel.
This was suggested by Lennart since udev is only doing naming policy
because kernel names were not repeatable.

This makes the VF show up as "ethN_vf" on Hyper-V which is user friendly.

Patch is pending.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 18:41         ` Stephen Hemminger
  2017-12-19 19:42           ` Samudrala, Sridhar
@ 2017-12-19 19:42           ` Samudrala, Sridhar
  2017-12-19 19:46             ` Stephen Hemminger
  2017-12-19 19:46             ` Stephen Hemminger
  2017-12-21  1:31           ` Siwei Liu
                             ` (2 subsequent siblings)
  4 siblings, 2 replies; 44+ messages in thread
From: Samudrala, Sridhar @ 2017-12-19 19:42 UTC (permalink / raw)
  To: Stephen Hemminger, David Miller
  Cc: mst, netdev, virtualization, alexander.duyck, jesse.brandeburg



On 12/19/2017 10:41 AM, Stephen Hemminger wrote:
> On Tue, 19 Dec 2017 13:21:17 -0500 (EST)
> David Miller <davem@davemloft.net> wrote:
>
>> From: Stephen Hemminger <stephen@networkplumber.org>
>> Date: Tue, 19 Dec 2017 09:55:48 -0800
>>
>>> could be 10ms, just enough to let udev do its renaming
>> Please, move to some kind of notification or event based handling of
>> this problem.
>>
>> No delay is safe, what if userspace gets swapped out or whatever
>> else might make userspace stall unexpectedly?
>>
> The plan is to remove the delay and do the naming in the kernel.
> This was suggested by Lennart since udev is only doing naming policy
> because kernel names were not repeatable.
>
> This makes the VF show up as "ethN_vf" on Hyper-V which is user friendly.
>
> Patch is pending.
Do we really need to delay the setup until the name is changed?
Can't we call dev_set_mtu() and dev_open() until dev_change_name() is done?

Thanks
Sridhar

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 18:41         ` Stephen Hemminger
@ 2017-12-19 19:42           ` Samudrala, Sridhar
  2017-12-19 19:42           ` Samudrala, Sridhar
                             ` (3 subsequent siblings)
  4 siblings, 0 replies; 44+ messages in thread
From: Samudrala, Sridhar @ 2017-12-19 19:42 UTC (permalink / raw)
  To: Stephen Hemminger, David Miller
  Cc: netdev, virtualization, alexander.duyck, mst



On 12/19/2017 10:41 AM, Stephen Hemminger wrote:
> On Tue, 19 Dec 2017 13:21:17 -0500 (EST)
> David Miller <davem@davemloft.net> wrote:
>
>> From: Stephen Hemminger <stephen@networkplumber.org>
>> Date: Tue, 19 Dec 2017 09:55:48 -0800
>>
>>> could be 10ms, just enough to let udev do its renaming
>> Please, move to some kind of notification or event based handling of
>> this problem.
>>
>> No delay is safe, what if userspace gets swapped out or whatever
>> else might make userspace stall unexpectedly?
>>
> The plan is to remove the delay and do the naming in the kernel.
> This was suggested by Lennart since udev is only doing naming policy
> because kernel names were not repeatable.
>
> This makes the VF show up as "ethN_vf" on Hyper-V which is user friendly.
>
> Patch is pending.
Do we really need to delay the setup until the name is changed?
Can't we call dev_set_mtu() and dev_open() until dev_change_name() is done?

Thanks
Sridhar

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 19:42           ` Samudrala, Sridhar
  2017-12-19 19:46             ` Stephen Hemminger
@ 2017-12-19 19:46             ` Stephen Hemminger
  2017-12-19 22:37               ` Samudrala, Sridhar
  2017-12-19 22:37               ` Samudrala, Sridhar
  1 sibling, 2 replies; 44+ messages in thread
From: Stephen Hemminger @ 2017-12-19 19:46 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: David Miller, mst, netdev, virtualization, alexander.duyck,
	jesse.brandeburg

On Tue, 19 Dec 2017 11:42:33 -0800
"Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:

> On 12/19/2017 10:41 AM, Stephen Hemminger wrote:
> > On Tue, 19 Dec 2017 13:21:17 -0500 (EST)
> > David Miller <davem@davemloft.net> wrote:
> >  
> >> From: Stephen Hemminger <stephen@networkplumber.org>
> >> Date: Tue, 19 Dec 2017 09:55:48 -0800
> >>  
> >>> could be 10ms, just enough to let udev do its renaming  
> >> Please, move to some kind of notification or event based handling of
> >> this problem.
> >>
> >> No delay is safe, what if userspace gets swapped out or whatever
> >> else might make userspace stall unexpectedly?
> >>  
> > The plan is to remove the delay and do the naming in the kernel.
> > This was suggested by Lennart since udev is only doing naming policy
> > because kernel names were not repeatable.
> >
> > This makes the VF show up as "ethN_vf" on Hyper-V which is user friendly.
> >
> > Patch is pending.  
> Do we really need to delay the setup until the name is changed?
> Can't we call dev_set_mtu() and dev_open() until dev_change_name() is done?
> 
> Thanks
> Sridhar

You can call dev_set_mtu, but when dev_open is done the device name
can not be changed by userspace.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 19:42           ` Samudrala, Sridhar
@ 2017-12-19 19:46             ` Stephen Hemminger
  2017-12-19 19:46             ` Stephen Hemminger
  1 sibling, 0 replies; 44+ messages in thread
From: Stephen Hemminger @ 2017-12-19 19:46 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: mst, netdev, alexander.duyck, virtualization, David Miller

On Tue, 19 Dec 2017 11:42:33 -0800
"Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:

> On 12/19/2017 10:41 AM, Stephen Hemminger wrote:
> > On Tue, 19 Dec 2017 13:21:17 -0500 (EST)
> > David Miller <davem@davemloft.net> wrote:
> >  
> >> From: Stephen Hemminger <stephen@networkplumber.org>
> >> Date: Tue, 19 Dec 2017 09:55:48 -0800
> >>  
> >>> could be 10ms, just enough to let udev do its renaming  
> >> Please, move to some kind of notification or event based handling of
> >> this problem.
> >>
> >> No delay is safe, what if userspace gets swapped out or whatever
> >> else might make userspace stall unexpectedly?
> >>  
> > The plan is to remove the delay and do the naming in the kernel.
> > This was suggested by Lennart since udev is only doing naming policy
> > because kernel names were not repeatable.
> >
> > This makes the VF show up as "ethN_vf" on Hyper-V which is user friendly.
> >
> > Patch is pending.  
> Do we really need to delay the setup until the name is changed?
> Can't we call dev_set_mtu() and dev_open() until dev_change_name() is done?
> 
> Thanks
> Sridhar

You can call dev_set_mtu, but when dev_open is done the device name
can not be changed by userspace.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 19:46             ` Stephen Hemminger
@ 2017-12-19 22:37               ` Samudrala, Sridhar
  2017-12-19 22:53                 ` Stephen Hemminger
  2017-12-19 22:53                 ` Stephen Hemminger
  2017-12-19 22:37               ` Samudrala, Sridhar
  1 sibling, 2 replies; 44+ messages in thread
From: Samudrala, Sridhar @ 2017-12-19 22:37 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, mst, netdev, virtualization, alexander.duyck,
	jesse.brandeburg

On 12/19/2017 11:46 AM, Stephen Hemminger wrote:
> On Tue, 19 Dec 2017 11:42:33 -0800
> "Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:
>
>> On 12/19/2017 10:41 AM, Stephen Hemminger wrote:
>>> On Tue, 19 Dec 2017 13:21:17 -0500 (EST)
>>> David Miller <davem@davemloft.net> wrote:
>>>   
>>>> From: Stephen Hemminger <stephen@networkplumber.org>
>>>> Date: Tue, 19 Dec 2017 09:55:48 -0800
>>>>   
>>>>> could be 10ms, just enough to let udev do its renaming
>>>> Please, move to some kind of notification or event based handling of
>>>> this problem.
>>>>
>>>> No delay is safe, what if userspace gets swapped out or whatever
>>>> else might make userspace stall unexpectedly?
>>>>   
>>> The plan is to remove the delay and do the naming in the kernel.
>>> This was suggested by Lennart since udev is only doing naming policy
>>> because kernel names were not repeatable.
>>>
>>> This makes the VF show up as "ethN_vf" on Hyper-V which is user friendly.
>>>
>>> Patch is pending.
>> Do we really need to delay the setup until the name is changed?
>> Can't we call dev_set_mtu() and dev_open() until dev_change_name() is done?
>>
>> Thanks
>> Sridhar
> You can call dev_set_mtu, but when dev_open is done the device name
> can not be changed by userspace.
I did a quick test to remove the delay and also the dev_open() call and 
i don't see
any issues with virtio taking over the VF datapath.
Only the netdev_info() messages may show old device name.

Any specific scenario where we need to explicitly call  VF's dev_open() 
in the VF setup process?
I tried i40evf driver loaded after virtio_net  AND  virtio_net loading 
after i40evf.

Thanks
Sridhar

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 19:46             ` Stephen Hemminger
  2017-12-19 22:37               ` Samudrala, Sridhar
@ 2017-12-19 22:37               ` Samudrala, Sridhar
  1 sibling, 0 replies; 44+ messages in thread
From: Samudrala, Sridhar @ 2017-12-19 22:37 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: mst, netdev, alexander.duyck, virtualization, David Miller

On 12/19/2017 11:46 AM, Stephen Hemminger wrote:
> On Tue, 19 Dec 2017 11:42:33 -0800
> "Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:
>
>> On 12/19/2017 10:41 AM, Stephen Hemminger wrote:
>>> On Tue, 19 Dec 2017 13:21:17 -0500 (EST)
>>> David Miller <davem@davemloft.net> wrote:
>>>   
>>>> From: Stephen Hemminger <stephen@networkplumber.org>
>>>> Date: Tue, 19 Dec 2017 09:55:48 -0800
>>>>   
>>>>> could be 10ms, just enough to let udev do its renaming
>>>> Please, move to some kind of notification or event based handling of
>>>> this problem.
>>>>
>>>> No delay is safe, what if userspace gets swapped out or whatever
>>>> else might make userspace stall unexpectedly?
>>>>   
>>> The plan is to remove the delay and do the naming in the kernel.
>>> This was suggested by Lennart since udev is only doing naming policy
>>> because kernel names were not repeatable.
>>>
>>> This makes the VF show up as "ethN_vf" on Hyper-V which is user friendly.
>>>
>>> Patch is pending.
>> Do we really need to delay the setup until the name is changed?
>> Can't we call dev_set_mtu() and dev_open() until dev_change_name() is done?
>>
>> Thanks
>> Sridhar
> You can call dev_set_mtu, but when dev_open is done the device name
> can not be changed by userspace.
I did a quick test to remove the delay and also the dev_open() call and 
i don't see
any issues with virtio taking over the VF datapath.
Only the netdev_info() messages may show old device name.

Any specific scenario where we need to explicitly call  VF's dev_open() 
in the VF setup process?
I tried i40evf driver loaded after virtio_net  AND  virtio_net loading 
after i40evf.

Thanks
Sridhar
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 22:37               ` Samudrala, Sridhar
@ 2017-12-19 22:53                 ` Stephen Hemminger
  2017-12-20  0:26                   ` Samudrala, Sridhar
  2017-12-20  0:26                   ` Samudrala, Sridhar
  2017-12-19 22:53                 ` Stephen Hemminger
  1 sibling, 2 replies; 44+ messages in thread
From: Stephen Hemminger @ 2017-12-19 22:53 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: David Miller, mst, netdev, virtualization, alexander.duyck,
	jesse.brandeburg

On Tue, 19 Dec 2017 14:37:50 -0800
"Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:

> On 12/19/2017 11:46 AM, Stephen Hemminger wrote:
> > On Tue, 19 Dec 2017 11:42:33 -0800
> > "Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:
> >  
> >> On 12/19/2017 10:41 AM, Stephen Hemminger wrote:  
> >>> On Tue, 19 Dec 2017 13:21:17 -0500 (EST)
> >>> David Miller <davem@davemloft.net> wrote:
> >>>     
> >>>> From: Stephen Hemminger <stephen@networkplumber.org>
> >>>> Date: Tue, 19 Dec 2017 09:55:48 -0800
> >>>>     
> >>>>> could be 10ms, just enough to let udev do its renaming  
> >>>> Please, move to some kind of notification or event based handling of
> >>>> this problem.
> >>>>
> >>>> No delay is safe, what if userspace gets swapped out or whatever
> >>>> else might make userspace stall unexpectedly?
> >>>>     
> >>> The plan is to remove the delay and do the naming in the kernel.
> >>> This was suggested by Lennart since udev is only doing naming policy
> >>> because kernel names were not repeatable.
> >>>
> >>> This makes the VF show up as "ethN_vf" on Hyper-V which is user friendly.
> >>>
> >>> Patch is pending.  
> >> Do we really need to delay the setup until the name is changed?
> >> Can't we call dev_set_mtu() and dev_open() until dev_change_name() is done?
> >>
> >> Thanks
> >> Sridhar  
> > You can call dev_set_mtu, but when dev_open is done the device name
> > can not be changed by userspace.  
> I did a quick test to remove the delay and also the dev_open() call and 
> i don't see
> any issues with virtio taking over the VF datapath.
> Only the netdev_info() messages may show old device name.
> 
> Any specific scenario where we need to explicitly call  VF's dev_open() 
> in the VF setup process?
> I tried i40evf driver loaded after virtio_net  AND  virtio_net loading 
> after i40evf.
> 
> Thanks
> Sridhar

It happens with hotplug. It is possible on Hyper-V to hotplug SR-IOV on
and off while guest is running. If SR-IOV is disabled in host then the
VF device is removed (hotplug) and the inverse. If the master device is
up then the VF device should be brought up by the master device.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 22:37               ` Samudrala, Sridhar
  2017-12-19 22:53                 ` Stephen Hemminger
@ 2017-12-19 22:53                 ` Stephen Hemminger
  1 sibling, 0 replies; 44+ messages in thread
From: Stephen Hemminger @ 2017-12-19 22:53 UTC (permalink / raw)
  To: Samudrala, Sridhar
  Cc: mst, netdev, alexander.duyck, virtualization, David Miller

On Tue, 19 Dec 2017 14:37:50 -0800
"Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:

> On 12/19/2017 11:46 AM, Stephen Hemminger wrote:
> > On Tue, 19 Dec 2017 11:42:33 -0800
> > "Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:
> >  
> >> On 12/19/2017 10:41 AM, Stephen Hemminger wrote:  
> >>> On Tue, 19 Dec 2017 13:21:17 -0500 (EST)
> >>> David Miller <davem@davemloft.net> wrote:
> >>>     
> >>>> From: Stephen Hemminger <stephen@networkplumber.org>
> >>>> Date: Tue, 19 Dec 2017 09:55:48 -0800
> >>>>     
> >>>>> could be 10ms, just enough to let udev do its renaming  
> >>>> Please, move to some kind of notification or event based handling of
> >>>> this problem.
> >>>>
> >>>> No delay is safe, what if userspace gets swapped out or whatever
> >>>> else might make userspace stall unexpectedly?
> >>>>     
> >>> The plan is to remove the delay and do the naming in the kernel.
> >>> This was suggested by Lennart since udev is only doing naming policy
> >>> because kernel names were not repeatable.
> >>>
> >>> This makes the VF show up as "ethN_vf" on Hyper-V which is user friendly.
> >>>
> >>> Patch is pending.  
> >> Do we really need to delay the setup until the name is changed?
> >> Can't we call dev_set_mtu() and dev_open() until dev_change_name() is done?
> >>
> >> Thanks
> >> Sridhar  
> > You can call dev_set_mtu, but when dev_open is done the device name
> > can not be changed by userspace.  
> I did a quick test to remove the delay and also the dev_open() call and 
> i don't see
> any issues with virtio taking over the VF datapath.
> Only the netdev_info() messages may show old device name.
> 
> Any specific scenario where we need to explicitly call  VF's dev_open() 
> in the VF setup process?
> I tried i40evf driver loaded after virtio_net  AND  virtio_net loading 
> after i40evf.
> 
> Thanks
> Sridhar

It happens with hotplug. It is possible on Hyper-V to hotplug SR-IOV on
and off while guest is running. If SR-IOV is disabled in host then the
VF device is removed (hotplug) and the inverse. If the master device is
up then the VF device should be brought up by the master device.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 22:53                 ` Stephen Hemminger
  2017-12-20  0:26                   ` Samudrala, Sridhar
@ 2017-12-20  0:26                   ` Samudrala, Sridhar
  1 sibling, 0 replies; 44+ messages in thread
From: Samudrala, Sridhar @ 2017-12-20  0:26 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, mst, netdev, virtualization, alexander.duyck,
	jesse.brandeburg

On 12/19/2017 2:53 PM, Stephen Hemminger wrote:
> On Tue, 19 Dec 2017 14:37:50 -0800
> "Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:
>
>> On 12/19/2017 11:46 AM, Stephen Hemminger wrote:
>>> On Tue, 19 Dec 2017 11:42:33 -0800
>>> "Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:
>>>   
>>>> On 12/19/2017 10:41 AM, Stephen Hemminger wrote:
>>>>> On Tue, 19 Dec 2017 13:21:17 -0500 (EST)
>>>>> David Miller <davem@davemloft.net> wrote:
>>>>>      
>>>>>> From: Stephen Hemminger <stephen@networkplumber.org>
>>>>>> Date: Tue, 19 Dec 2017 09:55:48 -0800
>>>>>>      
>>>>>>> could be 10ms, just enough to let udev do its renaming
>>>>>> Please, move to some kind of notification or event based handling of
>>>>>> this problem.
>>>>>>
>>>>>> No delay is safe, what if userspace gets swapped out or whatever
>>>>>> else might make userspace stall unexpectedly?
>>>>>>      
>>>>> The plan is to remove the delay and do the naming in the kernel.
>>>>> This was suggested by Lennart since udev is only doing naming policy
>>>>> because kernel names were not repeatable.
>>>>>
>>>>> This makes the VF show up as "ethN_vf" on Hyper-V which is user friendly.
>>>>>
>>>>> Patch is pending.
>>>> Do we really need to delay the setup until the name is changed?
>>>> Can't we call dev_set_mtu() and dev_open() until dev_change_name() is done?
>>>>
>>>> Thanks
>>>> Sridhar
>>> You can call dev_set_mtu, but when dev_open is done the device name
>>> can not be changed by userspace.
>> I did a quick test to remove the delay and also the dev_open() call and
>> i don't see
>> any issues with virtio taking over the VF datapath.
>> Only the netdev_info() messages may show old device name.
>>
>> Any specific scenario where we need to explicitly call  VF's dev_open()
>> in the VF setup process?
>> I tried i40evf driver loaded after virtio_net  AND  virtio_net loading
>> after i40evf.
>>
> It happens with hotplug. It is possible on Hyper-V to hotplug SR-IOV on
> and off while guest is running. If SR-IOV is disabled in host then the
> VF device is removed (hotplug) and the inverse. If the master device is
> up then the VF device should be brought up by the master device.

Even with KVM, we need to hot unplug SR-IOV device on the source host 
and  plug it
back on the destination host to enable live migration. It all works for 
me even without
the dev_open() of the lower device  from the VF setup routine. Will send 
out a v2 version
of this patch without the delayed VF setup after some more testing next 
week.

Thanks
Sridhar

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 22:53                 ` Stephen Hemminger
@ 2017-12-20  0:26                   ` Samudrala, Sridhar
  2017-12-20  0:26                   ` Samudrala, Sridhar
  1 sibling, 0 replies; 44+ messages in thread
From: Samudrala, Sridhar @ 2017-12-20  0:26 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: mst, netdev, alexander.duyck, virtualization, David Miller

On 12/19/2017 2:53 PM, Stephen Hemminger wrote:
> On Tue, 19 Dec 2017 14:37:50 -0800
> "Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:
>
>> On 12/19/2017 11:46 AM, Stephen Hemminger wrote:
>>> On Tue, 19 Dec 2017 11:42:33 -0800
>>> "Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:
>>>   
>>>> On 12/19/2017 10:41 AM, Stephen Hemminger wrote:
>>>>> On Tue, 19 Dec 2017 13:21:17 -0500 (EST)
>>>>> David Miller <davem@davemloft.net> wrote:
>>>>>      
>>>>>> From: Stephen Hemminger <stephen@networkplumber.org>
>>>>>> Date: Tue, 19 Dec 2017 09:55:48 -0800
>>>>>>      
>>>>>>> could be 10ms, just enough to let udev do its renaming
>>>>>> Please, move to some kind of notification or event based handling of
>>>>>> this problem.
>>>>>>
>>>>>> No delay is safe, what if userspace gets swapped out or whatever
>>>>>> else might make userspace stall unexpectedly?
>>>>>>      
>>>>> The plan is to remove the delay and do the naming in the kernel.
>>>>> This was suggested by Lennart since udev is only doing naming policy
>>>>> because kernel names were not repeatable.
>>>>>
>>>>> This makes the VF show up as "ethN_vf" on Hyper-V which is user friendly.
>>>>>
>>>>> Patch is pending.
>>>> Do we really need to delay the setup until the name is changed?
>>>> Can't we call dev_set_mtu() and dev_open() until dev_change_name() is done?
>>>>
>>>> Thanks
>>>> Sridhar
>>> You can call dev_set_mtu, but when dev_open is done the device name
>>> can not be changed by userspace.
>> I did a quick test to remove the delay and also the dev_open() call and
>> i don't see
>> any issues with virtio taking over the VF datapath.
>> Only the netdev_info() messages may show old device name.
>>
>> Any specific scenario where we need to explicitly call  VF's dev_open()
>> in the VF setup process?
>> I tried i40evf driver loaded after virtio_net  AND  virtio_net loading
>> after i40evf.
>>
> It happens with hotplug. It is possible on Hyper-V to hotplug SR-IOV on
> and off while guest is running. If SR-IOV is disabled in host then the
> VF device is removed (hotplug) and the inverse. If the master device is
> up then the VF device should be brought up by the master device.

Even with KVM, we need to hot unplug SR-IOV device on the source host 
and  plug it
back on the destination host to enable live migration. It all works for 
me even without
the dev_open() of the lower device  from the VF setup routine. Will send 
out a v2 version
of this patch without the delayed VF setup after some more testing next 
week.

Thanks
Sridhar

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 18:20     ` David Miller
  2017-12-20 10:51       ` Vitaly Kuznetsov
@ 2017-12-20 10:51       ` Vitaly Kuznetsov
  1 sibling, 0 replies; 44+ messages in thread
From: Vitaly Kuznetsov @ 2017-12-20 10:51 UTC (permalink / raw)
  To: David Miller
  Cc: sridhar.samudrala, mst, netdev, alexander.duyck, virtualization,
	Stephen Hemminger

David Miller <davem@davemloft.net> writes:

> From: "Samudrala, Sridhar" <sridhar.samudrala@intel.com>
> Date: Tue, 19 Dec 2017 09:41:39 -0800
>
>> This is based on netvsc implementation and here is the commit that
>> added this delay.  Not sure if this needs to be 100ms.
>> 
>> commit 6123c66854c174e4982f98195100c1d990f9e5e6
>> Author: stephen hemminger <stephen@networkplumber.org>
>> Date:   Wed Aug 9 17:46:03 2017 -0700
>> 
>>     netvsc: delay setup of VF device
>> 
>>     When VF device is discovered, delay bring it automatically up in
>>     order to allow userspace to some simple changes (like renaming).
>
> This is kind of bogus, I should have called this out when the patch
> was posted.
>
> Any delay is wrong, there needs to be tight synchronization if a
> userspace operation must occur before proceeding.  If something
> happens and userspace is delayed, this whole thing doesn't work.

Hyper-V's netvsc does exactly the same and when this was first discussed
I suggested to allow renaming of IFF_UP interfaces:
https://patchwork.kernel.org/patch/9890299/

but as far as I understand it was decided that it's too risky and not
worth it. Maybe we need to reconsider this, at least for slave
interfaces (as scripts are not supposed to operate on them).

-- 
  Vitaly

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 18:20     ` David Miller
@ 2017-12-20 10:51       ` Vitaly Kuznetsov
  2017-12-20 10:51       ` Vitaly Kuznetsov
  1 sibling, 0 replies; 44+ messages in thread
From: Vitaly Kuznetsov @ 2017-12-20 10:51 UTC (permalink / raw)
  To: David Miller
  Cc: Stephen Hemminger, mst, sridhar.samudrala, alexander.duyck,
	virtualization, netdev

David Miller <davem@davemloft.net> writes:

> From: "Samudrala, Sridhar" <sridhar.samudrala@intel.com>
> Date: Tue, 19 Dec 2017 09:41:39 -0800
>
>> This is based on netvsc implementation and here is the commit that
>> added this delay.  Not sure if this needs to be 100ms.
>> 
>> commit 6123c66854c174e4982f98195100c1d990f9e5e6
>> Author: stephen hemminger <stephen@networkplumber.org>
>> Date:   Wed Aug 9 17:46:03 2017 -0700
>> 
>>     netvsc: delay setup of VF device
>> 
>>     When VF device is discovered, delay bring it automatically up in
>>     order to allow userspace to some simple changes (like renaming).
>
> This is kind of bogus, I should have called this out when the patch
> was posted.
>
> Any delay is wrong, there needs to be tight synchronization if a
> userspace operation must occur before proceeding.  If something
> happens and userspace is delayed, this whole thing doesn't work.

Hyper-V's netvsc does exactly the same and when this was first discussed
I suggested to allow renaming of IFF_UP interfaces:
https://patchwork.kernel.org/patch/9890299/

but as far as I understand it was decided that it's too risky and not
worth it. Maybe we need to reconsider this, at least for slave
interfaces (as scripts are not supposed to operate on them).

-- 
  Vitaly
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19  0:40 [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available Sridhar Samudrala
  2017-12-19 15:47 ` Michael S. Tsirkin
  2017-12-19 15:47 ` Michael S. Tsirkin
@ 2017-12-20 22:33 ` Jakub Kicinski
  2017-12-21  0:15   ` Michael S. Tsirkin
  2017-12-21  0:15   ` Michael S. Tsirkin
  2017-12-20 22:33 ` Jakub Kicinski
                   ` (2 subsequent siblings)
  5 siblings, 2 replies; 44+ messages in thread
From: Jakub Kicinski @ 2017-12-20 22:33 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: mst, stephen, netdev, virtualization, alexander.duyck

On Mon, 18 Dec 2017 16:40:36 -0800, Sridhar Samudrala wrote:
> +static int virtio_netdev_event(struct notifier_block *this,
> +			       unsigned long event, void *ptr)
> +{
> +	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
> +
> +	/* Skip our own events */
> +	if (event_dev->netdev_ops == &virtnet_netdev)
> +		return NOTIFY_DONE;

I wonder how does this work WRT loop prevention.  What if I have two
virtio devices with the same MAC, what is preventing them from claiming
each other?  Is it only the check above (not sure what is "own" in
comment referring to)?

I'm worried the check above will not stop virtio from enslaving hyperv
interfaces and vice versa, potentially leading to a loop, no?  There is
also the fact that it would be preferable to share the code between
paravirt drivers, to avoid duplicated bugs.

My suggestion during the previous discussion was to create a paravirt
bond device, which will explicitly tell the OS which interfaces to
bond, regardless of which driver they're using.  Could be some form of
ACPI/FW driver too, I don't know enough about x86 FW to suggest
something fully fleshed :(

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19  0:40 [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available Sridhar Samudrala
                   ` (2 preceding siblings ...)
  2017-12-20 22:33 ` Jakub Kicinski
@ 2017-12-20 22:33 ` Jakub Kicinski
  2017-12-21  0:14 ` Michael S. Tsirkin
  2017-12-21  0:14 ` Michael S. Tsirkin
  5 siblings, 0 replies; 44+ messages in thread
From: Jakub Kicinski @ 2017-12-20 22:33 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: netdev, virtualization, alexander.duyck, mst

On Mon, 18 Dec 2017 16:40:36 -0800, Sridhar Samudrala wrote:
> +static int virtio_netdev_event(struct notifier_block *this,
> +			       unsigned long event, void *ptr)
> +{
> +	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
> +
> +	/* Skip our own events */
> +	if (event_dev->netdev_ops == &virtnet_netdev)
> +		return NOTIFY_DONE;

I wonder how does this work WRT loop prevention.  What if I have two
virtio devices with the same MAC, what is preventing them from claiming
each other?  Is it only the check above (not sure what is "own" in
comment referring to)?

I'm worried the check above will not stop virtio from enslaving hyperv
interfaces and vice versa, potentially leading to a loop, no?  There is
also the fact that it would be preferable to share the code between
paravirt drivers, to avoid duplicated bugs.

My suggestion during the previous discussion was to create a paravirt
bond device, which will explicitly tell the OS which interfaces to
bond, regardless of which driver they're using.  Could be some form of
ACPI/FW driver too, I don't know enough about x86 FW to suggest
something fully fleshed :(

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19  0:40 [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available Sridhar Samudrala
                   ` (3 preceding siblings ...)
  2017-12-20 22:33 ` Jakub Kicinski
@ 2017-12-21  0:14 ` Michael S. Tsirkin
  2017-12-21  0:14 ` Michael S. Tsirkin
  5 siblings, 0 replies; 44+ messages in thread
From: Michael S. Tsirkin @ 2017-12-21  0:14 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: stephen, netdev, virtualization, alexander.duyck

On Mon, Dec 18, 2017 at 04:40:36PM -0800, Sridhar Samudrala wrote:
> This patch enables virtio to switch over to a VF datapath when a VF netdev
> is present with the same MAC address.

I prefer saying "a passthrough device" here. Does not have to be a VF at
all.

>  It allows live migration of a VM
> with a direct attached VF without the need to setup a bond/team between a
> VF and virtio net device in the guest.
> 
> The hypervisor needs to unplug the VF device from the guest on the source
> host and reset the MAC filter of the VF to initiate failover of datapath to
> virtio before starting the migration. After the migration is completed, the
> destination hypervisor sets the MAC filter on the VF and plugs it back to
> the guest to switch over to VF datapath.
> 
> It is entirely based on netvsc implementation and it should be possible to
> make this code generic and move it to a common location that can be shared
> by netvsc and virtio.
> 
> Also, i think we should make this a negotiated feature that is off by
> default via a new feature bit.

So please include this. A copy needs to go to virtio TC
to reserve the bit. Enabling this by default risks breaking
too many configurations.

> 
> This patch is based on the discussion initiated by Jesse on this thread.
> https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
> 
> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
> Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
> ---
>  drivers/net/virtio_net.c | 341 ++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 339 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 559b215c0169..a34c717bb15b 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -31,6 +31,8 @@
>  #include <linux/average.h>
>  #include <linux/filter.h>
>  #include <net/route.h>
> +#include <linux/netdevice.h>
> +#include <linux/netpoll.h>
>  
>  static int napi_weight = NAPI_POLL_WEIGHT;
>  module_param(napi_weight, int, 0444);
> @@ -56,6 +58,8 @@ module_param(napi_tx, bool, 0644);
>   */
>  DECLARE_EWMA(pkt_len, 0, 64)
>  
> +#define VF_TAKEOVER_INT	(HZ / 10)
> +
>  #define VIRTNET_DRIVER_VERSION "1.0.0"
>  
>  static const unsigned long guest_offloads[] = {
> @@ -117,6 +121,15 @@ struct receive_queue {
>  	char name[40];
>  };
>  
> +struct virtnet_vf_pcpu_stats {
> +	u64	rx_packets;
> +	u64	rx_bytes;
> +	u64	tx_packets;
> +	u64	tx_bytes;
> +	struct u64_stats_sync   syncp;
> +	u32	tx_dropped;
> +};
> +
>  struct virtnet_info {
>  	struct virtio_device *vdev;
>  	struct virtqueue *cvq;
> @@ -179,6 +192,11 @@ struct virtnet_info {
>  	u32 speed;
>  
>  	unsigned long guest_offloads;
> +
> +	/* State to manage the associated VF interface. */
> +	struct net_device __rcu *vf_netdev;
> +	struct virtnet_vf_pcpu_stats __percpu *vf_stats;
> +	struct delayed_work vf_takeover;
>  };
>  
>  struct padded_vnet_hdr {
> @@ -1300,16 +1318,51 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
>  	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
>  }
>  
> +/* Send skb on the slave VF device. */
> +static int virtnet_vf_xmit(struct net_device *dev, struct net_device *vf_netdev,
> +			   struct sk_buff *skb)
> +{
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	unsigned int len = skb->len;
> +	int rc;
> +
> +	skb->dev = vf_netdev;
> +	skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
> +
> +	rc = dev_queue_xmit(skb);
> +	if (likely(rc == NET_XMIT_SUCCESS || rc == NET_XMIT_CN)) {
> +		struct virtnet_vf_pcpu_stats *pcpu_stats
> +			= this_cpu_ptr(vi->vf_stats);
> +
> +		u64_stats_update_begin(&pcpu_stats->syncp);
> +		pcpu_stats->tx_packets++;
> +		pcpu_stats->tx_bytes += len;
> +		u64_stats_update_end(&pcpu_stats->syncp);
> +	} else {
> +		this_cpu_inc(vi->vf_stats->tx_dropped);
> +	}
> +
> +	return rc;
> +}
> +
>  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  {
>  	struct virtnet_info *vi = netdev_priv(dev);
>  	int qnum = skb_get_queue_mapping(skb);
>  	struct send_queue *sq = &vi->sq[qnum];
> +	struct net_device *vf_netdev;
>  	int err;
>  	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
>  	bool kick = !skb->xmit_more;
>  	bool use_napi = sq->napi.weight;
>  
> +	/* if VF is present and up then redirect packets
> +	 * called with rcu_read_lock_bh
> +	 */
> +	vf_netdev = rcu_dereference_bh(vi->vf_netdev);
> +	if (vf_netdev && netif_running(vf_netdev) && !netpoll_tx_running(dev))
> +		return virtnet_vf_xmit(dev, vf_netdev, skb);
> +
>  	/* Free up any pending old buffers before queueing new ones. */
>  	free_old_xmit_skbs(sq);
>  
> @@ -1456,10 +1509,41 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
>  	return ret;
>  }
>  
> +static void virtnet_get_vf_stats(struct net_device *dev,
> +				 struct virtnet_vf_pcpu_stats *tot)
> +{
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	int i;
> +
> +	memset(tot, 0, sizeof(*tot));
> +
> +	for_each_possible_cpu(i) {
> +		const struct virtnet_vf_pcpu_stats *stats
> +				= per_cpu_ptr(vi->vf_stats, i);
> +		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
> +		unsigned int start;
> +
> +		do {
> +			start = u64_stats_fetch_begin_irq(&stats->syncp);
> +			rx_packets = stats->rx_packets;
> +			tx_packets = stats->tx_packets;
> +			rx_bytes = stats->rx_bytes;
> +			tx_bytes = stats->tx_bytes;
> +		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
> +
> +		tot->rx_packets += rx_packets;
> +		tot->tx_packets += tx_packets;
> +		tot->rx_bytes   += rx_bytes;
> +		tot->tx_bytes   += tx_bytes;
> +		tot->tx_dropped += stats->tx_dropped;
> +	}
> +}
> +
>  static void virtnet_stats(struct net_device *dev,
>  			  struct rtnl_link_stats64 *tot)
>  {
>  	struct virtnet_info *vi = netdev_priv(dev);
> +	struct virtnet_vf_pcpu_stats vf_stats;
>  	int cpu;
>  	unsigned int start;
>  
> @@ -1490,6 +1574,13 @@ static void virtnet_stats(struct net_device *dev,
>  	tot->rx_dropped = dev->stats.rx_dropped;
>  	tot->rx_length_errors = dev->stats.rx_length_errors;
>  	tot->rx_frame_errors = dev->stats.rx_frame_errors;
> +
> +	virtnet_get_vf_stats(dev, &vf_stats);
> +	tot->rx_packets += vf_stats.rx_packets;
> +	tot->tx_packets += vf_stats.tx_packets;
> +	tot->rx_bytes += vf_stats.rx_bytes;
> +	tot->tx_bytes += vf_stats.tx_bytes;
> +	tot->tx_dropped += vf_stats.tx_dropped;
>  }
>  
>  #ifdef CONFIG_NET_POLL_CONTROLLER
> @@ -2508,6 +2599,47 @@ static int virtnet_validate(struct virtio_device *vdev)
>  	return 0;
>  }
>  
> +static void __virtnet_vf_setup(struct net_device *ndev,
> +			       struct net_device *vf_netdev)
> +{
> +	int ret;
> +
> +	/* Align MTU of VF with master */
> +	ret = dev_set_mtu(vf_netdev, ndev->mtu);
> +	if (ret)
> +		netdev_warn(vf_netdev,
> +			    "unable to change mtu to %u\n", ndev->mtu);
> +
> +	if (netif_running(ndev)) {
> +		ret = dev_open(vf_netdev);
> +		if (ret)
> +			netdev_warn(vf_netdev,
> +				    "unable to open: %d\n", ret);
> +	}
> +}
> +
> +/* Setup VF as slave of the virtio device.
> + * Runs in workqueue to avoid recursion in netlink callbacks.
> + */
> +static void virtnet_vf_setup(struct work_struct *w)
> +{
> +	struct virtnet_info *vi
> +		= container_of(w, struct virtnet_info, vf_takeover.work);
> +	struct net_device *ndev = vi->dev;
> +	struct net_device *vf_netdev;
> +
> +	if (!rtnl_trylock()) {
> +		schedule_delayed_work(&vi->vf_takeover, 0);
> +		return;
> +	}
> +
> +	vf_netdev = rtnl_dereference(vi->vf_netdev);
> +	if (vf_netdev)
> +		__virtnet_vf_setup(ndev, vf_netdev);
> +
> +	rtnl_unlock();
> +}
> +
>  static int virtnet_probe(struct virtio_device *vdev)
>  {
>  	int i, err;
> @@ -2600,6 +2732,11 @@ static int virtnet_probe(struct virtio_device *vdev)
>  	}
>  
>  	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
> +	INIT_DELAYED_WORK(&vi->vf_takeover, virtnet_vf_setup);
> +
> +	vi->vf_stats = netdev_alloc_pcpu_stats(struct virtnet_vf_pcpu_stats);
> +	if (!vi->vf_stats)
> +		goto free_stats;
>  
>  	/* If we can receive ANY GSO packets, we must allocate large ones. */
>  	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
> @@ -2634,7 +2771,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>  			 */
>  			dev_err(&vdev->dev, "device MTU appears to have changed "
>  				"it is now %d < %d", mtu, dev->min_mtu);
> -			goto free_stats;
> +			goto free_vf_stats;
>  		}
>  
>  		dev->mtu = mtu;
> @@ -2658,7 +2795,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>  	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
>  	err = init_vqs(vi);
>  	if (err)
> -		goto free_stats;
> +		goto free_vf_stats;
>  
>  #ifdef CONFIG_SYSFS
>  	if (vi->mergeable_rx_bufs)
> @@ -2712,6 +2849,8 @@ static int virtnet_probe(struct virtio_device *vdev)
>  	cancel_delayed_work_sync(&vi->refill);
>  	free_receive_page_frags(vi);
>  	virtnet_del_vqs(vi);
> +free_vf_stats:
> +	free_percpu(vi->vf_stats);
>  free_stats:
>  	free_percpu(vi->stats);
>  free:
> @@ -2733,19 +2872,178 @@ static void remove_vq_common(struct virtnet_info *vi)
>  	virtnet_del_vqs(vi);
>  }
>  
> +static struct net_device *get_virtio_bymac(const u8 *mac)
> +{
> +	struct net_device *dev;
> +
> +	ASSERT_RTNL();
> +
> +	for_each_netdev(&init_net, dev) {
> +		if (dev->netdev_ops != &virtnet_netdev)
> +			continue;       /* not a virtio_net device */
> +
> +		if (ether_addr_equal(mac, dev->perm_addr))
> +			return dev;
> +	}
> +
> +	return NULL;
> +}
> +
> +static struct net_device *get_virtio_byref(struct net_device *vf_netdev)
> +{
> +	struct net_device *dev;
> +
> +	ASSERT_RTNL();
> +
> +	for_each_netdev(&init_net, dev) {
> +		struct virtnet_info *vi;
> +
> +		if (dev->netdev_ops != &virtnet_netdev)
> +			continue;	/* not a virtio_net device */
> +
> +		vi = netdev_priv(dev);
> +		if (rtnl_dereference(vi->vf_netdev) == vf_netdev)
> +			return dev;	/* a match */
> +	}
> +
> +	return NULL;
> +}
> +
> +/* Called when VF is injecting data into network stack.
> + * Change the associated network device from VF to virtio.
> + * note: already called with rcu_read_lock
> + */
> +static rx_handler_result_t virtnet_vf_handle_frame(struct sk_buff **pskb)
> +{
> +	struct sk_buff *skb = *pskb;
> +	struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
> +	struct virtnet_info *vi = netdev_priv(ndev);
> +	struct virtnet_vf_pcpu_stats *pcpu_stats =
> +				this_cpu_ptr(vi->vf_stats);
> +
> +	skb->dev = ndev;
> +
> +	u64_stats_update_begin(&pcpu_stats->syncp);
> +	pcpu_stats->rx_packets++;
> +	pcpu_stats->rx_bytes += skb->len;
> +	u64_stats_update_end(&pcpu_stats->syncp);
> +
> +	return RX_HANDLER_ANOTHER;
> +}
> +
> +static int virtnet_vf_join(struct net_device *vf_netdev,
> +			   struct net_device *ndev)
> +{
> +	struct virtnet_info *vi = netdev_priv(ndev);
> +	int ret;
> +
> +	ret = netdev_rx_handler_register(vf_netdev,
> +					 virtnet_vf_handle_frame, ndev);
> +	if (ret != 0) {
> +		netdev_err(vf_netdev,
> +			   "can not register virtio VF receive handler (err = %d)\n",
> +			   ret);
> +		goto rx_handler_failed;
> +	}
> +
> +	ret = netdev_upper_dev_link(vf_netdev, ndev, NULL);
> +	if (ret != 0) {
> +		netdev_err(vf_netdev,
> +			   "can not set master device %s (err = %d)\n",
> +			   ndev->name, ret);
> +		goto upper_link_failed;
> +	}
> +
> +	/* set slave flag before open to prevent IPv6 addrconf */
> +	vf_netdev->flags |= IFF_SLAVE;
> +
> +	schedule_delayed_work(&vi->vf_takeover, VF_TAKEOVER_INT);
> +
> +	call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
> +
> +	netdev_info(vf_netdev, "joined to %s\n", ndev->name);
> +	return 0;
> +
> +upper_link_failed:
> +	netdev_rx_handler_unregister(vf_netdev);
> +rx_handler_failed:
> +	return ret;
> +}
> +
> +static int virtnet_register_vf(struct net_device *vf_netdev)
> +{
> +	struct net_device *ndev;
> +	struct virtnet_info *vi;
> +
> +	if (vf_netdev->addr_len != ETH_ALEN)
> +		return NOTIFY_DONE;
> +
> +	/* We will use the MAC address to locate the virtio_net interface to
> +	 * associate with the VF interface. If we don't find a matching
> +	 * virtio interface, move on.
> +	 */
> +	ndev = get_virtio_bymac(vf_netdev->perm_addr);
> +	if (!ndev)
> +		return NOTIFY_DONE;
> +
> +	vi = netdev_priv(ndev);
> +	if (rtnl_dereference(vi->vf_netdev))
> +		return NOTIFY_DONE;
> +
> +	if (virtnet_vf_join(vf_netdev, ndev) != 0)
> +		return NOTIFY_DONE;
> +
> +	netdev_info(ndev, "VF registering %s\n", vf_netdev->name);
> +
> +	dev_hold(vf_netdev);
> +	rcu_assign_pointer(vi->vf_netdev, vf_netdev);
> +
> +	return NOTIFY_OK;
> +}
> +
> +static int virtnet_unregister_vf(struct net_device *vf_netdev)
> +{
> +	struct net_device *ndev;
> +	struct virtnet_info *vi;
> +
> +	ndev = get_virtio_byref(vf_netdev);
> +	if (!ndev)
> +		return NOTIFY_DONE;
> +
> +	vi = netdev_priv(ndev);
> +	cancel_delayed_work_sync(&vi->vf_takeover);
> +
> +	netdev_info(ndev, "VF unregistering %s\n", vf_netdev->name);
> +
> +	netdev_rx_handler_unregister(vf_netdev);
> +	netdev_upper_dev_unlink(vf_netdev, ndev);
> +	RCU_INIT_POINTER(vi->vf_netdev, NULL);
> +	dev_put(vf_netdev);
> +
> +	return NOTIFY_OK;
> +}
> +
>  static void virtnet_remove(struct virtio_device *vdev)
>  {
>  	struct virtnet_info *vi = vdev->priv;
> +	struct net_device *vf_netdev;
>  
>  	virtnet_cpu_notif_remove(vi);
>  
>  	/* Make sure no work handler is accessing the device. */
>  	flush_work(&vi->config_work);
>  
> +	rtnl_lock();
> +	vf_netdev = rtnl_dereference(vi->vf_netdev);
> +	if (vf_netdev)
> +		virtnet_unregister_vf(vf_netdev);
> +	rtnl_unlock();
> +
>  	unregister_netdev(vi->dev);
>  
>  	remove_vq_common(vi);
>  
> +	free_percpu(vi->vf_stats);
>  	free_percpu(vi->stats);
>  	free_netdev(vi->dev);
>  }
> @@ -2823,6 +3121,42 @@ static struct virtio_driver virtio_net_driver = {
>  #endif
>  };
>  
> +static int virtio_netdev_event(struct notifier_block *this,
> +			       unsigned long event, void *ptr)
> +{
> +	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
> +
> +	/* Skip our own events */
> +	if (event_dev->netdev_ops == &virtnet_netdev)
> +		return NOTIFY_DONE;
> +
> +	/* Avoid non-Ethernet type devices */
> +	if (event_dev->type != ARPHRD_ETHER)
> +		return NOTIFY_DONE;
> +
> +	/* Avoid Vlan dev with same MAC registering as VF */
> +	if (is_vlan_dev(event_dev))
> +		return NOTIFY_DONE;
> +
> +	/* Avoid Bonding master dev with same MAC registering as VF */
> +	if ((event_dev->priv_flags & IFF_BONDING) &&
> +	    (event_dev->flags & IFF_MASTER))
> +		return NOTIFY_DONE;
> +
> +	switch (event) {
> +	case NETDEV_REGISTER:
> +		return virtnet_register_vf(event_dev);
> +	case NETDEV_UNREGISTER:
> +		return virtnet_unregister_vf(event_dev);
> +	default:
> +		return NOTIFY_DONE;
> +	}
> +}
> +
> +static struct notifier_block virtio_netdev_notifier = {
> +	.notifier_call = virtio_netdev_event,
> +};
> +
>  static __init int virtio_net_driver_init(void)
>  {
>  	int ret;
> @@ -2841,6 +3175,8 @@ static __init int virtio_net_driver_init(void)
>          ret = register_virtio_driver(&virtio_net_driver);
>  	if (ret)
>  		goto err_virtio;
> +
> +	register_netdevice_notifier(&virtio_netdev_notifier);
>  	return 0;
>  err_virtio:
>  	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
> @@ -2853,6 +3189,7 @@ module_init(virtio_net_driver_init);
>  
>  static __exit void virtio_net_driver_exit(void)
>  {
> +	unregister_netdevice_notifier(&virtio_netdev_notifier);
>  	unregister_virtio_driver(&virtio_net_driver);
>  	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
>  	cpuhp_remove_multi_state(virtionet_online);
> -- 
> 2.14.3

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19  0:40 [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available Sridhar Samudrala
                   ` (4 preceding siblings ...)
  2017-12-21  0:14 ` Michael S. Tsirkin
@ 2017-12-21  0:14 ` Michael S. Tsirkin
  5 siblings, 0 replies; 44+ messages in thread
From: Michael S. Tsirkin @ 2017-12-21  0:14 UTC (permalink / raw)
  To: Sridhar Samudrala; +Cc: netdev, alexander.duyck, virtualization

On Mon, Dec 18, 2017 at 04:40:36PM -0800, Sridhar Samudrala wrote:
> This patch enables virtio to switch over to a VF datapath when a VF netdev
> is present with the same MAC address.

I prefer saying "a passthrough device" here. Does not have to be a VF at
all.

>  It allows live migration of a VM
> with a direct attached VF without the need to setup a bond/team between a
> VF and virtio net device in the guest.
> 
> The hypervisor needs to unplug the VF device from the guest on the source
> host and reset the MAC filter of the VF to initiate failover of datapath to
> virtio before starting the migration. After the migration is completed, the
> destination hypervisor sets the MAC filter on the VF and plugs it back to
> the guest to switch over to VF datapath.
> 
> It is entirely based on netvsc implementation and it should be possible to
> make this code generic and move it to a common location that can be shared
> by netvsc and virtio.
> 
> Also, i think we should make this a negotiated feature that is off by
> default via a new feature bit.

So please include this. A copy needs to go to virtio TC
to reserve the bit. Enabling this by default risks breaking
too many configurations.

> 
> This patch is based on the discussion initiated by Jesse on this thread.
> https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
> 
> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
> Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
> ---
>  drivers/net/virtio_net.c | 341 ++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 339 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 559b215c0169..a34c717bb15b 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -31,6 +31,8 @@
>  #include <linux/average.h>
>  #include <linux/filter.h>
>  #include <net/route.h>
> +#include <linux/netdevice.h>
> +#include <linux/netpoll.h>
>  
>  static int napi_weight = NAPI_POLL_WEIGHT;
>  module_param(napi_weight, int, 0444);
> @@ -56,6 +58,8 @@ module_param(napi_tx, bool, 0644);
>   */
>  DECLARE_EWMA(pkt_len, 0, 64)
>  
> +#define VF_TAKEOVER_INT	(HZ / 10)
> +
>  #define VIRTNET_DRIVER_VERSION "1.0.0"
>  
>  static const unsigned long guest_offloads[] = {
> @@ -117,6 +121,15 @@ struct receive_queue {
>  	char name[40];
>  };
>  
> +struct virtnet_vf_pcpu_stats {
> +	u64	rx_packets;
> +	u64	rx_bytes;
> +	u64	tx_packets;
> +	u64	tx_bytes;
> +	struct u64_stats_sync   syncp;
> +	u32	tx_dropped;
> +};
> +
>  struct virtnet_info {
>  	struct virtio_device *vdev;
>  	struct virtqueue *cvq;
> @@ -179,6 +192,11 @@ struct virtnet_info {
>  	u32 speed;
>  
>  	unsigned long guest_offloads;
> +
> +	/* State to manage the associated VF interface. */
> +	struct net_device __rcu *vf_netdev;
> +	struct virtnet_vf_pcpu_stats __percpu *vf_stats;
> +	struct delayed_work vf_takeover;
>  };
>  
>  struct padded_vnet_hdr {
> @@ -1300,16 +1318,51 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
>  	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
>  }
>  
> +/* Send skb on the slave VF device. */
> +static int virtnet_vf_xmit(struct net_device *dev, struct net_device *vf_netdev,
> +			   struct sk_buff *skb)
> +{
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	unsigned int len = skb->len;
> +	int rc;
> +
> +	skb->dev = vf_netdev;
> +	skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
> +
> +	rc = dev_queue_xmit(skb);
> +	if (likely(rc == NET_XMIT_SUCCESS || rc == NET_XMIT_CN)) {
> +		struct virtnet_vf_pcpu_stats *pcpu_stats
> +			= this_cpu_ptr(vi->vf_stats);
> +
> +		u64_stats_update_begin(&pcpu_stats->syncp);
> +		pcpu_stats->tx_packets++;
> +		pcpu_stats->tx_bytes += len;
> +		u64_stats_update_end(&pcpu_stats->syncp);
> +	} else {
> +		this_cpu_inc(vi->vf_stats->tx_dropped);
> +	}
> +
> +	return rc;
> +}
> +
>  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  {
>  	struct virtnet_info *vi = netdev_priv(dev);
>  	int qnum = skb_get_queue_mapping(skb);
>  	struct send_queue *sq = &vi->sq[qnum];
> +	struct net_device *vf_netdev;
>  	int err;
>  	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
>  	bool kick = !skb->xmit_more;
>  	bool use_napi = sq->napi.weight;
>  
> +	/* if VF is present and up then redirect packets
> +	 * called with rcu_read_lock_bh
> +	 */
> +	vf_netdev = rcu_dereference_bh(vi->vf_netdev);
> +	if (vf_netdev && netif_running(vf_netdev) && !netpoll_tx_running(dev))
> +		return virtnet_vf_xmit(dev, vf_netdev, skb);
> +
>  	/* Free up any pending old buffers before queueing new ones. */
>  	free_old_xmit_skbs(sq);
>  
> @@ -1456,10 +1509,41 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
>  	return ret;
>  }
>  
> +static void virtnet_get_vf_stats(struct net_device *dev,
> +				 struct virtnet_vf_pcpu_stats *tot)
> +{
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	int i;
> +
> +	memset(tot, 0, sizeof(*tot));
> +
> +	for_each_possible_cpu(i) {
> +		const struct virtnet_vf_pcpu_stats *stats
> +				= per_cpu_ptr(vi->vf_stats, i);
> +		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
> +		unsigned int start;
> +
> +		do {
> +			start = u64_stats_fetch_begin_irq(&stats->syncp);
> +			rx_packets = stats->rx_packets;
> +			tx_packets = stats->tx_packets;
> +			rx_bytes = stats->rx_bytes;
> +			tx_bytes = stats->tx_bytes;
> +		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
> +
> +		tot->rx_packets += rx_packets;
> +		tot->tx_packets += tx_packets;
> +		tot->rx_bytes   += rx_bytes;
> +		tot->tx_bytes   += tx_bytes;
> +		tot->tx_dropped += stats->tx_dropped;
> +	}
> +}
> +
>  static void virtnet_stats(struct net_device *dev,
>  			  struct rtnl_link_stats64 *tot)
>  {
>  	struct virtnet_info *vi = netdev_priv(dev);
> +	struct virtnet_vf_pcpu_stats vf_stats;
>  	int cpu;
>  	unsigned int start;
>  
> @@ -1490,6 +1574,13 @@ static void virtnet_stats(struct net_device *dev,
>  	tot->rx_dropped = dev->stats.rx_dropped;
>  	tot->rx_length_errors = dev->stats.rx_length_errors;
>  	tot->rx_frame_errors = dev->stats.rx_frame_errors;
> +
> +	virtnet_get_vf_stats(dev, &vf_stats);
> +	tot->rx_packets += vf_stats.rx_packets;
> +	tot->tx_packets += vf_stats.tx_packets;
> +	tot->rx_bytes += vf_stats.rx_bytes;
> +	tot->tx_bytes += vf_stats.tx_bytes;
> +	tot->tx_dropped += vf_stats.tx_dropped;
>  }
>  
>  #ifdef CONFIG_NET_POLL_CONTROLLER
> @@ -2508,6 +2599,47 @@ static int virtnet_validate(struct virtio_device *vdev)
>  	return 0;
>  }
>  
> +static void __virtnet_vf_setup(struct net_device *ndev,
> +			       struct net_device *vf_netdev)
> +{
> +	int ret;
> +
> +	/* Align MTU of VF with master */
> +	ret = dev_set_mtu(vf_netdev, ndev->mtu);
> +	if (ret)
> +		netdev_warn(vf_netdev,
> +			    "unable to change mtu to %u\n", ndev->mtu);
> +
> +	if (netif_running(ndev)) {
> +		ret = dev_open(vf_netdev);
> +		if (ret)
> +			netdev_warn(vf_netdev,
> +				    "unable to open: %d\n", ret);
> +	}
> +}
> +
> +/* Setup VF as slave of the virtio device.
> + * Runs in workqueue to avoid recursion in netlink callbacks.
> + */
> +static void virtnet_vf_setup(struct work_struct *w)
> +{
> +	struct virtnet_info *vi
> +		= container_of(w, struct virtnet_info, vf_takeover.work);
> +	struct net_device *ndev = vi->dev;
> +	struct net_device *vf_netdev;
> +
> +	if (!rtnl_trylock()) {
> +		schedule_delayed_work(&vi->vf_takeover, 0);
> +		return;
> +	}
> +
> +	vf_netdev = rtnl_dereference(vi->vf_netdev);
> +	if (vf_netdev)
> +		__virtnet_vf_setup(ndev, vf_netdev);
> +
> +	rtnl_unlock();
> +}
> +
>  static int virtnet_probe(struct virtio_device *vdev)
>  {
>  	int i, err;
> @@ -2600,6 +2732,11 @@ static int virtnet_probe(struct virtio_device *vdev)
>  	}
>  
>  	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
> +	INIT_DELAYED_WORK(&vi->vf_takeover, virtnet_vf_setup);
> +
> +	vi->vf_stats = netdev_alloc_pcpu_stats(struct virtnet_vf_pcpu_stats);
> +	if (!vi->vf_stats)
> +		goto free_stats;
>  
>  	/* If we can receive ANY GSO packets, we must allocate large ones. */
>  	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
> @@ -2634,7 +2771,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>  			 */
>  			dev_err(&vdev->dev, "device MTU appears to have changed "
>  				"it is now %d < %d", mtu, dev->min_mtu);
> -			goto free_stats;
> +			goto free_vf_stats;
>  		}
>  
>  		dev->mtu = mtu;
> @@ -2658,7 +2795,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>  	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
>  	err = init_vqs(vi);
>  	if (err)
> -		goto free_stats;
> +		goto free_vf_stats;
>  
>  #ifdef CONFIG_SYSFS
>  	if (vi->mergeable_rx_bufs)
> @@ -2712,6 +2849,8 @@ static int virtnet_probe(struct virtio_device *vdev)
>  	cancel_delayed_work_sync(&vi->refill);
>  	free_receive_page_frags(vi);
>  	virtnet_del_vqs(vi);
> +free_vf_stats:
> +	free_percpu(vi->vf_stats);
>  free_stats:
>  	free_percpu(vi->stats);
>  free:
> @@ -2733,19 +2872,178 @@ static void remove_vq_common(struct virtnet_info *vi)
>  	virtnet_del_vqs(vi);
>  }
>  
> +static struct net_device *get_virtio_bymac(const u8 *mac)
> +{
> +	struct net_device *dev;
> +
> +	ASSERT_RTNL();
> +
> +	for_each_netdev(&init_net, dev) {
> +		if (dev->netdev_ops != &virtnet_netdev)
> +			continue;       /* not a virtio_net device */
> +
> +		if (ether_addr_equal(mac, dev->perm_addr))
> +			return dev;
> +	}
> +
> +	return NULL;
> +}
> +
> +static struct net_device *get_virtio_byref(struct net_device *vf_netdev)
> +{
> +	struct net_device *dev;
> +
> +	ASSERT_RTNL();
> +
> +	for_each_netdev(&init_net, dev) {
> +		struct virtnet_info *vi;
> +
> +		if (dev->netdev_ops != &virtnet_netdev)
> +			continue;	/* not a virtio_net device */
> +
> +		vi = netdev_priv(dev);
> +		if (rtnl_dereference(vi->vf_netdev) == vf_netdev)
> +			return dev;	/* a match */
> +	}
> +
> +	return NULL;
> +}
> +
> +/* Called when VF is injecting data into network stack.
> + * Change the associated network device from VF to virtio.
> + * note: already called with rcu_read_lock
> + */
> +static rx_handler_result_t virtnet_vf_handle_frame(struct sk_buff **pskb)
> +{
> +	struct sk_buff *skb = *pskb;
> +	struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
> +	struct virtnet_info *vi = netdev_priv(ndev);
> +	struct virtnet_vf_pcpu_stats *pcpu_stats =
> +				this_cpu_ptr(vi->vf_stats);
> +
> +	skb->dev = ndev;
> +
> +	u64_stats_update_begin(&pcpu_stats->syncp);
> +	pcpu_stats->rx_packets++;
> +	pcpu_stats->rx_bytes += skb->len;
> +	u64_stats_update_end(&pcpu_stats->syncp);
> +
> +	return RX_HANDLER_ANOTHER;
> +}
> +
> +static int virtnet_vf_join(struct net_device *vf_netdev,
> +			   struct net_device *ndev)
> +{
> +	struct virtnet_info *vi = netdev_priv(ndev);
> +	int ret;
> +
> +	ret = netdev_rx_handler_register(vf_netdev,
> +					 virtnet_vf_handle_frame, ndev);
> +	if (ret != 0) {
> +		netdev_err(vf_netdev,
> +			   "can not register virtio VF receive handler (err = %d)\n",
> +			   ret);
> +		goto rx_handler_failed;
> +	}
> +
> +	ret = netdev_upper_dev_link(vf_netdev, ndev, NULL);
> +	if (ret != 0) {
> +		netdev_err(vf_netdev,
> +			   "can not set master device %s (err = %d)\n",
> +			   ndev->name, ret);
> +		goto upper_link_failed;
> +	}
> +
> +	/* set slave flag before open to prevent IPv6 addrconf */
> +	vf_netdev->flags |= IFF_SLAVE;
> +
> +	schedule_delayed_work(&vi->vf_takeover, VF_TAKEOVER_INT);
> +
> +	call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
> +
> +	netdev_info(vf_netdev, "joined to %s\n", ndev->name);
> +	return 0;
> +
> +upper_link_failed:
> +	netdev_rx_handler_unregister(vf_netdev);
> +rx_handler_failed:
> +	return ret;
> +}
> +
> +static int virtnet_register_vf(struct net_device *vf_netdev)
> +{
> +	struct net_device *ndev;
> +	struct virtnet_info *vi;
> +
> +	if (vf_netdev->addr_len != ETH_ALEN)
> +		return NOTIFY_DONE;
> +
> +	/* We will use the MAC address to locate the virtio_net interface to
> +	 * associate with the VF interface. If we don't find a matching
> +	 * virtio interface, move on.
> +	 */
> +	ndev = get_virtio_bymac(vf_netdev->perm_addr);
> +	if (!ndev)
> +		return NOTIFY_DONE;
> +
> +	vi = netdev_priv(ndev);
> +	if (rtnl_dereference(vi->vf_netdev))
> +		return NOTIFY_DONE;
> +
> +	if (virtnet_vf_join(vf_netdev, ndev) != 0)
> +		return NOTIFY_DONE;
> +
> +	netdev_info(ndev, "VF registering %s\n", vf_netdev->name);
> +
> +	dev_hold(vf_netdev);
> +	rcu_assign_pointer(vi->vf_netdev, vf_netdev);
> +
> +	return NOTIFY_OK;
> +}
> +
> +static int virtnet_unregister_vf(struct net_device *vf_netdev)
> +{
> +	struct net_device *ndev;
> +	struct virtnet_info *vi;
> +
> +	ndev = get_virtio_byref(vf_netdev);
> +	if (!ndev)
> +		return NOTIFY_DONE;
> +
> +	vi = netdev_priv(ndev);
> +	cancel_delayed_work_sync(&vi->vf_takeover);
> +
> +	netdev_info(ndev, "VF unregistering %s\n", vf_netdev->name);
> +
> +	netdev_rx_handler_unregister(vf_netdev);
> +	netdev_upper_dev_unlink(vf_netdev, ndev);
> +	RCU_INIT_POINTER(vi->vf_netdev, NULL);
> +	dev_put(vf_netdev);
> +
> +	return NOTIFY_OK;
> +}
> +
>  static void virtnet_remove(struct virtio_device *vdev)
>  {
>  	struct virtnet_info *vi = vdev->priv;
> +	struct net_device *vf_netdev;
>  
>  	virtnet_cpu_notif_remove(vi);
>  
>  	/* Make sure no work handler is accessing the device. */
>  	flush_work(&vi->config_work);
>  
> +	rtnl_lock();
> +	vf_netdev = rtnl_dereference(vi->vf_netdev);
> +	if (vf_netdev)
> +		virtnet_unregister_vf(vf_netdev);
> +	rtnl_unlock();
> +
>  	unregister_netdev(vi->dev);
>  
>  	remove_vq_common(vi);
>  
> +	free_percpu(vi->vf_stats);
>  	free_percpu(vi->stats);
>  	free_netdev(vi->dev);
>  }
> @@ -2823,6 +3121,42 @@ static struct virtio_driver virtio_net_driver = {
>  #endif
>  };
>  
> +static int virtio_netdev_event(struct notifier_block *this,
> +			       unsigned long event, void *ptr)
> +{
> +	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
> +
> +	/* Skip our own events */
> +	if (event_dev->netdev_ops == &virtnet_netdev)
> +		return NOTIFY_DONE;
> +
> +	/* Avoid non-Ethernet type devices */
> +	if (event_dev->type != ARPHRD_ETHER)
> +		return NOTIFY_DONE;
> +
> +	/* Avoid Vlan dev with same MAC registering as VF */
> +	if (is_vlan_dev(event_dev))
> +		return NOTIFY_DONE;
> +
> +	/* Avoid Bonding master dev with same MAC registering as VF */
> +	if ((event_dev->priv_flags & IFF_BONDING) &&
> +	    (event_dev->flags & IFF_MASTER))
> +		return NOTIFY_DONE;
> +
> +	switch (event) {
> +	case NETDEV_REGISTER:
> +		return virtnet_register_vf(event_dev);
> +	case NETDEV_UNREGISTER:
> +		return virtnet_unregister_vf(event_dev);
> +	default:
> +		return NOTIFY_DONE;
> +	}
> +}
> +
> +static struct notifier_block virtio_netdev_notifier = {
> +	.notifier_call = virtio_netdev_event,
> +};
> +
>  static __init int virtio_net_driver_init(void)
>  {
>  	int ret;
> @@ -2841,6 +3175,8 @@ static __init int virtio_net_driver_init(void)
>          ret = register_virtio_driver(&virtio_net_driver);
>  	if (ret)
>  		goto err_virtio;
> +
> +	register_netdevice_notifier(&virtio_netdev_notifier);
>  	return 0;
>  err_virtio:
>  	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
> @@ -2853,6 +3189,7 @@ module_init(virtio_net_driver_init);
>  
>  static __exit void virtio_net_driver_exit(void)
>  {
> +	unregister_netdevice_notifier(&virtio_netdev_notifier);
>  	unregister_virtio_driver(&virtio_net_driver);
>  	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
>  	cpuhp_remove_multi_state(virtionet_online);
> -- 
> 2.14.3

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-20 22:33 ` Jakub Kicinski
  2017-12-21  0:15   ` Michael S. Tsirkin
@ 2017-12-21  0:15   ` Michael S. Tsirkin
  2017-12-21  0:29     ` Jakub Kicinski
  2017-12-21  0:29     ` Jakub Kicinski
  1 sibling, 2 replies; 44+ messages in thread
From: Michael S. Tsirkin @ 2017-12-21  0:15 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Sridhar Samudrala, stephen, netdev, virtualization, alexander.duyck

On Wed, Dec 20, 2017 at 02:33:34PM -0800, Jakub Kicinski wrote:
> On Mon, 18 Dec 2017 16:40:36 -0800, Sridhar Samudrala wrote:
> > +static int virtio_netdev_event(struct notifier_block *this,
> > +			       unsigned long event, void *ptr)
> > +{
> > +	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
> > +
> > +	/* Skip our own events */
> > +	if (event_dev->netdev_ops == &virtnet_netdev)
> > +		return NOTIFY_DONE;
> 
> I wonder how does this work WRT loop prevention.  What if I have two
> virtio devices with the same MAC, what is preventing them from claiming
> each other?  Is it only the check above (not sure what is "own" in
> comment referring to)?

I expect we will add a feature bit (VIRTIO_NET_F_MASTER) and it will
be host's responsibility not to add more than 1 such device.

> I'm worried the check above will not stop virtio from enslaving hyperv
> interfaces and vice versa, potentially leading to a loop, no?  There is
> also the fact that it would be preferable to share the code between
> paravirt drivers, to avoid duplicated bugs.
> 
> My suggestion during the previous discussion was to create a paravirt
> bond device, which will explicitly tell the OS which interfaces to
> bond, regardless of which driver they're using.  Could be some form of
> ACPI/FW driver too, I don't know enough about x86 FW to suggest
> something fully fleshed :(

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-20 22:33 ` Jakub Kicinski
@ 2017-12-21  0:15   ` Michael S. Tsirkin
  2017-12-21  0:15   ` Michael S. Tsirkin
  1 sibling, 0 replies; 44+ messages in thread
From: Michael S. Tsirkin @ 2017-12-21  0:15 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: Sridhar Samudrala, virtualization, alexander.duyck, netdev

On Wed, Dec 20, 2017 at 02:33:34PM -0800, Jakub Kicinski wrote:
> On Mon, 18 Dec 2017 16:40:36 -0800, Sridhar Samudrala wrote:
> > +static int virtio_netdev_event(struct notifier_block *this,
> > +			       unsigned long event, void *ptr)
> > +{
> > +	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
> > +
> > +	/* Skip our own events */
> > +	if (event_dev->netdev_ops == &virtnet_netdev)
> > +		return NOTIFY_DONE;
> 
> I wonder how does this work WRT loop prevention.  What if I have two
> virtio devices with the same MAC, what is preventing them from claiming
> each other?  Is it only the check above (not sure what is "own" in
> comment referring to)?

I expect we will add a feature bit (VIRTIO_NET_F_MASTER) and it will
be host's responsibility not to add more than 1 such device.

> I'm worried the check above will not stop virtio from enslaving hyperv
> interfaces and vice versa, potentially leading to a loop, no?  There is
> also the fact that it would be preferable to share the code between
> paravirt drivers, to avoid duplicated bugs.
> 
> My suggestion during the previous discussion was to create a paravirt
> bond device, which will explicitly tell the OS which interfaces to
> bond, regardless of which driver they're using.  Could be some form of
> ACPI/FW driver too, I don't know enough about x86 FW to suggest
> something fully fleshed :(

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-21  0:15   ` Michael S. Tsirkin
@ 2017-12-21  0:29     ` Jakub Kicinski
  2017-12-21  0:29     ` Jakub Kicinski
  1 sibling, 0 replies; 44+ messages in thread
From: Jakub Kicinski @ 2017-12-21  0:29 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sridhar Samudrala, stephen, netdev, virtualization, alexander.duyck

On Thu, 21 Dec 2017 02:15:31 +0200, Michael S. Tsirkin wrote:
> On Wed, Dec 20, 2017 at 02:33:34PM -0800, Jakub Kicinski wrote:
> > On Mon, 18 Dec 2017 16:40:36 -0800, Sridhar Samudrala wrote:  
> > > +static int virtio_netdev_event(struct notifier_block *this,
> > > +			       unsigned long event, void *ptr)
> > > +{
> > > +	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
> > > +
> > > +	/* Skip our own events */
> > > +	if (event_dev->netdev_ops == &virtnet_netdev)
> > > +		return NOTIFY_DONE;  
> > 
> > I wonder how does this work WRT loop prevention.  What if I have two
> > virtio devices with the same MAC, what is preventing them from claiming
> > each other?  Is it only the check above (not sure what is "own" in
> > comment referring to)?  
> 
> I expect we will add a feature bit (VIRTIO_NET_F_MASTER) and it will
> be host's responsibility not to add more than 1 such device.

Do you mean a virtio_net feature bit?  That won't stop the loop with
a hyperv device.  Unless we want every paravirt device to have such
bit and enforce there can be only one magic bond enslavement active in
the system.

Making the bonding information separate from the slaves seems so much
cleaner...  No limitation on device types, or how many bonds user
chooses to create.  No MAC matching...

> > I'm worried the check above will not stop virtio from enslaving hyperv
> > interfaces and vice versa, potentially leading to a loop, no?  There is
> > also the fact that it would be preferable to share the code between
> > paravirt drivers, to avoid duplicated bugs.
> > 
> > My suggestion during the previous discussion was to create a paravirt
> > bond device, which will explicitly tell the OS which interfaces to
> > bond, regardless of which driver they're using.  Could be some form of
> > ACPI/FW driver too, I don't know enough about x86 FW to suggest
> > something fully fleshed :(  

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-21  0:15   ` Michael S. Tsirkin
  2017-12-21  0:29     ` Jakub Kicinski
@ 2017-12-21  0:29     ` Jakub Kicinski
  1 sibling, 0 replies; 44+ messages in thread
From: Jakub Kicinski @ 2017-12-21  0:29 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Sridhar Samudrala, virtualization, alexander.duyck, netdev

On Thu, 21 Dec 2017 02:15:31 +0200, Michael S. Tsirkin wrote:
> On Wed, Dec 20, 2017 at 02:33:34PM -0800, Jakub Kicinski wrote:
> > On Mon, 18 Dec 2017 16:40:36 -0800, Sridhar Samudrala wrote:  
> > > +static int virtio_netdev_event(struct notifier_block *this,
> > > +			       unsigned long event, void *ptr)
> > > +{
> > > +	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
> > > +
> > > +	/* Skip our own events */
> > > +	if (event_dev->netdev_ops == &virtnet_netdev)
> > > +		return NOTIFY_DONE;  
> > 
> > I wonder how does this work WRT loop prevention.  What if I have two
> > virtio devices with the same MAC, what is preventing them from claiming
> > each other?  Is it only the check above (not sure what is "own" in
> > comment referring to)?  
> 
> I expect we will add a feature bit (VIRTIO_NET_F_MASTER) and it will
> be host's responsibility not to add more than 1 such device.

Do you mean a virtio_net feature bit?  That won't stop the loop with
a hyperv device.  Unless we want every paravirt device to have such
bit and enforce there can be only one magic bond enslavement active in
the system.

Making the bonding information separate from the slaves seems so much
cleaner...  No limitation on device types, or how many bonds user
chooses to create.  No MAC matching...

> > I'm worried the check above will not stop virtio from enslaving hyperv
> > interfaces and vice versa, potentially leading to a loop, no?  There is
> > also the fact that it would be preferable to share the code between
> > paravirt drivers, to avoid duplicated bugs.
> > 
> > My suggestion during the previous discussion was to create a paravirt
> > bond device, which will explicitly tell the OS which interfaces to
> > bond, regardless of which driver they're using.  Could be some form of
> > ACPI/FW driver too, I don't know enough about x86 FW to suggest
> > something fully fleshed :(  

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 18:41         ` Stephen Hemminger
  2017-12-19 19:42           ` Samudrala, Sridhar
  2017-12-19 19:42           ` Samudrala, Sridhar
@ 2017-12-21  1:31           ` Siwei Liu
  2017-12-21  2:16           ` Siwei Liu
  2017-12-21  2:16           ` Siwei Liu
  4 siblings, 0 replies; 44+ messages in thread
From: Siwei Liu @ 2017-12-21  1:31 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: mst, sridhar.samudrala, alexander.duyck, virtualization, netdev,
	David Miller


[-- Attachment #1.1: Type: text/plain, Size: 1504 bytes --]

On Tue, Dec 19, 2017 at 10:41 AM, Stephen Hemminger <
stephen@networkplumber.org> wrote:

> On Tue, 19 Dec 2017 13:21:17 -0500 (EST)
> David Miller <davem@davemloft.net> wrote:
>
> > From: Stephen Hemminger <stephen@networkplumber.org>
> > Date: Tue, 19 Dec 2017 09:55:48 -0800
> >
> > > could be 10ms, just enough to let udev do its renaming
> >
> > Please, move to some kind of notification or event based handling of
> > this problem.
> >
> > No delay is safe, what if userspace gets swapped out or whatever
> > else might make userspace stall unexpectedly?
> >
>
> The plan is to remove the delay and do the naming in the kernel.
> This was suggested by Lennart since udev is only doing naming policy
> because kernel names were not repeatable.
>
> This makes the VF show up as "ethN_vf" on Hyper-V which is user friendly.
>
> Patch is pending.
>

While it's good to show VF with specific naming to indicate enslavement, I
wonder wouldn't it be better to hide this netdev at all from the user
space? IMHO this extra device is useless when being enslaved and we may
delegate controls (e.g. ethtool) over to the para-virtual device instead?
That way it's possible to eliminate the possibility of additional udev
setup or modification?

I'm not sure if this  is consistent with Windows guest or not, but I don't
find it _Linux_ user friendly that ethtool doesn't work on the composite
interface any more, and I have to end up with finding out the correct
enslaved VF I must operate on.

Regards,
-Siwei

[-- Attachment #1.2: Type: text/html, Size: 2322 bytes --]

[-- Attachment #2: Type: text/plain, Size: 183 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 18:41         ` Stephen Hemminger
                             ` (3 preceding siblings ...)
  2017-12-21  2:16           ` Siwei Liu
@ 2017-12-21  2:16           ` Siwei Liu
  2017-12-21  4:52             ` Jakub Kicinski
  4 siblings, 1 reply; 44+ messages in thread
From: Siwei Liu @ 2017-12-21  2:16 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, sridhar.samudrala, mst, netdev, virtualization,
	alexander.duyck, jesse.brandeburg

On Tue, Dec 19, 2017 at 10:41 AM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
> On Tue, 19 Dec 2017 13:21:17 -0500 (EST)
> David Miller <davem@davemloft.net> wrote:
>
>> From: Stephen Hemminger <stephen@networkplumber.org>
>> Date: Tue, 19 Dec 2017 09:55:48 -0800
>>
>> > could be 10ms, just enough to let udev do its renaming
>>
>> Please, move to some kind of notification or event based handling of
>> this problem.
>>
>> No delay is safe, what if userspace gets swapped out or whatever
>> else might make userspace stall unexpectedly?
>>
>
> The plan is to remove the delay and do the naming in the kernel.
> This was suggested by Lennart since udev is only doing naming policy
> because kernel names were not repeatable.
>
> This makes the VF show up as "ethN_vf" on Hyper-V which is user friendly.
>
> Patch is pending.

While it's good to show VF with specific naming to indicate
enslavement, I wonder wouldn't it be better to hide this netdev at all
from the user space? IMHO this extra device is useless when being
enslaved and we may delegate controls (e.g. ethtool) over to the
para-virtual device instead? That way it's possible to eliminate the
possibility of additional udev setup or modification?

I'm not sure if this  is consistent with Windows guest or not, but I
don't find it _Linux_ user friendly that ethtool doesn't work on the
composite interface any more, and I have to end up with finding out
the correct enslaved VF I must operate on.

Regards,
-Siwei

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-19 18:41         ` Stephen Hemminger
                             ` (2 preceding siblings ...)
  2017-12-21  1:31           ` Siwei Liu
@ 2017-12-21  2:16           ` Siwei Liu
  2017-12-21  2:16           ` Siwei Liu
  4 siblings, 0 replies; 44+ messages in thread
From: Siwei Liu @ 2017-12-21  2:16 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: mst, sridhar.samudrala, alexander.duyck, virtualization, netdev,
	David Miller

On Tue, Dec 19, 2017 at 10:41 AM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
> On Tue, 19 Dec 2017 13:21:17 -0500 (EST)
> David Miller <davem@davemloft.net> wrote:
>
>> From: Stephen Hemminger <stephen@networkplumber.org>
>> Date: Tue, 19 Dec 2017 09:55:48 -0800
>>
>> > could be 10ms, just enough to let udev do its renaming
>>
>> Please, move to some kind of notification or event based handling of
>> this problem.
>>
>> No delay is safe, what if userspace gets swapped out or whatever
>> else might make userspace stall unexpectedly?
>>
>
> The plan is to remove the delay and do the naming in the kernel.
> This was suggested by Lennart since udev is only doing naming policy
> because kernel names were not repeatable.
>
> This makes the VF show up as "ethN_vf" on Hyper-V which is user friendly.
>
> Patch is pending.

While it's good to show VF with specific naming to indicate
enslavement, I wonder wouldn't it be better to hide this netdev at all
from the user space? IMHO this extra device is useless when being
enslaved and we may delegate controls (e.g. ethtool) over to the
para-virtual device instead? That way it's possible to eliminate the
possibility of additional udev setup or modification?

I'm not sure if this  is consistent with Windows guest or not, but I
don't find it _Linux_ user friendly that ethtool doesn't work on the
composite interface any more, and I have to end up with finding out
the correct enslaved VF I must operate on.

Regards,
-Siwei

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-21  2:16           ` Siwei Liu
@ 2017-12-21  4:52             ` Jakub Kicinski
  2017-12-22  8:42               ` Siwei Liu
  2017-12-22  8:42               ` Siwei Liu
  0 siblings, 2 replies; 44+ messages in thread
From: Jakub Kicinski @ 2017-12-21  4:52 UTC (permalink / raw)
  To: Siwei Liu
  Cc: mst, sridhar.samudrala, alexander.duyck, virtualization, netdev,
	David Miller

On Wed, 20 Dec 2017 18:16:30 -0800, Siwei Liu wrote:
> > The plan is to remove the delay and do the naming in the kernel.
> > This was suggested by Lennart since udev is only doing naming policy
> > because kernel names were not repeatable.
> >
> > This makes the VF show up as "ethN_vf" on Hyper-V which is user friendly.
> >
> > Patch is pending.  
> 
> While it's good to show VF with specific naming to indicate
> enslavement, I wonder wouldn't it be better to hide this netdev at all
> from the user space? IMHO this extra device is useless when being
> enslaved and we may delegate controls (e.g. ethtool) over to the
> para-virtual device instead? That way it's possible to eliminate the
> possibility of additional udev setup or modification?
> 
> I'm not sure if this  is consistent with Windows guest or not, but I
> don't find it _Linux_ user friendly that ethtool doesn't work on the
> composite interface any more, and I have to end up with finding out
> the correct enslaved VF I must operate on.

Hiding "low level" netdevs comes up from time to time, and is more
widely applicable than just to VF bonds.  We should find a generic
solution to that problem.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-21  4:52             ` Jakub Kicinski
@ 2017-12-22  8:42               ` Siwei Liu
  2017-12-22  8:42               ` Siwei Liu
  1 sibling, 0 replies; 44+ messages in thread
From: Siwei Liu @ 2017-12-22  8:42 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Stephen Hemminger, David Miller, sridhar.samudrala, mst, netdev,
	virtualization, Alexander Duyck, jesse.brandeburg

On Wed, Dec 20, 2017 at 8:52 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
> On Wed, 20 Dec 2017 18:16:30 -0800, Siwei Liu wrote:
>> > The plan is to remove the delay and do the naming in the kernel.
>> > This was suggested by Lennart since udev is only doing naming policy
>> > because kernel names were not repeatable.
>> >
>> > This makes the VF show up as "ethN_vf" on Hyper-V which is user friendly.
>> >
>> > Patch is pending.
>>
>> While it's good to show VF with specific naming to indicate
>> enslavement, I wonder wouldn't it be better to hide this netdev at all
>> from the user space? IMHO this extra device is useless when being
>> enslaved and we may delegate controls (e.g. ethtool) over to the
>> para-virtual device instead? That way it's possible to eliminate the
>> possibility of additional udev setup or modification?
>>
>> I'm not sure if this  is consistent with Windows guest or not, but I
>> don't find it _Linux_ user friendly that ethtool doesn't work on the
>> composite interface any more, and I have to end up with finding out
>> the correct enslaved VF I must operate on.
>
> Hiding "low level" netdevs comes up from time to time, and is more
> widely applicable than just to VF bonds.  We should find a generic
> solution to that problem.

Wholeheartedly agreed.

Be it a generic virtual bond or virtio-net specific one, there should
be some common code between netvsc and virtio for this type of work.
For avoiding duplicated bugs, consistent (Linux) user experience,
future code refactoring/management, and whatever...

One thing worth to note is that, unlike the Hyper-V netvsc backend
there's currently no equivalent (fine-grained) Linux ndo_* driver
interface that is able to move around MAC address/VLAN filters at
run-time specifically. The OID_RECEIVE_FILTER_MOVE_FILTER request I
mean. That translates to one substantial difference in VF
plumbing/unplumbing sequence: you cannot move the MAC address around
to paravirt device until VF is fully unplugged out of the guest OS. I
don't know what backend changes to be proposed for virtio-net as
helper, but the common code needs to work with both flavors of data
path switching backends and do its job correctly.

Regards,
-Siwei

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
  2017-12-21  4:52             ` Jakub Kicinski
  2017-12-22  8:42               ` Siwei Liu
@ 2017-12-22  8:42               ` Siwei Liu
  1 sibling, 0 replies; 44+ messages in thread
From: Siwei Liu @ 2017-12-22  8:42 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: mst, sridhar.samudrala, Alexander Duyck, virtualization, netdev,
	David Miller

On Wed, Dec 20, 2017 at 8:52 PM, Jakub Kicinski <kubakici@wp.pl> wrote:
> On Wed, 20 Dec 2017 18:16:30 -0800, Siwei Liu wrote:
>> > The plan is to remove the delay and do the naming in the kernel.
>> > This was suggested by Lennart since udev is only doing naming policy
>> > because kernel names were not repeatable.
>> >
>> > This makes the VF show up as "ethN_vf" on Hyper-V which is user friendly.
>> >
>> > Patch is pending.
>>
>> While it's good to show VF with specific naming to indicate
>> enslavement, I wonder wouldn't it be better to hide this netdev at all
>> from the user space? IMHO this extra device is useless when being
>> enslaved and we may delegate controls (e.g. ethtool) over to the
>> para-virtual device instead? That way it's possible to eliminate the
>> possibility of additional udev setup or modification?
>>
>> I'm not sure if this  is consistent with Windows guest or not, but I
>> don't find it _Linux_ user friendly that ethtool doesn't work on the
>> composite interface any more, and I have to end up with finding out
>> the correct enslaved VF I must operate on.
>
> Hiding "low level" netdevs comes up from time to time, and is more
> widely applicable than just to VF bonds.  We should find a generic
> solution to that problem.

Wholeheartedly agreed.

Be it a generic virtual bond or virtio-net specific one, there should
be some common code between netvsc and virtio for this type of work.
For avoiding duplicated bugs, consistent (Linux) user experience,
future code refactoring/management, and whatever...

One thing worth to note is that, unlike the Hyper-V netvsc backend
there's currently no equivalent (fine-grained) Linux ndo_* driver
interface that is able to move around MAC address/VLAN filters at
run-time specifically. The OID_RECEIVE_FILTER_MOVE_FILTER request I
mean. That translates to one substantial difference in VF
plumbing/unplumbing sequence: you cannot move the MAC address around
to paravirt device until VF is fully unplugged out of the guest OS. I
don't know what backend changes to be proposed for virtio-net as
helper, but the common code needs to work with both flavors of data
path switching backends and do its job correctly.

Regards,
-Siwei

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available
@ 2017-12-19  0:40 Sridhar Samudrala
  0 siblings, 0 replies; 44+ messages in thread
From: Sridhar Samudrala @ 2017-12-19  0:40 UTC (permalink / raw)
  To: mst, stephen, netdev, virtualization, alexander.duyck, sridhar.samudrala

This patch enables virtio to switch over to a VF datapath when a VF netdev
is present with the same MAC address.  It allows live migration of a VM
with a direct attached VF without the need to setup a bond/team between a
VF and virtio net device in the guest.

The hypervisor needs to unplug the VF device from the guest on the source
host and reset the MAC filter of the VF to initiate failover of datapath to
virtio before starting the migration. After the migration is completed, the
destination hypervisor sets the MAC filter on the VF and plugs it back to
the guest to switch over to VF datapath.

It is entirely based on netvsc implementation and it should be possible to
make this code generic and move it to a common location that can be shared
by netvsc and virtio.

Also, i think we should make this a negotiated feature that is off by
default via a new feature bit.

This patch is based on the discussion initiated by Jesse on this thread.
https://marc.info/?l=linux-virtualization&m=151189725224231&w=2

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
---
 drivers/net/virtio_net.c | 341 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 339 insertions(+), 2 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 559b215c0169..a34c717bb15b 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -31,6 +31,8 @@
 #include <linux/average.h>
 #include <linux/filter.h>
 #include <net/route.h>
+#include <linux/netdevice.h>
+#include <linux/netpoll.h>
 
 static int napi_weight = NAPI_POLL_WEIGHT;
 module_param(napi_weight, int, 0444);
@@ -56,6 +58,8 @@ module_param(napi_tx, bool, 0644);
  */
 DECLARE_EWMA(pkt_len, 0, 64)
 
+#define VF_TAKEOVER_INT	(HZ / 10)
+
 #define VIRTNET_DRIVER_VERSION "1.0.0"
 
 static const unsigned long guest_offloads[] = {
@@ -117,6 +121,15 @@ struct receive_queue {
 	char name[40];
 };
 
+struct virtnet_vf_pcpu_stats {
+	u64	rx_packets;
+	u64	rx_bytes;
+	u64	tx_packets;
+	u64	tx_bytes;
+	struct u64_stats_sync   syncp;
+	u32	tx_dropped;
+};
+
 struct virtnet_info {
 	struct virtio_device *vdev;
 	struct virtqueue *cvq;
@@ -179,6 +192,11 @@ struct virtnet_info {
 	u32 speed;
 
 	unsigned long guest_offloads;
+
+	/* State to manage the associated VF interface. */
+	struct net_device __rcu *vf_netdev;
+	struct virtnet_vf_pcpu_stats __percpu *vf_stats;
+	struct delayed_work vf_takeover;
 };
 
 struct padded_vnet_hdr {
@@ -1300,16 +1318,51 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
 }
 
+/* Send skb on the slave VF device. */
+static int virtnet_vf_xmit(struct net_device *dev, struct net_device *vf_netdev,
+			   struct sk_buff *skb)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	unsigned int len = skb->len;
+	int rc;
+
+	skb->dev = vf_netdev;
+	skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
+
+	rc = dev_queue_xmit(skb);
+	if (likely(rc == NET_XMIT_SUCCESS || rc == NET_XMIT_CN)) {
+		struct virtnet_vf_pcpu_stats *pcpu_stats
+			= this_cpu_ptr(vi->vf_stats);
+
+		u64_stats_update_begin(&pcpu_stats->syncp);
+		pcpu_stats->tx_packets++;
+		pcpu_stats->tx_bytes += len;
+		u64_stats_update_end(&pcpu_stats->syncp);
+	} else {
+		this_cpu_inc(vi->vf_stats->tx_dropped);
+	}
+
+	return rc;
+}
+
 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	int qnum = skb_get_queue_mapping(skb);
 	struct send_queue *sq = &vi->sq[qnum];
+	struct net_device *vf_netdev;
 	int err;
 	struct netdev_queue *txq = netdev_get_tx_queue(dev, qnum);
 	bool kick = !skb->xmit_more;
 	bool use_napi = sq->napi.weight;
 
+	/* if VF is present and up then redirect packets
+	 * called with rcu_read_lock_bh
+	 */
+	vf_netdev = rcu_dereference_bh(vi->vf_netdev);
+	if (vf_netdev && netif_running(vf_netdev) && !netpoll_tx_running(dev))
+		return virtnet_vf_xmit(dev, vf_netdev, skb);
+
 	/* Free up any pending old buffers before queueing new ones. */
 	free_old_xmit_skbs(sq);
 
@@ -1456,10 +1509,41 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
 	return ret;
 }
 
+static void virtnet_get_vf_stats(struct net_device *dev,
+				 struct virtnet_vf_pcpu_stats *tot)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	int i;
+
+	memset(tot, 0, sizeof(*tot));
+
+	for_each_possible_cpu(i) {
+		const struct virtnet_vf_pcpu_stats *stats
+				= per_cpu_ptr(vi->vf_stats, i);
+		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+		unsigned int start;
+
+		do {
+			start = u64_stats_fetch_begin_irq(&stats->syncp);
+			rx_packets = stats->rx_packets;
+			tx_packets = stats->tx_packets;
+			rx_bytes = stats->rx_bytes;
+			tx_bytes = stats->tx_bytes;
+		} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+
+		tot->rx_packets += rx_packets;
+		tot->tx_packets += tx_packets;
+		tot->rx_bytes   += rx_bytes;
+		tot->tx_bytes   += tx_bytes;
+		tot->tx_dropped += stats->tx_dropped;
+	}
+}
+
 static void virtnet_stats(struct net_device *dev,
 			  struct rtnl_link_stats64 *tot)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
+	struct virtnet_vf_pcpu_stats vf_stats;
 	int cpu;
 	unsigned int start;
 
@@ -1490,6 +1574,13 @@ static void virtnet_stats(struct net_device *dev,
 	tot->rx_dropped = dev->stats.rx_dropped;
 	tot->rx_length_errors = dev->stats.rx_length_errors;
 	tot->rx_frame_errors = dev->stats.rx_frame_errors;
+
+	virtnet_get_vf_stats(dev, &vf_stats);
+	tot->rx_packets += vf_stats.rx_packets;
+	tot->tx_packets += vf_stats.tx_packets;
+	tot->rx_bytes += vf_stats.rx_bytes;
+	tot->tx_bytes += vf_stats.tx_bytes;
+	tot->tx_dropped += vf_stats.tx_dropped;
 }
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -2508,6 +2599,47 @@ static int virtnet_validate(struct virtio_device *vdev)
 	return 0;
 }
 
+static void __virtnet_vf_setup(struct net_device *ndev,
+			       struct net_device *vf_netdev)
+{
+	int ret;
+
+	/* Align MTU of VF with master */
+	ret = dev_set_mtu(vf_netdev, ndev->mtu);
+	if (ret)
+		netdev_warn(vf_netdev,
+			    "unable to change mtu to %u\n", ndev->mtu);
+
+	if (netif_running(ndev)) {
+		ret = dev_open(vf_netdev);
+		if (ret)
+			netdev_warn(vf_netdev,
+				    "unable to open: %d\n", ret);
+	}
+}
+
+/* Setup VF as slave of the virtio device.
+ * Runs in workqueue to avoid recursion in netlink callbacks.
+ */
+static void virtnet_vf_setup(struct work_struct *w)
+{
+	struct virtnet_info *vi
+		= container_of(w, struct virtnet_info, vf_takeover.work);
+	struct net_device *ndev = vi->dev;
+	struct net_device *vf_netdev;
+
+	if (!rtnl_trylock()) {
+		schedule_delayed_work(&vi->vf_takeover, 0);
+		return;
+	}
+
+	vf_netdev = rtnl_dereference(vi->vf_netdev);
+	if (vf_netdev)
+		__virtnet_vf_setup(ndev, vf_netdev);
+
+	rtnl_unlock();
+}
+
 static int virtnet_probe(struct virtio_device *vdev)
 {
 	int i, err;
@@ -2600,6 +2732,11 @@ static int virtnet_probe(struct virtio_device *vdev)
 	}
 
 	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
+	INIT_DELAYED_WORK(&vi->vf_takeover, virtnet_vf_setup);
+
+	vi->vf_stats = netdev_alloc_pcpu_stats(struct virtnet_vf_pcpu_stats);
+	if (!vi->vf_stats)
+		goto free_stats;
 
 	/* If we can receive ANY GSO packets, we must allocate large ones. */
 	if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
@@ -2634,7 +2771,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 			 */
 			dev_err(&vdev->dev, "device MTU appears to have changed "
 				"it is now %d < %d", mtu, dev->min_mtu);
-			goto free_stats;
+			goto free_vf_stats;
 		}
 
 		dev->mtu = mtu;
@@ -2658,7 +2795,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
 	err = init_vqs(vi);
 	if (err)
-		goto free_stats;
+		goto free_vf_stats;
 
 #ifdef CONFIG_SYSFS
 	if (vi->mergeable_rx_bufs)
@@ -2712,6 +2849,8 @@ static int virtnet_probe(struct virtio_device *vdev)
 	cancel_delayed_work_sync(&vi->refill);
 	free_receive_page_frags(vi);
 	virtnet_del_vqs(vi);
+free_vf_stats:
+	free_percpu(vi->vf_stats);
 free_stats:
 	free_percpu(vi->stats);
 free:
@@ -2733,19 +2872,178 @@ static void remove_vq_common(struct virtnet_info *vi)
 	virtnet_del_vqs(vi);
 }
 
+static struct net_device *get_virtio_bymac(const u8 *mac)
+{
+	struct net_device *dev;
+
+	ASSERT_RTNL();
+
+	for_each_netdev(&init_net, dev) {
+		if (dev->netdev_ops != &virtnet_netdev)
+			continue;       /* not a virtio_net device */
+
+		if (ether_addr_equal(mac, dev->perm_addr))
+			return dev;
+	}
+
+	return NULL;
+}
+
+static struct net_device *get_virtio_byref(struct net_device *vf_netdev)
+{
+	struct net_device *dev;
+
+	ASSERT_RTNL();
+
+	for_each_netdev(&init_net, dev) {
+		struct virtnet_info *vi;
+
+		if (dev->netdev_ops != &virtnet_netdev)
+			continue;	/* not a virtio_net device */
+
+		vi = netdev_priv(dev);
+		if (rtnl_dereference(vi->vf_netdev) == vf_netdev)
+			return dev;	/* a match */
+	}
+
+	return NULL;
+}
+
+/* Called when VF is injecting data into network stack.
+ * Change the associated network device from VF to virtio.
+ * note: already called with rcu_read_lock
+ */
+static rx_handler_result_t virtnet_vf_handle_frame(struct sk_buff **pskb)
+{
+	struct sk_buff *skb = *pskb;
+	struct net_device *ndev = rcu_dereference(skb->dev->rx_handler_data);
+	struct virtnet_info *vi = netdev_priv(ndev);
+	struct virtnet_vf_pcpu_stats *pcpu_stats =
+				this_cpu_ptr(vi->vf_stats);
+
+	skb->dev = ndev;
+
+	u64_stats_update_begin(&pcpu_stats->syncp);
+	pcpu_stats->rx_packets++;
+	pcpu_stats->rx_bytes += skb->len;
+	u64_stats_update_end(&pcpu_stats->syncp);
+
+	return RX_HANDLER_ANOTHER;
+}
+
+static int virtnet_vf_join(struct net_device *vf_netdev,
+			   struct net_device *ndev)
+{
+	struct virtnet_info *vi = netdev_priv(ndev);
+	int ret;
+
+	ret = netdev_rx_handler_register(vf_netdev,
+					 virtnet_vf_handle_frame, ndev);
+	if (ret != 0) {
+		netdev_err(vf_netdev,
+			   "can not register virtio VF receive handler (err = %d)\n",
+			   ret);
+		goto rx_handler_failed;
+	}
+
+	ret = netdev_upper_dev_link(vf_netdev, ndev, NULL);
+	if (ret != 0) {
+		netdev_err(vf_netdev,
+			   "can not set master device %s (err = %d)\n",
+			   ndev->name, ret);
+		goto upper_link_failed;
+	}
+
+	/* set slave flag before open to prevent IPv6 addrconf */
+	vf_netdev->flags |= IFF_SLAVE;
+
+	schedule_delayed_work(&vi->vf_takeover, VF_TAKEOVER_INT);
+
+	call_netdevice_notifiers(NETDEV_JOIN, vf_netdev);
+
+	netdev_info(vf_netdev, "joined to %s\n", ndev->name);
+	return 0;
+
+upper_link_failed:
+	netdev_rx_handler_unregister(vf_netdev);
+rx_handler_failed:
+	return ret;
+}
+
+static int virtnet_register_vf(struct net_device *vf_netdev)
+{
+	struct net_device *ndev;
+	struct virtnet_info *vi;
+
+	if (vf_netdev->addr_len != ETH_ALEN)
+		return NOTIFY_DONE;
+
+	/* We will use the MAC address to locate the virtio_net interface to
+	 * associate with the VF interface. If we don't find a matching
+	 * virtio interface, move on.
+	 */
+	ndev = get_virtio_bymac(vf_netdev->perm_addr);
+	if (!ndev)
+		return NOTIFY_DONE;
+
+	vi = netdev_priv(ndev);
+	if (rtnl_dereference(vi->vf_netdev))
+		return NOTIFY_DONE;
+
+	if (virtnet_vf_join(vf_netdev, ndev) != 0)
+		return NOTIFY_DONE;
+
+	netdev_info(ndev, "VF registering %s\n", vf_netdev->name);
+
+	dev_hold(vf_netdev);
+	rcu_assign_pointer(vi->vf_netdev, vf_netdev);
+
+	return NOTIFY_OK;
+}
+
+static int virtnet_unregister_vf(struct net_device *vf_netdev)
+{
+	struct net_device *ndev;
+	struct virtnet_info *vi;
+
+	ndev = get_virtio_byref(vf_netdev);
+	if (!ndev)
+		return NOTIFY_DONE;
+
+	vi = netdev_priv(ndev);
+	cancel_delayed_work_sync(&vi->vf_takeover);
+
+	netdev_info(ndev, "VF unregistering %s\n", vf_netdev->name);
+
+	netdev_rx_handler_unregister(vf_netdev);
+	netdev_upper_dev_unlink(vf_netdev, ndev);
+	RCU_INIT_POINTER(vi->vf_netdev, NULL);
+	dev_put(vf_netdev);
+
+	return NOTIFY_OK;
+}
+
 static void virtnet_remove(struct virtio_device *vdev)
 {
 	struct virtnet_info *vi = vdev->priv;
+	struct net_device *vf_netdev;
 
 	virtnet_cpu_notif_remove(vi);
 
 	/* Make sure no work handler is accessing the device. */
 	flush_work(&vi->config_work);
 
+	rtnl_lock();
+	vf_netdev = rtnl_dereference(vi->vf_netdev);
+	if (vf_netdev)
+		virtnet_unregister_vf(vf_netdev);
+	rtnl_unlock();
+
 	unregister_netdev(vi->dev);
 
 	remove_vq_common(vi);
 
+	free_percpu(vi->vf_stats);
 	free_percpu(vi->stats);
 	free_netdev(vi->dev);
 }
@@ -2823,6 +3121,42 @@ static struct virtio_driver virtio_net_driver = {
 #endif
 };
 
+static int virtio_netdev_event(struct notifier_block *this,
+			       unsigned long event, void *ptr)
+{
+	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+	/* Skip our own events */
+	if (event_dev->netdev_ops == &virtnet_netdev)
+		return NOTIFY_DONE;
+
+	/* Avoid non-Ethernet type devices */
+	if (event_dev->type != ARPHRD_ETHER)
+		return NOTIFY_DONE;
+
+	/* Avoid Vlan dev with same MAC registering as VF */
+	if (is_vlan_dev(event_dev))
+		return NOTIFY_DONE;
+
+	/* Avoid Bonding master dev with same MAC registering as VF */
+	if ((event_dev->priv_flags & IFF_BONDING) &&
+	    (event_dev->flags & IFF_MASTER))
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		return virtnet_register_vf(event_dev);
+	case NETDEV_UNREGISTER:
+		return virtnet_unregister_vf(event_dev);
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+static struct notifier_block virtio_netdev_notifier = {
+	.notifier_call = virtio_netdev_event,
+};
+
 static __init int virtio_net_driver_init(void)
 {
 	int ret;
@@ -2841,6 +3175,8 @@ static __init int virtio_net_driver_init(void)
         ret = register_virtio_driver(&virtio_net_driver);
 	if (ret)
 		goto err_virtio;
+
+	register_netdevice_notifier(&virtio_netdev_notifier);
 	return 0;
 err_virtio:
 	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
@@ -2853,6 +3189,7 @@ module_init(virtio_net_driver_init);
 
 static __exit void virtio_net_driver_exit(void)
 {
+	unregister_netdevice_notifier(&virtio_netdev_notifier);
 	unregister_virtio_driver(&virtio_net_driver);
 	cpuhp_remove_multi_state(CPUHP_VIRT_NET_DEAD);
 	cpuhp_remove_multi_state(virtionet_online);
-- 
2.14.3

^ permalink raw reply related	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2017-12-22  8:42 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-12-19  0:40 [RFC PATCH] virtio_net: Extend virtio to use VF datapath when available Sridhar Samudrala
2017-12-19 15:47 ` Michael S. Tsirkin
2017-12-19 15:47 ` Michael S. Tsirkin
2017-12-19 17:41   ` Samudrala, Sridhar
2017-12-19 17:41   ` Samudrala, Sridhar
2017-12-19 17:55     ` Stephen Hemminger
2017-12-19 17:55     ` Stephen Hemminger
2017-12-19 18:07       ` Michael S. Tsirkin
2017-12-19 18:13         ` Stephen Hemminger
2017-12-19 18:13         ` Stephen Hemminger
2017-12-19 18:07       ` Michael S. Tsirkin
2017-12-19 18:21       ` David Miller
2017-12-19 18:21       ` David Miller
2017-12-19 18:41         ` Stephen Hemminger
2017-12-19 18:41         ` Stephen Hemminger
2017-12-19 19:42           ` Samudrala, Sridhar
2017-12-19 19:42           ` Samudrala, Sridhar
2017-12-19 19:46             ` Stephen Hemminger
2017-12-19 19:46             ` Stephen Hemminger
2017-12-19 22:37               ` Samudrala, Sridhar
2017-12-19 22:53                 ` Stephen Hemminger
2017-12-20  0:26                   ` Samudrala, Sridhar
2017-12-20  0:26                   ` Samudrala, Sridhar
2017-12-19 22:53                 ` Stephen Hemminger
2017-12-19 22:37               ` Samudrala, Sridhar
2017-12-21  1:31           ` Siwei Liu
2017-12-21  2:16           ` Siwei Liu
2017-12-21  2:16           ` Siwei Liu
2017-12-21  4:52             ` Jakub Kicinski
2017-12-22  8:42               ` Siwei Liu
2017-12-22  8:42               ` Siwei Liu
2017-12-19 18:20     ` David Miller
2017-12-19 18:20     ` David Miller
2017-12-20 10:51       ` Vitaly Kuznetsov
2017-12-20 10:51       ` Vitaly Kuznetsov
2017-12-20 22:33 ` Jakub Kicinski
2017-12-21  0:15   ` Michael S. Tsirkin
2017-12-21  0:15   ` Michael S. Tsirkin
2017-12-21  0:29     ` Jakub Kicinski
2017-12-21  0:29     ` Jakub Kicinski
2017-12-20 22:33 ` Jakub Kicinski
2017-12-21  0:14 ` Michael S. Tsirkin
2017-12-21  0:14 ` Michael S. Tsirkin
  -- strict thread matches above, loose matches on Subject: below --
2017-12-19  0:40 Sridhar Samudrala

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.