All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] net/mlx5: support device removal event
@ 2017-08-13 12:25 Matan Azrad
  2017-08-13 12:25 ` [PATCH 2/2] net/mlx5: fix probe failure report Matan Azrad
  2017-08-23  9:40 ` [PATCH 1/2] net/mlx5: support device removal event Nélio Laranjeiro
  0 siblings, 2 replies; 19+ messages in thread
From: Matan Azrad @ 2017-08-13 12:25 UTC (permalink / raw)
  To: Adrien Mazarguil, Nelio Laranjeiro; +Cc: dev

Extend the LSC event handling to support the device removal as well.
The Verbs library may send several related events, which are
different from LSC event.

The mlx5 event handling has been made capable of receiving and
signaling several event types at once.

This support includes next:
1. Removal event detection according to the user configuration.
2. Calling to all registered mlx5 removal callbacks.
3. Capabilities extension to include removal interrupt handling.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx5/mlx5.c        |   2 +-
 drivers/net/mlx5/mlx5_ethdev.c | 100 +++++++++++++++++++++++++++--------------
 2 files changed, 68 insertions(+), 34 deletions(-)

Hi 
This patch based on top of last Nelio mlx5 cleanup patches.

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index bd66a7c..1a3d7f1 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -865,7 +865,7 @@ static struct rte_pci_driver mlx5_driver = {
 	},
 	.id_table = mlx5_pci_id_map,
 	.probe = mlx5_pci_probe,
-	.drv_flags = RTE_PCI_DRV_INTR_LSC,
+	.drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV,
 };
 
 /**
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 57f6237..404d8f4 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1112,47 +1112,75 @@ mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
 }
 
 /**
- * Link status handler.
+ * Update the link status.
+ * Set alarm if the device link status is inconsistent.
  *
  * @param priv
  *   Pointer to private structure.
- * @param dev
- *   Pointer to the rte_eth_dev structure.
  *
  * @return
- *   Nonzero if the callback process can be called immediately.
+ *   Zero if alarm is not set and the link status is consistent.
  */
 static int
-priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev)
+priv_link_status_alarm_update(struct priv *priv)
+{
+	struct rte_eth_link *link = &priv->dev->data->dev_link;
+
+	mlx5_link_update(priv->dev, 0);
+	if (((link->link_speed == 0) && link->link_status) ||
+		((link->link_speed != 0) && !link->link_status)) {
+		if (!priv->pending_alarm) {
+			/* Inconsistent status, check again later. */
+			priv->pending_alarm = 1;
+			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
+				mlx5_dev_link_status_handler,
+				priv->dev);
+		}
+		return 1;
+	} else if (unlikely(priv->pending_alarm)) {
+		/* In case of link interrupt while link alarm was setting. */
+		priv->pending_alarm = 0;
+		rte_eal_alarm_cancel(mlx5_dev_link_status_handler, priv->dev);
+	}
+	return 0;
+}
+
+/**
+ * Device status handler.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param events
+ *   Pointer to event flags holder.
+ *
+ * @return
+ *   Events bitmap of callback process which can be called immediately.
+ */
+static uint32_t
+priv_dev_status_handler(struct priv *priv)
 {
 	struct ibv_async_event event;
-	struct rte_eth_link *link = &dev->data->dev_link;
-	int ret = 0;
+	uint32_t ret = 0;
 
 	/* Read all message and acknowledge them. */
 	for (;;) {
 		if (ibv_get_async_event(priv->ctx, &event))
 			break;
-
-		if (event.event_type != IBV_EVENT_PORT_ACTIVE &&
-		    event.event_type != IBV_EVENT_PORT_ERR)
+		if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
+			event.event_type == IBV_EVENT_PORT_ERR) &&
+			(priv->dev->data->dev_conf.intr_conf.lsc == 1))
+			ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
+		else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
+			priv->dev->data->dev_conf.intr_conf.rmv == 1)
+			ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
+		else
 			DEBUG("event type %d on port %d not handled",
-			      event.event_type, event.element.port_num);
+				event.event_type, event.element.port_num);
 		ibv_ack_async_event(&event);
 	}
-	mlx5_link_update(dev, 0);
-	if (((link->link_speed == 0) && link->link_status) ||
-	    ((link->link_speed != 0) && !link->link_status)) {
-		if (!priv->pending_alarm) {
-			/* Inconsistent status, check again later. */
-			priv->pending_alarm = 1;
-			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
-					  mlx5_dev_link_status_handler,
-					  dev);
-		}
-	} else {
-		ret = 1;
-	}
+	if (ret & (1 << RTE_ETH_EVENT_INTR_LSC))
+		if (priv_link_status_alarm_update(priv))
+			ret &= ~(1 << RTE_ETH_EVENT_INTR_LSC);
 	return ret;
 }
 
@@ -1172,11 +1200,11 @@ mlx5_dev_link_status_handler(void *arg)
 	priv_lock(priv);
 	assert(priv->pending_alarm == 1);
 	priv->pending_alarm = 0;
-	ret = priv_dev_link_status_handler(priv, dev);
+	ret = priv_link_status_alarm_update(priv);
 	priv_unlock(priv);
-	if (ret)
+	if (!ret)
 		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
-					      NULL);
+			NULL);
 }
 
 /**
@@ -1192,14 +1220,17 @@ mlx5_dev_interrupt_handler(void *cb_arg)
 {
 	struct rte_eth_dev *dev = cb_arg;
 	struct priv *priv = dev->data->dev_private;
-	int ret;
+	uint32_t events;
 
 	priv_lock(priv);
-	ret = priv_dev_link_status_handler(priv, dev);
+	events = priv_dev_status_handler(priv);
 	priv_unlock(priv);
-	if (ret)
+	if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
 		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
-					      NULL);
+			NULL);
+	if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
+		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL,
+			NULL);
 }
 
 /**
@@ -1213,7 +1244,8 @@ mlx5_dev_interrupt_handler(void *cb_arg)
 void
 priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev)
 {
-	if (!dev->data->dev_conf.intr_conf.lsc)
+	if (!dev->data->dev_conf.intr_conf.lsc &&
+		!dev->data->dev_conf.intr_conf.rmv)
 		return;
 	rte_intr_callback_unregister(&priv->intr_handle,
 				     mlx5_dev_interrupt_handler,
@@ -1238,7 +1270,8 @@ priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev)
 {
 	int rc, flags;
 
-	if (!dev->data->dev_conf.intr_conf.lsc)
+	if (!dev->data->dev_conf.intr_conf.lsc &&
+		!dev->data->dev_conf.intr_conf.rmv)
 		return;
 	assert(priv->ctx->async_fd > 0);
 	flags = fcntl(priv->ctx->async_fd, F_GETFL);
@@ -1246,6 +1279,7 @@ priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev)
 	if (rc < 0) {
 		INFO("failed to change file descriptor async event queue");
 		dev->data->dev_conf.intr_conf.lsc = 0;
+		dev->data->dev_conf.intr_conf.rmv = 0;
 	} else {
 		priv->intr_handle.fd = priv->ctx->async_fd;
 		priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 2/2] net/mlx5: fix probe failure report
  2017-08-13 12:25 [PATCH 1/2] net/mlx5: support device removal event Matan Azrad
@ 2017-08-13 12:25 ` Matan Azrad
  2017-08-23  9:44   ` Nélio Laranjeiro
  2017-08-23  9:40 ` [PATCH 1/2] net/mlx5: support device removal event Nélio Laranjeiro
  1 sibling, 1 reply; 19+ messages in thread
From: Matan Azrad @ 2017-08-13 12:25 UTC (permalink / raw)
  To: Adrien Mazarguil, Nelio Laranjeiro; +Cc: dev, stable

The corrupted code doesn't return error when probe function
fails due to error in device mac address getting.
By this way, the probe function may return success even if the
ETH dev is not allocated.

Hence, the probe caller, for example failsafe PMD, fails when it
tries to get ETH dev after the device was plugged out while mlx5
was probing it.

The fix adds error report to the probe caller when priv_get_mac fails
and in all other failure options which are missing it.

By this way, it prevents the unexpected behavior to miss ETH device
after the device was probed successfully.

This bug was already present in the original code taken from mlx4.

Fixes: 771fa900b73a ("mlx5: introduce new driver for Mellanox ConnectX-4 adapters")
Fixes: 1371f4df16bc ("mlx5: check port is configured as ethernet device")

Signed-off-by: Matan Azrad <matan@mellanox.com>
Cc: stable@dpdk.org
---
 drivers/net/mlx5/mlx5.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 1a3d7f1..99a2fb3 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -579,8 +579,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		DEBUG("using port %u (%08" PRIx32 ")", port, test);
 
 		ctx = ibv_open_device(ibv_dev);
-		if (ctx == NULL)
+		if (ctx == NULL) {
+			err = ENODEV;
 			goto port_error;
+		}
 
 		/* Check port status. */
 		err = ibv_query_port(ctx, port, &port_attr);
@@ -592,6 +594,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
 			ERROR("port %d is not configured in Ethernet mode",
 			      port);
+			err = EINVAL;
 			goto port_error;
 		}
 
@@ -640,6 +643,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		mlx5_args_assign(priv, &args);
 		if (ibv_exp_query_device(ctx, &exp_device_attr)) {
 			ERROR("ibv_exp_query_device() failed");
+			err = ENODEV;
 			goto port_error;
 		}
 
@@ -728,6 +732,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		if (priv_get_mac(priv, &mac.addr_bytes)) {
 			ERROR("cannot get MAC address, is mlx5_en loaded?"
 			      " (errno: %s)", strerror(errno));
+			err = ENODEV;
 			goto port_error;
 		}
 		INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] net/mlx5: support device removal event
  2017-08-13 12:25 [PATCH 1/2] net/mlx5: support device removal event Matan Azrad
  2017-08-13 12:25 ` [PATCH 2/2] net/mlx5: fix probe failure report Matan Azrad
@ 2017-08-23  9:40 ` Nélio Laranjeiro
  2017-08-23 19:44   ` Matan Azrad
  1 sibling, 1 reply; 19+ messages in thread
From: Nélio Laranjeiro @ 2017-08-23  9:40 UTC (permalink / raw)
  To: Matan Azrad; +Cc: Adrien Mazarguil, dev

Hi Matan,

On Sun, Aug 13, 2017 at 03:25:11PM +0300, Matan Azrad wrote:
> Extend the LSC event handling to support the device removal as well.
> The Verbs library may send several related events, which are
> different from LSC event.
> 
> The mlx5 event handling has been made capable of receiving and
> signaling several event types at once.
> 
> This support includes next:
> 1. Removal event detection according to the user configuration.
> 2. Calling to all registered mlx5 removal callbacks.
> 3. Capabilities extension to include removal interrupt handling.
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>
> ---
>  drivers/net/mlx5/mlx5.c        |   2 +-
>  drivers/net/mlx5/mlx5_ethdev.c | 100 +++++++++++++++++++++++++++--------------
>  2 files changed, 68 insertions(+), 34 deletions(-)
> 
> Hi 
> This patch based on top of last Nelio mlx5 cleanup patches.
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> index bd66a7c..1a3d7f1 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -865,7 +865,7 @@ static struct rte_pci_driver mlx5_driver = {
>  	},
>  	.id_table = mlx5_pci_id_map,
>  	.probe = mlx5_pci_probe,
> -	.drv_flags = RTE_PCI_DRV_INTR_LSC,
> +	.drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV,
>  };
>  
>  /**
> diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
> index 57f6237..404d8f4 100644
> --- a/drivers/net/mlx5/mlx5_ethdev.c
> +++ b/drivers/net/mlx5/mlx5_ethdev.c
> @@ -1112,47 +1112,75 @@ mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
>  }
>  
>  /**
> - * Link status handler.
> + * Update the link status.
> + * Set alarm if the device link status is inconsistent.

Adding such comment should also comment about the issue this alarm is solving
i.e. why the link is inconsistent and why the alarm help to fix the issue.

>   *
>   * @param priv
>   *   Pointer to private structure.
> - * @param dev
> - *   Pointer to the rte_eth_dev structure.
>   *
>   * @return
> - *   Nonzero if the callback process can be called immediately.
> + *   Zero if alarm is not set and the link status is consistent.
>   */
>  static int
> -priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev)
> +priv_link_status_alarm_update(struct priv *priv)

The old name is more accurate, the fact we need to program an alarm is a work
around to get the correct status from ethtool.  If it was possible to avoid
it, this alarm would not exists.

> +{
> +	struct rte_eth_link *link = &priv->dev->data->dev_link;
> +
> +	mlx5_link_update(priv->dev, 0);
> +	if (((link->link_speed == 0) && link->link_status) ||
> +		((link->link_speed != 0) && !link->link_status)) {
> +		if (!priv->pending_alarm) {
> +			/* Inconsistent status, check again later. */
> +			priv->pending_alarm = 1;
> +			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
> +				mlx5_dev_link_status_handler,
> +				priv->dev);
> +		}
> +		return 1;
> +	} else if (unlikely(priv->pending_alarm)) {
> +		/* In case of link interrupt while link alarm was setting. */
> +		priv->pending_alarm = 0;
> +		rte_eal_alarm_cancel(mlx5_dev_link_status_handler, priv->dev);
> +	}
> +	return 0;
> +}
> +
>[...]
>  
> @@ -1172,11 +1200,11 @@ mlx5_dev_link_status_handler(void *arg)
>  	priv_lock(priv);
>  	assert(priv->pending_alarm == 1);
>  	priv->pending_alarm = 0;
> -	ret = priv_dev_link_status_handler(priv, dev);
> +	ret = priv_link_status_alarm_update(priv);

It is not clear, this calls an alarm_update without getting the link status,
the function name is "link_status_handler" why does the behavior does not
reflect the function name?

It is too confusing to be integrated as is, we had several bugs in this part of the
code, keep it clear, by keeping the old functions name.

Thanks,

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/2] net/mlx5: fix probe failure report
  2017-08-13 12:25 ` [PATCH 2/2] net/mlx5: fix probe failure report Matan Azrad
@ 2017-08-23  9:44   ` Nélio Laranjeiro
  2017-09-01 10:40     ` [dpdk-stable] " Ferruh Yigit
  0 siblings, 1 reply; 19+ messages in thread
From: Nélio Laranjeiro @ 2017-08-23  9:44 UTC (permalink / raw)
  To: Matan Azrad; +Cc: Adrien Mazarguil, dev, stable

On Sun, Aug 13, 2017 at 03:25:12PM +0300, Matan Azrad wrote:
> The corrupted code doesn't return error when probe function
> fails due to error in device mac address getting.
> By this way, the probe function may return success even if the
> ETH dev is not allocated.
> 
> Hence, the probe caller, for example failsafe PMD, fails when it
> tries to get ETH dev after the device was plugged out while mlx5
> was probing it.
> 
> The fix adds error report to the probe caller when priv_get_mac fails
> and in all other failure options which are missing it.
> 
> By this way, it prevents the unexpected behavior to miss ETH device
> after the device was probed successfully.
> 
> This bug was already present in the original code taken from mlx4.
> 
> Fixes: 771fa900b73a ("mlx5: introduce new driver for Mellanox ConnectX-4 adapters")
> Fixes: 1371f4df16bc ("mlx5: check port is configured as ethernet device")
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>
> Cc: stable@dpdk.org

Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] net/mlx5: support device removal event
  2017-08-23  9:40 ` [PATCH 1/2] net/mlx5: support device removal event Nélio Laranjeiro
@ 2017-08-23 19:44   ` Matan Azrad
  2017-08-24  7:38     ` Nélio Laranjeiro
  0 siblings, 1 reply; 19+ messages in thread
From: Matan Azrad @ 2017-08-23 19:44 UTC (permalink / raw)
  To: Nélio Laranjeiro; +Cc: Adrien Mazarguil, dev

Hi Nelio

> -----Original Message-----
> From: Nélio Laranjeiro [mailto:nelio.laranjeiro@6wind.com]
> Sent: Wednesday, August 23, 2017 12:41 PM
> To: Matan Azrad <matan@mellanox.com>
> Cc: Adrien Mazarguil <adrien.mazarguil@6wind.com>; dev@dpdk.org
> Subject: Re: [PATCH 1/2] net/mlx5: support device removal event
> 
> Hi Matan,
> 
> On Sun, Aug 13, 2017 at 03:25:11PM +0300, Matan Azrad wrote:
> > Extend the LSC event handling to support the device removal as well.
> > The Verbs library may send several related events, which are different
> > from LSC event.
> >
> > The mlx5 event handling has been made capable of receiving and
> > signaling several event types at once.
> >
> > This support includes next:
> > 1. Removal event detection according to the user configuration.
> > 2. Calling to all registered mlx5 removal callbacks.
> > 3. Capabilities extension to include removal interrupt handling.
> >
> > Signed-off-by: Matan Azrad <matan@mellanox.com>
> > ---
> >  drivers/net/mlx5/mlx5.c        |   2 +-
> >  drivers/net/mlx5/mlx5_ethdev.c | 100
> > +++++++++++++++++++++++++++--------------
> >  2 files changed, 68 insertions(+), 34 deletions(-)
> >
> > Hi
> > This patch based on top of last Nelio mlx5 cleanup patches.
> >
> > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> > bd66a7c..1a3d7f1 100644
> > --- a/drivers/net/mlx5/mlx5.c
> > +++ b/drivers/net/mlx5/mlx5.c
> > @@ -865,7 +865,7 @@ static struct rte_pci_driver mlx5_driver = {
> >  	},
> >  	.id_table = mlx5_pci_id_map,
> >  	.probe = mlx5_pci_probe,
> > -	.drv_flags = RTE_PCI_DRV_INTR_LSC,
> > +	.drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV,
> >  };
> >
> >  /**
> > diff --git a/drivers/net/mlx5/mlx5_ethdev.c
> > b/drivers/net/mlx5/mlx5_ethdev.c index 57f6237..404d8f4 100644
> > --- a/drivers/net/mlx5/mlx5_ethdev.c
> > +++ b/drivers/net/mlx5/mlx5_ethdev.c
> > @@ -1112,47 +1112,75 @@ mlx5_ibv_device_to_pci_addr(const struct
> > ibv_device *device,  }
> >
> >  /**
> > - * Link status handler.
> > + * Update the link status.
> > + * Set alarm if the device link status is inconsistent.
> 
> Adding such comment should also comment about the issue this alarm is
> solving i.e. why the link is inconsistent and why the alarm help to fix the
> issue.
> 
I didn't see any comments about that in the old code , Hence I didn't write it.
I think you right and this could be added.(even before this patch).

> >   *
> >   * @param priv
> >   *   Pointer to private structure.
> > - * @param dev
> > - *   Pointer to the rte_eth_dev structure.
> >   *
> >   * @return
> > - *   Nonzero if the callback process can be called immediately.
> > + *   Zero if alarm is not set and the link status is consistent.
> >   */
> >  static int
> > -priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev
> > *dev)
> > +priv_link_status_alarm_update(struct priv *priv)
> 	
> The old name is more accurate, the fact we need to program an alarm is a
> work around to get the correct status from ethtool.  If it was possible to avoid
> it, this alarm would not exists.
> 
Probably because of the git +- format and this specific patch you got confuse here.
Actually priv_link_status_alarm_update function is a new function and don't replace priv_dev_link_status_handler function.

The new name is priv_dev_status_handler since
now it is not just a link but also remove handler.
(maybe more interrupt types in the future)


> > +{
> > +	struct rte_eth_link *link = &priv->dev->data->dev_link;
> > +
> > +	mlx5_link_update(priv->dev, 0);
> > +	if (((link->link_speed == 0) && link->link_status) ||
> > +		((link->link_speed != 0) && !link->link_status)) {
> > +		if (!priv->pending_alarm) {
> > +			/* Inconsistent status, check again later. */
> > +			priv->pending_alarm = 1;
> > +			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
> > +				mlx5_dev_link_status_handler,
> > +				priv->dev);
> > +		}
> > +		return 1;
> > +	} else if (unlikely(priv->pending_alarm)) {
> > +		/* In case of link interrupt while link alarm was setting. */
> > +		priv->pending_alarm = 0;
> > +		rte_eal_alarm_cancel(mlx5_dev_link_status_handler, priv-
> >dev);
> > +	}
> > +	return 0;
> > +}
> > +
> >[...]
> >
> > @@ -1172,11 +1200,11 @@ mlx5_dev_link_status_handler(void *arg)
> >  	priv_lock(priv);
> >  	assert(priv->pending_alarm == 1);
> >  	priv->pending_alarm = 0;
> > -	ret = priv_dev_link_status_handler(priv, dev);
> > +	ret = priv_link_status_alarm_update(priv);
> 
> It is not clear, this calls an alarm_update without getting the link status, the
> function name is "link_status_handler" why does the behavior does not
> reflect the function name?
> 
> It is too confusing to be integrated as is, we had several bugs in this part of
> the code, keep it clear, by keeping the old functions name.
> 
Just to explain what was changed in link functions:

priv_dev_link_status_handler name changed 
to priv_dev_status_handler as I already explained.

Some of priv_dev_status_handler code was passed to
new function named priv_link_status_alarm_update.

This function updates the link status and sets\removes the
inconsistency link alarm if needed.
So, it updates the link status and the alarm setting.
I open for other name suggestions :)

I did this because I think the alarm handler(mlx5_dev_link_status_handler)
shouldn't call to priv_dev_status_handler for trying to update
the link again since:
1.We can't know who is calling (the interrupt or alarm) and the logic is different
accordingly:
In case of interrupt we must to update the link only when the interrupt type is LCS.
In case of alarm we always should call to link update.
2. It doesn't need to read new events from Verbs(it is not new interrupt).
Therefore, the alarm handler just calls to the new function.

So, the new function called ether by priv_dev_status_handler 
in case of LCS interrupt or by mlx5_dev_link_status_handler for
another chance to get consistent link status.

> Thanks,
> 
> --
> Nélio Laranjeiro
> 6WIND

Regards
Matan Azrad

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] net/mlx5: support device removal event
  2017-08-23 19:44   ` Matan Azrad
@ 2017-08-24  7:38     ` Nélio Laranjeiro
  2017-08-24 14:33       ` Matan Azrad
  0 siblings, 1 reply; 19+ messages in thread
From: Nélio Laranjeiro @ 2017-08-24  7:38 UTC (permalink / raw)
  To: Matan Azrad; +Cc: Adrien Mazarguil, dev

On Wed, Aug 23, 2017 at 07:44:45PM +0000, Matan Azrad wrote:
> Hi Nelio
> 
> > -----Original Message-----
> > From: Nélio Laranjeiro [mailto:nelio.laranjeiro@6wind.com]
> > Sent: Wednesday, August 23, 2017 12:41 PM
> > To: Matan Azrad <matan@mellanox.com>
> > Cc: Adrien Mazarguil <adrien.mazarguil@6wind.com>; dev@dpdk.org
> > Subject: Re: [PATCH 1/2] net/mlx5: support device removal event
> > 
> > Hi Matan,
> > 
> > On Sun, Aug 13, 2017 at 03:25:11PM +0300, Matan Azrad wrote:
> > > Extend the LSC event handling to support the device removal as well.
> > > The Verbs library may send several related events, which are different
> > > from LSC event.
> > >
> > > The mlx5 event handling has been made capable of receiving and
> > > signaling several event types at once.
> > >
> > > This support includes next:
> > > 1. Removal event detection according to the user configuration.
> > > 2. Calling to all registered mlx5 removal callbacks.
> > > 3. Capabilities extension to include removal interrupt handling.
> > >
> > > Signed-off-by: Matan Azrad <matan@mellanox.com>
> > > ---
> > >  drivers/net/mlx5/mlx5.c        |   2 +-
> > >  drivers/net/mlx5/mlx5_ethdev.c | 100
> > > +++++++++++++++++++++++++++--------------
> > >  2 files changed, 68 insertions(+), 34 deletions(-)
> > >
> > > Hi
> > > This patch based on top of last Nelio mlx5 cleanup patches.
> > >
> > > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> > > bd66a7c..1a3d7f1 100644
> > > --- a/drivers/net/mlx5/mlx5.c
> > > +++ b/drivers/net/mlx5/mlx5.c
> > > @@ -865,7 +865,7 @@ static struct rte_pci_driver mlx5_driver = {
> > >  	},
> > >  	.id_table = mlx5_pci_id_map,
> > >  	.probe = mlx5_pci_probe,
> > > -	.drv_flags = RTE_PCI_DRV_INTR_LSC,
> > > +	.drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV,
> > >  };
> > >
> > >  /**
> > > diff --git a/drivers/net/mlx5/mlx5_ethdev.c
> > > b/drivers/net/mlx5/mlx5_ethdev.c index 57f6237..404d8f4 100644
> > > --- a/drivers/net/mlx5/mlx5_ethdev.c
> > > +++ b/drivers/net/mlx5/mlx5_ethdev.c
> > > @@ -1112,47 +1112,75 @@ mlx5_ibv_device_to_pci_addr(const struct
> > > ibv_device *device,  }
> > >
> > >  /**
> > > - * Link status handler.
> > > + * Update the link status.
> > > + * Set alarm if the device link status is inconsistent.
> > 
> > Adding such comment should also comment about the issue this alarm is
> > solving i.e. why the link is inconsistent and why the alarm help to fix the
> > issue.
> > 
> I didn't see any comments about that in the old code , Hence I didn't write it.

Normal as the alarm is a work around specifically necessary to Mellanox PMD.
Now you explicitly announce that this function program an alarm, the question
is why is it necessary?

> I think you right and this could be added.(even before this patch).

No, in the current code, it update the link, if it inconsistent it tries to
have a link correct ASAP.  There is no need to inform this function will
program an alarm, it is internal cooking.

> > >   *
> > >   * @param priv
> > >   *   Pointer to private structure.
> > > - * @param dev
> > > - *   Pointer to the rte_eth_dev structure.
> > >   *
> > >   * @return
> > > - *   Nonzero if the callback process can be called immediately.
> > > + *   Zero if alarm is not set and the link status is consistent.
> > >   */
> > >  static int
> > > -priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev
> > > *dev)
> > > +priv_link_status_alarm_update(struct priv *priv)
> > 	
> > The old name is more accurate, the fact we need to program an alarm is a
> > work around to get the correct status from ethtool.  If it was possible to avoid
> > it, this alarm would not exists.
> > 
> Probably because of the git +- format and this specific patch you got confuse here.

No I applied your patch and read your code.  You did not understand my
comment.

>[...]

When I read:

>  void
>  mlx5_dev_link_status_handler(void *arg)
>  {
>         struct rte_eth_dev *dev = arg;
>         struct priv *priv = dev->data->dev_private;
>         int ret;
> 
>         priv_lock(priv);
>         assert(priv->pending_alarm == 1);
>         priv->pending_alarm = 0;
> -       ret = priv_dev_link_status_handler(priv, dev);
> +       ret = priv_link_status_alarm_update(priv);
>         priv_unlock(priv);
> -       if (ret)
> +       if (!ret)
>                 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
> -                                             NULL);
> +                       NULL);
>  }

I am expecting to find something related to a link update, what I see is an alarm
update.  I don't expect to update an alarm but a link.  The names and action
are inconsistent i.e. mlx5_dev_link_status_handler() should handle a link not
an alarm.

I understand there is a need to add more function levels, but the
priv_link_status_alarm_update() should be renamed to something like
priv_link_status_update().

Regards,

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] net/mlx5: support device removal event
  2017-08-24  7:38     ` Nélio Laranjeiro
@ 2017-08-24 14:33       ` Matan Azrad
  2017-08-25  8:29         ` Nélio Laranjeiro
  0 siblings, 1 reply; 19+ messages in thread
From: Matan Azrad @ 2017-08-24 14:33 UTC (permalink / raw)
  To: Nélio Laranjeiro; +Cc: Adrien Mazarguil, dev

Hi Nelio

> -----Original Message-----
> From: Nélio Laranjeiro [mailto:nelio.laranjeiro@6wind.com]
> Sent: Thursday, August 24, 2017 10:38 AM
> To: Matan Azrad <matan@mellanox.com>
> Cc: Adrien Mazarguil <adrien.mazarguil@6wind.com>; dev@dpdk.org
> Subject: Re: [PATCH 1/2] net/mlx5: support device removal event
> 
> On Wed, Aug 23, 2017 at 07:44:45PM +0000, Matan Azrad wrote:
> > Hi Nelio
> >
> > > -----Original Message-----
> > > From: Nélio Laranjeiro [mailto:nelio.laranjeiro@6wind.com]
> > > Sent: Wednesday, August 23, 2017 12:41 PM
> > > To: Matan Azrad <matan@mellanox.com>
> > > Cc: Adrien Mazarguil <adrien.mazarguil@6wind.com>; dev@dpdk.org
> > > Subject: Re: [PATCH 1/2] net/mlx5: support device removal event
> > >
> > > Hi Matan,
> > >
> > > On Sun, Aug 13, 2017 at 03:25:11PM +0300, Matan Azrad wrote:
> > > > Extend the LSC event handling to support the device removal as well.
> > > > The Verbs library may send several related events, which are
> > > > different from LSC event.
> > > >
> > > > The mlx5 event handling has been made capable of receiving and
> > > > signaling several event types at once.
> > > >
> > > > This support includes next:
> > > > 1. Removal event detection according to the user configuration.
> > > > 2. Calling to all registered mlx5 removal callbacks.
> > > > 3. Capabilities extension to include removal interrupt handling.
> > > >
> > > > Signed-off-by: Matan Azrad <matan@mellanox.com>
> > > > ---
> > > >  drivers/net/mlx5/mlx5.c        |   2 +-
> > > >  drivers/net/mlx5/mlx5_ethdev.c | 100
> > > > +++++++++++++++++++++++++++--------------
> > > >  2 files changed, 68 insertions(+), 34 deletions(-)
> > > >
> > > > Hi
> > > > This patch based on top of last Nelio mlx5 cleanup patches.
> > > >
> > > > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> > > > index
> > > > bd66a7c..1a3d7f1 100644
> > > > --- a/drivers/net/mlx5/mlx5.c
> > > > +++ b/drivers/net/mlx5/mlx5.c
> > > > @@ -865,7 +865,7 @@ static struct rte_pci_driver mlx5_driver = {
> > > >  	},
> > > >  	.id_table = mlx5_pci_id_map,
> > > >  	.probe = mlx5_pci_probe,
> > > > -	.drv_flags = RTE_PCI_DRV_INTR_LSC,
> > > > +	.drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV,
> > > >  };
> > > >
> > > >  /**
> > > > diff --git a/drivers/net/mlx5/mlx5_ethdev.c
> > > > b/drivers/net/mlx5/mlx5_ethdev.c index 57f6237..404d8f4 100644
> > > > --- a/drivers/net/mlx5/mlx5_ethdev.c
> > > > +++ b/drivers/net/mlx5/mlx5_ethdev.c
> > > > @@ -1112,47 +1112,75 @@ mlx5_ibv_device_to_pci_addr(const struct
> > > > ibv_device *device,  }
> > > >
> > > >  /**
> > > > - * Link status handler.
> > > > + * Update the link status.
> > > > + * Set alarm if the device link status is inconsistent.
> > >
> > > Adding such comment should also comment about the issue this alarm
> > > is solving i.e. why the link is inconsistent and why the alarm help
> > > to fix the issue.
> > >
> > I didn't see any comments about that in the old code , Hence I didn't write
> it.
> 
> Normal as the alarm is a work around specifically necessary to Mellanox PMD.
> Now you explicitly announce that this function program an alarm, the
> question is why is it necessary?
> 

> > I think you right and this could be added.(even before this patch).
> 
> No, in the current code, it update the link, if it inconsistent it tries to have a
> link correct ASAP.  There is no need to inform this function will program an
> alarm, it is internal cooking.
> 
> > > >   *
> > > >   * @param priv
> > > >   *   Pointer to private structure.
> > > > - * @param dev
> > > > - *   Pointer to the rte_eth_dev structure.
> > > >   *
> > > >   * @return
> > > > - *   Nonzero if the callback process can be called immediately.
> > > > + *   Zero if alarm is not set and the link status is consistent.
> > > >   */
> > > >  static int
> > > > -priv_dev_link_status_handler(struct priv *priv, struct
> > > > rte_eth_dev
> > > > *dev)
> > > > +priv_link_status_alarm_update(struct priv *priv)
> > >
> > > The old name is more accurate, the fact we need to program an alarm
> > > is a work around to get the correct status from ethtool.  If it was
> > > possible to avoid it, this alarm would not exists.
> > >
> > Probably because of the git +- format and this specific patch you got
> confuse here.
> 
> No I applied your patch and read your code.  You did not understand my
> comment.
>
I thought it because you said "old name" related to a new function name :) 
 
> >[...]
> 
> When I read:
> 
> >  void
> >  mlx5_dev_link_status_handler(void *arg)  {
> >         struct rte_eth_dev *dev = arg;
> >         struct priv *priv = dev->data->dev_private;
> >         int ret;
> >
> >         priv_lock(priv);
> >         assert(priv->pending_alarm == 1);
> >         priv->pending_alarm = 0;
> > -       ret = priv_dev_link_status_handler(priv, dev);
> > +       ret = priv_link_status_alarm_update(priv);
> >         priv_unlock(priv);
> > -       if (ret)
> > +       if (!ret)
> >                 _rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC,
> NULL,
> > -                                             NULL);
> > +                       NULL);
> >  }
> 
> I am expecting to find something related to a link update, what I see is an
> alarm update.  I don't expect to update an alarm but a link.  The names and
> action are inconsistent i.e. mlx5_dev_link_status_handler() should handle a
> link not an alarm.
> 
> I understand there is a need to add more function levels, but the
> priv_link_status_alarm_update() should be renamed to something like
> priv_link_status_update().

OK, I think I understand you.

Because the alarm is a workaround you don't think it should be mentioned
in function description or function name.
(also the function subject should be the link status and not the alarm)
I can agree with you about it.
And I will create v2 with your suggestion - priv_link_status_update.

The return value description can stay as in old code semantic:
Zero if the callback process can be called immediately.

Are you agree?

Maybe we can tell something about the alarm and inconsistent reason
In this function description or internal comment for future code review.
If you want it, please suggest comment.

Thank you.
> 
> Regards,
> 
> --
> Nélio Laranjeiro
> 6WIND

Regards
Matan Azrad

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] net/mlx5: support device removal event
  2017-08-24 14:33       ` Matan Azrad
@ 2017-08-25  8:29         ` Nélio Laranjeiro
  2017-08-29  8:30           ` [PATCH v2] " Matan Azrad
  0 siblings, 1 reply; 19+ messages in thread
From: Nélio Laranjeiro @ 2017-08-25  8:29 UTC (permalink / raw)
  To: Matan Azrad; +Cc: Adrien Mazarguil, dev

On Thu, Aug 24, 2017 at 02:33:43PM +0000, Matan Azrad wrote:
> Hi Nelio
>[...] 
> > 
> > I am expecting to find something related to a link update, what I see is an
> > alarm update.  I don't expect to update an alarm but a link.  The names and
> > action are inconsistent i.e. mlx5_dev_link_status_handler() should handle a
> > link not an alarm.
> > 
> > I understand there is a need to add more function levels, but the
> > priv_link_status_alarm_update() should be renamed to something like
> > priv_link_status_update().
> 
> OK, I think I understand you.
> 
> Because the alarm is a workaround you don't think it should be mentioned
> in function description or function name.
> (also the function subject should be the link status and not the alarm)
> I can agree with you about it.
> And I will create v2 with your suggestion - priv_link_status_update.

Thanks,

> The return value description can stay as in old code semantic:
> Zero if the callback process can be called immediately.
> 
> Are you agree?

Yes.

> Maybe we can tell something about the alarm and inconsistent reason
> In this function description or internal comment for future code review.
> If you want it, please suggest comment.

Yes the comment can added internally.

Thanks,

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH v2] net/mlx5: support device removal event
  2017-08-25  8:29         ` Nélio Laranjeiro
@ 2017-08-29  8:30           ` Matan Azrad
  2017-09-04 12:49             ` Nélio Laranjeiro
  0 siblings, 1 reply; 19+ messages in thread
From: Matan Azrad @ 2017-08-29  8:30 UTC (permalink / raw)
  To: Nelio Laranjeiro; +Cc: dev, Adrien Mazarguil

Extend the LSC event handling to support the device removal as well.
The Verbs library may send several related events, which are
different from LSC event.

The mlx5 event handling has been made capable of receiving and
signaling several event types at once.

This support includes next:
1. Removal event detection according to the user configuration.
2. Calling to all registered mlx5 removal callbacks.
3. Capabilities extension to include removal interrupt handling.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx5/mlx5.c        |   2 +-
 drivers/net/mlx5/mlx5_ethdev.c | 108 ++++++++++++++++++++++++++++-------------
 2 files changed, 76 insertions(+), 34 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index bd66a7c..1a3d7f1 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -865,7 +865,7 @@ static struct rte_pci_driver mlx5_driver = {
 	},
 	.id_table = mlx5_pci_id_map,
 	.probe = mlx5_pci_probe,
-	.drv_flags = RTE_PCI_DRV_INTR_LSC,
+	.drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV,
 };
 
 /**
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 57f6237..2cf7726 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1112,47 +1112,83 @@ mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
 }
 
 /**
- * Link status handler.
+ * Update the link status.
  *
  * @param priv
  *   Pointer to private structure.
- * @param dev
- *   Pointer to the rte_eth_dev structure.
  *
  * @return
- *   Nonzero if the callback process can be called immediately.
+ *   Zero if the callback process can be called immediately.
  */
 static int
-priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev)
+priv_link_status_update(struct priv *priv)
+{
+	struct rte_eth_link *link = &priv->dev->data->dev_link;
+
+	mlx5_link_update(priv->dev, 0);
+	if (((link->link_speed == 0) && link->link_status) ||
+		((link->link_speed != 0) && !link->link_status)) {
+		/*
+		 * Inconsistent status.
+		 * The link status is read from Ethtool through an IOCTL,
+		 * but as the PMD is working in polling mode it gets the port
+		 * event before the Kernel driver had time to process it.
+		 * PMD then request the link from the kernel but the event is
+		 * still not processed (due to more urgent interrupts) and
+		 * finally the PMD may get an inconsistent link.
+		 * Setting alarm for later checking.
+		 */
+		if (!priv->pending_alarm) {
+			priv->pending_alarm = 1;
+			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
+				mlx5_dev_link_status_handler,
+				priv->dev);
+		}
+		return 1;
+	} else if (unlikely(priv->pending_alarm)) {
+		/* In case of link interrupt while link alarm was setting. */
+		priv->pending_alarm = 0;
+		rte_eal_alarm_cancel(mlx5_dev_link_status_handler, priv->dev);
+	}
+	return 0;
+}
+
+/**
+ * Device status handler.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param events
+ *   Pointer to event flags holder.
+ *
+ * @return
+ *   Events bitmap of callback process which can be called immediately.
+ */
+static uint32_t
+priv_dev_status_handler(struct priv *priv)
 {
 	struct ibv_async_event event;
-	struct rte_eth_link *link = &dev->data->dev_link;
-	int ret = 0;
+	uint32_t ret = 0;
 
 	/* Read all message and acknowledge them. */
 	for (;;) {
 		if (ibv_get_async_event(priv->ctx, &event))
 			break;
-
-		if (event.event_type != IBV_EVENT_PORT_ACTIVE &&
-		    event.event_type != IBV_EVENT_PORT_ERR)
+		if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
+			event.event_type == IBV_EVENT_PORT_ERR) &&
+			(priv->dev->data->dev_conf.intr_conf.lsc == 1))
+			ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
+		else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
+			priv->dev->data->dev_conf.intr_conf.rmv == 1)
+			ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
+		else
 			DEBUG("event type %d on port %d not handled",
-			      event.event_type, event.element.port_num);
+				event.event_type, event.element.port_num);
 		ibv_ack_async_event(&event);
 	}
-	mlx5_link_update(dev, 0);
-	if (((link->link_speed == 0) && link->link_status) ||
-	    ((link->link_speed != 0) && !link->link_status)) {
-		if (!priv->pending_alarm) {
-			/* Inconsistent status, check again later. */
-			priv->pending_alarm = 1;
-			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
-					  mlx5_dev_link_status_handler,
-					  dev);
-		}
-	} else {
-		ret = 1;
-	}
+	if (ret & (1 << RTE_ETH_EVENT_INTR_LSC))
+		if (priv_link_status_update(priv))
+			ret &= ~(1 << RTE_ETH_EVENT_INTR_LSC);
 	return ret;
 }
 
@@ -1172,11 +1208,11 @@ mlx5_dev_link_status_handler(void *arg)
 	priv_lock(priv);
 	assert(priv->pending_alarm == 1);
 	priv->pending_alarm = 0;
-	ret = priv_dev_link_status_handler(priv, dev);
+	ret = priv_link_status_update(priv);
 	priv_unlock(priv);
-	if (ret)
+	if (!ret)
 		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
-					      NULL);
+			NULL);
 }
 
 /**
@@ -1192,14 +1228,17 @@ mlx5_dev_interrupt_handler(void *cb_arg)
 {
 	struct rte_eth_dev *dev = cb_arg;
 	struct priv *priv = dev->data->dev_private;
-	int ret;
+	uint32_t events;
 
 	priv_lock(priv);
-	ret = priv_dev_link_status_handler(priv, dev);
+	events = priv_dev_status_handler(priv);
 	priv_unlock(priv);
-	if (ret)
+	if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
 		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
-					      NULL);
+			NULL);
+	if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
+		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL,
+			NULL);
 }
 
 /**
@@ -1213,7 +1252,8 @@ mlx5_dev_interrupt_handler(void *cb_arg)
 void
 priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev)
 {
-	if (!dev->data->dev_conf.intr_conf.lsc)
+	if (!dev->data->dev_conf.intr_conf.lsc &&
+		!dev->data->dev_conf.intr_conf.rmv)
 		return;
 	rte_intr_callback_unregister(&priv->intr_handle,
 				     mlx5_dev_interrupt_handler,
@@ -1238,7 +1278,8 @@ priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev)
 {
 	int rc, flags;
 
-	if (!dev->data->dev_conf.intr_conf.lsc)
+	if (!dev->data->dev_conf.intr_conf.lsc &&
+		!dev->data->dev_conf.intr_conf.rmv)
 		return;
 	assert(priv->ctx->async_fd > 0);
 	flags = fcntl(priv->ctx->async_fd, F_GETFL);
@@ -1246,6 +1287,7 @@ priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev)
 	if (rc < 0) {
 		INFO("failed to change file descriptor async event queue");
 		dev->data->dev_conf.intr_conf.lsc = 0;
+		dev->data->dev_conf.intr_conf.rmv = 0;
 	} else {
 		priv->intr_handle.fd = priv->ctx->async_fd;
 		priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [dpdk-stable] [PATCH 2/2] net/mlx5: fix probe failure report
  2017-08-23  9:44   ` Nélio Laranjeiro
@ 2017-09-01 10:40     ` Ferruh Yigit
  0 siblings, 0 replies; 19+ messages in thread
From: Ferruh Yigit @ 2017-09-01 10:40 UTC (permalink / raw)
  To: Nélio Laranjeiro, Matan Azrad; +Cc: Adrien Mazarguil, dev, stable

On 8/23/2017 10:44 AM, Nélio Laranjeiro wrote:
> On Sun, Aug 13, 2017 at 03:25:12PM +0300, Matan Azrad wrote:
>> The corrupted code doesn't return error when probe function
>> fails due to error in device mac address getting.
>> By this way, the probe function may return success even if the
>> ETH dev is not allocated.
>>
>> Hence, the probe caller, for example failsafe PMD, fails when it
>> tries to get ETH dev after the device was plugged out while mlx5
>> was probing it.
>>
>> The fix adds error report to the probe caller when priv_get_mac fails
>> and in all other failure options which are missing it.
>>
>> By this way, it prevents the unexpected behavior to miss ETH device
>> after the device was probed successfully.
>>
>> This bug was already present in the original code taken from mlx4.
>>
>> Fixes: 771fa900b73a ("mlx5: introduce new driver for Mellanox ConnectX-4 adapters")
>> Fixes: 1371f4df16bc ("mlx5: check port is configured as ethernet device")
>>
>> Signed-off-by: Matan Azrad <matan@mellanox.com>
>> Cc: stable@dpdk.org
> 
> Acked-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>

Applied to dpdk-next-net/master, thanks.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] net/mlx5: support device removal event
  2017-08-29  8:30           ` [PATCH v2] " Matan Azrad
@ 2017-09-04 12:49             ` Nélio Laranjeiro
  2017-09-04 13:55               ` [PATCH v3] " Matan Azrad
  0 siblings, 1 reply; 19+ messages in thread
From: Nélio Laranjeiro @ 2017-09-04 12:49 UTC (permalink / raw)
  To: Matan Azrad; +Cc: dev, Adrien Mazarguil

Hi Matan,

Please see comments below,

On Tue, Aug 29, 2017 at 11:30:44AM +0300, Matan Azrad wrote:
> Extend the LSC event handling to support the device removal as well.
> The Verbs library may send several related events, which are
> different from LSC event.
> 
> The mlx5 event handling has been made capable of receiving and
> signaling several event types at once.
> 
> This support includes next:
> 1. Removal event detection according to the user configuration.
> 2. Calling to all registered mlx5 removal callbacks.
> 3. Capabilities extension to include removal interrupt handling.
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>
> ---
>  drivers/net/mlx5/mlx5.c        |   2 +-
>  drivers/net/mlx5/mlx5_ethdev.c | 108 ++++++++++++++++++++++++++++-------------
>  2 files changed, 76 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> index bd66a7c..1a3d7f1 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -865,7 +865,7 @@ static struct rte_pci_driver mlx5_driver = {
>  	},
>  	.id_table = mlx5_pci_id_map,
>  	.probe = mlx5_pci_probe,
> -	.drv_flags = RTE_PCI_DRV_INTR_LSC,
> +	.drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV,
>  };
>  
>  /**
> diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
> index 57f6237..2cf7726 100644
> --- a/drivers/net/mlx5/mlx5_ethdev.c
> +++ b/drivers/net/mlx5/mlx5_ethdev.c
> @@ -1112,47 +1112,83 @@ mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
>  }
>  
>  /**
> - * Link status handler.
> + * Update the link status.
>   *
>   * @param priv
>   *   Pointer to private structure.
> - * @param dev
> - *   Pointer to the rte_eth_dev structure.
>   *
>   * @return
> - *   Nonzero if the callback process can be called immediately.
> + *   Zero if the callback process can be called immediately.
>   */
>  static int
> -priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev)
> +priv_link_status_update(struct priv *priv)
> +{
> +	struct rte_eth_link *link = &priv->dev->data->dev_link;
> +
> +	mlx5_link_update(priv->dev, 0);
> +	if (((link->link_speed == 0) && link->link_status) ||
> +		((link->link_speed != 0) && !link->link_status)) {
> +		/*
> +		 * Inconsistent status.
> +		 * The link status is read from Ethtool through an IOCTL,
> +		 * but as the PMD is working in polling mode it gets the port
> +		 * event before the Kernel driver had time to process it.
> +		 * PMD then request the link from the kernel but the event is
> +		 * still not processed (due to more urgent interrupts) and
> +		 * finally the PMD may get an inconsistent link.
> +		 * Setting alarm for later checking.
> +		 */

This comment is not totally correct, it is not the PMD which handles the LCS
event, but the application.  In testpmd the LSC event are polled in loop,
there is no guarantee that another application will do the same.
Remaining part is correct.

> +		if (!priv->pending_alarm) {
> +			priv->pending_alarm = 1;
> +			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
> +				mlx5_dev_link_status_handler,
> +				priv->dev);

Indentation.

> +		}
> +		return 1;
> +	} else if (unlikely(priv->pending_alarm)) {
> +		/* In case of link interrupt while link alarm was setting. */
> +		priv->pending_alarm = 0;
> +		rte_eal_alarm_cancel(mlx5_dev_link_status_handler, priv->dev);
> +	}
> +	return 0;
> +}
> +
> +/**
> + * Device status handler.
> + *
> + * @param priv
> + *   Pointer to private structure.
> + * @param events
> + *   Pointer to event flags holder.
> + *
> + * @return
> + *   Events bitmap of callback process which can be called immediately.
> + */
> +static uint32_t
> +priv_dev_status_handler(struct priv *priv)
>  {
>  	struct ibv_async_event event;
> -	struct rte_eth_link *link = &dev->data->dev_link;
> -	int ret = 0;
> +	uint32_t ret = 0;
>  
>  	/* Read all message and acknowledge them. */
>  	for (;;) {
>  		if (ibv_get_async_event(priv->ctx, &event))
>  			break;
> -
> -		if (event.event_type != IBV_EVENT_PORT_ACTIVE &&
> -		    event.event_type != IBV_EVENT_PORT_ERR)
> +		if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
> +			event.event_type == IBV_EVENT_PORT_ERR) &&
> +			(priv->dev->data->dev_conf.intr_conf.lsc == 1))
> +			ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
> +		else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
> +			priv->dev->data->dev_conf.intr_conf.rmv == 1)
> +			ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
> +		else
>  			DEBUG("event type %d on port %d not handled",
> -			      event.event_type, event.element.port_num);
> +				event.event_type, event.element.port_num);

Why is this line modified?, the indentation was correct and nothing has been
modified.

>  		ibv_ack_async_event(&event);
>  	}
> -	mlx5_link_update(dev, 0);
> -	if (((link->link_speed == 0) && link->link_status) ||
> -	    ((link->link_speed != 0) && !link->link_status)) {
> -		if (!priv->pending_alarm) {
> -			/* Inconsistent status, check again later. */
> -			priv->pending_alarm = 1;
> -			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
> -					  mlx5_dev_link_status_handler,
> -					  dev);
> -		}
> -	} else {
> -		ret = 1;
> -	}
> +	if (ret & (1 << RTE_ETH_EVENT_INTR_LSC))
> +		if (priv_link_status_update(priv))
> +			ret &= ~(1 << RTE_ETH_EVENT_INTR_LSC);
>  	return ret;
>  }
>  
> @@ -1172,11 +1208,11 @@ mlx5_dev_link_status_handler(void *arg)
>  	priv_lock(priv);
>  	assert(priv->pending_alarm == 1);
>  	priv->pending_alarm = 0;
> -	ret = priv_dev_link_status_handler(priv, dev);
> +	ret = priv_link_status_update(priv);
>  	priv_unlock(priv);
> -	if (ret)
> +	if (!ret)
>  		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
> -					      NULL);
> +			NULL);

Same as above, why is this line modified?

>  }
>  
>  /**
> @@ -1192,14 +1228,17 @@ mlx5_dev_interrupt_handler(void *cb_arg)
>  {
>  	struct rte_eth_dev *dev = cb_arg;
>  	struct priv *priv = dev->data->dev_private;
> -	int ret;
> +	uint32_t events;
>  
>  	priv_lock(priv);
> -	ret = priv_dev_link_status_handler(priv, dev);
> +	events = priv_dev_status_handler(priv);
>  	priv_unlock(priv);
> -	if (ret)
> +	if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
>  		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
> -					      NULL);
> +			NULL);

Same question here,

> +	if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
> +		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL,
> +			NULL);

and here.

Thanks,

-- 
Nélio Laranjeiro
6WIND

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH v3] net/mlx5: support device removal event
  2017-09-04 12:49             ` Nélio Laranjeiro
@ 2017-09-04 13:55               ` Matan Azrad
  2017-09-04 15:33                 ` Adrien Mazarguil
  0 siblings, 1 reply; 19+ messages in thread
From: Matan Azrad @ 2017-09-04 13:55 UTC (permalink / raw)
  To: Nelio Laranjeiro; +Cc: dev

Extend the LSC event handling to support the device removal as well.
The Verbs library may send several related events, which are
different from LSC event.

The mlx5 event handling has been made capable of receiving and
signaling several event types at once.

This support includes next:
1. Removal event detection according to the user configuration.
2. Calling to all registered mlx5 removal callbacks.
3. Capabilities extension to include removal interrupt handling.

Signed-off-by: Matan Azrad <matan@mellanox.com>
---
 drivers/net/mlx5/mlx5.c        |   2 +-
 drivers/net/mlx5/mlx5_ethdev.c | 103 +++++++++++++++++++++++++++++------------
 2 files changed, 74 insertions(+), 31 deletions(-)

Changes:
V2:
Replace link status update function name.
add inconsistent link workaround comment.

V3:
Fix indentations.
Accurate inconsistent link comment.


diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index bd66a7c..1a3d7f1 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -865,7 +865,7 @@ static struct rte_pci_driver mlx5_driver = {
 	},
 	.id_table = mlx5_pci_id_map,
 	.probe = mlx5_pci_probe,
-	.drv_flags = RTE_PCI_DRV_INTR_LSC,
+	.drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV,
 };
 
 /**
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 57f6237..cdbd723 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1112,47 +1112,84 @@ mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
 }
 
 /**
- * Link status handler.
+ * Update the link status.
  *
  * @param priv
  *   Pointer to private structure.
- * @param dev
- *   Pointer to the rte_eth_dev structure.
  *
  * @return
- *   Nonzero if the callback process can be called immediately.
+ *   Zero if the callback process can be called immediately.
  */
 static int
-priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev)
+priv_link_status_update(struct priv *priv)
+{
+	struct rte_eth_link *link = &priv->dev->data->dev_link;
+
+	mlx5_link_update(priv->dev, 0);
+	if (((link->link_speed == 0) && link->link_status) ||
+		((link->link_speed != 0) && !link->link_status)) {
+		/*
+		 * Inconsistent status.
+		 * The link status is read from Ethtool through an IOCTL,
+		 * but as the application may work in polling mode it
+		 * may get the port event before the Kernel driver had
+		 * time to process it. PMD then request the link from
+		 * the kernel but the event is still not processed (due
+		 * to more urgent interrupts) and finally the PMD may
+		 * get an inconsistent link.
+		 * Setting alarm for later checking.
+		 */
+		if (!priv->pending_alarm) {
+			priv->pending_alarm = 1;
+			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
+					  mlx5_dev_link_status_handler,
+					  priv->dev);
+		}
+		return 1;
+	} else if (unlikely(priv->pending_alarm)) {
+		/* In case of link interrupt while link alarm was setting. */
+		priv->pending_alarm = 0;
+		rte_eal_alarm_cancel(mlx5_dev_link_status_handler, priv->dev);
+	}
+	return 0;
+}
+
+/**
+ * Device status handler.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param events
+ *   Pointer to event flags holder.
+ *
+ * @return
+ *   Events bitmap of callback process which can be called immediately.
+ */
+static uint32_t
+priv_dev_status_handler(struct priv *priv)
 {
 	struct ibv_async_event event;
-	struct rte_eth_link *link = &dev->data->dev_link;
-	int ret = 0;
+	uint32_t ret = 0;
 
 	/* Read all message and acknowledge them. */
 	for (;;) {
 		if (ibv_get_async_event(priv->ctx, &event))
 			break;
-
-		if (event.event_type != IBV_EVENT_PORT_ACTIVE &&
-		    event.event_type != IBV_EVENT_PORT_ERR)
+		if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
+			event.event_type == IBV_EVENT_PORT_ERR) &&
+			(priv->dev->data->dev_conf.intr_conf.lsc == 1))
+			ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
+		else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
+			priv->dev->data->dev_conf.intr_conf.rmv == 1)
+			ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
+		else
 			DEBUG("event type %d on port %d not handled",
 			      event.event_type, event.element.port_num);
 		ibv_ack_async_event(&event);
 	}
-	mlx5_link_update(dev, 0);
-	if (((link->link_speed == 0) && link->link_status) ||
-	    ((link->link_speed != 0) && !link->link_status)) {
-		if (!priv->pending_alarm) {
-			/* Inconsistent status, check again later. */
-			priv->pending_alarm = 1;
-			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
-					  mlx5_dev_link_status_handler,
-					  dev);
-		}
-	} else {
-		ret = 1;
-	}
+	if (ret & (1 << RTE_ETH_EVENT_INTR_LSC))
+		if (priv_link_status_update(priv))
+			ret &= ~(1 << RTE_ETH_EVENT_INTR_LSC);
 	return ret;
 }
 
@@ -1172,9 +1209,9 @@ mlx5_dev_link_status_handler(void *arg)
 	priv_lock(priv);
 	assert(priv->pending_alarm == 1);
 	priv->pending_alarm = 0;
-	ret = priv_dev_link_status_handler(priv, dev);
+	ret = priv_link_status_update(priv);
 	priv_unlock(priv);
-	if (ret)
+	if (!ret)
 		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
 					      NULL);
 }
@@ -1192,14 +1229,17 @@ mlx5_dev_interrupt_handler(void *cb_arg)
 {
 	struct rte_eth_dev *dev = cb_arg;
 	struct priv *priv = dev->data->dev_private;
-	int ret;
+	uint32_t events;
 
 	priv_lock(priv);
-	ret = priv_dev_link_status_handler(priv, dev);
+	events = priv_dev_status_handler(priv);
 	priv_unlock(priv);
-	if (ret)
+	if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
 		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
 					      NULL);
+	if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
+		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL,
+					      NULL);
 }
 
 /**
@@ -1213,7 +1253,8 @@ mlx5_dev_interrupt_handler(void *cb_arg)
 void
 priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev)
 {
-	if (!dev->data->dev_conf.intr_conf.lsc)
+	if (!dev->data->dev_conf.intr_conf.lsc &&
+		!dev->data->dev_conf.intr_conf.rmv)
 		return;
 	rte_intr_callback_unregister(&priv->intr_handle,
 				     mlx5_dev_interrupt_handler,
@@ -1238,7 +1279,8 @@ priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev)
 {
 	int rc, flags;
 
-	if (!dev->data->dev_conf.intr_conf.lsc)
+	if (!dev->data->dev_conf.intr_conf.lsc &&
+		!dev->data->dev_conf.intr_conf.rmv)
 		return;
 	assert(priv->ctx->async_fd > 0);
 	flags = fcntl(priv->ctx->async_fd, F_GETFL);
@@ -1246,6 +1288,7 @@ priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev)
 	if (rc < 0) {
 		INFO("failed to change file descriptor async event queue");
 		dev->data->dev_conf.intr_conf.lsc = 0;
+		dev->data->dev_conf.intr_conf.rmv = 0;
 	} else {
 		priv->intr_handle.fd = priv->ctx->async_fd;
 		priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH v3] net/mlx5: support device removal event
  2017-09-04 13:55               ` [PATCH v3] " Matan Azrad
@ 2017-09-04 15:33                 ` Adrien Mazarguil
  2017-09-04 17:52                   ` Matan Azrad
  0 siblings, 1 reply; 19+ messages in thread
From: Adrien Mazarguil @ 2017-09-04 15:33 UTC (permalink / raw)
  To: Matan Azrad; +Cc: Nelio Laranjeiro, dev

Hi Matan,

One comment I have is, while this patch adds support for RMV, it also
silently addresses a bug (see large comment you added to
priv_link_status_update()).

This should be split in two commits, with the fix part coming first and CC
stable@dpdk.org, and a second commit adding RMV support proper.

More below.

On Mon, Sep 04, 2017 at 04:55:53PM +0300, Matan Azrad wrote:
> Extend the LSC event handling to support the device removal as well.
> The Verbs library may send several related events, which are
> different from LSC event.
> 
> The mlx5 event handling has been made capable of receiving and
> signaling several event types at once.
> 
> This support includes next:
> 1. Removal event detection according to the user configuration.
> 2. Calling to all registered mlx5 removal callbacks.
> 3. Capabilities extension to include removal interrupt handling.
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>
> ---
>  drivers/net/mlx5/mlx5.c        |   2 +-
>  drivers/net/mlx5/mlx5_ethdev.c | 103 +++++++++++++++++++++++++++++------------
>  2 files changed, 74 insertions(+), 31 deletions(-)
> 
> Changes:
> V2:
> Replace link status update function name.
> add inconsistent link workaround comment.
> 
> V3:
> Fix indentations.
> Accurate inconsistent link comment.
> 
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> index bd66a7c..1a3d7f1 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -865,7 +865,7 @@ static struct rte_pci_driver mlx5_driver = {
>  	},
>  	.id_table = mlx5_pci_id_map,
>  	.probe = mlx5_pci_probe,
> -	.drv_flags = RTE_PCI_DRV_INTR_LSC,
> +	.drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV,
>  };
>  
>  /**
> diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
> index 57f6237..cdbd723 100644
> --- a/drivers/net/mlx5/mlx5_ethdev.c
> +++ b/drivers/net/mlx5/mlx5_ethdev.c
> @@ -1112,47 +1112,84 @@ mlx5_ibv_device_to_pci_addr(const struct ibv_device *device,
>  }
>  
>  /**
> - * Link status handler.
> + * Update the link status.
>   *
>   * @param priv
>   *   Pointer to private structure.
> - * @param dev
> - *   Pointer to the rte_eth_dev structure.
>   *
>   * @return
> - *   Nonzero if the callback process can be called immediately.
> + *   Zero if the callback process can be called immediately.
>   */
>  static int
> -priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev *dev)
> +priv_link_status_update(struct priv *priv)
> +{
> +	struct rte_eth_link *link = &priv->dev->data->dev_link;
> +
> +	mlx5_link_update(priv->dev, 0);
> +	if (((link->link_speed == 0) && link->link_status) ||
> +		((link->link_speed != 0) && !link->link_status)) {
> +		/*
> +		 * Inconsistent status.
> +		 * The link status is read from Ethtool through an IOCTL,
> +		 * but as the application may work in polling mode it
> +		 * may get the port event before the Kernel driver had
> +		 * time to process it. PMD then request the link from
> +		 * the kernel but the event is still not processed (due
> +		 * to more urgent interrupts) and finally the PMD may
> +		 * get an inconsistent link.
> +		 * Setting alarm for later checking.
> +		 */

While adding a comment is nice, there's too much info in there. From the PMD
standpoint, what happens is the interrupt occurs much before the kernel
netdevice exposes the new status, so it needs to be checked later. Can you
sum it up in fewer words?

> +		if (!priv->pending_alarm) {
> +			priv->pending_alarm = 1;
> +			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
> +					  mlx5_dev_link_status_handler,
> +					  priv->dev);
> +		}
> +		return 1;
> +	} else if (unlikely(priv->pending_alarm)) {
> +		/* In case of link interrupt while link alarm was setting. */
> +		priv->pending_alarm = 0;
> +		rte_eal_alarm_cancel(mlx5_dev_link_status_handler, priv->dev);
> +	}
> +	return 0;
> +}
> +
> +/**
> + * Device status handler.
> + *
> + * @param priv
> + *   Pointer to private structure.
> + * @param events
> + *   Pointer to event flags holder.
> + *
> + * @return
> + *   Events bitmap of callback process which can be called immediately.
> + */
> +static uint32_t
> +priv_dev_status_handler(struct priv *priv)
>  {
>  	struct ibv_async_event event;
> -	struct rte_eth_link *link = &dev->data->dev_link;
> -	int ret = 0;
> +	uint32_t ret = 0;
>  
>  	/* Read all message and acknowledge them. */
>  	for (;;) {
>  		if (ibv_get_async_event(priv->ctx, &event))
>  			break;
> -
> -		if (event.event_type != IBV_EVENT_PORT_ACTIVE &&
> -		    event.event_type != IBV_EVENT_PORT_ERR)
> +		if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
> +			event.event_type == IBV_EVENT_PORT_ERR) &&
> +			(priv->dev->data->dev_conf.intr_conf.lsc == 1))
> +			ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
> +		else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
> +			priv->dev->data->dev_conf.intr_conf.rmv == 1)
> +			ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
> +		else
>  			DEBUG("event type %d on port %d not handled",
>  			      event.event_type, event.element.port_num);

What you also need to mention in the commit log of the fix is that splitting
priv_dev_status_handler() and priv_link_status_update() addresses another
bug here: this loop consumed *all* events, even during alarms. An alarm
occurring for a LSC event could eat a RMV event that the application would
never receive. This also affects mlx4, for which I intend to submit a fix
soon.

>  		ibv_ack_async_event(&event);
>  	}
> -	mlx5_link_update(dev, 0);
> -	if (((link->link_speed == 0) && link->link_status) ||
> -	    ((link->link_speed != 0) && !link->link_status)) {
> -		if (!priv->pending_alarm) {
> -			/* Inconsistent status, check again later. */
> -			priv->pending_alarm = 1;
> -			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
> -					  mlx5_dev_link_status_handler,
> -					  dev);
> -		}
> -	} else {
> -		ret = 1;
> -	}
> +	if (ret & (1 << RTE_ETH_EVENT_INTR_LSC))
> +		if (priv_link_status_update(priv))
> +			ret &= ~(1 << RTE_ETH_EVENT_INTR_LSC);
>  	return ret;
>  }
>  
> @@ -1172,9 +1209,9 @@ mlx5_dev_link_status_handler(void *arg)
>  	priv_lock(priv);
>  	assert(priv->pending_alarm == 1);
>  	priv->pending_alarm = 0;
> -	ret = priv_dev_link_status_handler(priv, dev);
> +	ret = priv_link_status_update(priv);
>  	priv_unlock(priv);
> -	if (ret)
> +	if (!ret)
>  		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
>  					      NULL);
>  }
> @@ -1192,14 +1229,17 @@ mlx5_dev_interrupt_handler(void *cb_arg)
>  {
>  	struct rte_eth_dev *dev = cb_arg;
>  	struct priv *priv = dev->data->dev_private;
> -	int ret;
> +	uint32_t events;
>  
>  	priv_lock(priv);
> -	ret = priv_dev_link_status_handler(priv, dev);
> +	events = priv_dev_status_handler(priv);
>  	priv_unlock(priv);
> -	if (ret)
> +	if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
>  		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_LSC, NULL,
>  					      NULL);
> +	if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
> +		_rte_eth_dev_callback_process(dev, RTE_ETH_EVENT_INTR_RMV, NULL,
> +					      NULL);
>  }
>  
>  /**
> @@ -1213,7 +1253,8 @@ mlx5_dev_interrupt_handler(void *cb_arg)
>  void
>  priv_dev_interrupt_handler_uninstall(struct priv *priv, struct rte_eth_dev *dev)
>  {
> -	if (!dev->data->dev_conf.intr_conf.lsc)
> +	if (!dev->data->dev_conf.intr_conf.lsc &&
> +		!dev->data->dev_conf.intr_conf.rmv)
>  		return;
>  	rte_intr_callback_unregister(&priv->intr_handle,
>  				     mlx5_dev_interrupt_handler,
> @@ -1238,7 +1279,8 @@ priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev)
>  {
>  	int rc, flags;
>  
> -	if (!dev->data->dev_conf.intr_conf.lsc)
> +	if (!dev->data->dev_conf.intr_conf.lsc &&
> +		!dev->data->dev_conf.intr_conf.rmv)
>  		return;
>  	assert(priv->ctx->async_fd > 0);
>  	flags = fcntl(priv->ctx->async_fd, F_GETFL);
> @@ -1246,6 +1288,7 @@ priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev *dev)
>  	if (rc < 0) {
>  		INFO("failed to change file descriptor async event queue");
>  		dev->data->dev_conf.intr_conf.lsc = 0;
> +		dev->data->dev_conf.intr_conf.rmv = 0;
>  	} else {
>  		priv->intr_handle.fd = priv->ctx->async_fd;
>  		priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
> -- 
> 2.7.4
> 

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v3] net/mlx5: support device removal event
  2017-09-04 15:33                 ` Adrien Mazarguil
@ 2017-09-04 17:52                   ` Matan Azrad
  2017-09-05  9:28                     ` Adrien Mazarguil
  0 siblings, 1 reply; 19+ messages in thread
From: Matan Azrad @ 2017-09-04 17:52 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: Nélio Laranjeiro, dev

Hi Adrien,

> -----Original Message-----
> From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> Sent: Monday, September 4, 2017 6:33 PM
> To: Matan Azrad <matan@mellanox.com>
> Cc: Nélio Laranjeiro <nelio.laranjeiro@6wind.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: support device removal event
> 
> Hi Matan,
> 
> One comment I have is, while this patch adds support for RMV, it also silently
> addresses a bug (see large comment you added to
> priv_link_status_update()).
> 
> This should be split in two commits, with the fix part coming first and CC
> stable@dpdk.org, and a second commit adding RMV support proper.
> 

Actually, the mlx4 bug was not appeared in the mlx5 previous code,
Probably because the RMV interrupt was not implemented in mlx5 before this patch.
The big comment just explains the link inconsistent issue and was added
here since Nelio and I think the new function, priv_link_status_update(),
justifies this comment for future review.  

> More below.
> 
> On Mon, Sep 04, 2017 at 04:55:53PM +0300, Matan Azrad wrote:
> > Extend the LSC event handling to support the device removal as well.
> > The Verbs library may send several related events, which are different
> > from LSC event.
> >
> > The mlx5 event handling has been made capable of receiving and
> > signaling several event types at once.
> >
> > This support includes next:
> > 1. Removal event detection according to the user configuration.
> > 2. Calling to all registered mlx5 removal callbacks.
> > 3. Capabilities extension to include removal interrupt handling.
> >
> > Signed-off-by: Matan Azrad <matan@mellanox.com>
> > ---
> >  drivers/net/mlx5/mlx5.c        |   2 +-
> >  drivers/net/mlx5/mlx5_ethdev.c | 103
> > +++++++++++++++++++++++++++++------------
> >  2 files changed, 74 insertions(+), 31 deletions(-)
> >
> > Changes:
> > V2:
> > Replace link status update function name.
> > add inconsistent link workaround comment.
> >
> > V3:
> > Fix indentations.
> > Accurate inconsistent link comment.
> >
> >
> > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> > bd66a7c..1a3d7f1 100644
> > --- a/drivers/net/mlx5/mlx5.c
> > +++ b/drivers/net/mlx5/mlx5.c
> > @@ -865,7 +865,7 @@ static struct rte_pci_driver mlx5_driver = {
> >  	},
> >  	.id_table = mlx5_pci_id_map,
> >  	.probe = mlx5_pci_probe,
> > -	.drv_flags = RTE_PCI_DRV_INTR_LSC,
> > +	.drv_flags = RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV,
> >  };
> >
> >  /**
> > diff --git a/drivers/net/mlx5/mlx5_ethdev.c
> > b/drivers/net/mlx5/mlx5_ethdev.c index 57f6237..cdbd723 100644
> > --- a/drivers/net/mlx5/mlx5_ethdev.c
> > +++ b/drivers/net/mlx5/mlx5_ethdev.c
> > @@ -1112,47 +1112,84 @@ mlx5_ibv_device_to_pci_addr(const struct
> > ibv_device *device,  }
> >
> >  /**
> > - * Link status handler.
> > + * Update the link status.
> >   *
> >   * @param priv
> >   *   Pointer to private structure.
> > - * @param dev
> > - *   Pointer to the rte_eth_dev structure.
> >   *
> >   * @return
> > - *   Nonzero if the callback process can be called immediately.
> > + *   Zero if the callback process can be called immediately.
> >   */
> >  static int
> > -priv_dev_link_status_handler(struct priv *priv, struct rte_eth_dev
> > *dev)
> > +priv_link_status_update(struct priv *priv) {
> > +	struct rte_eth_link *link = &priv->dev->data->dev_link;
> > +
> > +	mlx5_link_update(priv->dev, 0);
> > +	if (((link->link_speed == 0) && link->link_status) ||
> > +		((link->link_speed != 0) && !link->link_status)) {
> > +		/*
> > +		 * Inconsistent status.
> > +		 * The link status is read from Ethtool through an IOCTL,
> > +		 * but as the application may work in polling mode it
> > +		 * may get the port event before the Kernel driver had
> > +		 * time to process it. PMD then request the link from
> > +		 * the kernel but the event is still not processed (due
> > +		 * to more urgent interrupts) and finally the PMD may
> > +		 * get an inconsistent link.
> > +		 * Setting alarm for later checking.
> > +		 */
> 
> While adding a comment is nice, there's too much info in there. From the
> PMD standpoint, what happens is the interrupt occurs much before the
> kernel netdevice exposes the new status, so it needs to be checked later.
> Can you sum it up in fewer words?
> 

Yes, sure :)

> > +		if (!priv->pending_alarm) {
> > +			priv->pending_alarm = 1;
> > +			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
> > +					  mlx5_dev_link_status_handler,
> > +					  priv->dev);
> > +		}
> > +		return 1;
> > +	} else if (unlikely(priv->pending_alarm)) {
> > +		/* In case of link interrupt while link alarm was setting. */
> > +		priv->pending_alarm = 0;
> > +		rte_eal_alarm_cancel(mlx5_dev_link_status_handler, priv-
> >dev);
> > +	}
> > +	return 0;
> > +}
> > +
> > +/**
> > + * Device status handler.
> > + *
> > + * @param priv
> > + *   Pointer to private structure.
> > + * @param events
> > + *   Pointer to event flags holder.
> > + *
> > + * @return
> > + *   Events bitmap of callback process which can be called immediately.
> > + */
> > +static uint32_t
> > +priv_dev_status_handler(struct priv *priv)
> >  {
> >  	struct ibv_async_event event;
> > -	struct rte_eth_link *link = &dev->data->dev_link;
> > -	int ret = 0;
> > +	uint32_t ret = 0;
> >
> >  	/* Read all message and acknowledge them. */
> >  	for (;;) {
> >  		if (ibv_get_async_event(priv->ctx, &event))
> >  			break;
> > -
> > -		if (event.event_type != IBV_EVENT_PORT_ACTIVE &&
> > -		    event.event_type != IBV_EVENT_PORT_ERR)
> > +		if ((event.event_type == IBV_EVENT_PORT_ACTIVE ||
> > +			event.event_type == IBV_EVENT_PORT_ERR) &&
> > +			(priv->dev->data->dev_conf.intr_conf.lsc == 1))
> > +			ret |= (1 << RTE_ETH_EVENT_INTR_LSC);
> > +		else if (event.event_type == IBV_EVENT_DEVICE_FATAL &&
> > +			priv->dev->data->dev_conf.intr_conf.rmv == 1)
> > +			ret |= (1 << RTE_ETH_EVENT_INTR_RMV);
> > +		else
> >  			DEBUG("event type %d on port %d not handled",
> >  			      event.event_type, event.element.port_num);
> 
> What you also need to mention in the commit log of the fix is that splitting
> priv_dev_status_handler() and priv_link_status_update() addresses another
> bug here: this loop consumed *all* events, even during alarms. An alarm
> occurring for a LSC event could eat a RMV event that the application would
> never receive. This also affects mlx4, for which I intend to submit a fix soon.
> 

I think also this issue is only mlx4 bug,
Since in the previous mlx5 code only LCS event was supported,
all these problems was not there. 

> >  		ibv_ack_async_event(&event);
> >  	}
> > -	mlx5_link_update(dev, 0);
> > -	if (((link->link_speed == 0) && link->link_status) ||
> > -	    ((link->link_speed != 0) && !link->link_status)) {
> > -		if (!priv->pending_alarm) {
> > -			/* Inconsistent status, check again later. */
> > -			priv->pending_alarm = 1;
> > -			rte_eal_alarm_set(MLX5_ALARM_TIMEOUT_US,
> > -					  mlx5_dev_link_status_handler,
> > -					  dev);
> > -		}
> > -	} else {
> > -		ret = 1;
> > -	}
> > +	if (ret & (1 << RTE_ETH_EVENT_INTR_LSC))
> > +		if (priv_link_status_update(priv))
> > +			ret &= ~(1 << RTE_ETH_EVENT_INTR_LSC);
> >  	return ret;
> >  }
> >
> > @@ -1172,9 +1209,9 @@ mlx5_dev_link_status_handler(void *arg)
> >  	priv_lock(priv);
> >  	assert(priv->pending_alarm == 1);
> >  	priv->pending_alarm = 0;
> > -	ret = priv_dev_link_status_handler(priv, dev);
> > +	ret = priv_link_status_update(priv);
> >  	priv_unlock(priv);
> > -	if (ret)
> > +	if (!ret)
> >  		_rte_eth_dev_callback_process(dev,
> RTE_ETH_EVENT_INTR_LSC, NULL,
> >  					      NULL);
> >  }
> > @@ -1192,14 +1229,17 @@ mlx5_dev_interrupt_handler(void *cb_arg)  {
> >  	struct rte_eth_dev *dev = cb_arg;
> >  	struct priv *priv = dev->data->dev_private;
> > -	int ret;
> > +	uint32_t events;
> >
> >  	priv_lock(priv);
> > -	ret = priv_dev_link_status_handler(priv, dev);
> > +	events = priv_dev_status_handler(priv);
> >  	priv_unlock(priv);
> > -	if (ret)
> > +	if (events & (1 << RTE_ETH_EVENT_INTR_LSC))
> >  		_rte_eth_dev_callback_process(dev,
> RTE_ETH_EVENT_INTR_LSC, NULL,
> >  					      NULL);
> > +	if (events & (1 << RTE_ETH_EVENT_INTR_RMV))
> > +		_rte_eth_dev_callback_process(dev,
> RTE_ETH_EVENT_INTR_RMV, NULL,
> > +					      NULL);
> >  }
> >
> >  /**
> > @@ -1213,7 +1253,8 @@ mlx5_dev_interrupt_handler(void *cb_arg)  void
> > priv_dev_interrupt_handler_uninstall(struct priv *priv, struct
> > rte_eth_dev *dev)  {
> > -	if (!dev->data->dev_conf.intr_conf.lsc)
> > +	if (!dev->data->dev_conf.intr_conf.lsc &&
> > +		!dev->data->dev_conf.intr_conf.rmv)
> >  		return;
> >  	rte_intr_callback_unregister(&priv->intr_handle,
> >  				     mlx5_dev_interrupt_handler,
> > @@ -1238,7 +1279,8 @@ priv_dev_interrupt_handler_install(struct priv
> > *priv, struct rte_eth_dev *dev)  {
> >  	int rc, flags;
> >
> > -	if (!dev->data->dev_conf.intr_conf.lsc)
> > +	if (!dev->data->dev_conf.intr_conf.lsc &&
> > +		!dev->data->dev_conf.intr_conf.rmv)
> >  		return;
> >  	assert(priv->ctx->async_fd > 0);
> >  	flags = fcntl(priv->ctx->async_fd, F_GETFL); @@ -1246,6 +1288,7 @@
> > priv_dev_interrupt_handler_install(struct priv *priv, struct rte_eth_dev
> *dev)
> >  	if (rc < 0) {
> >  		INFO("failed to change file descriptor async event queue");
> >  		dev->data->dev_conf.intr_conf.lsc = 0;
> > +		dev->data->dev_conf.intr_conf.rmv = 0;
> >  	} else {
> >  		priv->intr_handle.fd = priv->ctx->async_fd;
> >  		priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
> > --
> > 2.7.4
> >
> 
> --
> Adrien Mazarguil
> 6WIND

Thanks,
Matan Azrad

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v3] net/mlx5: support device removal event
  2017-09-04 17:52                   ` Matan Azrad
@ 2017-09-05  9:28                     ` Adrien Mazarguil
  2017-09-05 10:38                       ` Matan Azrad
  0 siblings, 1 reply; 19+ messages in thread
From: Adrien Mazarguil @ 2017-09-05  9:28 UTC (permalink / raw)
  To: Matan Azrad; +Cc: Nélio Laranjeiro, dev

Hi Matan,

On Mon, Sep 04, 2017 at 05:52:55PM +0000, Matan Azrad wrote:
> Hi Adrien,
> 
> > -----Original Message-----
> > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > Sent: Monday, September 4, 2017 6:33 PM
> > To: Matan Azrad <matan@mellanox.com>
> > Cc: Nélio Laranjeiro <nelio.laranjeiro@6wind.com>; dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: support device removal event
> > 
> > Hi Matan,
> > 
> > One comment I have is, while this patch adds support for RMV, it also silently
> > addresses a bug (see large comment you added to
> > priv_link_status_update()).
> > 
> > This should be split in two commits, with the fix part coming first and CC
> > stable@dpdk.org, and a second commit adding RMV support proper.
> > 
> 
> Actually, the mlx4 bug was not appeared in the mlx5 previous code,
> Probably because the RMV interrupt was not implemented in mlx5 before this patch.

Good point, no RMV could occur before it is implemented, however a dedicated
commit for the fix itself (i.e. alarm callback not supposed to end up
calling ibv_get_async_event()) might better explain the logic behind these
changes. What I mean is, if there was no problem, you wouldn't need to make
priv_link_status_update() a separate function, right?

> The big comment just explains the link inconsistent issue and was added
> here since Nelio and I think the new function, priv_link_status_update(),
> justifies this comment for future review.  

I understand, this could also have been part of the commit log of the
dedicated commit.

Thanks.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v3] net/mlx5: support device removal event
  2017-09-05  9:28                     ` Adrien Mazarguil
@ 2017-09-05 10:38                       ` Matan Azrad
  2017-09-05 12:01                         ` Adrien Mazarguil
  0 siblings, 1 reply; 19+ messages in thread
From: Matan Azrad @ 2017-09-05 10:38 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: Nélio Laranjeiro, dev

Hi Adrien

> -----Original Message-----
> From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> Sent: Tuesday, September 5, 2017 12:28 PM
> To: Matan Azrad <matan@mellanox.com>
> Cc: Nélio Laranjeiro <nelio.laranjeiro@6wind.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: support device removal event
> 
> Hi Matan,
> 
> On Mon, Sep 04, 2017 at 05:52:55PM +0000, Matan Azrad wrote:
> > Hi Adrien,
> >
> > > -----Original Message-----
> > > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > > Sent: Monday, September 4, 2017 6:33 PM
> > > To: Matan Azrad <matan@mellanox.com>
> > > Cc: Nélio Laranjeiro <nelio.laranjeiro@6wind.com>; dev@dpdk.org
> > > Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: support device removal
> > > event
> > >
> > > Hi Matan,
> > >
> > > One comment I have is, while this patch adds support for RMV, it
> > > also silently addresses a bug (see large comment you added to
> > > priv_link_status_update()).
> > >
> > > This should be split in two commits, with the fix part coming first
> > > and CC stable@dpdk.org, and a second commit adding RMV support
> proper.
> > >
> >
> > Actually, the mlx4 bug was not appeared in the mlx5 previous code,
> > Probably because the RMV interrupt was not implemented in mlx5 before
> this patch.
> 
> Good point, no RMV could occur before it is implemented, however a
> dedicated commit for the fix itself (i.e. alarm callback not supposed to end up
> calling ibv_get_async_event()) might better explain the logic behind these
> changes. What I mean is, if there was no problem, you wouldn't need to
> make
> priv_link_status_update() a separate function, right?
> 

The separation was done mainly because of the new interrupt implementation,
else, there was bug here.
The unnecessary  alarm ibv_get_async_event calling was harmless in
the previous code.
I gets your point for the logic explanation behind these changes and I can add it in this
patch commit log to be clearer, something like:
The link update operation was separated from the interrupt callback
to avoid RMV interrupt disregard and unnecessary event acknowledgment
caused by the inconsistent link status alarm callback.

> > The big comment just explains the link inconsistent issue and was
> > added here since Nelio and I think the new function,
> > priv_link_status_update(), justifies this comment for future review.
> 
> I understand, this could also have been part of the commit log of the
> dedicated commit.
> 
Are you sure we need to describe the code comment reason in the commit log?

> Thanks.
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v3] net/mlx5: support device removal event
  2017-09-05 10:38                       ` Matan Azrad
@ 2017-09-05 12:01                         ` Adrien Mazarguil
  2017-09-05 13:36                           ` Matan Azrad
  0 siblings, 1 reply; 19+ messages in thread
From: Adrien Mazarguil @ 2017-09-05 12:01 UTC (permalink / raw)
  To: Matan Azrad; +Cc: Nélio Laranjeiro, dev

Hi Matan,

On Tue, Sep 05, 2017 at 10:38:21AM +0000, Matan Azrad wrote:
> Hi Adrien
> 
> > -----Original Message-----
> > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > Sent: Tuesday, September 5, 2017 12:28 PM
> > To: Matan Azrad <matan@mellanox.com>
> > Cc: Nélio Laranjeiro <nelio.laranjeiro@6wind.com>; dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: support device removal event
> > 
> > Hi Matan,
> > 
> > On Mon, Sep 04, 2017 at 05:52:55PM +0000, Matan Azrad wrote:
> > > Hi Adrien,
> > >
> > > > -----Original Message-----
> > > > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > > > Sent: Monday, September 4, 2017 6:33 PM
> > > > To: Matan Azrad <matan@mellanox.com>
> > > > Cc: Nélio Laranjeiro <nelio.laranjeiro@6wind.com>; dev@dpdk.org
> > > > Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: support device removal
> > > > event
> > > >
> > > > Hi Matan,
> > > >
> > > > One comment I have is, while this patch adds support for RMV, it
> > > > also silently addresses a bug (see large comment you added to
> > > > priv_link_status_update()).
> > > >
> > > > This should be split in two commits, with the fix part coming first
> > > > and CC stable@dpdk.org, and a second commit adding RMV support
> > proper.
> > > >
> > >
> > > Actually, the mlx4 bug was not appeared in the mlx5 previous code,
> > > Probably because the RMV interrupt was not implemented in mlx5 before
> > this patch.
> > 
> > Good point, no RMV could occur before it is implemented, however a
> > dedicated commit for the fix itself (i.e. alarm callback not supposed to end up
> > calling ibv_get_async_event()) might better explain the logic behind these
> > changes. What I mean is, if there was no problem, you wouldn't need to
> > make
> > priv_link_status_update() a separate function, right?
> > 
> 
> The separation was done mainly because of the new interrupt implementation,
> else, there was bug here.
> The unnecessary  alarm ibv_get_async_event calling was harmless in
> the previous code.
> I gets your point for the logic explanation behind these changes and I can add it in this
> patch commit log to be clearer, something like:
> The link update operation was separated from the interrupt callback
> to avoid RMV interrupt disregard and unnecessary event acknowledgment
> caused by the inconsistent link status alarm callback.

Yes, it's better to explain why you did this in the commit log, but see
below.

> > > The big comment just explains the link inconsistent issue and was
> > > added here since Nelio and I think the new function,
> > > priv_link_status_update(), justifies this comment for future review.
> > 
> > I understand, this could also have been part of the commit log of the
> > dedicated commit.
> > 
> Are you sure we need to describe the code comment reason in the commit log?

It's a change you did to address a possible bug otherwise so we have to,
however remember that a commit should, as much as possible, do exactly one
thing. If you need to explain that you did this in order to do that, "this"
and "that" can often be identified as two separate commits. Doing so makes
it much easier for reviewers to understand the reasoning behind changes and
leads to quicker reviews (makes instant-acks even possible).

It'd still like a separate commit if you don't mind.

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v3] net/mlx5: support device removal event
  2017-09-05 12:01                         ` Adrien Mazarguil
@ 2017-09-05 13:36                           ` Matan Azrad
  2017-09-06  7:12                             ` Adrien Mazarguil
  0 siblings, 1 reply; 19+ messages in thread
From: Matan Azrad @ 2017-09-05 13:36 UTC (permalink / raw)
  To: Adrien Mazarguil; +Cc: Nélio Laranjeiro, dev

Hi Adrien

> -----Original Message-----
> From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> Sent: Tuesday, September 5, 2017 3:02 PM
> To: Matan Azrad <matan@mellanox.com>
> Cc: Nélio Laranjeiro <nelio.laranjeiro@6wind.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: support device removal event
> 
> Hi Matan,
> 
> On Tue, Sep 05, 2017 at 10:38:21AM +0000, Matan Azrad wrote:
> > Hi Adrien
> >
> > > -----Original Message-----
> > > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > > Sent: Tuesday, September 5, 2017 12:28 PM
> > > To: Matan Azrad <matan@mellanox.com>
> > > Cc: Nélio Laranjeiro <nelio.laranjeiro@6wind.com>; dev@dpdk.org
> > > Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: support device removal
> > > event
> > >
> > > Hi Matan,
> > >
> > > On Mon, Sep 04, 2017 at 05:52:55PM +0000, Matan Azrad wrote:
> > > > Hi Adrien,
> > > >
> > > > > -----Original Message-----
> > > > > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > > > > Sent: Monday, September 4, 2017 6:33 PM
> > > > > To: Matan Azrad <matan@mellanox.com>
> > > > > Cc: Nélio Laranjeiro <nelio.laranjeiro@6wind.com>; dev@dpdk.org
> > > > > Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: support device
> > > > > removal event
> > > > >
> > > > > Hi Matan,
> > > > >
> > > > > One comment I have is, while this patch adds support for RMV, it
> > > > > also silently addresses a bug (see large comment you added to
> > > > > priv_link_status_update()).
> > > > >
> > > > > This should be split in two commits, with the fix part coming
> > > > > first and CC stable@dpdk.org, and a second commit adding RMV
> > > > > support
> > > proper.
> > > > >
> > > >
> > > > Actually, the mlx4 bug was not appeared in the mlx5 previous code,
> > > > Probably because the RMV interrupt was not implemented in mlx5
> > > > before
> > > this patch.
> > >
> > > Good point, no RMV could occur before it is implemented, however a
> > > dedicated commit for the fix itself (i.e. alarm callback not
> > > supposed to end up calling ibv_get_async_event()) might better
> > > explain the logic behind these changes. What I mean is, if there was
> > > no problem, you wouldn't need to make
> > > priv_link_status_update() a separate function, right?
> > >
> >
> > The separation was done mainly because of the new interrupt
> > implementation, else, there was bug here.
> > The unnecessary  alarm ibv_get_async_event calling was harmless in the
> > previous code.
> > I gets your point for the logic explanation behind these changes and I
> > can add it in this patch commit log to be clearer, something like:
> > The link update operation was separated from the interrupt callback to
> > avoid RMV interrupt disregard and unnecessary event acknowledgment
> > caused by the inconsistent link status alarm callback.
> 
> Yes, it's better to explain why you did this in the commit log, but see below.
> 
> > > > The big comment just explains the link inconsistent issue and was
> > > > added here since Nelio and I think the new function,
> > > > priv_link_status_update(), justifies this comment for future review.
> > >
> > > I understand, this could also have been part of the commit log of
> > > the dedicated commit.
> > >
> > Are you sure we need to describe the code comment reason in the commit
> log?
> 
> It's a change you did to address a possible bug otherwise so we have to,
> however remember that a commit should, as much as possible, do exactly
> one thing. If you need to explain that you did this in order to do that, "this"
> and "that" can often be identified as two separate commits. Doing so makes
> it much easier for reviewers to understand the reasoning behind changes
> and leads to quicker reviews (makes instant-acks even possible).
> 
> It'd still like a separate commit if you don't mind.

Sorry, but I think it is an infinite order.
I have just added RMV interrupt, I did a lot of things in this patch for it.
I think  I don't need to separate each thing done for this support.
I prefer to stay it in one patch if you don't mind. 
 
> 
> --
> Adrien Mazarguil
> 6WIND

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v3] net/mlx5: support device removal event
  2017-09-05 13:36                           ` Matan Azrad
@ 2017-09-06  7:12                             ` Adrien Mazarguil
  0 siblings, 0 replies; 19+ messages in thread
From: Adrien Mazarguil @ 2017-09-06  7:12 UTC (permalink / raw)
  To: Matan Azrad; +Cc: Nélio Laranjeiro, dev

Hi Matan,

On Tue, Sep 05, 2017 at 01:36:13PM +0000, Matan Azrad wrote:
> Hi Adrien
> 
> > -----Original Message-----
> > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > Sent: Tuesday, September 5, 2017 3:02 PM
> > To: Matan Azrad <matan@mellanox.com>
> > Cc: Nélio Laranjeiro <nelio.laranjeiro@6wind.com>; dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: support device removal event
> > 
> > Hi Matan,
> > 
> > On Tue, Sep 05, 2017 at 10:38:21AM +0000, Matan Azrad wrote:
> > > Hi Adrien
> > >
> > > > -----Original Message-----
> > > > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > > > Sent: Tuesday, September 5, 2017 12:28 PM
> > > > To: Matan Azrad <matan@mellanox.com>
> > > > Cc: Nélio Laranjeiro <nelio.laranjeiro@6wind.com>; dev@dpdk.org
> > > > Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: support device removal
> > > > event
> > > >
> > > > Hi Matan,
> > > >
> > > > On Mon, Sep 04, 2017 at 05:52:55PM +0000, Matan Azrad wrote:
> > > > > Hi Adrien,
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: Adrien Mazarguil [mailto:adrien.mazarguil@6wind.com]
> > > > > > Sent: Monday, September 4, 2017 6:33 PM
> > > > > > To: Matan Azrad <matan@mellanox.com>
> > > > > > Cc: Nélio Laranjeiro <nelio.laranjeiro@6wind.com>; dev@dpdk.org
> > > > > > Subject: Re: [dpdk-dev] [PATCH v3] net/mlx5: support device
> > > > > > removal event
> > > > > >
> > > > > > Hi Matan,
> > > > > >
> > > > > > One comment I have is, while this patch adds support for RMV, it
> > > > > > also silently addresses a bug (see large comment you added to
> > > > > > priv_link_status_update()).
> > > > > >
> > > > > > This should be split in two commits, with the fix part coming
> > > > > > first and CC stable@dpdk.org, and a second commit adding RMV
> > > > > > support
> > > > proper.
> > > > > >
> > > > >
> > > > > Actually, the mlx4 bug was not appeared in the mlx5 previous code,
> > > > > Probably because the RMV interrupt was not implemented in mlx5
> > > > > before
> > > > this patch.
> > > >
> > > > Good point, no RMV could occur before it is implemented, however a
> > > > dedicated commit for the fix itself (i.e. alarm callback not
> > > > supposed to end up calling ibv_get_async_event()) might better
> > > > explain the logic behind these changes. What I mean is, if there was
> > > > no problem, you wouldn't need to make
> > > > priv_link_status_update() a separate function, right?
> > > >
> > >
> > > The separation was done mainly because of the new interrupt
> > > implementation, else, there was bug here.
> > > The unnecessary  alarm ibv_get_async_event calling was harmless in the
> > > previous code.
> > > I gets your point for the logic explanation behind these changes and I
> > > can add it in this patch commit log to be clearer, something like:
> > > The link update operation was separated from the interrupt callback to
> > > avoid RMV interrupt disregard and unnecessary event acknowledgment
> > > caused by the inconsistent link status alarm callback.
> > 
> > Yes, it's better to explain why you did this in the commit log, but see below.
> > 
> > > > > The big comment just explains the link inconsistent issue and was
> > > > > added here since Nelio and I think the new function,
> > > > > priv_link_status_update(), justifies this comment for future review.
> > > >
> > > > I understand, this could also have been part of the commit log of
> > > > the dedicated commit.
> > > >
> > > Are you sure we need to describe the code comment reason in the commit
> > log?
> > 
> > It's a change you did to address a possible bug otherwise so we have to,
> > however remember that a commit should, as much as possible, do exactly
> > one thing. If you need to explain that you did this in order to do that, "this"
> > and "that" can often be identified as two separate commits. Doing so makes
> > it much easier for reviewers to understand the reasoning behind changes
> > and leads to quicker reviews (makes instant-acks even possible).
> > 
> > It'd still like a separate commit if you don't mind.
> 
> Sorry, but I think it is an infinite order.
> I have just added RMV interrupt, I did a lot of things in this patch for it.
> I think  I don't need to separate each thing done for this support.
> I prefer to stay it in one patch if you don't mind. 

I understand that's a lot of work, so let's cut the talk. Since I'm the one
requesting for patches to be split, I'll offer to re-spin yours and submit
the result as v4, is that OK?

-- 
Adrien Mazarguil
6WIND

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2017-09-06  7:13 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-13 12:25 [PATCH 1/2] net/mlx5: support device removal event Matan Azrad
2017-08-13 12:25 ` [PATCH 2/2] net/mlx5: fix probe failure report Matan Azrad
2017-08-23  9:44   ` Nélio Laranjeiro
2017-09-01 10:40     ` [dpdk-stable] " Ferruh Yigit
2017-08-23  9:40 ` [PATCH 1/2] net/mlx5: support device removal event Nélio Laranjeiro
2017-08-23 19:44   ` Matan Azrad
2017-08-24  7:38     ` Nélio Laranjeiro
2017-08-24 14:33       ` Matan Azrad
2017-08-25  8:29         ` Nélio Laranjeiro
2017-08-29  8:30           ` [PATCH v2] " Matan Azrad
2017-09-04 12:49             ` Nélio Laranjeiro
2017-09-04 13:55               ` [PATCH v3] " Matan Azrad
2017-09-04 15:33                 ` Adrien Mazarguil
2017-09-04 17:52                   ` Matan Azrad
2017-09-05  9:28                     ` Adrien Mazarguil
2017-09-05 10:38                       ` Matan Azrad
2017-09-05 12:01                         ` Adrien Mazarguil
2017-09-05 13:36                           ` Matan Azrad
2017-09-06  7:12                             ` Adrien Mazarguil

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.