All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next 1/2] update ENA driver to version 1.5.0
@ 2017-12-28 21:31 netanel
  2017-12-28 21:31 ` [PATCH net-next 1/2] net: ena: add detection and recovery mechanism for handling missed/misrouted MSI-X netanel
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: netanel @ 2017-12-28 21:31 UTC (permalink / raw)
  To: davem, netdev
  Cc: Netanel Belgazal, dwmw, zorik, matua, saeedb, msw, aliguori,
	nafea, evgenys, gtzalik

From: Netanel Belgazal <netanel@amazon.com>

This patchset contains two changes:
* Add a robust mechanism for detection of stuck Rx/Tx rings due to
  missed or misrouted MSI-X
* Increase the driver version to 1.5.0

Netanel Belgazal (2):
  net: ena: add detection and recovery mechanism for handling
    missed/misrouted MSI-X
  net: ena: increase ena driver version to 1.5.0

 drivers/net/ethernet/amazon/ena/ena_eth_com.c   | 12 +++++
 drivers/net/ethernet/amazon/ena/ena_eth_com.h   |  2 +
 drivers/net/ethernet/amazon/ena/ena_netdev.c    | 68 ++++++++++++++++++++++---
 drivers/net/ethernet/amazon/ena/ena_netdev.h    |  6 ++-
 drivers/net/ethernet/amazon/ena/ena_regs_defs.h |  2 +
 5 files changed, 82 insertions(+), 8 deletions(-)

-- 
2.7.3.AMZN

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH net-next 1/2] net: ena: add detection and recovery mechanism for handling missed/misrouted MSI-X
  2017-12-28 21:31 [PATCH net-next 1/2] update ENA driver to version 1.5.0 netanel
@ 2017-12-28 21:31 ` netanel
  2017-12-28 21:31 ` [PATCH net-next 2/2] net: ena: increase ena driver version to 1.5.0 netanel
  2018-01-02 19:35 ` [PATCH net-next 1/2] update ENA driver to version 1.5.0 David Miller
  2 siblings, 0 replies; 4+ messages in thread
From: netanel @ 2017-12-28 21:31 UTC (permalink / raw)
  To: davem, netdev
  Cc: Netanel Belgazal, dwmw, zorik, matua, saeedb, msw, aliguori,
	nafea, evgenys, gtzalik

From: Netanel Belgazal <netanel@amazon.com>

A mechanism for detection of stuck Rx/Tx rings due to missed or
misrouted interrupts.
Check if there are unhandled completion descriptors before the first
MSI-X interrupt arrived.
The check is per queue and per interrupt vector.
Once such condition is detected, driver and device reset is scheduled.

Signed-off-by: Netanel Belgazal <netanel@amazon.com>
---
 drivers/net/ethernet/amazon/ena/ena_eth_com.c   | 12 +++++
 drivers/net/ethernet/amazon/ena/ena_eth_com.h   |  2 +
 drivers/net/ethernet/amazon/ena/ena_netdev.c    | 68 ++++++++++++++++++++++---
 drivers/net/ethernet/amazon/ena/ena_netdev.h    |  4 ++
 drivers/net/ethernet/amazon/ena/ena_regs_defs.h |  2 +
 5 files changed, 81 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_eth_com.c b/drivers/net/ethernet/amazon/ena/ena_eth_com.c
index b11e573ad57a..04ed2b57ff20 100644
--- a/drivers/net/ethernet/amazon/ena/ena_eth_com.c
+++ b/drivers/net/ethernet/amazon/ena/ena_eth_com.c
@@ -504,3 +504,15 @@ int ena_com_tx_comp_req_id_get(struct ena_com_io_cq *io_cq, u16 *req_id)
 
 	return 0;
 }
+
+bool ena_com_cq_empty(struct ena_com_io_cq *io_cq)
+{
+	struct ena_eth_io_rx_cdesc_base *cdesc;
+
+	cdesc = ena_com_get_next_rx_cdesc(io_cq);
+	if (cdesc)
+		return false;
+	else
+		return true;
+}
+
diff --git a/drivers/net/ethernet/amazon/ena/ena_eth_com.h b/drivers/net/ethernet/amazon/ena/ena_eth_com.h
index bb53c3a4f8e9..2f7657227cfe 100644
--- a/drivers/net/ethernet/amazon/ena/ena_eth_com.h
+++ b/drivers/net/ethernet/amazon/ena/ena_eth_com.h
@@ -88,6 +88,8 @@ int ena_com_add_single_rx_desc(struct ena_com_io_sq *io_sq,
 
 int ena_com_tx_comp_req_id_get(struct ena_com_io_cq *io_cq, u16 *req_id);
 
+bool ena_com_cq_empty(struct ena_com_io_cq *io_cq);
+
 static inline void ena_com_unmask_intr(struct ena_com_io_cq *io_cq,
 				       struct ena_eth_io_intr_reg *intr_reg)
 {
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 97c5a89a9cf7..a6f283232cb7 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -158,6 +158,8 @@ static void ena_init_io_rings_common(struct ena_adapter *adapter,
 	ring->per_napi_packets = 0;
 	ring->per_napi_bytes = 0;
 	ring->cpu = 0;
+	ring->first_interrupt = false;
+	ring->no_interrupt_event_cnt = 0;
 	u64_stats_init(&ring->syncp);
 }
 
@@ -1274,6 +1276,9 @@ static irqreturn_t ena_intr_msix_io(int irq, void *data)
 {
 	struct ena_napi *ena_napi = data;
 
+	ena_napi->tx_ring->first_interrupt = true;
+	ena_napi->rx_ring->first_interrupt = true;
+
 	napi_schedule_irqoff(&ena_napi->napi);
 
 	return IRQ_HANDLED;
@@ -2648,8 +2653,32 @@ static void ena_fw_reset_device(struct work_struct *work)
 	rtnl_unlock();
 }
 
-static int check_missing_comp_in_queue(struct ena_adapter *adapter,
-				       struct ena_ring *tx_ring)
+static int check_for_rx_interrupt_queue(struct ena_adapter *adapter,
+					struct ena_ring *rx_ring)
+{
+	if (likely(rx_ring->first_interrupt))
+		return 0;
+
+	if (ena_com_cq_empty(rx_ring->ena_com_io_cq))
+		return 0;
+
+	rx_ring->no_interrupt_event_cnt++;
+
+	if (rx_ring->no_interrupt_event_cnt == ENA_MAX_NO_INTERRUPT_ITERATIONS) {
+		netif_err(adapter, rx_err, adapter->netdev,
+			  "Potential MSIX issue on Rx side Queue = %d. Reset the device\n",
+			  rx_ring->qid);
+		adapter->reset_reason = ENA_REGS_RESET_MISS_INTERRUPT;
+		smp_mb__before_atomic();
+		set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
+					  struct ena_ring *tx_ring)
 {
 	struct ena_tx_buffer *tx_buf;
 	unsigned long last_jiffies;
@@ -2659,8 +2688,27 @@ static int check_missing_comp_in_queue(struct ena_adapter *adapter,
 	for (i = 0; i < tx_ring->ring_size; i++) {
 		tx_buf = &tx_ring->tx_buffer_info[i];
 		last_jiffies = tx_buf->last_jiffies;
-		if (unlikely(last_jiffies &&
-			     time_is_before_jiffies(last_jiffies + adapter->missing_tx_completion_to))) {
+
+		if (last_jiffies == 0)
+			/* no pending Tx at this location */
+			continue;
+
+		if (unlikely(!tx_ring->first_interrupt && time_is_before_jiffies(last_jiffies +
+			     2 * adapter->missing_tx_completion_to))) {
+			/* If after graceful period interrupt is still not
+			 * received, we schedule a reset
+			 */
+			netif_err(adapter, tx_err, adapter->netdev,
+				  "Potential MSIX issue on Tx side Queue = %d. Reset the device\n",
+				  tx_ring->qid);
+			adapter->reset_reason = ENA_REGS_RESET_MISS_INTERRUPT;
+			smp_mb__before_atomic();
+			set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
+			return -EIO;
+		}
+
+		if (unlikely(time_is_before_jiffies(last_jiffies +
+				adapter->missing_tx_completion_to))) {
 			if (!tx_buf->print_once)
 				netif_notice(adapter, tx_err, adapter->netdev,
 					     "Found a Tx that wasn't completed on time, qid %d, index %d.\n",
@@ -2689,9 +2737,10 @@ static int check_missing_comp_in_queue(struct ena_adapter *adapter,
 	return rc;
 }
 
-static void check_for_missing_tx_completions(struct ena_adapter *adapter)
+static void check_for_missing_completions(struct ena_adapter *adapter)
 {
 	struct ena_ring *tx_ring;
+	struct ena_ring *rx_ring;
 	int i, budget, rc;
 
 	/* Make sure the driver doesn't turn the device in other process */
@@ -2710,8 +2759,13 @@ static void check_for_missing_tx_completions(struct ena_adapter *adapter)
 
 	for (i = adapter->last_monitored_tx_qid; i < adapter->num_queues; i++) {
 		tx_ring = &adapter->tx_ring[i];
+		rx_ring = &adapter->rx_ring[i];
+
+		rc = check_missing_comp_in_tx_queue(adapter, tx_ring);
+		if (unlikely(rc))
+			return;
 
-		rc = check_missing_comp_in_queue(adapter, tx_ring);
+		rc = check_for_rx_interrupt_queue(adapter, rx_ring);
 		if (unlikely(rc))
 			return;
 
@@ -2870,7 +2924,7 @@ static void ena_timer_service(struct timer_list *t)
 
 	check_for_admin_com_state(adapter);
 
-	check_for_missing_tx_completions(adapter);
+	check_for_missing_completions(adapter);
 
 	check_for_empty_rx_ring(adapter);
 
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h
index 3bbc003871de..734ff2e84494 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.h
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h
@@ -122,6 +122,7 @@
  * We wait for 6 sec just to be on the safe side.
  */
 #define ENA_DEVICE_KALIVE_TIMEOUT	(6 * HZ)
+#define ENA_MAX_NO_INTERRUPT_ITERATIONS 3
 
 #define ENA_MMIO_DISABLE_REG_READ	BIT(0)
 
@@ -236,6 +237,9 @@ struct ena_ring {
 	/* The maximum header length the device can handle */
 	u8 tx_max_header_size;
 
+	bool first_interrupt;
+	u16 no_interrupt_event_cnt;
+
 	/* cpu for TPH */
 	int cpu;
 	 /* number of tx/rx_buffer_info's entries */
diff --git a/drivers/net/ethernet/amazon/ena/ena_regs_defs.h b/drivers/net/ethernet/amazon/ena/ena_regs_defs.h
index 9aec43c5bba8..48ca97fbe7bc 100644
--- a/drivers/net/ethernet/amazon/ena/ena_regs_defs.h
+++ b/drivers/net/ethernet/amazon/ena/ena_regs_defs.h
@@ -60,6 +60,8 @@ enum ena_regs_reset_reason_types {
 	ENA_REGS_RESET_USER_TRIGGER		= 12,
 
 	ENA_REGS_RESET_GENERIC			= 13,
+
+	ENA_REGS_RESET_MISS_INTERRUPT		= 14,
 };
 
 /* ena_registers offsets */
-- 
2.7.3.AMZN

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH net-next 2/2] net: ena: increase ena driver version to 1.5.0
  2017-12-28 21:31 [PATCH net-next 1/2] update ENA driver to version 1.5.0 netanel
  2017-12-28 21:31 ` [PATCH net-next 1/2] net: ena: add detection and recovery mechanism for handling missed/misrouted MSI-X netanel
@ 2017-12-28 21:31 ` netanel
  2018-01-02 19:35 ` [PATCH net-next 1/2] update ENA driver to version 1.5.0 David Miller
  2 siblings, 0 replies; 4+ messages in thread
From: netanel @ 2017-12-28 21:31 UTC (permalink / raw)
  To: davem, netdev
  Cc: Netanel Belgazal, dwmw, zorik, matua, saeedb, msw, aliguori,
	nafea, evgenys, gtzalik

From: Netanel Belgazal <netanel@amazon.com>

Signed-off-by: Netanel Belgazal <netanel@amazon.com>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h
index 734ff2e84494..f1972b5ab650 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.h
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h
@@ -44,7 +44,7 @@
 #include "ena_eth_com.h"
 
 #define DRV_MODULE_VER_MAJOR	1
-#define DRV_MODULE_VER_MINOR	3
+#define DRV_MODULE_VER_MINOR	5
 #define DRV_MODULE_VER_SUBMINOR 0
 
 #define DRV_MODULE_NAME		"ena"
-- 
2.7.3.AMZN

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH net-next 1/2] update ENA driver to version 1.5.0
  2017-12-28 21:31 [PATCH net-next 1/2] update ENA driver to version 1.5.0 netanel
  2017-12-28 21:31 ` [PATCH net-next 1/2] net: ena: add detection and recovery mechanism for handling missed/misrouted MSI-X netanel
  2017-12-28 21:31 ` [PATCH net-next 2/2] net: ena: increase ena driver version to 1.5.0 netanel
@ 2018-01-02 19:35 ` David Miller
  2 siblings, 0 replies; 4+ messages in thread
From: David Miller @ 2018-01-02 19:35 UTC (permalink / raw)
  To: netanel
  Cc: netdev, dwmw, zorik, matua, saeedb, msw, aliguori, nafea,
	evgenys, gtzalik

From: <netanel@amazon.com>
Date: Thu, 28 Dec 2017 21:31:29 +0000

> From: Netanel Belgazal <netanel@amazon.com>
> 
> This patchset contains two changes:
> * Add a robust mechanism for detection of stuck Rx/Tx rings due to
>   missed or misrouted MSI-X
> * Increase the driver version to 1.5.0

I think you meant "[PATCH net-next 0/2]" for this email :-)

Series applied, thanks.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2018-01-02 19:35 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-12-28 21:31 [PATCH net-next 1/2] update ENA driver to version 1.5.0 netanel
2017-12-28 21:31 ` [PATCH net-next 1/2] net: ena: add detection and recovery mechanism for handling missed/misrouted MSI-X netanel
2017-12-28 21:31 ` [PATCH net-next 2/2] net: ena: increase ena driver version to 1.5.0 netanel
2018-01-02 19:35 ` [PATCH net-next 1/2] update ENA driver to version 1.5.0 David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.