All of lore.kernel.org
 help / color / mirror / Atom feed
From: Oded Gabbay <oded.gabbay@gmail.com>
To: linux-kernel@vger.kernel.org, netdev@vger.kernel.org
Cc: SW_Drivers@habana.ai, gregkh@linuxfoundation.org,
	davem@davemloft.net, kuba@kernel.org, andrew@lunn.ch,
	f.fainelli@gmail.com, Omer Shpigelman <oshpigelman@habana.ai>
Subject: [PATCH v3 11/14] habanalabs/gaudi: add QP error handling
Date: Tue, 15 Sep 2020 20:10:19 +0300	[thread overview]
Message-ID: <20200915171022.10561-12-oded.gabbay@gmail.com> (raw)
In-Reply-To: <20200915171022.10561-1-oded.gabbay@gmail.com>

From: Omer Shpigelman <oshpigelman@habana.ai>

Add Queue Pair (QP) error notification to the user e.g. security violation,
too many retransmissions, invalid QP etc.

Whenever a QP caused an error, the firmware will send an event to the
driver which will push the error as an error entry to the Completion Queue
(if exists).

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/gaudi/gaudi.c     | 13 ++++
 drivers/misc/habanalabs/gaudi/gaudiP.h    |  1 +
 drivers/misc/habanalabs/gaudi/gaudi_nic.c | 95 +++++++++++++++++++++++
 3 files changed, 109 insertions(+)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 4602e4780651..71c9e2d18032 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -6660,6 +6660,19 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 		hl_fw_unmask_irq(hdev, event_type);
 		break;
 
+	case GAUDI_EVENT_NIC0_QP0:
+	case GAUDI_EVENT_NIC0_QP1:
+	case GAUDI_EVENT_NIC1_QP0:
+	case GAUDI_EVENT_NIC1_QP1:
+	case GAUDI_EVENT_NIC2_QP0:
+	case GAUDI_EVENT_NIC2_QP1:
+	case GAUDI_EVENT_NIC3_QP0:
+	case GAUDI_EVENT_NIC3_QP1:
+	case GAUDI_EVENT_NIC4_QP0:
+	case GAUDI_EVENT_NIC4_QP1:
+		gaudi_nic_handle_qp_err(hdev, event_type);
+		break;
+
 	case GAUDI_EVENT_PSOC_GPIO_U16_0:
 		cause = le64_to_cpu(eq_entry->data[0]) & 0xFF;
 		dev_err(hdev->dev,
diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h
index 3158d5d68c1d..7d7439da88bc 100644
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@@ -576,5 +576,6 @@ netdev_tx_t gaudi_nic_handle_tx_pkt(struct gaudi_nic_device *gaudi_nic,
 					struct sk_buff *skb);
 int gaudi_nic_sw_init(struct hl_device *hdev);
 void gaudi_nic_sw_fini(struct hl_device *hdev);
+void gaudi_nic_handle_qp_err(struct hl_device *hdev, u16 event_type);
 
 #endif /* GAUDIP_H_ */
diff --git a/drivers/misc/habanalabs/gaudi/gaudi_nic.c b/drivers/misc/habanalabs/gaudi/gaudi_nic.c
index 37f25247f751..49e94e9c786a 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi_nic.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi_nic.c
@@ -3988,3 +3988,98 @@ int gaudi_nic_cq_mmap(struct hl_device *hdev, struct vm_area_struct *vma)
 
 	return rc;
 }
+
+static char *get_syndrome_text(u32 syndrome)
+{
+	char *str;
+
+	switch (syndrome) {
+	case 0x05:
+		str = "Rx got invalid QP";
+		break;
+	case 0x06:
+		str = "Rx transport service mismatch";
+		break;
+	case 0x09:
+		str = "Rx Rkey check failed";
+		break;
+	case 0x40:
+		str = "timer retry exceeded";
+		break;
+	case 0x41:
+		str = "NACK retry exceeded";
+		break;
+	case 0x42:
+		str = "doorbell on invalid QP";
+		break;
+	case 0x43:
+		str = "doorbell security check failed";
+		break;
+	case 0x44:
+		str = "Tx got invalid QP";
+		break;
+	case 0x45:
+		str = "responder got ACK/NACK on invalid QP";
+		break;
+	case 0x46:
+		str = "responder try to send ACK/NACK on invalid QP";
+		break;
+	default:
+		str = "unknown syndrome";
+		break;
+	}
+
+	return str;
+}
+
+void gaudi_nic_handle_qp_err(struct hl_device *hdev, u16 event_type)
+{
+	struct gaudi_device *gaudi = hdev->asic_specific;
+	struct gaudi_nic_device *gaudi_nic;
+	struct qp_err *qp_err_arr;
+	struct hl_nic_cqe cqe_sw;
+	u32 pi, ci;
+
+	gaudi_nic = &gaudi->nic_devices[event_type - GAUDI_EVENT_NIC0_QP0];
+	qp_err_arr = gaudi_nic->qp_err_mem_cpu;
+
+	mutex_lock(&gaudi->nic_qp_err_lock);
+
+	if (!gaudi->nic_cq_enable)
+		dev_err_ratelimited(hdev->dev,
+			"received NIC %d QP error event %d but no CQ to push it\n",
+			gaudi_nic->port, event_type);
+
+	pi = NIC_RREG32(mmNIC0_QPC0_ERR_FIFO_PRODUCER_INDEX);
+	ci = gaudi_nic->qp_err_ci;
+
+	cqe_sw.is_err = true;
+	cqe_sw.port = gaudi_nic->port;
+
+	while (ci < pi) {
+		cqe_sw.type = QP_ERR_IS_REQ(qp_err_arr[ci]) ?
+				HL_NIC_CQE_TYPE_REQ : HL_NIC_CQE_TYPE_RES;
+		cqe_sw.qp_number = QP_ERR_QP_NUM(qp_err_arr[ci]);
+		cqe_sw.qp_err.syndrome = QP_ERR_ERR_NUM(qp_err_arr[ci]);
+
+		ci = (ci + 1) & (QP_ERR_BUF_LEN - 1);
+
+		dev_err_ratelimited(hdev->dev,
+			"NIC QP error port: %d, type: %d, qpn: %d, syndrome: %s (0x%x)\n",
+			cqe_sw.port, cqe_sw.type, cqe_sw.qp_number,
+			get_syndrome_text(cqe_sw.qp_err.syndrome),
+			cqe_sw.qp_err.syndrome);
+
+		if (gaudi->nic_cq_enable)
+			copy_cqe_to_main_queue(hdev, &cqe_sw);
+	}
+
+	gaudi_nic->qp_err_ci = ci;
+	NIC_WREG32(mmNIC0_QPC0_ERR_FIFO_CONSUMER_INDEX, ci);
+
+	/* signal the completion queue that there are available CQEs */
+	if (gaudi->nic_cq_enable)
+		complete(&gaudi->nic_cq_comp);
+
+	mutex_unlock(&gaudi->nic_qp_err_lock);
+}
-- 
2.17.1


  parent reply	other threads:[~2020-09-15 17:36 UTC|newest]

Thread overview: 84+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-15 17:10 [PATCH v3 00/14] Adding GAUDI NIC code to habanalabs driver Oded Gabbay
2020-09-15 17:10 ` [PATCH v3 01/14] habanalabs/gaudi: add NIC H/W and registers definitions Oded Gabbay
2020-09-15 17:10 ` [PATCH v3 02/14] habanalabs/gaudi: add NIC firmware-related definitions Oded Gabbay
2020-09-15 17:10 ` [PATCH v3 03/14] habanalabs/gaudi: add NIC security configuration Oded Gabbay
2020-09-15 17:10 ` [PATCH v3 04/14] habanalabs/gaudi: add support for NIC QMANs Oded Gabbay
2020-09-15 17:10 ` [PATCH v3 05/14] habanalabs/gaudi: add NIC Ethernet support Oded Gabbay
2020-09-15 17:10 ` [PATCH v3 06/14] habanalabs/gaudi: add NIC PHY code Oded Gabbay
2020-09-15 17:10 ` [PATCH v3 07/14] habanalabs/gaudi: allow user to get MAC addresses in INFO IOCTL Oded Gabbay
2020-09-15 17:10 ` [PATCH v3 08/14] habanalabs/gaudi: add a new IOCTL for NIC control operations Oded Gabbay
2020-09-15 17:10 ` [PATCH v3 09/14] habanalabs/gaudi: add CQ " Oded Gabbay
2020-09-15 17:10 ` [PATCH v3 10/14] habanalabs/gaudi: add WQ " Oded Gabbay
2020-09-15 17:10 ` Oded Gabbay [this message]
2020-09-15 17:10 ` [PATCH v3 12/14] habanalabs/gaudi: Add ethtool support using coresight Oded Gabbay
2020-09-15 17:10 ` [PATCH v3 13/14] habanalabs/gaudi: support DCB protocol Oded Gabbay
2020-09-15 17:10 ` [PATCH v3 14/14] habanalabs/gaudi: add NIC init/fini calls from common code Oded Gabbay
2020-09-15 20:35 ` [PATCH v3 00/14] Adding GAUDI NIC code to habanalabs driver Jakub Kicinski
2020-09-15 20:46   ` Oded Gabbay
2020-09-15 21:04     ` Jakub Kicinski
2020-09-15 21:20       ` Oded Gabbay
2020-09-15 21:37         ` Andrew Lunn
2020-09-15 21:43           ` Oded Gabbay
2020-09-15 22:35             ` David Miller
2020-09-15 22:36           ` David Miller
2020-09-15 22:34         ` David Miller
2020-09-16  4:26           ` Oded Gabbay
2020-09-17 17:18     ` Jason Gunthorpe
2020-09-18 11:36       ` Gal Pressman
2020-09-18 11:52         ` Leon Romanovsky
2020-09-18 11:56           ` Oded Gabbay
2020-09-18 12:03             ` Leon Romanovsky
2020-09-18 12:07               ` Oded Gabbay
2020-09-18 12:19                 ` Leon Romanovsky
2020-09-18 12:31                   ` Oded Gabbay
2020-09-18 13:09                     ` Leon Romanovsky
2020-09-19  6:40                   ` Greg Kroah-Hartman
2020-09-19  8:20                     ` Leon Romanovsky
2020-09-19  8:30                       ` Greg Kroah-Hartman
2020-09-19  8:58                         ` Leon Romanovsky
2020-09-19 16:43                         ` Oded Gabbay
2020-09-19 17:27                           ` Greg Kroah-Hartman
2020-09-19 19:22                             ` Jason Gunthorpe
2020-09-20  8:47                               ` Greg Kroah-Hartman
2020-09-20 19:05                                 ` Oded Gabbay
2020-09-21 10:39                                   ` Leon Romanovsky
2020-09-21 11:52                                 ` Jason Gunthorpe
2020-09-21 21:20                                   ` Jakub Kicinski
2020-09-22 11:49                                     ` Jason Gunthorpe
2020-09-19 18:49                           ` Andrew Lunn
2020-09-18 11:56         ` Jason Gunthorpe
2020-09-18 11:59           ` Oded Gabbay
2020-09-18 12:16             ` Jason Gunthorpe
2020-09-18 12:34               ` Oded Gabbay
2020-09-18 12:50                 ` Jason Gunthorpe
2020-09-18 13:02                   ` Oded Gabbay
2020-09-18 13:26                     ` Jason Gunthorpe
2020-09-18 13:49                       ` Oded Gabbay
2020-09-18 13:59                         ` Jason Gunthorpe
2020-09-18 14:12                           ` Oded Gabbay
2020-09-18 14:19                             ` Jason Gunthorpe
2020-09-18 14:45                               ` Oded Gabbay
2020-09-18 15:07                                 ` Jason Gunthorpe
2020-09-18 15:15                                   ` Oded Gabbay
2020-09-18 15:28                                     ` Jason Gunthorpe
2020-09-21 11:22                                       ` Gal Pressman
2020-09-21 11:49                                         ` Leon Romanovsky
2020-09-22 11:41                                         ` Jason Gunthorpe
2020-09-22 12:46                                           ` Gal Pressman
2020-09-22 16:14                                             ` Jason Gunthorpe
2020-09-22 16:30                                               ` Gal Pressman
2020-09-22 16:52                                                 ` Jason Gunthorpe
2020-09-18 12:10         ` Oded Gabbay
2020-09-15 20:42 ` David Miller
2020-09-15 20:49   ` Oded Gabbay
2020-09-16  6:26     ` Greg Kroah-Hartman
2020-09-16  6:36       ` Oded Gabbay
2020-09-16  7:42         ` Greg Kroah-Hartman
2020-09-16  8:02           ` Oded Gabbay
2020-09-16  8:22             ` Greg Kroah-Hartman
2020-09-16  8:47               ` Oded Gabbay
2020-09-16 12:00                 ` Greg Kroah-Hartman
2020-09-20 16:45                   ` Daniel Vetter
2020-09-16 23:04               ` Williams, Dan J
2020-09-18 12:00 ` Jason Gunthorpe
2020-09-18 12:01   ` Oded Gabbay

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200915171022.10561-12-oded.gabbay@gmail.com \
    --to=oded.gabbay@gmail.com \
    --cc=SW_Drivers@habana.ai \
    --cc=andrew@lunn.ch \
    --cc=davem@davemloft.net \
    --cc=f.fainelli@gmail.com \
    --cc=gregkh@linuxfoundation.org \
    --cc=kuba@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=oshpigelman@habana.ai \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.