All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next 0/4] Mellanox driver update, Oct 13 2015
@ 2015-10-13 15:44 Or Gerlitz
  2015-10-13 15:44 ` [PATCH net-next 1/4] net/mlx5_core: Fix internal error detection conditions Or Gerlitz
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Or Gerlitz @ 2015-10-13 15:44 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Eli Cohen, Amir Vadai, Or Gerlitz

Hi Dave,

This series contains two more patches from Eli, patch from Majd
to support PCI error handlers and a fix from Jack to mlx4 VFs
when probed without a provisioned mac address.

The patch set applied on top of net-next commit bbb300e "Merge branch 'bridge-vlan'"

Or.

Eli Cohen (2):
  net/mlx5_core: Fix internal error detection conditions
  net/mlx5_core: Wait for FW readiness on startup

Jack Morgenstein (1):
  net/mlx4_core: Replace VF zero mac with random mac in mlx4_core

Majd Dibbiny (1):
  net/mlx5_core: Add pci error handlers to mlx5_core driver

 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |  23 ++-
 drivers/net/ethernet/mellanox/mlx4/fw.c            |  16 ++
 drivers/net/ethernet/mellanox/mlx4/main.c          |   2 +
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c      | 170 +++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/health.c   | 123 +++++++++++-
 drivers/net/ethernet/mellanox/mlx5/core/main.c     | 212 ++++++++++++++++++++-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   4 +
 .../net/ethernet/mellanox/mlx5/core/pagealloc.c    |  13 +-
 include/linux/mlx4/device.h                        |   1 +
 include/linux/mlx5/device.h                        |   3 +-
 include/linux/mlx5/driver.h                        |  28 +++
 12 files changed, 565 insertions(+), 32 deletions(-)

-- 
2.3.7

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH net-next 1/4] net/mlx5_core: Fix internal error detection conditions
  2015-10-13 15:44 [PATCH net-next 0/4] Mellanox driver update, Oct 13 2015 Or Gerlitz
@ 2015-10-13 15:44 ` Or Gerlitz
  2015-10-14  2:46   ` David Miller
  2015-10-13 15:44 ` [PATCH net-next 2/4] net/mlx5_core: Add pci error handlers to mlx5_core driver Or Gerlitz
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 9+ messages in thread
From: Or Gerlitz @ 2015-10-13 15:44 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Eli Cohen, Amir Vadai, Or Gerlitz

From: Eli Cohen <eli@mellanox.com>

The detection of a fatal condition has been updated to take into account
the state reported by the device or by detecting an all ones read of the
firmware version which indicates that the device is not accessible.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/health.c | 51 ++++++++++++++++++++----
 include/linux/mlx5/driver.h                      |  1 +
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 9b81e1c..b007e83 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -57,6 +57,31 @@ enum {
 	MLX5_HEALTH_SYNDR_HIGH_TEMP		= 0x10
 };
 
+enum {
+	MLX5_NIC_IFC_FULL		= 0,
+	MLX5_NIC_IFC_DISABLED		= 1,
+	MLX5_NIC_IFC_NO_DRAM_NIC	= 2
+};
+
+static u8 get_nic_interface(struct mlx5_core_dev *dev)
+{
+	return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
+}
+
+static int in_fatal(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	struct health_buffer __iomem *h = health->health;
+
+	if (get_nic_interface(dev) == MLX5_NIC_IFC_DISABLED)
+		return 1;
+
+	if (ioread32be(&h->fw_ver) == 0xffffffff)
+		return 1;
+
+	return 0;
+}
+
 static void health_care(struct work_struct *work)
 {
 	struct mlx5_core_health *health;
@@ -136,11 +161,21 @@ static void print_health_info(struct mlx5_core_dev *dev)
 	dev_err(&dev->pdev->dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
 }
 
+static unsigned long get_next_poll_jiffies(void)
+{
+	unsigned long next;
+
+	get_random_bytes(&next, sizeof(next));
+	next %= HZ;
+	next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
+
+	return next;
+}
+
 static void poll_health(unsigned long data)
 {
 	struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data;
 	struct mlx5_core_health *health = &dev->priv.health;
-	unsigned long next;
 	u32 count;
 
 	count = ioread32be(health->health_counter);
@@ -151,14 +186,16 @@ static void poll_health(unsigned long data)
 
 	health->prev = count;
 	if (health->miss_counter == MAX_MISSES) {
-		mlx5_core_err(dev, "device's health compromised\n");
+		dev_err(&dev->pdev->dev, "device's health compromised - reached miss count\n");
 		print_health_info(dev);
-		queue_work(health->wq, &health->work);
 	} else {
-		get_random_bytes(&next, sizeof(next));
-		next %= HZ;
-		next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
-		mod_timer(&health->timer, next);
+		mod_timer(&health->timer, get_next_poll_jiffies());
+	}
+
+	if (in_fatal(dev) && !health->sick) {
+		health->sick = 1;
+		print_health_info(dev);
+		queue_work(health->wq, &health->work);
 	}
 }
 
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 41a3287..cd469b9 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -393,6 +393,7 @@ struct mlx5_core_health {
 	struct timer_list		timer;
 	u32				prev;
 	int				miss_counter;
+	int				sick;
 	struct workqueue_struct	       *wq;
 	struct work_struct		work;
 };
-- 
2.3.7

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH net-next 2/4] net/mlx5_core: Add pci error handlers to mlx5_core driver
  2015-10-13 15:44 [PATCH net-next 0/4] Mellanox driver update, Oct 13 2015 Or Gerlitz
  2015-10-13 15:44 ` [PATCH net-next 1/4] net/mlx5_core: Fix internal error detection conditions Or Gerlitz
@ 2015-10-13 15:44 ` Or Gerlitz
  2015-10-13 17:04   ` kbuild test robot
  2015-10-13 15:44 ` [PATCH net-next 3/4] net/mlx5_core: Wait for FW readiness on startup Or Gerlitz
  2015-10-13 15:44 ` [PATCH net-next 4/4] net/mlx4_core: Replace VF zero mac with random mac in mlx4_core Or Gerlitz
  3 siblings, 1 reply; 9+ messages in thread
From: Or Gerlitz @ 2015-10-13 15:44 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Eli Cohen, Amir Vadai, Majd Dibbiny, Or Gerlitz

From: Majd Dibbiny <majd@mellanox.com>

This patch implement the pci_error_handlers for mlx5_core which allow the
driver to recover from PCI error.

Once an error is detected in the PCI, the mlx5_pci_err_detected is called
and it:
1) Marks the device to be in 'Internal Error' state.
2) Dispatches an event to the mlx5_ib to flush all the outstanding cqes
with error.
3) Returns all the on going commands with error.
4) Unloads the driver.

Afterwards, the FW is reset and mlx5_pci_slot_reset is called and it
enables the device and restore it's pci state.

If the later succeeds, mlx5_pci_resume is called, and it loads the SW
stack.

Signed-off-by: Majd Dibbiny <majd@mellanox.com>
Signed-off-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c      | 170 +++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/health.c   |  72 ++++++++
 drivers/net/ethernet/mellanox/mlx5/core/main.c     | 185 ++++++++++++++++++++-
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   4 +
 .../net/ethernet/mellanox/mlx5/core/pagealloc.c    |  13 +-
 include/linux/mlx5/driver.h                        |  22 +++
 6 files changed, 454 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index c3e54b7..9c0446b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -256,8 +256,154 @@ static void dump_buf(void *buf, int size, int data_only, int offset)
 
 enum {
 	MLX5_DRIVER_STATUS_ABORTED = 0xfe,
+	MLX5_DRIVER_SYND = 0xbadd00de,
 };
 
+static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op,
+				       u32 *synd, u8 *status)
+{
+	*synd = 0;
+	*status = 0;
+
+	switch (op) {
+	case MLX5_CMD_OP_TEARDOWN_HCA:
+	case MLX5_CMD_OP_DISABLE_HCA:
+	case MLX5_CMD_OP_MANAGE_PAGES:
+	case MLX5_CMD_OP_DESTROY_MKEY:
+	case MLX5_CMD_OP_DESTROY_EQ:
+	case MLX5_CMD_OP_DESTROY_CQ:
+	case MLX5_CMD_OP_DESTROY_QP:
+	case MLX5_CMD_OP_DESTROY_PSV:
+	case MLX5_CMD_OP_DESTROY_SRQ:
+	case MLX5_CMD_OP_DESTROY_XRC_SRQ:
+	case MLX5_CMD_OP_DESTROY_DCT:
+	case MLX5_CMD_OP_DEALLOC_Q_COUNTER:
+	case MLX5_CMD_OP_DEALLOC_PD:
+	case MLX5_CMD_OP_DEALLOC_UAR:
+	case MLX5_CMD_OP_DETTACH_FROM_MCG:
+	case MLX5_CMD_OP_DEALLOC_XRCD:
+	case MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN:
+	case MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT:
+	case MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY:
+	case MLX5_CMD_OP_DESTROY_TIR:
+	case MLX5_CMD_OP_DESTROY_SQ:
+	case MLX5_CMD_OP_DESTROY_RQ:
+	case MLX5_CMD_OP_DESTROY_RMP:
+	case MLX5_CMD_OP_DESTROY_TIS:
+	case MLX5_CMD_OP_DESTROY_RQT:
+	case MLX5_CMD_OP_DESTROY_FLOW_TABLE:
+	case MLX5_CMD_OP_DESTROY_FLOW_GROUP:
+	case MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY:
+		return MLX5_CMD_STAT_OK;
+
+	case MLX5_CMD_OP_QUERY_HCA_CAP:
+	case MLX5_CMD_OP_QUERY_ADAPTER:
+	case MLX5_CMD_OP_INIT_HCA:
+	case MLX5_CMD_OP_ENABLE_HCA:
+	case MLX5_CMD_OP_QUERY_PAGES:
+	case MLX5_CMD_OP_SET_HCA_CAP:
+	case MLX5_CMD_OP_QUERY_ISSI:
+	case MLX5_CMD_OP_SET_ISSI:
+	case MLX5_CMD_OP_CREATE_MKEY:
+	case MLX5_CMD_OP_QUERY_MKEY:
+	case MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS:
+	case MLX5_CMD_OP_PAGE_FAULT_RESUME:
+	case MLX5_CMD_OP_CREATE_EQ:
+	case MLX5_CMD_OP_QUERY_EQ:
+	case MLX5_CMD_OP_GEN_EQE:
+	case MLX5_CMD_OP_CREATE_CQ:
+	case MLX5_CMD_OP_QUERY_CQ:
+	case MLX5_CMD_OP_MODIFY_CQ:
+	case MLX5_CMD_OP_CREATE_QP:
+	case MLX5_CMD_OP_RST2INIT_QP:
+	case MLX5_CMD_OP_INIT2RTR_QP:
+	case MLX5_CMD_OP_RTR2RTS_QP:
+	case MLX5_CMD_OP_RTS2RTS_QP:
+	case MLX5_CMD_OP_SQERR2RTS_QP:
+	case MLX5_CMD_OP_2ERR_QP:
+	case MLX5_CMD_OP_2RST_QP:
+	case MLX5_CMD_OP_QUERY_QP:
+	case MLX5_CMD_OP_SQD_RTS_QP:
+	case MLX5_CMD_OP_INIT2INIT_QP:
+	case MLX5_CMD_OP_CREATE_PSV:
+	case MLX5_CMD_OP_CREATE_SRQ:
+	case MLX5_CMD_OP_QUERY_SRQ:
+	case MLX5_CMD_OP_ARM_RQ:
+	case MLX5_CMD_OP_CREATE_XRC_SRQ:
+	case MLX5_CMD_OP_QUERY_XRC_SRQ:
+	case MLX5_CMD_OP_ARM_XRC_SRQ:
+	case MLX5_CMD_OP_CREATE_DCT:
+	case MLX5_CMD_OP_DRAIN_DCT:
+	case MLX5_CMD_OP_QUERY_DCT:
+	case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION:
+	case MLX5_CMD_OP_QUERY_VPORT_STATE:
+	case MLX5_CMD_OP_MODIFY_VPORT_STATE:
+	case MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT:
+	case MLX5_CMD_OP_MODIFY_ESW_VPORT_CONTEXT:
+	case MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT:
+	case MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT:
+	case MLX5_CMD_OP_QUERY_ROCE_ADDRESS:
+	case MLX5_CMD_OP_SET_ROCE_ADDRESS:
+	case MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT:
+	case MLX5_CMD_OP_MODIFY_HCA_VPORT_CONTEXT:
+	case MLX5_CMD_OP_QUERY_HCA_VPORT_GID:
+	case MLX5_CMD_OP_QUERY_HCA_VPORT_PKEY:
+	case MLX5_CMD_OP_QUERY_VPORT_COUNTER:
+	case MLX5_CMD_OP_ALLOC_Q_COUNTER:
+	case MLX5_CMD_OP_QUERY_Q_COUNTER:
+	case MLX5_CMD_OP_ALLOC_PD:
+	case MLX5_CMD_OP_ALLOC_UAR:
+	case MLX5_CMD_OP_CONFIG_INT_MODERATION:
+	case MLX5_CMD_OP_ACCESS_REG:
+	case MLX5_CMD_OP_ATTACH_TO_MCG:
+	case MLX5_CMD_OP_GET_DROPPED_PACKET_LOG:
+	case MLX5_CMD_OP_MAD_IFC:
+	case MLX5_CMD_OP_QUERY_MAD_DEMUX:
+	case MLX5_CMD_OP_SET_MAD_DEMUX:
+	case MLX5_CMD_OP_NOP:
+	case MLX5_CMD_OP_ALLOC_XRCD:
+	case MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN:
+	case MLX5_CMD_OP_QUERY_CONG_STATUS:
+	case MLX5_CMD_OP_MODIFY_CONG_STATUS:
+	case MLX5_CMD_OP_QUERY_CONG_PARAMS:
+	case MLX5_CMD_OP_MODIFY_CONG_PARAMS:
+	case MLX5_CMD_OP_QUERY_CONG_STATISTICS:
+	case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT:
+	case MLX5_CMD_OP_SET_L2_TABLE_ENTRY:
+	case MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY:
+	case MLX5_CMD_OP_CREATE_TIR:
+	case MLX5_CMD_OP_MODIFY_TIR:
+	case MLX5_CMD_OP_QUERY_TIR:
+	case MLX5_CMD_OP_CREATE_SQ:
+	case MLX5_CMD_OP_MODIFY_SQ:
+	case MLX5_CMD_OP_QUERY_SQ:
+	case MLX5_CMD_OP_CREATE_RQ:
+	case MLX5_CMD_OP_MODIFY_RQ:
+	case MLX5_CMD_OP_QUERY_RQ:
+	case MLX5_CMD_OP_CREATE_RMP:
+	case MLX5_CMD_OP_MODIFY_RMP:
+	case MLX5_CMD_OP_QUERY_RMP:
+	case MLX5_CMD_OP_CREATE_TIS:
+	case MLX5_CMD_OP_MODIFY_TIS:
+	case MLX5_CMD_OP_QUERY_TIS:
+	case MLX5_CMD_OP_CREATE_RQT:
+	case MLX5_CMD_OP_MODIFY_RQT:
+	case MLX5_CMD_OP_QUERY_RQT:
+	case MLX5_CMD_OP_CREATE_FLOW_TABLE:
+	case MLX5_CMD_OP_QUERY_FLOW_TABLE:
+	case MLX5_CMD_OP_CREATE_FLOW_GROUP:
+	case MLX5_CMD_OP_QUERY_FLOW_GROUP:
+	case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY:
+	case MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY:
+		*status = MLX5_DRIVER_STATUS_ABORTED;
+		*synd = MLX5_DRIVER_SYND;
+		return -EIO;
+	default:
+		mlx5_core_err(dev, "Unknown FW command (%d)\n", op);
+		return -EINVAL;
+	}
+}
+
 const char *mlx5_command_str(int command)
 {
 	switch (command) {
@@ -592,6 +738,16 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
 	return err;
 }
 
+static __be32 *get_synd_ptr(struct mlx5_outbox_hdr *out)
+{
+	return &out->syndrome;
+}
+
+static u8 *get_status_ptr(struct mlx5_outbox_hdr *out)
+{
+	return &out->status;
+}
+
 /*  Notes:
  *    1. Callback functions may not sleep
  *    2. page queue commands do not support asynchrous completion
@@ -1200,6 +1356,11 @@ static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size,
 	return msg;
 }
 
+static u16 opcode_from_in(struct mlx5_inbox_hdr *in)
+{
+	return be16_to_cpu(in->opcode);
+}
+
 static int is_manage_pages(struct mlx5_inbox_hdr *in)
 {
 	return be16_to_cpu(in->opcode) == MLX5_CMD_OP_MANAGE_PAGES;
@@ -1214,6 +1375,15 @@ static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
 	gfp_t gfp;
 	int err;
 	u8 status = 0;
+	u32 drv_synd;
+
+	if (pci_channel_offline(dev->pdev) ||
+	    dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+		err = mlx5_internal_err_ret_value(dev, opcode_from_in(in), &drv_synd, &status);
+		*get_synd_ptr(out) = drv_synd;
+		*get_status_ptr(out) = status;
+		return err;
+	}
 
 	pages_queue = is_manage_pages(in);
 	gfp = callback ? GFP_ATOMIC : GFP_KERNEL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index b007e83..4089c8f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -34,6 +34,7 @@
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/vmalloc.h>
+#include <linux/hardirq.h>
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/cmd.h>
 #include "mlx5_core.h"
@@ -68,6 +69,29 @@ static u8 get_nic_interface(struct mlx5_core_dev *dev)
 	return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
 }
 
+static void trigger_cmd_completions(struct mlx5_core_dev *dev)
+{
+	unsigned long flags;
+	u64 vector;
+
+	/* wait for pending handlers to complete */
+	synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector);
+	spin_lock_irqsave(&dev->cmd.alloc_lock, flags);
+	vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1);
+	if (!vector)
+		goto no_trig;
+
+	vector |= MLX5_TRIGGERED_CMD_COMP;
+	spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
+
+	mlx5_core_dbg(dev, "vector 0x%llx\n", vector);
+	mlx5_cmd_comp_handler(dev, vector);
+	return;
+
+no_trig:
+	spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
+}
+
 static int in_fatal(struct mlx5_core_dev *dev)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
@@ -82,6 +106,43 @@ static int in_fatal(struct mlx5_core_dev *dev)
 	return 0;
 }
 
+void mlx5_enter_error_state(struct mlx5_core_dev *dev)
+{
+	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+		return;
+
+	mlx5_core_err(dev, "start\n");
+	if (pci_channel_offline(dev->pdev) || in_fatal(dev))
+		dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
+
+	mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 0);
+	mlx5_core_err(dev, "end\n");
+}
+
+static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
+{
+	u8 nic_interface = get_nic_interface(dev);
+
+	switch (nic_interface) {
+	case MLX5_NIC_IFC_FULL:
+		mlx5_core_warn(dev, "Expected to see disabled NIC but it is full driver\n");
+		break;
+
+	case MLX5_NIC_IFC_DISABLED:
+		mlx5_core_warn(dev, "starting teardown\n");
+		break;
+
+	case MLX5_NIC_IFC_NO_DRAM_NIC:
+		mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n");
+		break;
+	default:
+		mlx5_core_warn(dev, "Expected to see disabled NIC but it is has invalid value %d\n",
+			       nic_interface);
+	}
+
+	mlx5_disable_device(dev);
+}
+
 static void health_care(struct work_struct *work)
 {
 	struct mlx5_core_health *health;
@@ -92,6 +153,7 @@ static void health_care(struct work_struct *work)
 	priv = container_of(health, struct mlx5_priv, health);
 	dev = container_of(priv, struct mlx5_core_dev, priv);
 	mlx5_core_warn(dev, "handling bad device here\n");
+	mlx5_handle_bad_state(dev);
 }
 
 static const char *hsynd_str(u8 synd)
@@ -147,6 +209,10 @@ static void print_health_info(struct mlx5_core_dev *dev)
 	u32 fw;
 	int i;
 
+	/* If the syndrom is 0, the device is OK and no need to print buffer */
+	if (!ioread8(&h->synd))
+		return;
+
 	for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
 		dev_err(&dev->pdev->dev, "assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i));
 
@@ -178,6 +244,12 @@ static void poll_health(unsigned long data)
 	struct mlx5_core_health *health = &dev->priv.health;
 	u32 count;
 
+	if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+		trigger_cmd_completions(dev);
+		mod_timer(&health->timer, get_next_poll_jiffies());
+		return;
+	}
+
 	count = ioread32be(health->health_counter);
 	if (count == health->prev)
 		++health->miss_counter;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index b6edc58..368c2ea 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -45,6 +45,7 @@
 #include <linux/mlx5/srq.h>
 #include <linux/debugfs.h>
 #include <linux/kmod.h>
+#include <linux/delay.h>
 #include <linux/mlx5/mlx5_ifc.h>
 #include "mlx5_core.h"
 
@@ -181,6 +182,34 @@ static int set_dma_caps(struct pci_dev *pdev)
 	return err;
 }
 
+static int mlx5_pci_enable_device(struct mlx5_core_dev *dev)
+{
+	struct pci_dev *pdev = dev->pdev;
+	int err = 0;
+
+	mutex_lock(&dev->pci_status_mutex);
+	if (dev->pci_status == MLX5_PCI_STATUS_DISABLED) {
+		err = pci_enable_device(pdev);
+		if (!err)
+			dev->pci_status = MLX5_PCI_STATUS_ENABLED;
+	}
+	mutex_unlock(&dev->pci_status_mutex);
+
+	return err;
+}
+
+static void mlx5_pci_disable_device(struct mlx5_core_dev *dev)
+{
+	struct pci_dev *pdev = dev->pdev;
+
+	mutex_lock(&dev->pci_status_mutex);
+	if (dev->pci_status == MLX5_PCI_STATUS_ENABLED) {
+		pci_disable_device(pdev);
+		dev->pci_status = MLX5_PCI_STATUS_DISABLED;
+	}
+	mutex_unlock(&dev->pci_status_mutex);
+}
+
 static int request_bar(struct pci_dev *pdev)
 {
 	int err = 0;
@@ -807,7 +836,7 @@ static int mlx5_pci_init(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 	if (!priv->dbg_root)
 		return -ENOMEM;
 
-	err = pci_enable_device(pdev);
+	err = mlx5_pci_enable_device(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n");
 		goto err_dbg;
@@ -841,7 +870,7 @@ err_clr_master:
 	pci_clear_master(dev->pdev);
 	release_bar(dev->pdev);
 err_disable:
-	pci_disable_device(dev->pdev);
+	mlx5_pci_disable_device(dev);
 
 err_dbg:
 	debugfs_remove(priv->dbg_root);
@@ -853,7 +882,7 @@ static void mlx5_pci_close(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 	iounmap(dev->iseg);
 	pci_clear_master(dev->pdev);
 	release_bar(dev->pdev);
-	pci_disable_device(dev->pdev);
+	mlx5_pci_disable_device(dev);
 	debugfs_remove(priv->dbg_root);
 }
 
@@ -863,13 +892,25 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 	struct pci_dev *pdev = dev->pdev;
 	int err;
 
+	mutex_lock(&dev->intf_state_mutex);
+	if (dev->interface_state == MLX5_INTERFACE_STATE_UP) {
+		dev_warn(&dev->pdev->dev, "%s: interface is up, NOP\n",
+			 __func__);
+		goto out;
+	}
+
 	dev_info(&pdev->dev, "firmware version: %d.%d.%d\n", fw_rev_maj(dev),
 		 fw_rev_min(dev), fw_rev_sub(dev));
 
+	/* on load removing any previous indication of internal error, device is
+	 * up
+	 */
+	dev->state = MLX5_DEVICE_STATE_UP;
+
 	err = mlx5_cmd_init(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Failed initializing command interface, aborting\n");
-		return err;
+		goto out_err;
 	}
 
 	mlx5_pagealloc_init(dev);
@@ -994,6 +1035,10 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 	if (err)
 		pr_info("failed request module on %s\n", MLX5_IB_MOD);
 
+	dev->interface_state = MLX5_INTERFACE_STATE_UP;
+out:
+	mutex_unlock(&dev->intf_state_mutex);
+
 	return 0;
 
 err_reg_dev:
@@ -1024,7 +1069,7 @@ err_stop_poll:
 	mlx5_stop_health_poll(dev);
 	if (mlx5_cmd_teardown_hca(dev)) {
 		dev_err(&dev->pdev->dev, "tear_down_hca failed, skip cleanup\n");
-		return err;
+		goto out_err;
 	}
 
 err_pagealloc_stop:
@@ -1040,13 +1085,23 @@ err_pagealloc_cleanup:
 	mlx5_pagealloc_cleanup(dev);
 	mlx5_cmd_cleanup(dev);
 
+out_err:
+	dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
+	mutex_unlock(&dev->intf_state_mutex);
+
 	return err;
 }
 
 static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 {
-	int err;
+	int err = 0;
 
+	mutex_lock(&dev->intf_state_mutex);
+	if (dev->interface_state == MLX5_INTERFACE_STATE_DOWN) {
+		dev_warn(&dev->pdev->dev, "%s: interface is down, NOP\n",
+			 __func__);
+		goto out;
+	}
 	mlx5_unregister_device(dev);
 	mlx5_cleanup_mr_table(dev);
 	mlx5_cleanup_srq_table(dev);
@@ -1072,10 +1127,12 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 	mlx5_cmd_cleanup(dev);
 
 out:
+	dev->interface_state = MLX5_INTERFACE_STATE_DOWN;
+	mutex_unlock(&dev->intf_state_mutex);
 	return err;
 }
 
-static void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
+void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
 		     unsigned long param)
 {
 	struct mlx5_priv *priv = &dev->priv;
@@ -1125,6 +1182,8 @@ static int init_one(struct pci_dev *pdev,
 
 	INIT_LIST_HEAD(&priv->ctx_list);
 	spin_lock_init(&priv->ctx_lock);
+	mutex_init(&dev->pci_status_mutex);
+	mutex_init(&dev->intf_state_mutex);
 	err = mlx5_pci_init(dev, priv);
 	if (err) {
 		dev_err(&pdev->dev, "mlx5_pci_init failed with error code %d\n", err);
@@ -1172,6 +1231,115 @@ static void remove_one(struct pci_dev *pdev)
 	kfree(dev);
 }
 
+static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
+					      pci_channel_state_t state)
+{
+	struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
+	struct mlx5_priv *priv = &dev->priv;
+
+	dev_info(&pdev->dev, "%s was called\n", __func__);
+	mlx5_enter_error_state(dev);
+	mlx5_unload_one(dev, priv);
+	mlx5_pci_disable_device(dev);
+	return state == pci_channel_io_perm_failure ?
+		PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev)
+{
+	struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
+	int err = 0;
+
+	dev_info(&pdev->dev, "%s was called\n", __func__);
+
+	err = mlx5_pci_enable_device(dev);
+	if (err) {
+		dev_err(&pdev->dev, "%s: mlx5_pci_enable_device failed with error code: %d\n"
+			, __func__, err);
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+	pci_set_master(pdev);
+	pci_set_power_state(pdev, PCI_D0);
+	pci_restore_state(pdev);
+
+	return err ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
+}
+
+void mlx5_disable_device(struct mlx5_core_dev *dev)
+{
+	mlx5_pci_err_detected(dev->pdev, 0);
+}
+
+/* wait for the device to show vital signs. For now we check
+ * that we can read the device ID and that the health buffer
+ * shows a non zero value which is different than 0xffffffff
+ */
+static void wait_vital(struct pci_dev *pdev)
+{
+	struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
+	struct mlx5_core_health *health = &dev->priv.health;
+	const int niter = 100;
+	u32 count;
+	u16 did;
+	int i;
+
+	/* let's be on the safe side */
+	msleep(1000);
+	for (i = 0; i < niter; i++) {
+		if (pci_read_config_word(pdev, 2, &did)) {
+			dev_warn(&pdev->dev, "failed reading config word\n");
+			break;
+		}
+		if (did == pdev->device) {
+			dev_info(&pdev->dev, "device ID correctly read after %d iterations\n", i);
+			break;
+		}
+		msleep(50);
+	}
+	if (i == niter)
+		dev_warn(&pdev->dev, "%s-%d: could not read device ID\n", __func__, __LINE__);
+
+	for (i = 0; i < niter; i++) {
+		count = ioread32be(health->health_counter);
+		if (count && count != 0xffffffff) {
+			dev_info(&pdev->dev, "Counter value 0x%x after %d iterations\n", count, i);
+			break;
+		}
+		msleep(50);
+	}
+
+	if (i == niter)
+		dev_warn(&pdev->dev, "%s-%d: could not read device ID\n", __func__, __LINE__);
+
+	/* let's be on the safe side */
+	msleep(1000);
+}
+
+static void mlx5_pci_resume(struct pci_dev *pdev)
+{
+	struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
+	struct mlx5_priv *priv = &dev->priv;
+	int err;
+
+	dev_info(&pdev->dev, "%s was called\n", __func__);
+
+	pci_save_state(pdev);
+	wait_vital(pdev);
+
+	err = mlx5_load_one(dev, priv);
+	if (err)
+		dev_err(&pdev->dev, "%s: mlx5_load_one failed with error code: %d\n"
+			, __func__, err);
+	else
+		dev_info(&pdev->dev, "%s: device recovered\n", __func__);
+}
+
+static const struct pci_error_handlers mlx5_err_handler = {
+	.error_detected = mlx5_pci_err_detected,
+	.slot_reset	= mlx5_pci_slot_reset,
+	.resume		= mlx5_pci_resume
+};
+
 static const struct pci_device_id mlx5_core_pci_table[] = {
 	{ PCI_VDEVICE(MELLANOX, 0x1011) }, /* Connect-IB */
 	{ PCI_VDEVICE(MELLANOX, 0x1012) }, /* Connect-IB VF */
@@ -1188,7 +1356,8 @@ static struct pci_driver mlx5_core_driver = {
 	.name           = DRIVER_NAME,
 	.id_table       = mlx5_core_pci_table,
 	.probe          = init_one,
-	.remove         = remove_one
+	.remove         = remove_one,
+	.err_handler	= &mlx5_err_handler
 };
 
 static int __init init(void)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 30c0be7..cee5b7a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -86,6 +86,10 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev);
 int mlx5_query_board_id(struct mlx5_core_dev *dev);
 int mlx5_cmd_init_hca(struct mlx5_core_dev *dev);
 int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev);
+void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
+		     unsigned long param);
+void mlx5_enter_error_state(struct mlx5_core_dev *dev);
+void mlx5_disable_device(struct mlx5_core_dev *dev);
 
 void mlx5e_init(void);
 void mlx5e_cleanup(void);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
index 76432a5..1cda5d2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
@@ -493,15 +493,20 @@ int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev)
 	struct fw_page *fwp;
 	struct rb_node *p;
 	int nclaimed = 0;
-	int err;
+	int err = 0;
 
 	do {
 		p = rb_first(&dev->priv.page_root);
 		if (p) {
 			fwp = rb_entry(p, struct fw_page, rb_node);
-			err = reclaim_pages(dev, fwp->func_id,
-					    optimal_reclaimed_pages(),
-					    &nclaimed);
+			if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
+				free_4k(dev, fwp->addr);
+				nclaimed = 1;
+			} else {
+				err = reclaim_pages(dev, fwp->func_id,
+						    optimal_reclaimed_pages(),
+						    &nclaimed);
+			}
 			if (err) {
 				mlx5_core_warn(dev, "failed reclaiming pages (%d)\n",
 					       err);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index cd469b9..9d77b8d 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -487,8 +487,26 @@ struct mlx5_priv {
 	spinlock_t              ctx_lock;
 };
 
+enum mlx5_device_state {
+	MLX5_DEVICE_STATE_UP,
+	MLX5_DEVICE_STATE_INTERNAL_ERROR,
+};
+
+enum mlx5_interface_state {
+	MLX5_INTERFACE_STATE_DOWN,
+	MLX5_INTERFACE_STATE_UP,
+};
+
+enum mlx5_pci_status {
+	MLX5_PCI_STATUS_DISABLED,
+	MLX5_PCI_STATUS_ENABLED,
+};
+
 struct mlx5_core_dev {
 	struct pci_dev	       *pdev;
+	/* sync pci state */
+	struct mutex		pci_status_mutex;
+	enum mlx5_pci_status	pci_status;
 	u8			rev_id;
 	char			board_id[MLX5_BOARD_ID_LEN];
 	struct mlx5_cmd		cmd;
@@ -497,6 +515,10 @@ struct mlx5_core_dev {
 	u32 hca_caps_max[MLX5_CAP_NUM][MLX5_UN_SZ_DW(hca_cap_union)];
 	phys_addr_t		iseg_base;
 	struct mlx5_init_seg __iomem *iseg;
+	enum mlx5_device_state	state;
+	/* sync interface state */
+	struct mutex		intf_state_mutex;
+	enum mlx5_interface_state interface_state;
 	void			(*event) (struct mlx5_core_dev *dev,
 					  enum mlx5_dev_event event,
 					  unsigned long param);
-- 
2.3.7

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH net-next 3/4] net/mlx5_core: Wait for FW readiness on startup
  2015-10-13 15:44 [PATCH net-next 0/4] Mellanox driver update, Oct 13 2015 Or Gerlitz
  2015-10-13 15:44 ` [PATCH net-next 1/4] net/mlx5_core: Fix internal error detection conditions Or Gerlitz
  2015-10-13 15:44 ` [PATCH net-next 2/4] net/mlx5_core: Add pci error handlers to mlx5_core driver Or Gerlitz
@ 2015-10-13 15:44 ` Or Gerlitz
  2015-10-13 15:44 ` [PATCH net-next 4/4] net/mlx4_core: Replace VF zero mac with random mac in mlx4_core Or Gerlitz
  3 siblings, 0 replies; 9+ messages in thread
From: Or Gerlitz @ 2015-10-13 15:44 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Eli Cohen, Amir Vadai, Or Gerlitz

From: Eli Cohen <eli@mellanox.com>

On device initialization, wait till firmware indicates that that it is done
with initialization before proceeding to initialize the device.

Also update initialization segment layout to match driver/firmware
interface definitions.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 27 ++++++++++++++++++++++++++
 include/linux/mlx5/device.h                    |  3 ++-
 include/linux/mlx5/driver.h                    |  5 +++++
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 368c2ea..c55bb37 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -39,6 +39,7 @@
 #include <linux/slab.h>
 #include <linux/io-mapping.h>
 #include <linux/interrupt.h>
+#include <linux/delay.h>
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/cq.h>
 #include <linux/mlx5/qp.h>
@@ -152,6 +153,25 @@ static struct mlx5_profile profile[] = {
 	},
 };
 
+#define FW_INIT_TIMEOUT_MILI	2000
+#define FW_INIT_WAIT_MS		2
+
+static int wait_fw_init(struct mlx5_core_dev *dev, u32 max_wait_mili)
+{
+	unsigned long end = jiffies + msecs_to_jiffies(max_wait_mili);
+	int err = 0;
+
+	while (fw_initializing(dev)) {
+		if (time_after(jiffies, end)) {
+			err = -EBUSY;
+			break;
+		}
+		msleep(FW_INIT_WAIT_MS);
+	}
+
+	return err;
+}
+
 static int set_dma_caps(struct pci_dev *pdev)
 {
 	int err;
@@ -913,6 +933,13 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 		goto out_err;
 	}
 
+	err = wait_fw_init(dev, FW_INIT_TIMEOUT_MILI);
+	if (err) {
+		dev_err(&dev->pdev->dev, "Firmware over %d MS in initializing state, aborting\n",
+			FW_INIT_TIMEOUT_MILI);
+		goto out_err;
+	}
+
 	mlx5_pagealloc_init(dev);
 
 	err = mlx5_core_enable_hca(dev);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 2a0b956..0b473cb 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -439,7 +439,8 @@ struct mlx5_init_seg {
 	__be32			cmdq_addr_h;
 	__be32			cmdq_addr_l_sz;
 	__be32			cmd_dbell;
-	__be32			rsvd1[121];
+	__be32			rsvd1[120];
+	__be32			initializing;
 	struct health_buffer	health;
 	__be32			rsvd2[884];
 	__be32			health_counter;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 9d77b8d..c2ea5aa 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -826,6 +826,11 @@ void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common);
 int mlx5_query_odp_caps(struct mlx5_core_dev *dev,
 			struct mlx5_odp_caps *odp_caps);
 
+static inline int fw_initializing(struct mlx5_core_dev *dev)
+{
+	return ioread32be(&dev->iseg->initializing) >> 31;
+}
+
 static inline u32 mlx5_mkey_to_idx(u32 mkey)
 {
 	return mkey >> 8;
-- 
2.3.7

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH net-next 4/4] net/mlx4_core: Replace VF zero mac with random mac in mlx4_core
  2015-10-13 15:44 [PATCH net-next 0/4] Mellanox driver update, Oct 13 2015 Or Gerlitz
                   ` (2 preceding siblings ...)
  2015-10-13 15:44 ` [PATCH net-next 3/4] net/mlx5_core: Wait for FW readiness on startup Or Gerlitz
@ 2015-10-13 15:44 ` Or Gerlitz
  3 siblings, 0 replies; 9+ messages in thread
From: Or Gerlitz @ 2015-10-13 15:44 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Eli Cohen, Amir Vadai, Jack Morgenstein, Or Gerlitz

From: Jack Morgenstein <jackm@dev.mellanox.co.il>

By design, when no default MAC addresses are set in the Hypervisor for VFs,
the VFs are passed zero-macs. When such a MAC is received by the VF, it
generates a random MAC address and registers that MAC address
with the Hypervisor.

This random mac generation is currently done in the mlx4_en module.
There is a problem, though, if the mlx4_ib module is loaded by a VF before
the mlx4_en module. In this case, for RoCE, mlx4_ib will see the un-replaced
zero-mac and register that zero-mac as part of QP1 initialization.

Having a zero-mac in the port's MAC table creates problems for a
Baseboard Management Console. The BMC occasionally sends packets with a
zero-mac destination MAC. If there is a zero-mac present in the port's
MAC table, the FW will send such BMC packets to the host driver rather than
to the wire, and BMC will stop working.

To address this problem, we move the replacement of zero-mac addresses
with random-mac addresses to procedure mlx4_slave_cap(), which is part of the
driver startup for VFs, and is before activation of mlx4_ib and mlx4_en.
As a result, zero-mac addresses will never be registered in the port MAC table
by the driver.

In addition, when mlx4_en does initialize the net device, it needs to set
the NET_ADDR_RANDOM flag in the netdev structure if the address was
randomly generated. This is done so that udev on the VM does not create
a new device name after each VF probe (VM boot and such). To accomplish this,
we add a per-port flag in mlx4_dev which gets set whenever mlx4_core replaces
a zero-mac with a randomly-generated mac. This flag is examined when mlx4_en
initializes the net-device.

Fix was suggested by Matan Barak <matanb@mellanox.com>

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 23 +++++++++++------------
 drivers/net/ethernet/mellanox/mlx4/fw.c        | 16 ++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/main.c      |  2 ++
 drivers/net/ethernet/mellanox/mlx4/mlx4.h      |  2 ++
 include/linux/mlx4/device.h                    |  1 +
 5 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 597d892..886e1bc 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2816,7 +2816,6 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 	struct mlx4_en_priv *priv;
 	int i;
 	int err;
-	u64 mac_u64;
 
 	dev = alloc_etherdev_mqs(sizeof(struct mlx4_en_priv),
 				 MAX_TX_RINGS, MAX_RX_RINGS);
@@ -2908,17 +2907,17 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 	dev->addr_len = ETH_ALEN;
 	mlx4_en_u64_to_mac(dev->dev_addr, mdev->dev->caps.def_mac[priv->port]);
 	if (!is_valid_ether_addr(dev->dev_addr)) {
-		if (mlx4_is_slave(priv->mdev->dev)) {
-			eth_hw_addr_random(dev);
-			en_warn(priv, "Assigned random MAC address %pM\n", dev->dev_addr);
-			mac_u64 = mlx4_mac_to_u64(dev->dev_addr);
-			mdev->dev->caps.def_mac[priv->port] = mac_u64;
-		} else {
-			en_err(priv, "Port: %d, invalid mac burned: %pM, quiting\n",
-			       priv->port, dev->dev_addr);
-			err = -EINVAL;
-			goto out;
-		}
+		en_err(priv, "Port: %d, invalid mac burned: %pM, quiting\n",
+		       priv->port, dev->dev_addr);
+		err = -EINVAL;
+		goto out;
+	} else if (mlx4_is_slave(priv->mdev->dev) &&
+		   (priv->mdev->dev->port_random_macs & 1 << priv->port)) {
+		/* Random MAC was assigned in mlx4_slave_cap
+		 * in mlx4_core module
+		 */
+		dev->addr_assign_type |= NET_ADDR_RANDOM;
+		en_warn(priv, "Assigned random MAC address %pM\n", dev->dev_addr);
 	}
 
 	memcpy(priv->current_mac, dev->dev_addr, sizeof(priv->current_mac));
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index e8ec1de..f13a4d7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -2840,3 +2840,19 @@ int set_phv_bit(struct mlx4_dev *dev, u8 port, int new_val)
 	return -EOPNOTSUPP;
 }
 EXPORT_SYMBOL(set_phv_bit);
+
+void mlx4_replace_zero_macs(struct mlx4_dev *dev)
+{
+	int i;
+	u8 mac_addr[ETH_ALEN];
+
+	dev->port_random_macs = 0;
+	for (i = 1; i <= dev->caps.num_ports; ++i)
+		if (!dev->caps.def_mac[i] &&
+		    dev->caps.port_type[i] == MLX4_PORT_TYPE_ETH) {
+			eth_random_addr(mac_addr);
+			dev->port_random_macs |= 1 << i;
+			dev->caps.def_mac[i] = mlx4_mac_to_u64(mac_addr);
+		}
+}
+EXPORT_SYMBOL_GPL(mlx4_replace_zero_macs);
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 006757f..bcbdfab 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -863,6 +863,8 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 		return -ENODEV;
 	}
 
+	mlx4_replace_zero_macs(dev);
+
 	dev->caps.qp0_qkey = kcalloc(dev->caps.num_ports, sizeof(u32), GFP_KERNEL);
 	dev->caps.qp0_tunnel = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
 	dev->caps.qp0_proxy = kcalloc(dev->caps.num_ports, sizeof (u32), GFP_KERNEL);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 232b2b5..e1cf903 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -1378,6 +1378,8 @@ void mlx4_vf_immed_vlan_work_handler(struct work_struct *_work);
 
 void mlx4_init_quotas(struct mlx4_dev *dev);
 
+/* for VFs, replace zero MACs with randomly-generated MACs at driver start */
+void mlx4_replace_zero_macs(struct mlx4_dev *dev);
 int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave, int port);
 /* Returns the VF index of slave */
 int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index baad4cb..5a8677b 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -833,6 +833,7 @@ struct mlx4_dev {
 	struct mlx4_quotas	quotas;
 	struct radix_tree_root	qp_table_tree;
 	u8			rev_id;
+	u8			port_random_macs;
 	char			board_id[MLX4_BOARD_ID_LEN];
 	int			numa_node;
 	int			oper_log_mgm_entry_size;
-- 
2.3.7

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next 2/4] net/mlx5_core: Add pci error handlers to mlx5_core driver
  2015-10-13 15:44 ` [PATCH net-next 2/4] net/mlx5_core: Add pci error handlers to mlx5_core driver Or Gerlitz
@ 2015-10-13 17:04   ` kbuild test robot
  2015-10-14  4:34     ` Or Gerlitz
  0 siblings, 1 reply; 9+ messages in thread
From: kbuild test robot @ 2015-10-13 17:04 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: kbuild-all, David S. Miller, netdev, Eli Cohen, Amir Vadai,
	Majd Dibbiny, Or Gerlitz

Hi Majd,

[auto build test WARNING on net-next/master -- if it's inappropriate base, please suggest rules for selecting the more suitable base]

url:    https://github.com/0day-ci/linux/commits/Or-Gerlitz/net-mlx5_core-Fix-internal-error-detection-conditions/20151013-234855
reproduce:
        # apt-get install sparse
        make ARCH=x86_64 allmodconfig
        make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)

>> drivers/net/ethernet/mellanox/mlx5/core/cmd.c:1383:36: sparse: incorrect type in assignment (different base types)
   drivers/net/ethernet/mellanox/mlx5/core/cmd.c:1383:36:    expected restricted __be32 [usertype] <noident>
   drivers/net/ethernet/mellanox/mlx5/core/cmd.c:1383:36:    got unsigned int [unsigned] [addressable] [usertype] drv_synd

vim +1383 drivers/net/ethernet/mellanox/mlx5/core/cmd.c

  1367	}
  1368	
  1369	static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
  1370			    int out_size, mlx5_cmd_cbk_t callback, void *context)
  1371	{
  1372		struct mlx5_cmd_msg *inb;
  1373		struct mlx5_cmd_msg *outb;
  1374		int pages_queue;
  1375		gfp_t gfp;
  1376		int err;
  1377		u8 status = 0;
  1378		u32 drv_synd;
  1379	
  1380		if (pci_channel_offline(dev->pdev) ||
  1381		    dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
  1382			err = mlx5_internal_err_ret_value(dev, opcode_from_in(in), &drv_synd, &status);
> 1383			*get_synd_ptr(out) = drv_synd;
  1384			*get_status_ptr(out) = status;
  1385			return err;
  1386		}
  1387	
  1388		pages_queue = is_manage_pages(in);
  1389		gfp = callback ? GFP_ATOMIC : GFP_KERNEL;
  1390	
  1391		inb = alloc_msg(dev, in_size, gfp);

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next 1/4] net/mlx5_core: Fix internal error detection conditions
  2015-10-13 15:44 ` [PATCH net-next 1/4] net/mlx5_core: Fix internal error detection conditions Or Gerlitz
@ 2015-10-14  2:46   ` David Miller
  2015-10-14 14:47     ` Or Gerlitz
  0 siblings, 1 reply; 9+ messages in thread
From: David Miller @ 2015-10-14  2:46 UTC (permalink / raw)
  To: ogerlitz; +Cc: netdev, eli, amirv

From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Tue, 13 Oct 2015 18:44:06 +0300

> +	if (in_fatal(dev) && !health->sick) {
> +		health->sick = 1;
> +		print_health_info(dev);
> +		queue_work(health->wq, &health->work);
>  	}
>  }
>  
> diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
> index 41a3287..cd469b9 100644
> --- a/include/linux/mlx5/driver.h
> +++ b/include/linux/mlx5/driver.h
> @@ -393,6 +393,7 @@ struct mlx5_core_health {
>  	struct timer_list		timer;
>  	u32				prev;
>  	int				miss_counter;
> +	int				sick;
>  	struct workqueue_struct	       *wq;
>  	struct work_struct		work;
>  };
> -- 
> 2.3.7
> 

Please use "bool" and true/false for boolean states.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next 2/4] net/mlx5_core: Add pci error handlers to mlx5_core driver
  2015-10-13 17:04   ` kbuild test robot
@ 2015-10-14  4:34     ` Or Gerlitz
  0 siblings, 0 replies; 9+ messages in thread
From: Or Gerlitz @ 2015-10-14  4:34 UTC (permalink / raw)
  To: Majd Dibbiny; +Cc: netdev, Eli Cohen, Amir Vadai



On 10/13/2015 8:04 PM, kbuild test robot wrote:
> Hi Majd,
>
> [auto build test WARNING on net-next/master -- if it's inappropriate base, please suggest rules for selecting the more suitable base]
>
> url:    https://github.com/0day-ci/linux/commits/Or-Gerlitz/net-mlx5_core-Fix-internal-error-detection-conditions/20151013-234855
> reproduce:
>          # apt-get install sparse
>          make ARCH=x86_64 allmodconfig
>          make C=1 CF=-D__CHECK_ENDIAN__
>
>
> sparse warnings: (new ones prefixed by >>)

Hi Majd,

Please address and push fix to Gerrit, IM me when this is ready.

Or.


>
>>> drivers/net/ethernet/mellanox/mlx5/core/cmd.c:1383:36: sparse: incorrect type in assignment (different base types)
>     drivers/net/ethernet/mellanox/mlx5/core/cmd.c:1383:36:    expected restricted __be32 [usertype] <noident>
>     drivers/net/ethernet/mellanox/mlx5/core/cmd.c:1383:36:    got unsigned int [unsigned] [addressable] [usertype] drv_synd
>
> vim +1383 drivers/net/ethernet/mellanox/mlx5/core/cmd.c
>
>    1367	}
>    1368	
>    1369	static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
>    1370			    int out_size, mlx5_cmd_cbk_t callback, void *context)
>    1371	{
>    1372		struct mlx5_cmd_msg *inb;
>    1373		struct mlx5_cmd_msg *outb;
>    1374		int pages_queue;
>    1375		gfp_t gfp;
>    1376		int err;
>    1377		u8 status = 0;
>    1378		u32 drv_synd;
>    1379	
>    1380		if (pci_channel_offline(dev->pdev) ||
>    1381		    dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
>    1382			err = mlx5_internal_err_ret_value(dev, opcode_from_in(in), &drv_synd, &status);
>> 1383			*get_synd_ptr(out) = drv_synd;
>    1384			*get_status_ptr(out) = status;
>    1385			return err;
>    1386		}
>    1387	
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next 1/4] net/mlx5_core: Fix internal error detection conditions
  2015-10-14  2:46   ` David Miller
@ 2015-10-14 14:47     ` Or Gerlitz
  0 siblings, 0 replies; 9+ messages in thread
From: Or Gerlitz @ 2015-10-14 14:47 UTC (permalink / raw)
  To: David Miller; +Cc: Or Gerlitz, Linux Netdev List, Eli Cohen, Amir Vadai

On Wed, Oct 14, 2015 at 5:46 AM, David Miller <davem@davemloft.net> wrote:
> From: Or Gerlitz <ogerlitz@mellanox.com>
> Date: Tue, 13 Oct 2015 18:44:06 +0300
>
>> +     if (in_fatal(dev) && !health->sick) {
>> +             health->sick = 1;
>> +             print_health_info(dev);
>> +             queue_work(health->wq, &health->work);
>>       }
>>  }
>>
>> diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
>> index 41a3287..cd469b9 100644
>> --- a/include/linux/mlx5/driver.h
>> +++ b/include/linux/mlx5/driver.h
>> @@ -393,6 +393,7 @@ struct mlx5_core_health {
>>       struct timer_list               timer;
>>       u32                             prev;
>>       int                             miss_counter;
>> +     int                             sick;
>>       struct workqueue_struct        *wq;
>>       struct work_struct              work;
>>  };

> Please use "bool" and true/false for boolean states.

sure, will fix (just sent that..) in V1

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2015-10-14 14:47 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-13 15:44 [PATCH net-next 0/4] Mellanox driver update, Oct 13 2015 Or Gerlitz
2015-10-13 15:44 ` [PATCH net-next 1/4] net/mlx5_core: Fix internal error detection conditions Or Gerlitz
2015-10-14  2:46   ` David Miller
2015-10-14 14:47     ` Or Gerlitz
2015-10-13 15:44 ` [PATCH net-next 2/4] net/mlx5_core: Add pci error handlers to mlx5_core driver Or Gerlitz
2015-10-13 17:04   ` kbuild test robot
2015-10-14  4:34     ` Or Gerlitz
2015-10-13 15:44 ` [PATCH net-next 3/4] net/mlx5_core: Wait for FW readiness on startup Or Gerlitz
2015-10-13 15:44 ` [PATCH net-next 4/4] net/mlx4_core: Replace VF zero mac with random mac in mlx4_core Or Gerlitz

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.