linux-rdma.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Saeed Mahameed <saeed@kernel.org>
To: Saeed Mahameed <saeedm@nvidia.com>, Leon Romanovsky <leonro@nvidia.com>
Cc: netdev@vger.kernel.org, linux-rdma@vger.kernel.org,
	Mark Bloch <mbloch@nvidia.com>, Mark Zhang <markzhang@nvidia.com>
Subject: [PATCH mlx5-next 05/14] RDMA/mlx5: Add shared FDB support
Date: Tue,  3 Aug 2021 16:19:50 -0700	[thread overview]
Message-ID: <20210803231959.26513-6-saeed@kernel.org> (raw)
In-Reply-To: <20210803231959.26513-1-saeed@kernel.org>

From: Mark Bloch <mbloch@nvidia.com>

Shared FDB allows to create a single RDMA device that holds representors
from both eswitches. As shared FDB is only active when both uplink
representors are enslaved there is a single RDMA port that represents
both uplinks.

The number of ports is the number of vports on both eswitches minus one
as we only need 1 port for both uplinks.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Reviewed-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/infiniband/hw/mlx5/ib_rep.c | 75 ++++++++++++++++++++++++++---
 drivers/infiniband/hw/mlx5/main.c   | 44 ++++++++++-------
 2 files changed, 95 insertions(+), 24 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index bf5a6e4d1c03..52821485371a 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -8,13 +8,15 @@
 #include "srq.h"
 
 static int
-mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
+mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev,
+		      struct mlx5_eswitch_rep *rep,
+		      int vport_index)
 {
 	struct mlx5_ib_dev *ibdev;
-	int vport_index;
 
 	ibdev = mlx5_eswitch_uplink_get_proto_dev(dev->priv.eswitch, REP_IB);
-	vport_index = rep->vport_index;
+	if (!ibdev)
+		return -EINVAL;
 
 	ibdev->port[vport_index].rep = rep;
 	rep->rep_data[REP_IB].priv = ibdev;
@@ -26,19 +28,39 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 	return 0;
 }
 
+static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev);
+
 static int
 mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 {
 	u32 num_ports = mlx5_eswitch_get_total_vports(dev);
 	const struct mlx5_ib_profile *profile;
+	struct mlx5_core_dev *peer_dev;
 	struct mlx5_ib_dev *ibdev;
+	u32 peer_num_ports;
 	int vport_index;
 	int ret;
 
+	vport_index = rep->vport_index;
+
+	if (mlx5_lag_is_shared_fdb(dev)) {
+		peer_dev = mlx5_lag_get_peer_mdev(dev);
+		peer_num_ports = mlx5_eswitch_get_total_vports(peer_dev);
+		if (mlx5_lag_is_master(dev)) {
+			/* Only 1 ib port is the representor for both uplinks */
+			num_ports += peer_num_ports - 1;
+		} else {
+			if (rep->vport == MLX5_VPORT_UPLINK)
+				return 0;
+			vport_index += peer_num_ports;
+			dev = peer_dev;
+		}
+	}
+
 	if (rep->vport == MLX5_VPORT_UPLINK)
 		profile = &raw_eth_profile;
 	else
-		return mlx5_ib_set_vport_rep(dev, rep);
+		return mlx5_ib_set_vport_rep(dev, rep, vport_index);
 
 	ibdev = ib_alloc_device(mlx5_ib_dev, ib_dev);
 	if (!ibdev)
@@ -64,6 +86,8 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 		goto fail_add;
 
 	rep->rep_data[REP_IB].priv = ibdev;
+	if (mlx5_lag_is_shared_fdb(dev))
+		mlx5_ib_register_peer_vport_reps(dev);
 
 	return 0;
 
@@ -82,18 +106,45 @@ static void *mlx5_ib_rep_to_dev(struct mlx5_eswitch_rep *rep)
 static void
 mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep)
 {
+	struct mlx5_core_dev *mdev = mlx5_eswitch_get_core_dev(rep->esw);
 	struct mlx5_ib_dev *dev = mlx5_ib_rep_to_dev(rep);
+	int vport_index = rep->vport_index;
 	struct mlx5_ib_port *port;
 
-	port = &dev->port[rep->vport_index];
+	if (WARN_ON(!mdev))
+		return;
+
+	if (mlx5_lag_is_shared_fdb(mdev) &&
+	    !mlx5_lag_is_master(mdev)) {
+		struct mlx5_core_dev *peer_mdev;
+
+		if (rep->vport == MLX5_VPORT_UPLINK)
+			return;
+		peer_mdev = mlx5_lag_get_peer_mdev(mdev);
+		vport_index += mlx5_eswitch_get_total_vports(peer_mdev);
+	}
+
+	if (!dev)
+		return;
+
+	port = &dev->port[vport_index];
 	write_lock(&port->roce.netdev_lock);
 	port->roce.netdev = NULL;
 	write_unlock(&port->roce.netdev_lock);
 	rep->rep_data[REP_IB].priv = NULL;
 	port->rep = NULL;
 
-	if (rep->vport == MLX5_VPORT_UPLINK)
+	if (rep->vport == MLX5_VPORT_UPLINK) {
+		struct mlx5_core_dev *peer_mdev;
+		struct mlx5_eswitch *esw;
+
+		if (mlx5_lag_is_shared_fdb(mdev)) {
+			peer_mdev = mlx5_lag_get_peer_mdev(mdev);
+			esw = peer_mdev->priv.eswitch;
+			mlx5_eswitch_unregister_vport_reps(esw, REP_IB);
+		}
 		__mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
+	}
 }
 
 static const struct mlx5_eswitch_rep_ops rep_ops = {
@@ -102,6 +153,18 @@ static const struct mlx5_eswitch_rep_ops rep_ops = {
 	.get_proto_dev = mlx5_ib_rep_to_dev,
 };
 
+static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_core_dev *peer_mdev = mlx5_lag_get_peer_mdev(mdev);
+	struct mlx5_eswitch *esw;
+
+	if (!peer_mdev)
+		return;
+
+	esw = peer_mdev->priv.eswitch;
+	mlx5_eswitch_register_vport_reps(esw, &rep_ops, REP_IB);
+}
+
 struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
 					  u16 vport_num)
 {
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 094c976b1eed..ae05e143401c 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -126,6 +126,7 @@ static int get_port_state(struct ib_device *ibdev,
 
 static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev,
 					   struct net_device *ndev,
+					   struct net_device *upper,
 					   u32 *port_num)
 {
 	struct net_device *rep_ndev;
@@ -137,6 +138,14 @@ static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev,
 		if (!port->rep)
 			continue;
 
+		if (upper == ndev && port->rep->vport == MLX5_VPORT_UPLINK) {
+			*port_num = i + 1;
+			return &port->roce;
+		}
+
+		if (upper && port->rep->vport == MLX5_VPORT_UPLINK)
+			continue;
+
 		read_lock(&port->roce.netdev_lock);
 		rep_ndev = mlx5_ib_get_rep_netdev(port->rep->esw,
 						  port->rep->vport);
@@ -196,11 +205,12 @@ static int mlx5_netdev_event(struct notifier_block *this,
 		}
 
 		if (ibdev->is_rep)
-			roce = mlx5_get_rep_roce(ibdev, ndev, &port_num);
+			roce = mlx5_get_rep_roce(ibdev, ndev, upper, &port_num);
 		if (!roce)
 			return NOTIFY_DONE;
-		if ((upper == ndev || (!upper && ndev == roce->netdev))
-		    && ibdev->ib_active) {
+		if ((upper == ndev ||
+		     ((!upper || ibdev->is_rep) && ndev == roce->netdev)) &&
+		    ibdev->ib_active) {
 			struct ib_event ibev = { };
 			enum ib_port_state port_state;
 
@@ -3012,7 +3022,7 @@ static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev)
 	struct mlx5_flow_table *ft;
 	int err;
 
-	if (!ns || !mlx5_lag_is_roce(mdev))
+	if (!ns || !mlx5_lag_is_active(mdev))
 		return 0;
 
 	err = mlx5_cmd_create_vport_lag(mdev);
@@ -3074,9 +3084,11 @@ static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
 {
 	int err;
 
-	err = mlx5_nic_vport_enable_roce(dev->mdev);
-	if (err)
-		return err;
+	if (!dev->is_rep && dev->profile != &raw_eth_profile) {
+		err = mlx5_nic_vport_enable_roce(dev->mdev);
+		if (err)
+			return err;
+	}
 
 	err = mlx5_eth_lag_init(dev);
 	if (err)
@@ -3085,7 +3097,8 @@ static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
 	return 0;
 
 err_disable_roce:
-	mlx5_nic_vport_disable_roce(dev->mdev);
+	if (!dev->is_rep && dev->profile != &raw_eth_profile)
+		mlx5_nic_vport_disable_roce(dev->mdev);
 
 	return err;
 }
@@ -3093,7 +3106,8 @@ static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
 static void mlx5_disable_eth(struct mlx5_ib_dev *dev)
 {
 	mlx5_eth_lag_cleanup(dev);
-	mlx5_nic_vport_disable_roce(dev->mdev);
+	if (!dev->is_rep && dev->profile != &raw_eth_profile)
+		mlx5_nic_vport_disable_roce(dev->mdev);
 }
 
 static int mlx5_ib_rn_get_params(struct ib_device *device, u32 port_num,
@@ -3950,12 +3964,7 @@ static int mlx5_ib_roce_init(struct mlx5_ib_dev *dev)
 
 		/* Register only for native ports */
 		err = mlx5_add_netdev_notifier(dev, port_num);
-		if (err || dev->is_rep || !mlx5_is_roce_init_enabled(mdev))
-			/*
-			 * We don't enable ETH interface for
-			 * 1. IB representors
-			 * 2. User disabled ROCE through devlink interface
-			 */
+		if (err)
 			return err;
 
 		err = mlx5_enable_eth(dev);
@@ -3980,8 +3989,7 @@ static void mlx5_ib_roce_cleanup(struct mlx5_ib_dev *dev)
 	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
 
 	if (ll == IB_LINK_LAYER_ETHERNET) {
-		if (!dev->is_rep)
-			mlx5_disable_eth(dev);
+		mlx5_disable_eth(dev);
 
 		port_num = mlx5_core_native_port_num(dev->mdev) - 1;
 		mlx5_remove_netdev_notifier(dev, port_num);
@@ -4037,7 +4045,7 @@ static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
 {
 	const char *name;
 
-	if (!mlx5_lag_is_roce(dev->mdev))
+	if (!mlx5_lag_is_active(dev->mdev))
 		name = "mlx5_%d";
 	else
 		name = "mlx5_bond_%d";
-- 
2.31.1


  parent reply	other threads:[~2021-08-03 23:20 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-08-03 23:19 [PATCH mlx5-next 00/14] mlx5 single FDB for lag Saeed Mahameed
2021-08-03 23:19 ` [PATCH mlx5-next 01/14] net/mlx5: Return mdev from eswitch Saeed Mahameed
2021-08-03 23:19 ` [PATCH mlx5-next 02/14] net/mlx5: Lag, add initial logic for shared FDB Saeed Mahameed
2021-08-03 23:19 ` [PATCH mlx5-next 03/14] RDMA/mlx5: Fill port info based on the relevant eswitch Saeed Mahameed
2021-08-03 23:19 ` [PATCH mlx5-next 04/14] {net, RDMA}/mlx5: Extend send to vport rules Saeed Mahameed
2021-08-03 23:19 ` Saeed Mahameed [this message]
2021-08-03 23:19 ` [PATCH mlx5-next 06/14] net/mlx5: E-Switch, set flow source for send to uplink rule Saeed Mahameed
2021-08-03 23:19 ` [PATCH mlx5-next 07/14] net/mlx5e: Add an option to create a shared mapping Saeed Mahameed
2021-08-03 23:19 ` [PATCH mlx5-next 08/14] net/mlx5e: Use shared mappings for restoring from metadata Saeed Mahameed
2021-08-03 23:19 ` [PATCH mlx5-next 09/14] net/mlx5: E-Switch, Add event callback for representors Saeed Mahameed
2021-08-03 23:19 ` [PATCH mlx5-next 10/14] net/mlx5: Add send to vport rules on paired device Saeed Mahameed
2021-08-03 23:19 ` [PATCH mlx5-next 11/14] net/mlx5: Lag, properly lock eswitch if needed Saeed Mahameed
2021-08-03 23:19 ` [PATCH mlx5-next 12/14] net/mlx5: Lag, move lag destruction to a workqueue Saeed Mahameed
2021-08-03 23:19 ` [PATCH mlx5-next 13/14] net/mlx5/ E-Switch, add logic to enable shared FDB Saeed Mahameed
2021-08-03 23:19 ` [PATCH mlx5-next 14/14] net/mlx5: Lag, Create shared FDB when in switchdev mode Saeed Mahameed
2021-08-05 21:02 ` [PATCH mlx5-next 00/14] mlx5 single FDB for lag Saeed Mahameed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210803231959.26513-6-saeed@kernel.org \
    --to=saeed@kernel.org \
    --cc=leonro@nvidia.com \
    --cc=linux-rdma@vger.kernel.org \
    --cc=markzhang@nvidia.com \
    --cc=mbloch@nvidia.com \
    --cc=netdev@vger.kernel.org \
    --cc=saeedm@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).