netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* pull-request: mlx5-next 2023-01-24 V2
@ 2023-01-26 23:08 Saeed Mahameed
  2023-02-02  7:46 ` Leon Romanovsky
  2023-02-03 20:14 ` Saeed Mahameed
  0 siblings, 2 replies; 33+ messages in thread
From: Saeed Mahameed @ 2023-01-26 23:08 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski, Paolo Abeni, Eric Dumazet,
	Jason Gunthorpe
  Cc: Saeed Mahameed, linux-rdma, Leon Romanovsky, netdev, Saeed Mahameed

Hi, 

This pulls mlx5-next branch which includes changes from [1]:

1) From Jiri: fixe a deadlock in mlx5_ib's netdev notifier unregister.
2) From Mark and Patrisious: add IPsec RoCEv2 support.

[1] https://lore.kernel.org/netdev/20230105041756.677120-1-saeed@kernel.org/

Please pull into net-next and rdma-next.

v1-v2:
 - fix the pr command to use the branch name rather than the commit sha

Thanks,
Saeed.

The following changes since commit b7bfaa761d760e72a969d116517eaa12e404c262:

  Linux 6.2-rc3 (2023-01-08 11:49:43 -0600)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git mlx5-next

for you to fetch changes up to c4d508fbe54af3119e01672299514bfc83dfd59f:

  net/mlx5: Configure IPsec steering for egress RoCEv2 traffic (2023-01-18 00:12:58 -0800)

----------------------------------------------------------------
Jiri Pirko (3):
      net/mlx5e: Fix trap event handling
      net/mlx5e: Propagate an internal event in case uplink netdev changes
      RDMA/mlx5: Track netdev to avoid deadlock during netdev notifier unregister

Mark Zhang (4):
      net/mlx5: Implement new destination type TABLE_TYPE
      net/mlx5: Add IPSec priorities in RDMA namespaces
      net/mlx5: Configure IPsec steering for ingress RoCEv2 traffic
      net/mlx5: Configure IPsec steering for egress RoCEv2 traffic

Patrisious Haddad (2):
      net/mlx5: Introduce CQE error syndrome
      net/mlx5: Introduce new destination type TABLE_TYPE

 drivers/infiniband/hw/mlx5/main.c                  |  78 +++--
 drivers/infiniband/hw/mlx5/mlx5_ib.h               |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/devlink.c  |   9 +-
 drivers/net/ethernet/mellanox/mlx5/core/devlink.h  |   5 +
 .../mellanox/mlx5/core/diag/fs_tracepoint.c        |   4 +
 drivers/net/ethernet/mellanox/mlx5/core/en/fs.h    |   1 +
 .../ethernet/mellanox/mlx5/core/en_accel/ipsec.h   |   1 +
 .../mellanox/mlx5/core/en_accel/ipsec_fs.c         |  59 +++-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  15 +-
 drivers/net/ethernet/mellanox/mlx5/core/events.c   |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c   |   6 +
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c  |  44 ++-
 .../mellanox/mlx5/core/lib/ipsec_fs_roce.c         | 372 +++++++++++++++++++++
 .../mellanox/mlx5/core/lib/ipsec_fs_roce.h         |  20 ++
 drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h |   5 -
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  20 ++
 include/linux/mlx5/device.h                        |   1 +
 include/linux/mlx5/driver.h                        |   5 +
 include/linux/mlx5/fs.h                            |   3 +
 include/linux/mlx5/mlx5_ifc.h                      |  59 +++-
 21 files changed, 656 insertions(+), 58 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.h

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index c669ef6e47e7..dc32e4518a28 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3012,26 +3012,63 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev)
 	}
 }
 
-static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u32 port_num)
+static void mlx5_netdev_notifier_register(struct mlx5_roce *roce,
+					  struct net_device *netdev)
 {
 	int err;
 
-	dev->port[port_num].roce.nb.notifier_call = mlx5_netdev_event;
-	err = register_netdevice_notifier(&dev->port[port_num].roce.nb);
-	if (err) {
-		dev->port[port_num].roce.nb.notifier_call = NULL;
-		return err;
-	}
+	if (roce->tracking_netdev)
+		return;
+	roce->tracking_netdev = netdev;
+	roce->nb.notifier_call = mlx5_netdev_event;
+	err = register_netdevice_notifier_dev_net(netdev, &roce->nb, &roce->nn);
+	WARN_ON(err);
+}
 
-	return 0;
+static void mlx5_netdev_notifier_unregister(struct mlx5_roce *roce)
+{
+	if (!roce->tracking_netdev)
+		return;
+	unregister_netdevice_notifier_dev_net(roce->tracking_netdev, &roce->nb,
+					      &roce->nn);
+	roce->tracking_netdev = NULL;
 }
 
-static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u32 port_num)
+static int mlx5e_mdev_notifier_event(struct notifier_block *nb,
+				     unsigned long event, void *data)
 {
-	if (dev->port[port_num].roce.nb.notifier_call) {
-		unregister_netdevice_notifier(&dev->port[port_num].roce.nb);
-		dev->port[port_num].roce.nb.notifier_call = NULL;
+	struct mlx5_roce *roce = container_of(nb, struct mlx5_roce, mdev_nb);
+	struct net_device *netdev = data;
+
+	switch (event) {
+	case MLX5_DRIVER_EVENT_UPLINK_NETDEV:
+		if (netdev)
+			mlx5_netdev_notifier_register(roce, netdev);
+		else
+			mlx5_netdev_notifier_unregister(roce);
+		break;
+	default:
+		return NOTIFY_DONE;
 	}
+
+	return NOTIFY_OK;
+}
+
+static void mlx5_mdev_netdev_track(struct mlx5_ib_dev *dev, u32 port_num)
+{
+	struct mlx5_roce *roce = &dev->port[port_num].roce;
+
+	roce->mdev_nb.notifier_call = mlx5e_mdev_notifier_event;
+	mlx5_blocking_notifier_register(dev->mdev, &roce->mdev_nb);
+	mlx5_core_uplink_netdev_event_replay(dev->mdev);
+}
+
+static void mlx5_mdev_netdev_untrack(struct mlx5_ib_dev *dev, u32 port_num)
+{
+	struct mlx5_roce *roce = &dev->port[port_num].roce;
+
+	mlx5_blocking_notifier_unregister(dev->mdev, &roce->mdev_nb);
+	mlx5_netdev_notifier_unregister(roce);
 }
 
 static int mlx5_enable_eth(struct mlx5_ib_dev *dev)
@@ -3138,7 +3175,7 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
 	if (mpi->mdev_events.notifier_call)
 		mlx5_notifier_unregister(mpi->mdev, &mpi->mdev_events);
 	mpi->mdev_events.notifier_call = NULL;
-	mlx5_remove_netdev_notifier(ibdev, port_num);
+	mlx5_mdev_netdev_untrack(ibdev, port_num);
 	spin_lock(&port->mp.mpi_lock);
 
 	comps = mpi->mdev_refcnt;
@@ -3196,12 +3233,7 @@ static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
 	if (err)
 		goto unbind;
 
-	err = mlx5_add_netdev_notifier(ibdev, port_num);
-	if (err) {
-		mlx5_ib_err(ibdev, "failed adding netdev notifier for port %u\n",
-			    port_num + 1);
-		goto unbind;
-	}
+	mlx5_mdev_netdev_track(ibdev, port_num);
 
 	mpi->mdev_events.notifier_call = mlx5_ib_event_slave_port;
 	mlx5_notifier_register(mpi->mdev, &mpi->mdev_events);
@@ -3909,9 +3941,7 @@ static int mlx5_ib_roce_init(struct mlx5_ib_dev *dev)
 		port_num = mlx5_core_native_port_num(dev->mdev) - 1;
 
 		/* Register only for native ports */
-		err = mlx5_add_netdev_notifier(dev, port_num);
-		if (err)
-			return err;
+		mlx5_mdev_netdev_track(dev, port_num);
 
 		err = mlx5_enable_eth(dev);
 		if (err)
@@ -3920,7 +3950,7 @@ static int mlx5_ib_roce_init(struct mlx5_ib_dev *dev)
 
 	return 0;
 cleanup:
-	mlx5_remove_netdev_notifier(dev, port_num);
+	mlx5_mdev_netdev_untrack(dev, port_num);
 	return err;
 }
 
@@ -3938,7 +3968,7 @@ static void mlx5_ib_roce_cleanup(struct mlx5_ib_dev *dev)
 		mlx5_disable_eth(dev);
 
 		port_num = mlx5_core_native_port_num(dev->mdev) - 1;
-		mlx5_remove_netdev_notifier(dev, port_num);
+		mlx5_mdev_netdev_untrack(dev, port_num);
 	}
 }
 
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 8b91babdd4c0..7394e7f36ba7 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -832,6 +832,9 @@ struct mlx5_roce {
 	rwlock_t		netdev_lock;
 	struct net_device	*netdev;
 	struct notifier_block	nb;
+	struct netdev_net_notifier nn;
+	struct notifier_block	mdev_nb;
+	struct net_device	*tracking_netdev;
 	atomic_t		tx_port_affinity;
 	enum ib_port_state last_port_state;
 	struct mlx5_ib_dev	*dev;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index cd4a1ab0ea78..8415a44fb965 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -97,7 +97,7 @@ mlx5_core-$(CONFIG_MLX5_EN_MACSEC) += en_accel/macsec.o en_accel/macsec_fs.o \
 
 mlx5_core-$(CONFIG_MLX5_EN_IPSEC) += en_accel/ipsec.o en_accel/ipsec_rxtx.o \
 				     en_accel/ipsec_stats.o en_accel/ipsec_fs.o \
-				     en_accel/ipsec_offload.o
+				     en_accel/ipsec_offload.o lib/ipsec_fs_roce.o
 
 mlx5_core-$(CONFIG_MLX5_EN_TLS) += en_accel/ktls_stats.o \
 				   en_accel/fs_tcp.o en_accel/ktls.o en_accel/ktls_txrx.o \
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index 5bd83c0275f8..f641ff9bb3bb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -263,6 +263,7 @@ static int mlx5_devlink_trap_action_set(struct devlink *devlink,
 					struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
+	struct mlx5_devlink_trap_event_ctx trap_event_ctx;
 	enum devlink_trap_action action_orig;
 	struct mlx5_devlink_trap *dl_trap;
 	int err = 0;
@@ -289,10 +290,14 @@ static int mlx5_devlink_trap_action_set(struct devlink *devlink,
 
 	action_orig = dl_trap->trap.action;
 	dl_trap->trap.action = action;
+	trap_event_ctx.trap = &dl_trap->trap;
+	trap_event_ctx.err = 0;
 	err = mlx5_blocking_notifier_call_chain(dev, MLX5_DRIVER_EVENT_TYPE_TRAP,
-						&dl_trap->trap);
-	if (err)
+						&trap_event_ctx);
+	if (err == NOTIFY_BAD) {
 		dl_trap->trap.action = action_orig;
+		err = trap_event_ctx.err;
+	}
 out:
 	return err;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
index fd033df24856..b84cb70eb3ae 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
@@ -24,6 +24,11 @@ struct mlx5_devlink_trap {
 	struct list_head list;
 };
 
+struct mlx5_devlink_trap_event_ctx {
+	struct mlx5_trap_ctx *trap;
+	int err;
+};
+
 struct mlx5_core_dev;
 void mlx5_devlink_trap_report(struct mlx5_core_dev *dev, int trap_id, struct sk_buff *skb,
 			      struct devlink_port *dl_port);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
index 2732128e7a6e..6d73127b7217 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c
@@ -275,6 +275,10 @@ const char *parse_fs_dst(struct trace_seq *p,
 				 fs_dest_range_field_to_str(dst->range.field),
 				 dst->range.min, dst->range.max);
 		break;
+	case MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE:
+		trace_seq_printf(p, "flow_table_type=%u id:%u\n", dst->ft->type,
+				 dst->ft->id);
+		break;
 	case MLX5_FLOW_DESTINATION_TYPE_NONE:
 		trace_seq_printf(p, "none\n");
 		break;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
index 379c6dc9a3be..d2149f0138d8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
@@ -87,6 +87,7 @@ enum {
 	MLX5E_ACCEL_FS_POL_FT_LEVEL = MLX5E_INNER_TTC_FT_LEVEL + 1,
 	MLX5E_ACCEL_FS_ESP_FT_LEVEL,
 	MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL,
+	MLX5E_ACCEL_FS_ESP_FT_ROCE_LEVEL,
 #endif
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h
index a92e19c4c499..a72261ce7598 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h
@@ -141,6 +141,7 @@ struct mlx5e_ipsec {
 	struct mlx5e_ipsec_tx *tx;
 	struct mlx5e_ipsec_aso *aso;
 	struct notifier_block nb;
+	struct mlx5_ipsec_fs *roce_ipsec;
 };
 
 struct mlx5e_ipsec_esn_state {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c
index 9f19f4b59a70..4de528687536 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c
@@ -6,6 +6,7 @@
 #include "en/fs.h"
 #include "ipsec.h"
 #include "fs_core.h"
+#include "lib/ipsec_fs_roce.h"
 
 #define NUM_IPSEC_FTE BIT(15)
 
@@ -166,7 +167,8 @@ static int ipsec_miss_create(struct mlx5_core_dev *mdev,
 	return err;
 }
 
-static void rx_destroy(struct mlx5_core_dev *mdev, struct mlx5e_ipsec_rx *rx)
+static void rx_destroy(struct mlx5_core_dev *mdev, struct mlx5e_ipsec_rx *rx, u32 family,
+		       struct mlx5_ipsec_fs *roce_ipsec)
 {
 	mlx5_del_flow_rules(rx->pol.rule);
 	mlx5_destroy_flow_group(rx->pol.group);
@@ -179,6 +181,8 @@ static void rx_destroy(struct mlx5_core_dev *mdev, struct mlx5e_ipsec_rx *rx)
 	mlx5_del_flow_rules(rx->status.rule);
 	mlx5_modify_header_dealloc(mdev, rx->status.modify_hdr);
 	mlx5_destroy_flow_table(rx->ft.status);
+
+	mlx5_ipsec_fs_roce_rx_destroy(roce_ipsec, family);
 }
 
 static int rx_create(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec,
@@ -186,18 +190,35 @@ static int rx_create(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec,
 {
 	struct mlx5_flow_namespace *ns = mlx5e_fs_get_ns(ipsec->fs, false);
 	struct mlx5_ttc_table *ttc = mlx5e_fs_get_ttc(ipsec->fs, false);
+	struct mlx5_flow_destination default_dest;
 	struct mlx5_flow_destination dest[2];
 	struct mlx5_flow_table *ft;
 	int err;
 
+	default_dest = mlx5_ttc_get_default_dest(ttc, family2tt(family));
+	err = mlx5_ipsec_fs_roce_rx_create(ipsec->roce_ipsec, ns, &default_dest, family,
+					   MLX5E_ACCEL_FS_ESP_FT_ROCE_LEVEL, MLX5E_NIC_PRIO,
+					   ipsec->mdev);
+	if (err)
+		return err;
+
 	ft = ipsec_ft_create(ns, MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL,
 			     MLX5E_NIC_PRIO, 1);
-	if (IS_ERR(ft))
-		return PTR_ERR(ft);
+	if (IS_ERR(ft)) {
+		err = PTR_ERR(ft);
+		goto err_fs_ft_status;
+	}
 
 	rx->ft.status = ft;
 
-	dest[0] = mlx5_ttc_get_default_dest(ttc, family2tt(family));
+	ft = mlx5_ipsec_fs_roce_ft_get(ipsec->roce_ipsec, family);
+	if (ft) {
+		dest[0].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+		dest[0].ft = ft;
+	} else {
+		dest[0] = default_dest;
+	}
+
 	dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
 	dest[1].counter_id = mlx5_fc_id(rx->fc->cnt);
 	err = ipsec_status_rule(mdev, rx, dest);
@@ -245,6 +266,8 @@ static int rx_create(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec,
 	mlx5_modify_header_dealloc(mdev, rx->status.modify_hdr);
 err_add:
 	mlx5_destroy_flow_table(rx->ft.status);
+err_fs_ft_status:
+	mlx5_ipsec_fs_roce_rx_destroy(ipsec->roce_ipsec, family);
 	return err;
 }
 
@@ -304,7 +327,7 @@ static void rx_ft_put(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec,
 	mlx5_ttc_fwd_default_dest(ttc, family2tt(family));
 
 	/* remove FT */
-	rx_destroy(mdev, rx);
+	rx_destroy(mdev, rx, family, ipsec->roce_ipsec);
 
 out:
 	mutex_unlock(&rx->ft.mutex);
@@ -343,6 +366,14 @@ static int tx_create(struct mlx5_core_dev *mdev, struct mlx5e_ipsec_tx *tx)
 	return err;
 }
 
+static void tx_destroy(struct mlx5e_ipsec_tx *tx)
+{
+	mlx5_del_flow_rules(tx->pol.rule);
+	mlx5_destroy_flow_group(tx->pol.group);
+	mlx5_destroy_flow_table(tx->ft.pol);
+	mlx5_destroy_flow_table(tx->ft.sa);
+}
+
 static struct mlx5e_ipsec_tx *tx_ft_get(struct mlx5_core_dev *mdev,
 					struct mlx5e_ipsec *ipsec)
 {
@@ -356,6 +387,13 @@ static struct mlx5e_ipsec_tx *tx_ft_get(struct mlx5_core_dev *mdev,
 	err = tx_create(mdev, tx);
 	if (err)
 		goto out;
+
+	err = mlx5_ipsec_fs_roce_tx_create(ipsec->roce_ipsec, tx->ft.pol, ipsec->mdev);
+	if (err) {
+		tx_destroy(tx);
+		goto out;
+	}
+
 skip:
 	tx->ft.refcnt++;
 out:
@@ -374,10 +412,9 @@ static void tx_ft_put(struct mlx5e_ipsec *ipsec)
 	if (tx->ft.refcnt)
 		goto out;
 
-	mlx5_del_flow_rules(tx->pol.rule);
-	mlx5_destroy_flow_group(tx->pol.group);
-	mlx5_destroy_flow_table(tx->ft.pol);
-	mlx5_destroy_flow_table(tx->ft.sa);
+	mlx5_ipsec_fs_roce_tx_destroy(ipsec->roce_ipsec);
+
+	tx_destroy(tx);
 out:
 	mutex_unlock(&tx->ft.mutex);
 }
@@ -1008,6 +1045,8 @@ void mlx5e_accel_ipsec_fs_cleanup(struct mlx5e_ipsec *ipsec)
 	if (!ipsec->tx)
 		return;
 
+	mlx5_ipsec_fs_roce_cleanup(ipsec->roce_ipsec);
+
 	ipsec_fs_destroy_counters(ipsec);
 	mutex_destroy(&ipsec->tx->ft.mutex);
 	WARN_ON(ipsec->tx->ft.refcnt);
@@ -1053,6 +1092,8 @@ int mlx5e_accel_ipsec_fs_init(struct mlx5e_ipsec *ipsec)
 	mutex_init(&ipsec->rx_ipv6->ft.mutex);
 	ipsec->tx->ns = ns;
 
+	ipsec->roce_ipsec = mlx5_ipsec_fs_roce_init(ipsec->mdev);
+
 	return 0;
 
 err_counters:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index cff5f2e29e1e..85b51039d2a6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -179,17 +179,21 @@ static void mlx5e_disable_async_events(struct mlx5e_priv *priv)
 static int blocking_event(struct notifier_block *nb, unsigned long event, void *data)
 {
 	struct mlx5e_priv *priv = container_of(nb, struct mlx5e_priv, blocking_events_nb);
+	struct mlx5_devlink_trap_event_ctx *trap_event_ctx = data;
 	int err;
 
 	switch (event) {
 	case MLX5_DRIVER_EVENT_TYPE_TRAP:
-		err = mlx5e_handle_trap_event(priv, data);
+		err = mlx5e_handle_trap_event(priv, trap_event_ctx->trap);
+		if (err) {
+			trap_event_ctx->err = err;
+			return NOTIFY_BAD;
+		}
 		break;
 	default:
-		netdev_warn(priv->netdev, "Sync event: Unknown event %ld\n", event);
-		err = -EINVAL;
+		return NOTIFY_DONE;
 	}
-	return err;
+	return NOTIFY_OK;
 }
 
 static void mlx5e_enable_blocking_events(struct mlx5e_priv *priv)
@@ -5957,7 +5961,7 @@ static int mlx5e_probe(struct auxiliary_device *adev,
 	}
 
 	mlx5e_dcbnl_init_app(priv);
-	mlx5_uplink_netdev_set(mdev, netdev);
+	mlx5_core_uplink_netdev_set(mdev, netdev);
 	mlx5e_params_print_info(mdev, &priv->channels.params);
 	return 0;
 
@@ -5977,6 +5981,7 @@ static void mlx5e_remove(struct auxiliary_device *adev)
 	struct mlx5e_priv *priv = auxiliary_get_drvdata(adev);
 	pm_message_t state = {};
 
+	mlx5_core_uplink_netdev_set(priv->mdev, NULL);
 	mlx5e_dcbnl_delete_app(priv);
 	unregister_netdev(priv->netdev);
 	mlx5e_suspend(adev, state);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c b/drivers/net/ethernet/mellanox/mlx5/core/events.c
index 9459e56ee90a..718cf09c28ce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/events.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c
@@ -424,6 +424,7 @@ int mlx5_blocking_notifier_register(struct mlx5_core_dev *dev, struct notifier_b
 
 	return blocking_notifier_chain_register(&events->sw_nh, nb);
 }
+EXPORT_SYMBOL(mlx5_blocking_notifier_register);
 
 int mlx5_blocking_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb)
 {
@@ -431,6 +432,7 @@ int mlx5_blocking_notifier_unregister(struct mlx5_core_dev *dev, struct notifier
 
 	return blocking_notifier_chain_unregister(&events->sw_nh, nb);
 }
+EXPORT_SYMBOL(mlx5_blocking_notifier_unregister);
 
 int mlx5_blocking_notifier_call_chain(struct mlx5_core_dev *dev, unsigned int event,
 				      void *data)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
index 32d4c967469c..a3a9cc6f15ca 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
@@ -653,6 +653,12 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
 				id = dst->dest_attr.sampler_id;
 				ifc_type = MLX5_IFC_FLOW_DESTINATION_TYPE_FLOW_SAMPLER;
 				break;
+			case MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE:
+				MLX5_SET(dest_format_struct, in_dests,
+					 destination_table_type, dst->dest_attr.ft->type);
+				id = dst->dest_attr.ft->id;
+				ifc_type = MLX5_IFC_FLOW_DESTINATION_TYPE_TABLE_TYPE;
+				break;
 			default:
 				id = dst->dest_attr.tir_num;
 				ifc_type = MLX5_IFC_FLOW_DESTINATION_TYPE_TIR;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index 5a85d8c1e797..cb28cdb59c17 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -111,8 +111,10 @@
 #define ETHTOOL_PRIO_NUM_LEVELS 1
 #define ETHTOOL_NUM_PRIOS 11
 #define ETHTOOL_MIN_LEVEL (KERNEL_MIN_LEVEL + ETHTOOL_NUM_PRIOS)
-/* Promiscuous, Vlan, mac, ttc, inner ttc, {UDP/ANY/aRFS/accel/{esp, esp_err}}, IPsec policy */
-#define KERNEL_NIC_PRIO_NUM_LEVELS 8
+/* Promiscuous, Vlan, mac, ttc, inner ttc, {UDP/ANY/aRFS/accel/{esp, esp_err}}, IPsec policy,
+ * IPsec RoCE policy
+ */
+#define KERNEL_NIC_PRIO_NUM_LEVELS 9
 #define KERNEL_NIC_NUM_PRIOS 1
 /* One more level for tc */
 #define KERNEL_MIN_LEVEL (KERNEL_NIC_PRIO_NUM_LEVELS + 1)
@@ -219,19 +221,30 @@ static struct init_tree_node egress_root_fs = {
 };
 
 enum {
+	RDMA_RX_IPSEC_PRIO,
 	RDMA_RX_COUNTERS_PRIO,
 	RDMA_RX_BYPASS_PRIO,
 	RDMA_RX_KERNEL_PRIO,
 };
 
+#define RDMA_RX_IPSEC_NUM_PRIOS 1
+#define RDMA_RX_IPSEC_NUM_LEVELS 2
+#define RDMA_RX_IPSEC_MIN_LEVEL  (RDMA_RX_IPSEC_NUM_LEVELS)
+
 #define RDMA_RX_BYPASS_MIN_LEVEL MLX5_BY_PASS_NUM_REGULAR_PRIOS
 #define RDMA_RX_KERNEL_MIN_LEVEL (RDMA_RX_BYPASS_MIN_LEVEL + 1)
 #define RDMA_RX_COUNTERS_MIN_LEVEL (RDMA_RX_KERNEL_MIN_LEVEL + 2)
 
 static struct init_tree_node rdma_rx_root_fs = {
 	.type = FS_TYPE_NAMESPACE,
-	.ar_size = 3,
+	.ar_size = 4,
 	.children = (struct init_tree_node[]) {
+		[RDMA_RX_IPSEC_PRIO] =
+		ADD_PRIO(0, RDMA_RX_IPSEC_MIN_LEVEL, 0,
+			 FS_CHAINING_CAPS,
+			 ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF,
+				ADD_MULTIPLE_PRIO(RDMA_RX_IPSEC_NUM_PRIOS,
+						  RDMA_RX_IPSEC_NUM_LEVELS))),
 		[RDMA_RX_COUNTERS_PRIO] =
 		ADD_PRIO(0, RDMA_RX_COUNTERS_MIN_LEVEL, 0,
 			 FS_CHAINING_CAPS,
@@ -254,15 +267,20 @@ static struct init_tree_node rdma_rx_root_fs = {
 
 enum {
 	RDMA_TX_COUNTERS_PRIO,
+	RDMA_TX_IPSEC_PRIO,
 	RDMA_TX_BYPASS_PRIO,
 };
 
 #define RDMA_TX_BYPASS_MIN_LEVEL MLX5_BY_PASS_NUM_PRIOS
 #define RDMA_TX_COUNTERS_MIN_LEVEL (RDMA_TX_BYPASS_MIN_LEVEL + 1)
 
+#define RDMA_TX_IPSEC_NUM_PRIOS 1
+#define RDMA_TX_IPSEC_PRIO_NUM_LEVELS 1
+#define RDMA_TX_IPSEC_MIN_LEVEL  (RDMA_TX_COUNTERS_MIN_LEVEL + RDMA_TX_IPSEC_NUM_PRIOS)
+
 static struct init_tree_node rdma_tx_root_fs = {
 	.type = FS_TYPE_NAMESPACE,
-	.ar_size = 2,
+	.ar_size = 3,
 	.children = (struct init_tree_node[]) {
 		[RDMA_TX_COUNTERS_PRIO] =
 		ADD_PRIO(0, RDMA_TX_COUNTERS_MIN_LEVEL, 0,
@@ -270,6 +288,13 @@ static struct init_tree_node rdma_tx_root_fs = {
 			 ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF,
 				ADD_MULTIPLE_PRIO(MLX5_RDMA_TX_NUM_COUNTERS_PRIOS,
 						  RDMA_TX_COUNTERS_PRIO_NUM_LEVELS))),
+		[RDMA_TX_IPSEC_PRIO] =
+		ADD_PRIO(0, RDMA_TX_IPSEC_MIN_LEVEL, 0,
+			 FS_CHAINING_CAPS,
+			 ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF,
+				ADD_MULTIPLE_PRIO(RDMA_TX_IPSEC_NUM_PRIOS,
+						  RDMA_TX_IPSEC_PRIO_NUM_LEVELS))),
+
 		[RDMA_TX_BYPASS_PRIO] =
 		ADD_PRIO(0, RDMA_TX_BYPASS_MIN_LEVEL, 0,
 			 FS_CHAINING_CAPS_RDMA_TX,
@@ -449,7 +474,8 @@ static bool is_fwd_dest_type(enum mlx5_flow_destination_type type)
 		type == MLX5_FLOW_DESTINATION_TYPE_VPORT ||
 		type == MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER ||
 		type == MLX5_FLOW_DESTINATION_TYPE_TIR ||
-		type == MLX5_FLOW_DESTINATION_TYPE_RANGE;
+		type == MLX5_FLOW_DESTINATION_TYPE_RANGE ||
+		type == MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE;
 }
 
 static bool check_valid_spec(const struct mlx5_flow_spec *spec)
@@ -2367,6 +2393,14 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
 		root_ns = steering->rdma_tx_root_ns;
 		prio = RDMA_TX_COUNTERS_PRIO;
 		break;
+	case MLX5_FLOW_NAMESPACE_RDMA_RX_IPSEC:
+		root_ns = steering->rdma_rx_root_ns;
+		prio = RDMA_RX_IPSEC_PRIO;
+		break;
+	case MLX5_FLOW_NAMESPACE_RDMA_TX_IPSEC:
+		root_ns = steering->rdma_tx_root_ns;
+		prio = RDMA_TX_IPSEC_PRIO;
+		break;
 	default: /* Must be NIC RX */
 		WARN_ON(!is_nic_rx_ns(type));
 		root_ns = steering->root_ns;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c
new file mode 100644
index 000000000000..2711892fd5cb
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c
@@ -0,0 +1,372 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+
+#include "fs_core.h"
+#include "lib/ipsec_fs_roce.h"
+#include "mlx5_core.h"
+
+struct mlx5_ipsec_miss {
+	struct mlx5_flow_group *group;
+	struct mlx5_flow_handle *rule;
+};
+
+struct mlx5_ipsec_rx_roce {
+	struct mlx5_flow_group *g;
+	struct mlx5_flow_table *ft;
+	struct mlx5_flow_handle *rule;
+	struct mlx5_ipsec_miss roce_miss;
+
+	struct mlx5_flow_table *ft_rdma;
+	struct mlx5_flow_namespace *ns_rdma;
+};
+
+struct mlx5_ipsec_tx_roce {
+	struct mlx5_flow_group *g;
+	struct mlx5_flow_table *ft;
+	struct mlx5_flow_handle *rule;
+	struct mlx5_flow_namespace *ns;
+};
+
+struct mlx5_ipsec_fs {
+	struct mlx5_ipsec_rx_roce ipv4_rx;
+	struct mlx5_ipsec_rx_roce ipv6_rx;
+	struct mlx5_ipsec_tx_roce tx;
+};
+
+static void ipsec_fs_roce_setup_udp_dport(struct mlx5_flow_spec *spec, u16 dport)
+{
+	spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS;
+	MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_protocol);
+	MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_protocol, IPPROTO_UDP);
+	MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.udp_dport);
+	MLX5_SET(fte_match_param, spec->match_value, outer_headers.udp_dport, dport);
+}
+
+static int ipsec_fs_roce_rx_rule_setup(struct mlx5_flow_destination *default_dst,
+				       struct mlx5_ipsec_rx_roce *roce, struct mlx5_core_dev *mdev)
+{
+	struct mlx5_flow_destination dst = {};
+	MLX5_DECLARE_FLOW_ACT(flow_act);
+	struct mlx5_flow_handle *rule;
+	struct mlx5_flow_spec *spec;
+	int err = 0;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	if (!spec)
+		return -ENOMEM;
+
+	ipsec_fs_roce_setup_udp_dport(spec, ROCE_V2_UDP_DPORT);
+
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+	dst.type = MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE;
+	dst.ft = roce->ft_rdma;
+	rule = mlx5_add_flow_rules(roce->ft, spec, &flow_act, &dst, 1);
+	if (IS_ERR(rule)) {
+		err = PTR_ERR(rule);
+		mlx5_core_err(mdev, "Fail to add RX roce ipsec rule err=%d\n",
+			      err);
+		goto fail_add_rule;
+	}
+
+	roce->rule = rule;
+
+	memset(spec, 0, sizeof(*spec));
+	rule = mlx5_add_flow_rules(roce->ft, spec, &flow_act, default_dst, 1);
+	if (IS_ERR(rule)) {
+		err = PTR_ERR(rule);
+		mlx5_core_err(mdev, "Fail to add RX roce ipsec miss rule err=%d\n",
+			      err);
+		goto fail_add_default_rule;
+	}
+
+	roce->roce_miss.rule = rule;
+
+	kvfree(spec);
+	return 0;
+
+fail_add_default_rule:
+	mlx5_del_flow_rules(roce->rule);
+fail_add_rule:
+	kvfree(spec);
+	return err;
+}
+
+static int ipsec_fs_roce_tx_rule_setup(struct mlx5_core_dev *mdev, struct mlx5_ipsec_tx_roce *roce,
+				       struct mlx5_flow_table *pol_ft)
+{
+	struct mlx5_flow_destination dst = {};
+	MLX5_DECLARE_FLOW_ACT(flow_act);
+	struct mlx5_flow_handle *rule;
+	int err = 0;
+
+	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+	dst.type = MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE;
+	dst.ft = pol_ft;
+	rule = mlx5_add_flow_rules(roce->ft, NULL, &flow_act, &dst,
+				   1);
+	if (IS_ERR(rule)) {
+		err = PTR_ERR(rule);
+		mlx5_core_err(mdev, "Fail to add TX roce ipsec rule err=%d\n",
+			      err);
+		goto out;
+	}
+	roce->rule = rule;
+
+out:
+	return err;
+}
+
+void mlx5_ipsec_fs_roce_tx_destroy(struct mlx5_ipsec_fs *ipsec_roce)
+{
+	struct mlx5_ipsec_tx_roce *tx_roce;
+
+	if (!ipsec_roce)
+		return;
+
+	tx_roce = &ipsec_roce->tx;
+
+	mlx5_del_flow_rules(tx_roce->rule);
+	mlx5_destroy_flow_group(tx_roce->g);
+	mlx5_destroy_flow_table(tx_roce->ft);
+}
+
+#define MLX5_TX_ROCE_GROUP_SIZE BIT(0)
+
+int mlx5_ipsec_fs_roce_tx_create(struct mlx5_ipsec_fs *ipsec_roce, struct mlx5_flow_table *pol_ft,
+				 struct mlx5_core_dev *mdev)
+{
+	struct mlx5_flow_table_attr ft_attr = {};
+	struct mlx5_ipsec_tx_roce *roce;
+	struct mlx5_flow_table *ft;
+	struct mlx5_flow_group *g;
+	int ix = 0;
+	int err;
+	u32 *in;
+
+	if (!ipsec_roce)
+		return 0;
+
+	roce = &ipsec_roce->tx;
+
+	in = kvzalloc(MLX5_ST_SZ_BYTES(create_flow_group_in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	ft_attr.max_fte = 1;
+	ft = mlx5_create_flow_table(roce->ns, &ft_attr);
+	if (IS_ERR(ft)) {
+		err = PTR_ERR(ft);
+		mlx5_core_err(mdev, "Fail to create ipsec tx roce ft err=%d\n", err);
+		return err;
+	}
+
+	roce->ft = ft;
+
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5_TX_ROCE_GROUP_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	g = mlx5_create_flow_group(ft, in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		mlx5_core_err(mdev, "Fail to create ipsec tx roce group err=%d\n", err);
+		goto fail;
+	}
+	roce->g = g;
+
+	err = ipsec_fs_roce_tx_rule_setup(mdev, roce, pol_ft);
+	if (err) {
+		mlx5_core_err(mdev, "Fail to create ipsec tx roce rules err=%d\n", err);
+		goto rule_fail;
+	}
+
+	return 0;
+
+rule_fail:
+	mlx5_destroy_flow_group(roce->g);
+fail:
+	mlx5_destroy_flow_table(ft);
+	return err;
+}
+
+struct mlx5_flow_table *mlx5_ipsec_fs_roce_ft_get(struct mlx5_ipsec_fs *ipsec_roce, u32 family)
+{
+	struct mlx5_ipsec_rx_roce *rx_roce;
+
+	if (!ipsec_roce)
+		return NULL;
+
+	rx_roce = (family == AF_INET) ? &ipsec_roce->ipv4_rx :
+					&ipsec_roce->ipv6_rx;
+
+	return rx_roce->ft;
+}
+
+void mlx5_ipsec_fs_roce_rx_destroy(struct mlx5_ipsec_fs *ipsec_roce, u32 family)
+{
+	struct mlx5_ipsec_rx_roce *rx_roce;
+
+	if (!ipsec_roce)
+		return;
+
+	rx_roce = (family == AF_INET) ? &ipsec_roce->ipv4_rx :
+					&ipsec_roce->ipv6_rx;
+
+	mlx5_del_flow_rules(rx_roce->roce_miss.rule);
+	mlx5_del_flow_rules(rx_roce->rule);
+	mlx5_destroy_flow_table(rx_roce->ft_rdma);
+	mlx5_destroy_flow_group(rx_roce->roce_miss.group);
+	mlx5_destroy_flow_group(rx_roce->g);
+	mlx5_destroy_flow_table(rx_roce->ft);
+}
+
+#define MLX5_RX_ROCE_GROUP_SIZE BIT(0)
+
+int mlx5_ipsec_fs_roce_rx_create(struct mlx5_ipsec_fs *ipsec_roce, struct mlx5_flow_namespace *ns,
+				 struct mlx5_flow_destination *default_dst, u32 family, u32 level,
+				 u32 prio, struct mlx5_core_dev *mdev)
+{
+	struct mlx5_flow_table_attr ft_attr = {};
+	struct mlx5_ipsec_rx_roce *roce;
+	struct mlx5_flow_table *ft;
+	struct mlx5_flow_group *g;
+	void *outer_headers_c;
+	int ix = 0;
+	u32 *in;
+	int err;
+	u8 *mc;
+
+	if (!ipsec_roce)
+		return 0;
+
+	roce = (family == AF_INET) ? &ipsec_roce->ipv4_rx :
+				     &ipsec_roce->ipv6_rx;
+
+	ft_attr.max_fte = 2;
+	ft_attr.level = level;
+	ft_attr.prio = prio;
+	ft = mlx5_create_flow_table(ns, &ft_attr);
+	if (IS_ERR(ft)) {
+		err = PTR_ERR(ft);
+		mlx5_core_err(mdev, "Fail to create ipsec rx roce ft at nic err=%d\n", err);
+		return err;
+	}
+
+	roce->ft = ft;
+
+	in = kvzalloc(MLX5_ST_SZ_BYTES(create_flow_group_in), GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto fail_nomem;
+	}
+
+	mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+	outer_headers_c = MLX5_ADDR_OF(fte_match_param, mc, outer_headers);
+	MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol);
+	MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, udp_dport);
+
+	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5_RX_ROCE_GROUP_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	g = mlx5_create_flow_group(ft, in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		mlx5_core_err(mdev, "Fail to create ipsec rx roce group at nic err=%d\n", err);
+		goto fail_group;
+	}
+	roce->g = g;
+
+	memset(in, 0, MLX5_ST_SZ_BYTES(create_flow_group_in));
+	MLX5_SET_CFG(in, start_flow_index, ix);
+	ix += MLX5_RX_ROCE_GROUP_SIZE;
+	MLX5_SET_CFG(in, end_flow_index, ix - 1);
+	g = mlx5_create_flow_group(ft, in);
+	if (IS_ERR(g)) {
+		err = PTR_ERR(g);
+		mlx5_core_err(mdev, "Fail to create ipsec rx roce miss group at nic err=%d\n", err);
+		goto fail_mgroup;
+	}
+	roce->roce_miss.group = g;
+
+	memset(&ft_attr, 0, sizeof(ft_attr));
+	if (family == AF_INET)
+		ft_attr.level = 1;
+	ft = mlx5_create_flow_table(roce->ns_rdma, &ft_attr);
+	if (IS_ERR(ft)) {
+		err = PTR_ERR(ft);
+		mlx5_core_err(mdev, "Fail to create ipsec rx roce ft at rdma err=%d\n", err);
+		goto fail_rdma_table;
+	}
+
+	roce->ft_rdma = ft;
+
+	err = ipsec_fs_roce_rx_rule_setup(default_dst, roce, mdev);
+	if (err) {
+		mlx5_core_err(mdev, "Fail to create ipsec rx roce rules err=%d\n", err);
+		goto fail_setup_rule;
+	}
+
+	kvfree(in);
+	return 0;
+
+fail_setup_rule:
+	mlx5_destroy_flow_table(roce->ft_rdma);
+fail_rdma_table:
+	mlx5_destroy_flow_group(roce->roce_miss.group);
+fail_mgroup:
+	mlx5_destroy_flow_group(roce->g);
+fail_group:
+	kvfree(in);
+fail_nomem:
+	mlx5_destroy_flow_table(roce->ft);
+	return err;
+}
+
+void mlx5_ipsec_fs_roce_cleanup(struct mlx5_ipsec_fs *ipsec_roce)
+{
+	kfree(ipsec_roce);
+}
+
+#define NIC_RDMA_BOTH_DIRS_CAPS (MLX5_FT_NIC_RX_2_NIC_RX_RDMA | MLX5_FT_NIC_TX_RDMA_2_NIC_TX)
+
+struct mlx5_ipsec_fs *mlx5_ipsec_fs_roce_init(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_ipsec_fs *roce_ipsec;
+	struct mlx5_flow_namespace *ns;
+
+	if (!mlx5_get_roce_state(mdev))
+		return NULL;
+
+	if ((MLX5_CAP_GEN_2(mdev, flow_table_type_2_type) &
+	     NIC_RDMA_BOTH_DIRS_CAPS) != NIC_RDMA_BOTH_DIRS_CAPS) {
+		mlx5_core_dbg(mdev, "Failed to init roce ipsec flow steering, capabilities not supported\n");
+		return NULL;
+	}
+
+	ns = mlx5_get_flow_namespace(mdev, MLX5_FLOW_NAMESPACE_RDMA_RX_IPSEC);
+	if (!ns) {
+		mlx5_core_err(mdev, "Failed to get roce rx ns\n");
+		return NULL;
+	}
+
+	roce_ipsec = kzalloc(sizeof(*roce_ipsec), GFP_KERNEL);
+	if (!roce_ipsec)
+		return NULL;
+
+	roce_ipsec->ipv4_rx.ns_rdma = ns;
+	roce_ipsec->ipv6_rx.ns_rdma = ns;
+
+	ns = mlx5_get_flow_namespace(mdev, MLX5_FLOW_NAMESPACE_RDMA_TX_IPSEC);
+	if (!ns) {
+		mlx5_core_err(mdev, "Failed to get roce tx ns\n");
+		goto err_tx;
+	}
+
+	roce_ipsec->tx.ns = ns;
+
+	return roce_ipsec;
+
+err_tx:
+	kfree(roce_ipsec);
+	return NULL;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.h
new file mode 100644
index 000000000000..4b69d4e34234
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+
+#ifndef __MLX5_LIB_IPSEC_H__
+#define __MLX5_LIB_IPSEC_H__
+
+struct mlx5_ipsec_fs;
+
+struct mlx5_flow_table *mlx5_ipsec_fs_roce_ft_get(struct mlx5_ipsec_fs *ipsec_roce, u32 family);
+void mlx5_ipsec_fs_roce_rx_destroy(struct mlx5_ipsec_fs *ipsec_roce, u32 family);
+int mlx5_ipsec_fs_roce_rx_create(struct mlx5_ipsec_fs *ipsec_roce, struct mlx5_flow_namespace *ns,
+				 struct mlx5_flow_destination *default_dst, u32 family, u32 level,
+				 u32 prio, struct mlx5_core_dev *mdev);
+void mlx5_ipsec_fs_roce_tx_destroy(struct mlx5_ipsec_fs *ipsec_roce);
+int mlx5_ipsec_fs_roce_tx_create(struct mlx5_ipsec_fs *ipsec_roce, struct mlx5_flow_table *pol_ft,
+				 struct mlx5_core_dev *mdev);
+void mlx5_ipsec_fs_roce_cleanup(struct mlx5_ipsec_fs *ipsec_roce);
+struct mlx5_ipsec_fs *mlx5_ipsec_fs_roce_init(struct mlx5_core_dev *mdev);
+
+#endif /* __MLX5_LIB_IPSEC_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
index 032adb21ad4b..bfd3a1121ed8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
@@ -96,11 +96,6 @@ static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev)
 	return devlink_net(priv_to_devlink(dev));
 }
 
-static inline void mlx5_uplink_netdev_set(struct mlx5_core_dev *mdev, struct net_device *netdev)
-{
-	mdev->mlx5e_res.uplink_netdev = netdev;
-}
-
 static inline struct net_device *mlx5_uplink_netdev_get(struct mlx5_core_dev *mdev)
 {
 	return mdev->mlx5e_res.uplink_netdev;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index df134f6d32dc..72716d1f8b37 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -336,6 +336,24 @@ static u16 to_fw_pkey_sz(struct mlx5_core_dev *dev, u32 size)
 	}
 }
 
+void mlx5_core_uplink_netdev_set(struct mlx5_core_dev *dev, struct net_device *netdev)
+{
+	mutex_lock(&dev->mlx5e_res.uplink_netdev_lock);
+	dev->mlx5e_res.uplink_netdev = netdev;
+	mlx5_blocking_notifier_call_chain(dev, MLX5_DRIVER_EVENT_UPLINK_NETDEV,
+					  netdev);
+	mutex_unlock(&dev->mlx5e_res.uplink_netdev_lock);
+}
+
+void mlx5_core_uplink_netdev_event_replay(struct mlx5_core_dev *dev)
+{
+	mutex_lock(&dev->mlx5e_res.uplink_netdev_lock);
+	mlx5_blocking_notifier_call_chain(dev, MLX5_DRIVER_EVENT_UPLINK_NETDEV,
+					  dev->mlx5e_res.uplink_netdev);
+	mutex_unlock(&dev->mlx5e_res.uplink_netdev_lock);
+}
+EXPORT_SYMBOL(mlx5_core_uplink_netdev_event_replay);
+
 static int mlx5_core_get_caps_mode(struct mlx5_core_dev *dev,
 				   enum mlx5_cap_type cap_type,
 				   enum mlx5_cap_mode cap_mode)
@@ -1608,6 +1626,7 @@ int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx)
 	lockdep_register_key(&dev->lock_key);
 	mutex_init(&dev->intf_state_mutex);
 	lockdep_set_class(&dev->intf_state_mutex, &dev->lock_key);
+	mutex_init(&dev->mlx5e_res.uplink_netdev_lock);
 
 	mutex_init(&priv->bfregs.reg_head.lock);
 	mutex_init(&priv->bfregs.wc_head.lock);
@@ -1696,6 +1715,7 @@ void mlx5_mdev_uninit(struct mlx5_core_dev *dev)
 	mutex_destroy(&priv->alloc_mutex);
 	mutex_destroy(&priv->bfregs.wc_head.lock);
 	mutex_destroy(&priv->bfregs.reg_head.lock);
+	mutex_destroy(&dev->mlx5e_res.uplink_netdev_lock);
 	mutex_destroy(&dev->intf_state_mutex);
 	lockdep_unregister_key(&dev->lock_key);
 }
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 29d4b201c7b2..f2b271169daf 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -362,6 +362,7 @@ enum mlx5_event {
 
 enum mlx5_driver_event {
 	MLX5_DRIVER_EVENT_TYPE_TRAP = 0,
+	MLX5_DRIVER_EVENT_UPLINK_NETDEV,
 };
 
 enum {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index d476255c9a3f..cc48aa308269 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -49,6 +49,7 @@
 #include <linux/notifier.h>
 #include <linux/refcount.h>
 #include <linux/auxiliary_bus.h>
+#include <linux/mutex.h>
 
 #include <linux/mlx5/device.h>
 #include <linux/mlx5/doorbell.h>
@@ -674,6 +675,7 @@ struct mlx5e_resources {
 	} hw_objs;
 	struct devlink_port dl_port;
 	struct net_device *uplink_netdev;
+	struct mutex uplink_netdev_lock;
 };
 
 enum mlx5_sw_icm_type {
@@ -1011,6 +1013,9 @@ int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size,
 			  void *out, int out_size);
 bool mlx5_cmd_is_down(struct mlx5_core_dev *dev);
 
+void mlx5_core_uplink_netdev_set(struct mlx5_core_dev *mdev, struct net_device *netdev);
+void mlx5_core_uplink_netdev_event_replay(struct mlx5_core_dev *mdev);
+
 int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type);
 void mlx5_health_cleanup(struct mlx5_core_dev *dev);
 int mlx5_health_init(struct mlx5_core_dev *dev);
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index ba6958b49a8e..d72a09a3798c 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -51,6 +51,7 @@ enum mlx5_flow_destination_type {
 	MLX5_FLOW_DESTINATION_TYPE_COUNTER,
 	MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM,
 	MLX5_FLOW_DESTINATION_TYPE_RANGE,
+	MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE,
 };
 
 enum {
@@ -102,6 +103,8 @@ enum mlx5_flow_namespace_type {
 	MLX5_FLOW_NAMESPACE_PORT_SEL,
 	MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS,
 	MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS,
+	MLX5_FLOW_NAMESPACE_RDMA_RX_IPSEC,
+	MLX5_FLOW_NAMESPACE_RDMA_TX_IPSEC,
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index a9ee7bc59c90..c3d3a2eef7d4 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -315,6 +315,11 @@ enum {
 	MLX5_CMD_OP_GENERAL_END = 0xd00,
 };
 
+enum {
+	MLX5_FT_NIC_RX_2_NIC_RX_RDMA = BIT(0),
+	MLX5_FT_NIC_TX_RDMA_2_NIC_TX = BIT(1),
+};
+
 struct mlx5_ifc_flow_table_fields_supported_bits {
 	u8         outer_dmac[0x1];
 	u8         outer_smac[0x1];
@@ -1496,7 +1501,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         null_mkey[0x1];
 	u8         log_max_klm_list_size[0x6];
 
-	u8         reserved_at_120[0xa];
+	u8         reserved_at_120[0x2];
+	u8	   qpc_extension[0x1];
+	u8	   reserved_at_123[0x7];
 	u8         log_max_ra_req_dc[0x6];
 	u8         reserved_at_130[0x2];
 	u8         eth_wqe_too_small[0x1];
@@ -1662,7 +1669,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         log_bf_reg_size[0x5];
 
-	u8         reserved_at_270[0x6];
+	u8         reserved_at_270[0x3];
+	u8	   qp_error_syndrome[0x1];
+	u8	   reserved_at_274[0x2];
 	u8         lag_dct[0x2];
 	u8         lag_tx_port_affinity[0x1];
 	u8         lag_native_fdb_selection[0x1];
@@ -1899,7 +1908,8 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 
 	u8	   reserved_at_e0[0xc0];
 
-	u8	   reserved_at_1a0[0xb];
+	u8	   flow_table_type_2_type[0x8];
+	u8	   reserved_at_1a8[0x3];
 	u8	   log_min_mkey_entity_size[0x5];
 	u8	   reserved_at_1b0[0x10];
 
@@ -1923,6 +1933,7 @@ enum mlx5_ifc_flow_destination_type {
 	MLX5_IFC_FLOW_DESTINATION_TYPE_TIR          = 0x2,
 	MLX5_IFC_FLOW_DESTINATION_TYPE_FLOW_SAMPLER = 0x6,
 	MLX5_IFC_FLOW_DESTINATION_TYPE_UPLINK       = 0x8,
+	MLX5_IFC_FLOW_DESTINATION_TYPE_TABLE_TYPE   = 0xA,
 };
 
 enum mlx5_flow_table_miss_action {
@@ -1937,7 +1948,8 @@ struct mlx5_ifc_dest_format_struct_bits {
 
 	u8         destination_eswitch_owner_vhca_id_valid[0x1];
 	u8         packet_reformat[0x1];
-	u8         reserved_at_22[0xe];
+	u8         reserved_at_22[0x6];
+	u8         destination_table_type[0x8];
 	u8         destination_eswitch_owner_vhca_id[0x10];
 };
 
@@ -5342,6 +5354,37 @@ struct mlx5_ifc_query_rmp_in_bits {
 	u8         reserved_at_60[0x20];
 };
 
+struct mlx5_ifc_cqe_error_syndrome_bits {
+	u8         hw_error_syndrome[0x8];
+	u8         hw_syndrome_type[0x4];
+	u8         reserved_at_c[0x4];
+	u8         vendor_error_syndrome[0x8];
+	u8         syndrome[0x8];
+};
+
+struct mlx5_ifc_qp_context_extension_bits {
+	u8         reserved_at_0[0x60];
+
+	struct mlx5_ifc_cqe_error_syndrome_bits error_syndrome;
+
+	u8         reserved_at_80[0x580];
+};
+
+struct mlx5_ifc_qpc_extension_and_pas_list_in_bits {
+	struct mlx5_ifc_qp_context_extension_bits qpc_data_extension;
+
+	u8         pas[0][0x40];
+};
+
+struct mlx5_ifc_qp_pas_list_in_bits {
+	struct mlx5_ifc_cmd_pas_bits pas[0];
+};
+
+union mlx5_ifc_qp_pas_or_qpc_ext_and_pas_bits {
+	struct mlx5_ifc_qp_pas_list_in_bits qp_pas_list;
+	struct mlx5_ifc_qpc_extension_and_pas_list_in_bits qpc_ext_and_pas_list;
+};
+
 struct mlx5_ifc_query_qp_out_bits {
 	u8         status[0x8];
 	u8         reserved_at_8[0x18];
@@ -5358,7 +5401,7 @@ struct mlx5_ifc_query_qp_out_bits {
 
 	u8         reserved_at_800[0x80];
 
-	u8         pas[][0x40];
+	union mlx5_ifc_qp_pas_or_qpc_ext_and_pas_bits qp_pas_or_qpc_ext_and_pas;
 };
 
 struct mlx5_ifc_query_qp_in_bits {
@@ -5368,7 +5411,8 @@ struct mlx5_ifc_query_qp_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x8];
+	u8         qpc_ext[0x1];
+	u8         reserved_at_41[0x7];
 	u8         qpn[0x18];
 
 	u8         reserved_at_60[0x20];
@@ -8571,7 +8615,8 @@ struct mlx5_ifc_create_qp_in_bits {
 	u8         reserved_at_20[0x10];
 	u8         op_mod[0x10];
 
-	u8         reserved_at_40[0x8];
+	u8         qpc_ext[0x1];
+	u8         reserved_at_41[0x7];
 	u8         input_qpn[0x18];
 
 	u8         reserved_at_60[0x20];

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-01-26 23:08 pull-request: mlx5-next 2023-01-24 V2 Saeed Mahameed
@ 2023-02-02  7:46 ` Leon Romanovsky
  2023-02-02 17:13   ` Jakub Kicinski
  2023-02-03 20:14 ` Saeed Mahameed
  1 sibling, 1 reply; 33+ messages in thread
From: Leon Romanovsky @ 2023-02-02  7:46 UTC (permalink / raw)
  To: Saeed Mahameed, David S. Miller, Jakub Kicinski, Paolo Abeni,
	Eric Dumazet
  Cc: Jason Gunthorpe, Saeed Mahameed, linux-rdma, netdev

On Thu, Jan 26, 2023 at 03:08:15PM -0800, Saeed Mahameed wrote:
> Hi, 
> 
> This pulls mlx5-next branch which includes changes from [1]:
> 
> 1) From Jiri: fixe a deadlock in mlx5_ib's netdev notifier unregister.
> 2) From Mark and Patrisious: add IPsec RoCEv2 support.
> 
> [1] https://lore.kernel.org/netdev/20230105041756.677120-1-saeed@kernel.org/
> 
> Please pull into net-next and rdma-next.


Hi, 

I don't see it in net-next yet, can you please pull it?

There are outstanding RDMA patches which depend on this shared branch.
https://lore.kernel.org/all/cover.1673960981.git.leon@kernel.org

Thanks

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-02  7:46 ` Leon Romanovsky
@ 2023-02-02 17:13   ` Jakub Kicinski
  2023-02-02 17:14     ` Jason Gunthorpe
  0 siblings, 1 reply; 33+ messages in thread
From: Jakub Kicinski @ 2023-02-02 17:13 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Saeed Mahameed, David S. Miller, Paolo Abeni, Eric Dumazet,
	Jason Gunthorpe, Saeed Mahameed, linux-rdma, netdev

On Thu, 2 Feb 2023 09:46:11 +0200 Leon Romanovsky wrote:
> I don't see it in net-next yet, can you please pull it?
> 
> There are outstanding RDMA patches which depend on this shared branch.
> https://lore.kernel.org/all/cover.1673960981.git.leon@kernel.org

FWIW I'm not nacking this but I'm not putting my name on the merge,
either. You need to convince one of the other netdev maintainers to
pull.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-02 17:13   ` Jakub Kicinski
@ 2023-02-02 17:14     ` Jason Gunthorpe
  2023-02-02 17:25       ` Jakub Kicinski
  2023-02-02 18:07       ` Leon Romanovsky
  0 siblings, 2 replies; 33+ messages in thread
From: Jason Gunthorpe @ 2023-02-02 17:14 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Leon Romanovsky, Saeed Mahameed, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Thu, Feb 02, 2023 at 09:13:12AM -0800, Jakub Kicinski wrote:
> On Thu, 2 Feb 2023 09:46:11 +0200 Leon Romanovsky wrote:
> > I don't see it in net-next yet, can you please pull it?
> > 
> > There are outstanding RDMA patches which depend on this shared branch.
> > https://lore.kernel.org/all/cover.1673960981.git.leon@kernel.org
> 
> FWIW I'm not nacking this but I'm not putting my name on the merge,
> either. You need to convince one of the other netdev maintainers to
> pull.

What is the issue with this PR?

It looks all driver internal to me?

Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-02 17:14     ` Jason Gunthorpe
@ 2023-02-02 17:25       ` Jakub Kicinski
  2023-02-02 17:44         ` Jason Gunthorpe
  2023-02-02 18:07       ` Leon Romanovsky
  1 sibling, 1 reply; 33+ messages in thread
From: Jakub Kicinski @ 2023-02-02 17:25 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Leon Romanovsky, Saeed Mahameed, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Thu, 2 Feb 2023 13:14:25 -0400 Jason Gunthorpe wrote:
> On Thu, Feb 02, 2023 at 09:13:12AM -0800, Jakub Kicinski wrote:
> > On Thu, 2 Feb 2023 09:46:11 +0200 Leon Romanovsky wrote:  
> > > I don't see it in net-next yet, can you please pull it?
> > > 
> > > There are outstanding RDMA patches which depend on this shared branch.
> > > https://lore.kernel.org/all/cover.1673960981.git.leon@kernel.org  
> > 
> > FWIW I'm not nacking this but I'm not putting my name on the merge,
> > either. You need to convince one of the other netdev maintainers to
> > pull.  
> 
> What is the issue with this PR?

You don't remember me trying to convince you to keep the RoCE stuff
away from our open source IPsec implementation?

> It looks all driver internal to me?

Typical in a proprietary world, like RDMA, isn't it?


I'm just letting you know why I'm not merging it. I'm not the only one
with the keys, find someone else to convince, please.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-02 17:25       ` Jakub Kicinski
@ 2023-02-02 17:44         ` Jason Gunthorpe
  2023-02-02 17:54           ` Jakub Kicinski
  0 siblings, 1 reply; 33+ messages in thread
From: Jason Gunthorpe @ 2023-02-02 17:44 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Leon Romanovsky, Saeed Mahameed, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Thu, Feb 02, 2023 at 09:25:07AM -0800, Jakub Kicinski wrote:
> On Thu, 2 Feb 2023 13:14:25 -0400 Jason Gunthorpe wrote:
> > On Thu, Feb 02, 2023 at 09:13:12AM -0800, Jakub Kicinski wrote:
> > > On Thu, 2 Feb 2023 09:46:11 +0200 Leon Romanovsky wrote:  
> > > > I don't see it in net-next yet, can you please pull it?
> > > > 
> > > > There are outstanding RDMA patches which depend on this shared branch.
> > > > https://lore.kernel.org/all/cover.1673960981.git.leon@kernel.org  
> > > 
> > > FWIW I'm not nacking this but I'm not putting my name on the merge,
> > > either. You need to convince one of the other netdev maintainers to
> > > pull.  
> > 
> > What is the issue with this PR?
> 
> You don't remember me trying to convince you to keep the RoCE stuff
> away from our open source IPsec implementation?

Huh? What does this:

https://lore.kernel.org/all/cover.1673960981.git.leon@kernel.org/

Have to do with IPsec?

Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-02 17:44         ` Jason Gunthorpe
@ 2023-02-02 17:54           ` Jakub Kicinski
  2023-02-02 18:03             ` Leon Romanovsky
  0 siblings, 1 reply; 33+ messages in thread
From: Jakub Kicinski @ 2023-02-02 17:54 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Leon Romanovsky, Saeed Mahameed, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Thu, 2 Feb 2023 13:44:05 -0400 Jason Gunthorpe wrote:
> > You don't remember me trying to convince you to keep the RoCE stuff
> > away from our open source IPsec implementation?  
> 
> Huh? What does this:
> 
> https://lore.kernel.org/all/cover.1673960981.git.leon@kernel.org/
> 
> Have to do with IPsec?

Dunno. But I don't know what it has to do with the PR we're commenting
on either..

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-02 17:54           ` Jakub Kicinski
@ 2023-02-02 18:03             ` Leon Romanovsky
  2023-02-02 18:15               ` Saeed Mahameed
  0 siblings, 1 reply; 33+ messages in thread
From: Leon Romanovsky @ 2023-02-02 18:03 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Jason Gunthorpe, Saeed Mahameed, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Thu, Feb 02, 2023 at 09:54:53AM -0800, Jakub Kicinski wrote:
> On Thu, 2 Feb 2023 13:44:05 -0400 Jason Gunthorpe wrote:
> > > You don't remember me trying to convince you to keep the RoCE stuff
> > > away from our open source IPsec implementation?  
> > 
> > Huh? What does this:
> > 
> > https://lore.kernel.org/all/cover.1673960981.git.leon@kernel.org/
> > 
> > Have to do with IPsec?
> 
> Dunno. But I don't know what it has to do with the PR we're commenting
> on either..

It has to do, because I need shared branch to put net/mlx5 patches from
that "special keys" series and I patiently waited for any response.

First, I didn't see any comment of not pulling Saeed's PR.
Second, I didn't see any not-pulling comments other IPsec patches which
Saeed posted prior issuing his PR.
Third, IPsec patches are pure mlx5_core changes. This is where flow
steering exists.

➜  kernel git:(rdma-next) git diff --stat bb2e8913dc40..ml/mlx5-next
 drivers/net/ethernet/mellanox/mlx5/core/Makefile             |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c |   4 +
 drivers/net/ethernet/mellanox/mlx5/core/en/fs.h              |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h     |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c  |  59 ++++++--
 drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c             |   6 +
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c            |  44 +++++-
 drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c  | 372 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.h  |  20 +++
 include/linux/mlx5/fs.h                                      |   3 +
 10 files changed, 497 insertions(+), 15 deletions(-)

Thanks

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-02 17:14     ` Jason Gunthorpe
  2023-02-02 17:25       ` Jakub Kicinski
@ 2023-02-02 18:07       ` Leon Romanovsky
  1 sibling, 0 replies; 33+ messages in thread
From: Leon Romanovsky @ 2023-02-02 18:07 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Jakub Kicinski, Saeed Mahameed, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Thu, Feb 02, 2023 at 01:14:25PM -0400, Jason Gunthorpe wrote:
> On Thu, Feb 02, 2023 at 09:13:12AM -0800, Jakub Kicinski wrote:
> > On Thu, 2 Feb 2023 09:46:11 +0200 Leon Romanovsky wrote:
> > > I don't see it in net-next yet, can you please pull it?
> > > 
> > > There are outstanding RDMA patches which depend on this shared branch.
> > > https://lore.kernel.org/all/cover.1673960981.git.leon@kernel.org
> > 
> > FWIW I'm not nacking this but I'm not putting my name on the merge,
> > either. You need to convince one of the other netdev maintainers to
> > pull.
> 
> What is the issue with this PR?

The PR which stuck is from Saeed. I waited for netdev to pull it, before
I will add new code and will pull it to RDMA repo.

> 
> It looks all driver internal to me?

Yes, it is.

> 
> Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-02 18:03             ` Leon Romanovsky
@ 2023-02-02 18:15               ` Saeed Mahameed
  2023-02-02 18:30                 ` Jakub Kicinski
  0 siblings, 1 reply; 33+ messages in thread
From: Saeed Mahameed @ 2023-02-02 18:15 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Jakub Kicinski, Jason Gunthorpe, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On 02 Feb 20:03, Leon Romanovsky wrote:
>On Thu, Feb 02, 2023 at 09:54:53AM -0800, Jakub Kicinski wrote:
>> On Thu, 2 Feb 2023 13:44:05 -0400 Jason Gunthorpe wrote:
>> > > You don't remember me trying to convince you to keep the RoCE stuff
>> > > away from our open source IPsec implementation?
>> >
>> > Huh? What does this:
>> >
>> > https://lore.kernel.org/all/cover.1673960981.git.leon@kernel.org/
>> >
>> > Have to do with IPsec?
>>
>> Dunno. But I don't know what it has to do with the PR we're commenting
>> on either..
>
>It has to do, because I need shared branch to put net/mlx5 patches from
>that "special keys" series and I patiently waited for any response.
>

Hi Jakub, in a nutshell, my PR adds the steering rules needed for ipsec
RoCE purely in mlx5 only, ipsec tables are shared between netdev and rdma
It's a reality that mlx5_core is serving both netdev and rdma, it's not
about who has the keys for approving, it's that the fact the mlx5_core is
not just a netdev driver, this is true for all vendors who serve both
worlds it's not just mlx5.



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-02 18:15               ` Saeed Mahameed
@ 2023-02-02 18:30                 ` Jakub Kicinski
  2023-02-03 20:05                   ` Saeed Mahameed
  0 siblings, 1 reply; 33+ messages in thread
From: Jakub Kicinski @ 2023-02-02 18:30 UTC (permalink / raw)
  To: Saeed Mahameed
  Cc: Leon Romanovsky, Jason Gunthorpe, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Thu, 2 Feb 2023 10:15:57 -0800 Saeed Mahameed wrote:
> It's a reality that mlx5_core is serving both netdev and rdma, it's not
> about who has the keys for approving, it's that the fact the mlx5_core is
> not just a netdev driver

Nah, nah, nah, don't play with me. You put in "full IPsec offload" 
with little netdev use, then start pushing RDMA IPsec patches.
Don't make it sound like netdev and rdma are separate entities which
just share the HW when you're using APIs of one to configure the other.
If RDMA invented its own API for IPsec without touching xfrm, we would
not be having this conversation. That'd be fine by me.

You used our APIs to make your proprietary thing easier to integrate and
configure - now you have to find someone who will pull the PR and still
sleep at night. Not me.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-02 18:30                 ` Jakub Kicinski
@ 2023-02-03 20:05                   ` Saeed Mahameed
  2023-02-03 21:14                     ` Jakub Kicinski
  0 siblings, 1 reply; 33+ messages in thread
From: Saeed Mahameed @ 2023-02-03 20:05 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Leon Romanovsky, Jason Gunthorpe, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On 02 Feb 10:30, Jakub Kicinski wrote:
>On Thu, 2 Feb 2023 10:15:57 -0800 Saeed Mahameed wrote:
>> It's a reality that mlx5_core is serving both netdev and rdma, it's not
>> about who has the keys for approving, it's that the fact the mlx5_core is
>> not just a netdev driver
>
>Nah, nah, nah, don't play with me. You put in "full IPsec offload"
>with little netdev use, then start pushing RDMA IPsec patches.
>Don't make it sound like netdev and rdma are separate entities which
>just share the HW when you're using APIs of one to configure the other.
>If RDMA invented its own API for IPsec without touching xfrm, we would
>not be having this conversation. That'd be fine by me.
>
>You used our APIs to make your proprietary thing easier to integrate and
>configure - now you have to find someone who will pull the PR and still
>sleep at night. Not me.

I don't agree, RDMA isn't proprietary, and I wish not to go into this
political discussion, as this series isn't the right place for that.

To summarize, mlx5_core is doing RoCE traffic processing and directs it to
mlx5_ib driver (a standard rdma stack), in this series we add RoCE ipsec
traffic processing as we do for all other RoCE traffic.

   net/mlx5: Implement new destination type TABLE_TYPE
   net/mlx5: Add IPSec priorities in RDMA namespaces
   net/mlx5: Configure IPsec steering for ingress RoCEv2 traffic
   net/mlx5: Configure IPsec steering for egress RoCEv2 traffic

The last two patches are literally just adding the steering rules
corresponding to ingress and egress RoCE traffic in mlx5_core steering
tables. 


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-01-26 23:08 pull-request: mlx5-next 2023-01-24 V2 Saeed Mahameed
  2023-02-02  7:46 ` Leon Romanovsky
@ 2023-02-03 20:14 ` Saeed Mahameed
  1 sibling, 0 replies; 33+ messages in thread
From: Saeed Mahameed @ 2023-02-03 20:14 UTC (permalink / raw)
  To: David S. Miller, Jakub Kicinski, Paolo Abeni, Eric Dumazet,
	Jason Gunthorpe
  Cc: Saeed Mahameed, linux-rdma, Leon Romanovsky, netdev

On 26 Jan 15:08, Saeed Mahameed wrote:
>Hi,
>
>This pulls mlx5-next branch which includes changes from [1]:
>
>1) From Jiri: fixe a deadlock in mlx5_ib's netdev notifier unregister.
>2) From Mark and Patrisious: add IPsec RoCEv2 support.
>
>[1] https://lore.kernel.org/netdev/20230105041756.677120-1-saeed@kernel.org/
>
>Please pull into net-next and rdma-next.
>

[...]

>
>The following changes since commit b7bfaa761d760e72a969d116517eaa12e404c262:
>
>  Linux 6.2-rc3 (2023-01-08 11:49:43 -0600)
>
>are available in the Git repository at:
>
>  git://git.kernel.org/pub/scm/linux/kernel/git/mellanox/linux.git mlx5-next
>

Dave, Paolo, any chance you could pull this one?

The PR is already marked as accepted in patchwork but we don't see it in
net-next, Jason was planning to pull this into rdma-next but 
since we got a small conflict with net-next, we would like to make
sure it's handled first.

The conflict is very trivial and just take the two conflicting lines below

diff --cc include/linux/mlx5/driver.h
index cd529e051b4d,cc48aa308269..000000000000
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@@ -674,7 -675,7 +675,11 @@@ struct mlx5e_resources 
         } hw_objs;
         struct devlink_port dl_port;
         struct net_device *uplink_netdev;
++<<<<<<< HEAD
  +      struct mlx5_crypto_dek_priv *dek_priv;
++=======
+       struct mutex uplink_netdev_lock;
++>>>>>>> c4d508fbe54af3119e01672299514bfc83dfd59f
   };
   

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-03 20:05                   ` Saeed Mahameed
@ 2023-02-03 21:14                     ` Jakub Kicinski
  2023-02-04  0:18                       ` Jason Gunthorpe
  2023-02-04  0:47                       ` Saeed Mahameed
  0 siblings, 2 replies; 33+ messages in thread
From: Jakub Kicinski @ 2023-02-03 21:14 UTC (permalink / raw)
  To: Saeed Mahameed
  Cc: Leon Romanovsky, Jason Gunthorpe, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

I believe Paolo is planning to look next week. No idea why the patch
got marked as Accepted 🤷️

On Fri, 3 Feb 2023 12:05:56 -0800 Saeed Mahameed wrote:
> I don't agree, RDMA isn't proprietary, and I wish not to go into this
> political discussion, as this series isn't the right place for that.

I don't think it's a political discussion. Or at least not in the sense 
of hidden agendas because our agendas aren't hidden. I'm a maintainer
of an open source networking stack, you're working for a vendor who
wants to sell their own networking stack.

Perhaps you'd like to believe, and importantly have your customers
believe that it's the same networking stack. It is not, the crucial,
transport part of your stack is completely closed.

I don't think we can expect Linus to take a hard stand on this, but
do not expect us to lend you our APIs and help you sell your product.

Saying that RDMA/RoCE is not proprietary because there is a "standard"
is like saying that Windows is an open source operating system because
it supports POSIX.

My objectives for netdev are:
 - give users vendor independence
 - give developers the ability to innovate

I have not seen an RDMA implementation which could deliver on either.
Merging this code is contrary to my objectives for the project.

> To summarize, mlx5_core is doing RoCE traffic processing and directs it to
> mlx5_ib driver (a standard rdma stack), in this series we add RoCE ipsec
> traffic processing as we do for all other RoCE traffic.

I already said it. If you wanted to configure IPsec for RoCE you should
have added an API in the RDMA subsystem.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-03 21:14                     ` Jakub Kicinski
@ 2023-02-04  0:18                       ` Jason Gunthorpe
  2023-02-04  1:45                         ` Jakub Kicinski
  2023-02-04  0:47                       ` Saeed Mahameed
  1 sibling, 1 reply; 33+ messages in thread
From: Jason Gunthorpe @ 2023-02-04  0:18 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Saeed Mahameed, Leon Romanovsky, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Fri, Feb 03, 2023 at 01:14:56PM -0800, Jakub Kicinski wrote:
> I believe Paolo is planning to look next week. No idea why the patch
> got marked as Accepted 🤷️
> 
> On Fri, 3 Feb 2023 12:05:56 -0800 Saeed Mahameed wrote:
> > I don't agree, RDMA isn't proprietary, and I wish not to go into this
> > political discussion, as this series isn't the right place for that.
> 
> I don't think it's a political discussion. Or at least not in the sense 
> of hidden agendas because our agendas aren't hidden. I'm a maintainer
> of an open source networking stack, you're working for a vendor who
> wants to sell their own networking stack.

Wow, come down to earth a bit here, jeeze.

You are the maintainer of an open source subcomponent in Linux

I am the maintainer of an open source subcomponent in Linux

Gosh, they have some technological differences, but hey so does netdev
vs NVMe too - are you also upset that NVMe is less pure than netdev
because all the "crucial" flash management is proprietary?  Or suggest
that we should rip out all the AWS, GCP and HyperV drivers because the
hypervisor that creates them is closed source?

Heck, we both have quite interesting employers that bring their own
bias's and echo chambers.

Dave drew his line for netdev long ago, and I really respect that
choice and his convictions. But don't act like it is "better" or
somehow "more Linusy" than every other subsystem in the kernel.

> I don't think we can expect Linus to take a hard stand on this, but
> do not expect us to lend you our APIs and help you sell your product.

I think Linus has taken a stand. He is working on *Linux* not GNU
Hurd. The difference is Linux welcomes all HW and all devices. Bring
your open source kernel code and open source user space and you are
welcome here.

Sure the community has lots of different opinions, and there is a
definite group that leans in direction of wanting more open-ness
outside the kernel too, but overall Linus has kept consistent and has
not refused participation of HW on stricter ideological grounds.

"You are welcome here" is exactly why Linux dominates the industry and
GNU Hurd is a footnote.

"help you sell your product" when talking about a fellow open source
subsystem is an insulting line that has no business on these mailing
lists.

> Saying that RDMA/RoCE is not proprietary because there is a "standard"
> is like saying that Windows is an open source operating system because
> it supports POSIX.

That is a very creative definition of proprietary.

If you said "open source software to operate standards based fixed
function HW engines" you'd have a lot more accuracy and credibility,
but it doesn't sound as scary when you say it like that, does it?

RDMA is a alot more open than an NVMe drive, for instance.

> My objectives for netdev are:
>  - give users vendor independence
>  - give developers the ability to innovate
> 
> I have not seen an RDMA implementation which could deliver on either.
> Merging this code is contrary to my objectives for the project.

The things we do in other parts of the kernel in no way degrade these
activities for netdev. RDMA mirroring the netdev configurations is
irrelevant to the continued technical development of netdev, or its
ability to innovate.

We've never once said "you can't do that" to netdev because of
something RDMA is doing. I've been strict about that, rdma is on the
side of netdev and does not shackle netdev.

You've made it very clear you don't like the RDMA technology, but you
have no right to try and use your position as a kernel maintainer to
try and kill it by refusing PRs to shared driver code.

Let's try to all get along.

> > To summarize, mlx5_core is doing RoCE traffic processing and directs it to
> > mlx5_ib driver (a standard rdma stack), in this series we add RoCE ipsec
> > traffic processing as we do for all other RoCE traffic.
> 
> I already said it. If you wanted to configure IPsec for RoCE you should
> have added an API in the RDMA subsystem.

Did that years ago.

https://github.com/linux-rdma/rdma-core/blob/master/providers/mlx5/man/mlx5dv_flow_action_esp.3.md

HW accelerated IPSEC has been in RDMA and DPDK for a long time now,
the mlx5 team is trying to catch up netdev because NVIDIA has
customers interested in using netdev with ipsec and would like to get
best performance from their HW.

We always try to do a complete job and ensure that RDMA's use of the
shared IP/port and netdev use of the shared IP/port are as consistent
as we can get - and now that it is technically trivial for mlx5 to run
the RDMA IP traffic inside the HW that matches the netdev flows we
will do that too.

It is really paranoid to think we somehow did all the netdev
enablement just to get something in RDMA. Sorry, there is no
incredible irreplaceable value there. The netdev stuff was a lot of
difficult work and was very much done to run traffic originating in
netdev.

Real customers have mixed workloads, and I think that's great. You
should try looking outside the bubble of your peculiar hyperscaler
employer someday and see what the rest of the industry is doing. There
is a reason every high speed NIC has a RDMA offering now, a reason
every major cloud has some kind of RDMA based networking offering and
a reason I've been merging a couple new RDMA drivers every year.

None of that activity takes away from netdev - it is not a zero sum
game. Even more importantly, for Linux, my multivendor open source
community is every bit as legitimate as yours.

I appreciate your political leanings, and your deep concern for
netdev. But I have no idea why you care what RDMA does, and reject
this absurd notion that the IP address, or APIs inside our shared
Linux kernel are somehow "yours" alone to decide how and when they are
used.

Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-03 21:14                     ` Jakub Kicinski
  2023-02-04  0:18                       ` Jason Gunthorpe
@ 2023-02-04  0:47                       ` Saeed Mahameed
  2023-02-04  1:57                         ` Jakub Kicinski
  1 sibling, 1 reply; 33+ messages in thread
From: Saeed Mahameed @ 2023-02-04  0:47 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Leon Romanovsky, Jason Gunthorpe, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On 03 Feb 13:14, Jakub Kicinski wrote:
>I believe Paolo is planning to look next week. No idea why the patch
>got marked as Accepted 🤷️
>
>On Fri, 3 Feb 2023 12:05:56 -0800 Saeed Mahameed wrote:
>> I don't agree, RDMA isn't proprietary, and I wish not to go into this
>> political discussion, as this series isn't the right place for that.
>
>I don't think it's a political discussion. Or at least not in the sense
>of hidden agendas because our agendas aren't hidden. I'm a maintainer
>of an open source networking stack, you're working for a vendor who
>wants to sell their own networking stack.
>

we don't own any networking stack.. yes we do work on multiple opesource
fronts and projects, but how is that related to this patchset ? 
For the sake of this patchset, this purely mlx5 device management, and
yes for RoCE traffic, RoCE is RDMA spec and standard and an open source
mainstream kernel stack.

Now if you have issues of how they manage the RDMA stack, I 100% sure it
has nothing to do with mlx5_core, and such political discussion should be
taken elsewhere.

>Perhaps you'd like to believe, and importantly have your customers
>believe that it's the same networking stack. It is not, the crucial,

I personally don't believe it's the same networking stack.
  
>transport part of your stack is completely closed.
>

RDMA/RoCE is an open standard. also the ConnectX spec for both ethernet
and rdma and driver implementation is completely open..
yes the standard/open defined transport stack is implemented in HW,
hence RDMA..

>I don't think we can expect Linus to take a hard stand on this, but
>do not expect us to lend you our APIs and help you sell your product.
>
>Saying that RDMA/RoCE is not proprietary because there is a "standard"
>is like saying that Windows is an open source operating system because
>it supports POSIX.
>

Apples and oranges, really :) .. 

Sorry but I have to disagree, the difference here is that the spec
is open and the stack is in the mainstream linux, and there are at least
10 active vendors currently contributing to rdma with open source driver
and open source user space, and there is pure software RoCE
implementation for the paranoid who don't trust hw vendors, oh and it uses
netdev APIs, should that be also forbidden ??

What you're really saying here is that no vendor is allowed to do any
offload or acceleration .. not XDP not even tunnel or vlan offload,
and devices should be a mere pipe.. 




^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-04  0:18                       ` Jason Gunthorpe
@ 2023-02-04  1:45                         ` Jakub Kicinski
  2023-02-06 14:58                           ` Jason Gunthorpe
  0 siblings, 1 reply; 33+ messages in thread
From: Jakub Kicinski @ 2023-02-04  1:45 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Saeed Mahameed, Leon Romanovsky, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Fri, 3 Feb 2023 20:18:50 -0400 Jason Gunthorpe wrote:
> Wow, come down to earth a bit here, jeeze.
> 
> You are the maintainer of an open source subcomponent in Linux
> 
> I am the maintainer of an open source subcomponent in Linux
> 
> Gosh, they have some technological differences, but hey so does netdev
> vs NVMe too - are you also upset that NVMe is less pure than netdev
> because all the "crucial" flash management is proprietary?  Or suggest
> that we should rip out all the AWS, GCP and HyperV drivers because the
> hypervisor that creates them is closed source?

Perfectly irrelevant comparisons :/ How many times do I have to say
that all I'm asking is that you stay away from us and our APIs?

> Heck, we both have quite interesting employers that bring their own
> bias's and echo chambers.

My employer has no influence on my opinions and is completely
irrelevant here :/ I hope the same is true for you.

> Dave drew his line for netdev long ago, and I really respect that
> choice and his convictions. But don't act like it is "better" or
> somehow "more Linusy" than every other subsystem in the kernel.

We do have more restrictions for HW access than most subsystems. 
Whether that's better or worse depends on heuristic.

> > I don't think we can expect Linus to take a hard stand on this, but
> > do not expect us to lend you our APIs and help you sell your product.  
> 
> I think Linus has taken a stand. He is working on *Linux* not GNU
> Hurd. The difference is Linux welcomes all HW and all devices. Bring
> your open source kernel code and open source user space and you are
> welcome here.
> 
> Sure the community has lots of different opinions, and there is a
> definite group that leans in direction of wanting more open-ness
> outside the kernel too, but overall Linus has kept consistent and has
> not refused participation of HW on stricter ideological grounds.

I think that's accurate. Only dissent I'd like to register is for use
of "HW" when the devices I'm concerned with run piles and piles of FW.
To avoid misunderstanding prefer the term "device".

> "You are welcome here" is exactly why Linux dominates the industry and
> GNU Hurd is a footnote.
> 
> "help you sell your product" when talking about a fellow open source
> subsystem is an insulting line that has no business on these mailing
> lists.

Well, perhaps I should have s/you/vendors/. I'm not saying that either
of you is sales motivated. At the same time I advise more introspection.

> > Saying that RDMA/RoCE is not proprietary because there is a "standard"
> > is like saying that Windows is an open source operating system because
> > it supports POSIX.  
> 
> That is a very creative definition of proprietary.
> 
> If you said "open source software to operate standards based fixed
> function HW engines" you'd have a lot more accuracy and credibility,
> but it doesn't sound as scary when you say it like that, does it?

Here you go again with the HW :)

Maybe to you it's all the same because you're not interested in network
protocols and networking in general? Apologies if that's a
misrepresentation, I don't really know you. I'm trying to understand
how can you possibly not see the difference, tho.

> RDMA is a alot more open than an NVMe drive, for instance.

Certainly. Still, I don't see the relevance of the openness of NVMe 
to me as a network engineer.

> > My objectives for netdev are:
> >  - give users vendor independence
> >  - give developers the ability to innovate
> > 
> > I have not seen an RDMA implementation which could deliver on either.
> > Merging this code is contrary to my objectives for the project.  
> 
> The things we do in other parts of the kernel in no way degrade these
> activities for netdev. RDMA mirroring the netdev configurations is
> irrelevant to the continued technical development of netdev, or its
> ability to innovate.
> 
> We've never once said "you can't do that" to netdev because of
> something RDMA is doing. I've been strict about that, rdma is on the
> side of netdev and does not shackle netdev.

There were multiple cases when I was trying to refactor some code,
run into RDMA using it in odd ways and had to stop :/

> You've made it very clear you don't like the RDMA technology, but you
> have no right to try and use your position as a kernel maintainer to
> try and kill it by refusing PRs to shared driver code.

For the n-th time, not my intention. RDMA may be more open than NVMe.
Do your thing. Just do it with your own APIs.

> > > To summarize, mlx5_core is doing RoCE traffic processing and directs it to
> > > mlx5_ib driver (a standard rdma stack), in this series we add RoCE ipsec
> > > traffic processing as we do for all other RoCE traffic.  
> > 
> > I already said it. If you wanted to configure IPsec for RoCE you should
> > have added an API in the RDMA subsystem.  
> 
> Did that years ago.
> 
> https://github.com/linux-rdma/rdma-core/blob/master/providers/mlx5/man/mlx5dv_flow_action_esp.3.md
> 
> HW accelerated IPSEC has been in RDMA and DPDK for a long time now,
> the mlx5 team is trying to catch up netdev because NVIDIA has
> customers interested in using netdev with ipsec and would like to get
> best performance from their HW.
> 
> We always try to do a complete job and ensure that RDMA's use of the
> shared IP/port and netdev use of the shared IP/port are as consistent
> as we can get - and now that it is technically trivial for mlx5 to run
> the RDMA IP traffic inside the HW that matches the netdev flows we
> will do that too.

see above

> It is really paranoid to think we somehow did all the netdev
> enablement just to get something in RDMA. Sorry, there is no
> incredible irreplaceable value there. The netdev stuff was a lot of
> difficult work and was very much done to run traffic originating in
> netdev.
> 
> Real customers have mixed workloads, and I think that's great. You
> should try looking outside the bubble of your peculiar hyperscaler
> employer someday and see what the rest of the industry is doing.

Frankly implying that my horizon is somehow limited to my employer
is insulting. Please stop. I've only worked at Meta since start of
the pandemic, and was a netdev maintainer before that.

> There is a reason every high speed NIC has a RDMA offering now,

Not every, but it's certainly a valuable feature from the commercial
perspective.

> a reason every major cloud has some kind of RDMA based networking
> offering and a reason I've been merging a couple new RDMA drivers
> every year.

Yes, that is a longer and indeed interesting conversation for another
time. I heard a bit about Amazon's and Google's RDMA both of which are
designed in house, so no loss of ability to innovate.

But please don't try to imply that TCP can't scale to high speeds.

> None of that activity takes away from netdev - it is not a zero sum
> game.

Its not a game at all.

> Even more importantly, for Linux, my multivendor open source
> community is every bit as legitimate as yours.

I don't know your community, can't comment.

> I appreciate your political leanings,

You're just baiting me now :)

> and your deep concern for netdev. But I have no idea why you care
> what RDMA does, and reject this absurd notion that the IP address,
> or APIs inside our shared Linux kernel are somehow "yours" alone to
> decide how and when they are used.

I do believe they belong to the netdev community. I don't know where
you'd draw the line otherwise.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-04  0:47                       ` Saeed Mahameed
@ 2023-02-04  1:57                         ` Jakub Kicinski
  2023-02-05 10:26                           ` Leon Romanovsky
  0 siblings, 1 reply; 33+ messages in thread
From: Jakub Kicinski @ 2023-02-04  1:57 UTC (permalink / raw)
  To: Saeed Mahameed
  Cc: Leon Romanovsky, Jason Gunthorpe, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Fri, 3 Feb 2023 16:47:26 -0800 Saeed Mahameed wrote:
> On 03 Feb 13:14, Jakub Kicinski wrote:
> >I believe Paolo is planning to look next week. No idea why the patch
> >got marked as Accepted 🤷️
> >
> >On Fri, 3 Feb 2023 12:05:56 -0800 Saeed Mahameed wrote:  
> >> I don't agree, RDMA isn't proprietary, and I wish not to go into this
> >> political discussion, as this series isn't the right place for that.  
> >
> >I don't think it's a political discussion. Or at least not in the sense
> >of hidden agendas because our agendas aren't hidden. I'm a maintainer
> >of an open source networking stack, you're working for a vendor who
> >wants to sell their own networking stack.
> 
> we don't own any networking stack.. yes we do work on multiple opesource
> fronts and projects, but how is that related to this patchset ? 
> For the sake of this patchset, this purely mlx5 device management, and
> yes for RoCE traffic, RoCE is RDMA spec and standard and an open source
> mainstream kernel stack.

My memory is that Leon proposed IPsec offload, I said "you're doing
this for RDMA", he said "no we will also need this for TC redirect",
I said "if you implement TC redirect that's a legit use of netdev APIs".

And now RDMA integration is coming, and no TC in sight.

I think it's reasonable for me to feel mislead.

> >I don't think we can expect Linus to take a hard stand on this, but
> >do not expect us to lend you our APIs and help you sell your product.
> >
> >Saying that RDMA/RoCE is not proprietary because there is a "standard"
> >is like saying that Windows is an open source operating system because
> >it supports POSIX.
> 
> Apples and oranges, really :) .. 
> 
> Sorry but I have to disagree, the difference here is that the spec
> is open and the stack is in the mainstream linux, and there are at least
> 10 active vendors currently contributing to rdma with open source driver
> and open source user space, and there is pure software RoCE
> implementation for the paranoid who don't trust hw vendors, oh and it uses
> netdev APIs, should that be also forbidden ??

I don't want to be having theoretical discussions.
In theory there could exist a fully open RoCE implementation which
inter-operates with all other implementations perfectly. Agreed.

> What you're really saying here is that no vendor is allowed to do any
> offload or acceleration ..

IDK where you got that form, and it's obviously counter factual.
If I was nacking all offloads, I've have nacked the "full" IPsec
offload and we wouldn't be having this conversation at all.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-04  1:57                         ` Jakub Kicinski
@ 2023-02-05 10:26                           ` Leon Romanovsky
  0 siblings, 0 replies; 33+ messages in thread
From: Leon Romanovsky @ 2023-02-05 10:26 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Saeed Mahameed, Jason Gunthorpe, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Fri, Feb 03, 2023 at 05:57:39PM -0800, Jakub Kicinski wrote:
> On Fri, 3 Feb 2023 16:47:26 -0800 Saeed Mahameed wrote:
> > On 03 Feb 13:14, Jakub Kicinski wrote:
> > >I believe Paolo is planning to look next week. No idea why the patch
> > >got marked as Accepted 🤷️

<...>

> My memory is that Leon proposed IPsec offload, I said "you're doing
> this for RDMA", he said "no we will also need this for TC redirect",
> I said "if you implement TC redirect that's a legit use of netdev APIs".
> 
> And now RDMA integration is coming, and no TC in sight.
> 
> I think it's reasonable for me to feel mislead.

And I think that it is reasonable to assume that company doesn't need to
stop its execution just because Leon is going through very challenging
time.

Like, I said, first I need to fix HW limitation and it much harder than
it sounds.

https://git.kernel.org/pub/scm/linux/kernel/git/leon/linux-rdma.git/commit/?h=xfrm-latest&id=67bff1d4a6e30010b6fd88ddc3ed70e9da75c95a

Thanks

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-04  1:45                         ` Jakub Kicinski
@ 2023-02-06 14:58                           ` Jason Gunthorpe
  2023-02-07  0:38                             ` Jakub Kicinski
  0 siblings, 1 reply; 33+ messages in thread
From: Jason Gunthorpe @ 2023-02-06 14:58 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Saeed Mahameed, Leon Romanovsky, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Fri, Feb 03, 2023 at 05:45:31PM -0800, Jakub Kicinski wrote:
> On Fri, 3 Feb 2023 20:18:50 -0400 Jason Gunthorpe wrote:
> > Wow, come down to earth a bit here, jeeze.
> > 
> > You are the maintainer of an open source subcomponent in Linux
> > 
> > I am the maintainer of an open source subcomponent in Linux
> > 
> > Gosh, they have some technological differences, but hey so does netdev
> > vs NVMe too - are you also upset that NVMe is less pure than netdev
> > because all the "crucial" flash management is proprietary?  Or suggest
> > that we should rip out all the AWS, GCP and HyperV drivers because the
> > hypervisor that creates them is closed source?
> 
> Perfectly irrelevant comparisons :/ How many times do I have to say
> that all I'm asking is that you stay away from us and our APIs?

What I'm reacting to is your remarks that came across as trying to
saying that the particular netdev subystem approach to open-ness was
in fact the same as the larger Linux values on open source and
community.

netdev is clearly more restrictive, so is DRM, and that's fine. But it
should stay in netdev and not be exported to the rest of the
kernel. Eg don't lock away APIs for what are really shared resources.

> > Heck, we both have quite interesting employers that bring their own
> > bias's and echo chambers.
> 
> My employer has no influence on my opinions and is completely
> irrelevant here :/ I hope the same is true for you.

Well, I sit in an echo-chamber that is different than yours. I'm
doubtful it doesn't have at least some effect on all of us to hear the
same themes over and over.

What you posted about your goals for netdev is pretty consistent with
the typical approach from a hyperscaler purchasing department: Make it
all the same. Grind the competing vendors on price.

I'd say here things are more like "lets innovate!" "lets
differentiate!" "customers pay a premium for uniquess"

Which side of the purchasing table is better for the resilience and
vibrancy of our community? I don't know. I prefer not to decide, I
think there is room for both to advance their interests. I don't view
one as taking away from the other in terms of open source.

> I think that's accurate. Only dissent I'd like to register is for use
> of "HW" when the devices I'm concerned with run piles and piles of FW.
> To avoid misunderstanding prefer the term "device".

I use the term "HW" because Linux doesn't care what is under that HW
interface. Like I said, the AWS, GCP, HyperV stuff is often all SW
pretending to be HW. Nobody really knows what is hiding under the
register interface of a PCI device.

Even the purest most simple NIC is ultimately connected to a switch
which usually runs loads of proprietary software, so people can make
all kinds of idological arguments about openness and freeness in the
space.

I would say, what the Linux community primarily concerns itself with
is the openness of the drivers and in-kernel code and the openness of
the userspace that consumes it. We've even walked back from demanding
an openness of the HW programming specification over the years.

Personally I feel the openness of the userspace is much more important
to the vibrancy of the community than openness of the HW/FW/SW thing
the device driver talks to. I don't like what I see as a dangerous
trend of large cloud operators pushing things into the kernel where
the gold standard userspace is kept as some internal proprietary
application.

At least here in this thread the IPSEC work is being built with and
tested against fully open source strong/openswan. So, I'm pretty
happy on ideological grounds.

> > That is a very creative definition of proprietary.
> > 
> > If you said "open source software to operate standards based fixed
> > function HW engines" you'd have a lot more accuracy and credibility,
> > but it doesn't sound as scary when you say it like that, does it?
> 
> Here you go again with the HW :)

In the early 2000's when this debate was had and Dave set the course
it really was almost pure HW in some of the devices. IIRC a few of the
TCP Offload vendors were doing TCP offload in SW cores, but that
wasn't universal. Certainly the first true RDMA devices (back in the
1990's!) were more HW that SW.

Even today the mlx devices are largely fixed function HW engines with
a bunch of software to configure them and babysit them when they get
grouchy.

This is why I don't like the HW/FW distinction as something relevant
to Linux - a TOE built in nearly pure HW RTL or a TOE that is all SW
are both equally unfree and proprietary. The SW/FW is just more vexing
because it is easier to imagine it as something that could be freed,
while ASIC gates are more accepted as unrealistic.

> Maybe to you it's all the same because you're not interested in network
> protocols and networking in general? Apologies if that's a
> misrepresentation, I don't really know you. I'm trying to understand
> how can you possibly not see the difference, tho.

I'm interested in the Linux software - and maintaining the open source
ecosystem. I've spent almost my whole career in this kind of space.

So I feel much closer to what I see as Linus's perspective: Bring your
open drivers, bring your open userspace, everyone is welcome.

In most cases I don't feel threatened by HW that absorbed SW
functions. I like NVMe as an example because NVMe sucked in,
basically, the entire MTD subsystem and a filesystem into drive FW and
made it all proprietary. But the MTD stuff still exists in Linux, if
you want to use it. We, as a community, haven't lost anything - we
just got out-competed by a better proprietary solution. Can't win them
all.

Port your essential argument over to the storage world - what would
you say if the MTD developers insisted that proprietary NVMe shouldn't
be allowed to use "their" block APIs in Linux?

Or the MD/DM developers said no RAID controller drivers were allowed
to use "their" block stack?

I think as an overall community we would loose more than we gain.

So, why in your mind is networking so different from storage?

> > We've never once said "you can't do that" to netdev because of
> > something RDMA is doing. I've been strict about that, rdma is on the
> > side of netdev and does not shackle netdev.
> 
> There were multiple cases when I was trying to refactor some code,
> run into RDMA using it in odd ways and had to stop :/

Yes, that is true, but the same can be said about drivers using code
in odd ways and so on. Heck Alistair just hit some wonky netdev code
while working on MM cgroup stuff. I think this is normal and expected.

My threshold is more that if we do the hard work we can overcome
it. I never want to see netdev say "even with hard work we can't do
it because RDMA".  Just as I'd be unhappy for netdev to say MM can't
do the refactor they want (and I guess we will see what becomes of
Alistair's series because he has problems with skbuff that are not
obviously solvable)

What I mean, is we've never said something like - netdev can't
implement VXLAN in netdev because RDMA devices can't HW offload
that. That's obviously ridiculous. I've always thought that the
discussion around the TOE issue way back then was more around concepts
similar to stable-api-nonsense.rst (ie don't tie our SW API to HW
choices) than it was to ideological openness.

> > You've made it very clear you don't like the RDMA technology, but you
> > have no right to try and use your position as a kernel maintainer to
> > try and kill it by refusing PRs to shared driver code.
> 
> For the n-th time, not my intention. RDMA may be more open than NVMe.
> Do your thing. Just do it with your own APIs.

The standards being implemented broadly require the use of the APIs -
particularly the shared IP address.

Try to take them away and it is effectively killing the whole thing.

The shared IP comes along with a lot of baggage, including things like
IPSEC, VLAN, MACSEC, tc, routing, etc, etc. You can't really use just
the IP without the whole kit.

We've tried to keep RDMA implementations away from the TCP/UDP stack
(at Dave's request long ago) but even that is kind of a loosing battle
because the standards bodies have said to use TCP and UDP headers as
well.

If you accept my philosophy "All are welcome" then how can I square
that with your demand to reject entire legitimate standards from
Linux?

Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-06 14:58                           ` Jason Gunthorpe
@ 2023-02-07  0:38                             ` Jakub Kicinski
  2023-02-07 19:52                               ` Jason Gunthorpe
  0 siblings, 1 reply; 33+ messages in thread
From: Jakub Kicinski @ 2023-02-07  0:38 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Saeed Mahameed, Leon Romanovsky, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Mon, 6 Feb 2023 10:58:56 -0400 Jason Gunthorpe wrote:
> On Fri, Feb 03, 2023 at 05:45:31PM -0800, Jakub Kicinski wrote:
> > Perfectly irrelevant comparisons :/ How many times do I have to say
> > that all I'm asking is that you stay away from us and our APIs?  
> 
> What I'm reacting to is your remarks that came across as trying to
> saying that the particular netdev subystem approach to open-ness was
> in fact the same as the larger Linux values on open source and
> community.
>
> netdev is clearly more restrictive, so is DRM, and that's fine. But it
> should stay in netdev and not be exported to the rest of the
> kernel. Eg don't lock away APIs for what are really shared resources.

I think you're misrepresenting. The DRM example is pertinent.
The DRM disagreement as I recall it was whether Dave gets to nack
random drivers in misc/ which are implementing GPU-like functionality
but do _not_ use DRM APIs.

Whether one subsystem can use another subsystem's API over maintainer's
NACK has a pretty obvious answer.

"Don't touch my APIs" separation is the simplest and most effective
solution to the problem of hosting code with different standards.

IMO netdev should not stand in the way of scale-out fabrics (IB etc.)
or IPUs, even tho they don't meet our standards of openness.
Good fences make good neighbors so I'd like to build a fence and avoid
having to discuss this over and over.

> > > Heck, we both have quite interesting employers that bring their own
> > > bias's and echo chambers.  
> > 
> > My employer has no influence on my opinions and is completely
> > irrelevant here :/ I hope the same is true for you.  
> 
> Well, I sit in an echo-chamber that is different than yours. I'm
> doubtful it doesn't have at least some effect on all of us to hear the
> same themes over and over.
> 
> What you posted about your goals for netdev is pretty consistent with
> the typical approach from a hyperscaler purchasing department: Make it
> all the same. Grind the competing vendors on price.

Hyperscalers perhaps drive harder bargains, but the volume is so high
I'd imagine it's much easier for a hyperscaler to spin up a team of
people to support a new vendor.

Everyone is familiar with the term "vendor lock-in". The principles
I listed are hardly hyperscaler driven.

> I'd say here things are more like "lets innovate!" "lets
> differentiate!" "customers pay a premium for uniquess"

Which favors complex and hard-to-copy offloads, over
iterating on incremental common sense improvements.

> Which side of the purchasing table is better for the resilience and
> vibrancy of our community? I don't know. I prefer not to decide, I
> think there is room for both to advance their interests. I don't view
> one as taking away from the other in terms of open source.

The distinction between large users vs vendors is moderately meaningful.
Both will occasionally try to benefit themselves to the detriment of
the community. One has to take the developments case by case.

FWIW the "sides of the purchasing table" phrasing brings to mind
industry forums rather than open source communities... Whether Linux
is turning into an industry forum, and what Joreen would have to say
about that*.. discussion for another time.

(*  https://en.wikipedia.org/wiki/The_Tyranny_of_Structurelessness )

> > I think that's accurate. Only dissent I'd like to register is for use
> > of "HW" when the devices I'm concerned with run piles and piles of FW.
> > To avoid misunderstanding prefer the term "device".  
> 
> I use the term "HW" because Linux doesn't care what is under that HW
> interface. Like I said, the AWS, GCP, HyperV stuff is often all SW
> pretending to be HW. Nobody really knows what is hiding under the
> register interface of a PCI device.

I understand, but it can be very misleading in context of discussions
about open source.

> Even the purest most simple NIC is ultimately connected to a switch
> which usually runs loads of proprietary software, so people can make
> all kinds of idological arguments about openness and freeness in the
> space.
> 
> I would say, what the Linux community primarily concerns itself with
> is the openness of the drivers and in-kernel code and the openness of
> the userspace that consumes it. We've even walked back from demanding
> an openness of the HW programming specification over the years.
> 
> Personally I feel the openness of the userspace is much more important
> to the vibrancy of the community than openness of the HW/FW/SW thing
> the device driver talks to.

Hard to comment in abstract terms, but for me as a networking guy - 
I can fix bugs and experiment with TCP/IP. Take the patches that come
out of Google, or Cloudflare, or anyone else and use them.
Experience very different to those of folks who work on RDMA networks.

> I don't like what I see as a dangerous
> trend of large cloud operators pushing things into the kernel where
> the gold standard userspace is kept as some internal proprietary
> application.

Curious what you mean here.

> At least here in this thread the IPSEC work is being built with and
> tested against fully open source strong/openswan. So, I'm pretty
> happy on ideological grounds.
> 
> > > That is a very creative definition of proprietary.
> > > 
> > > If you said "open source software to operate standards based fixed
> > > function HW engines" you'd have a lot more accuracy and credibility,
> > > but it doesn't sound as scary when you say it like that, does it?  
> > 
> > Here you go again with the HW :)  
> 
> In the early 2000's when this debate was had and Dave set the course
> it really was almost pure HW in some of the devices. IIRC a few of the
> TCP Offload vendors were doing TCP offload in SW cores, but that
> wasn't universal. Certainly the first true RDMA devices (back in the
> 1990's!) were more HW that SW.
> 
> Even today the mlx devices are largely fixed function HW engines with
> a bunch of software to configure them and babysit them when they get
> grouchy.
> 
> This is why I don't like the HW/FW distinction as something relevant
> to Linux - a TOE built in nearly pure HW RTL or a TOE that is all SW
> are both equally unfree and proprietary. The SW/FW is just more vexing
> because it is easier to imagine it as something that could be freed,
> while ASIC gates are more accepted as unrealistic.

Agreed, and that's why saying "device" without specifying HW/FW/SW 
at all should be acceptable middle ground. Not misleading or triggering.

> > Maybe to you it's all the same because you're not interested in network
> > protocols and networking in general? Apologies if that's a
> > misrepresentation, I don't really know you. I'm trying to understand
> > how can you possibly not see the difference, tho.  
> 
> I'm interested in the Linux software - and maintaining the open source
> ecosystem. I've spent almost my whole career in this kind of space.
> 
> So I feel much closer to what I see as Linus's perspective: Bring your
> open drivers, bring your open userspace, everyone is welcome.

(*as long as they are on a side of the purchasing table) ?

> In most cases I don't feel threatened by HW that absorbed SW
> functions. I like NVMe as an example because NVMe sucked in,
> basically, the entire MTD subsystem and a filesystem into drive FW and
> made it all proprietary. But the MTD stuff still exists in Linux, if
> you want to use it. We, as a community, haven't lost anything - we
> just got out-competed by a better proprietary solution. Can't win them
> all.
> 
> Port your essential argument over to the storage world - what would
> you say if the MTD developers insisted that proprietary NVMe shouldn't
> be allowed to use "their" block APIs in Linux?
> 
> Or the MD/DM developers said no RAID controller drivers were allowed
> to use "their" block stack?
> 
> I think as an overall community we would loose more than we gain.
> 
> So, why in your mind is networking so different from storage?

Networking is about connecting devices. It requires standards,
interoperability and backward compatibility.

I'm not an expert on storage but my understanding is that the
standardization of the internals is limited and seen as unnecessary.
So there is no real potential for open source implementations of
disk FW. Movement of data from point (a) to point (b) is not interesting
either so NVMe is perfectly fine. Developers innovate in filesystems 
instead.

In networking we have strong standards so you can (and do) write
open source software all the way down to the PHYs (serdes is where
things get quite tricky). At the same time movement of data from point
a to point b is _the_ problem so we need the ability to innovate in
the transport space.

Now we have strayed quite far from the initial problem under discussion,
but you can't say "networking is just like storage" and not expect
a tirade from a networking guy :-D 

> > > We've never once said "you can't do that" to netdev because of
> > > something RDMA is doing. I've been strict about that, rdma is on the
> > > side of netdev and does not shackle netdev.  
> > 
> > There were multiple cases when I was trying to refactor some code,
> > run into RDMA using it in odd ways and had to stop :/  
> 
> Yes, that is true, but the same can be said about drivers using code
> in odd ways and so on. Heck Alistair just hit some wonky netdev code
> while working on MM cgroup stuff. I think this is normal and expected.
> 
> My threshold is more that if we do the hard work we can overcome
> it. I never want to see netdev say "even with hard work we can't do
> it because RDMA".  Just as I'd be unhappy for netdev to say MM can't
> do the refactor they want (and I guess we will see what becomes of
> Alistair's series because he has problems with skbuff that are not
> obviously solvable)

Core kernel is not a good comparison. The example in DRM vs misc/
would be more fitting.

> What I mean, is we've never said something like - netdev can't
> implement VXLAN in netdev because RDMA devices can't HW offload
> that. That's obviously ridiculous. I've always thought that the
> discussion around the TOE issue way back then was more around concepts
> similar to stable-api-nonsense.rst (ie don't tie our SW API to HW
> choices) than it was to ideological openness.
> 
> > > You've made it very clear you don't like the RDMA technology, but you
> > > have no right to try and use your position as a kernel maintainer to
> > > try and kill it by refusing PRs to shared driver code.  
> > 
> > For the n-th time, not my intention. RDMA may be more open than NVMe.
> > Do your thing. Just do it with your own APIs.  
> 
> The standards being implemented broadly require the use of the APIs -
> particularly the shared IP address.

No point talking about IP addresses, that ship has sailed.
I bet the size of both communities was also orders of magnitude
smaller back then. Different conditions different outcomes.

> Try to take them away and it is effectively killing the whole thing.
> 
> The shared IP comes along with a lot of baggage, including things like
> IPSEC, VLAN, MACSEC, tc, routing, etc, etc. You can't really use just
> the IP without the whole kit.
> 
> We've tried to keep RDMA implementations away from the TCP/UDP stack
> (at Dave's request long ago) but even that is kind of a loosing battle
> because the standards bodies have said to use TCP and UDP headers as
> well.
> 
> If you accept my philosophy "All are welcome" then how can I square
> that with your demand to reject entire legitimate standards from
> Linux?

We don't support black-box transport offloads in netdev. I thought that
it'd come across but maybe I should spell it out - just because you
are welcome in Linux does not mean RDMA devices are welcome in netdev.

As much as we got distracted by our ideological differences over the
course of this thread - the issue is that I believe we had an agreement
which was not upheld.

I thought we compromised that to make the full offload sensible in
netdev world nVidia would implement forwarding to xfrm tunnels using 
tc rules. You want to add a feature in netdev, it needs to be usable 
in a non-trivial way in netdev. Seems fair.

The simplest way forward would be to commit to when mlx5 will support
redirects to xfrm tunnel via tc...

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-07  0:38                             ` Jakub Kicinski
@ 2023-02-07 19:52                               ` Jason Gunthorpe
  2023-02-07 22:03                                 ` Jakub Kicinski
  0 siblings, 1 reply; 33+ messages in thread
From: Jason Gunthorpe @ 2023-02-07 19:52 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Saeed Mahameed, Leon Romanovsky, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Mon, Feb 06, 2023 at 04:38:41PM -0800, Jakub Kicinski wrote:
> On Mon, 6 Feb 2023 10:58:56 -0400 Jason Gunthorpe wrote:
> > On Fri, Feb 03, 2023 at 05:45:31PM -0800, Jakub Kicinski wrote:
> > > Perfectly irrelevant comparisons :/ How many times do I have to say
> > > that all I'm asking is that you stay away from us and our APIs?  
> > 
> > What I'm reacting to is your remarks that came across as trying to
> > saying that the particular netdev subystem approach to open-ness was
> > in fact the same as the larger Linux values on open source and
> > community.
> >
> > netdev is clearly more restrictive, so is DRM, and that's fine. But it
> > should stay in netdev and not be exported to the rest of the
> > kernel. Eg don't lock away APIs for what are really shared resources.
> 
> I think you're misrepresenting. The DRM example is pertinent.
> The DRM disagreement as I recall it was whether Dave gets to nack
> random drivers in misc/ which are implementing GPU-like functionality
> but do _not_ use DRM APIs.

That isn't what I was thinking about.

The DRM specialness is they are very demanding about having an open
user space. More so than most places in the kernel.

The misc/ argument was about drivers trying to avoid the strict DRM
open user space requirement. In the end Greg agreed that open
userspace was something he wanted for misc too.

DRM tried to use DMABUF as some kind of API wedge, but it didn't
really work out too well.

In the end the fight was ideological around what is open enough to be
inside Linux because the GPU devices were skirting around something of
a grey area in the project's philosophy on how much open user space is
actually required.

> Whether one subsystem can use another subsystem's API over maintainer's
> NACK has a pretty obvious answer.

I would say not, I've never seen this actually aside from netdev vs
rdma. If the APIs are being used wrong, sure, but not for ideological
reasons.

> Good fences make good neighbors so I'd like to build a fence and avoid
> having to discuss this over and over.

I also would like to not discuss this :)

> Everyone is familiar with the term "vendor lock-in". The principles
> I listed are hardly hyperscaler driven.

The hyperscalers brought it to a whole new level. Previously we'd see
industry consortium's try to hammer out some consolidation, now we
quite often see hyperscalers make their own private purchasing
standards and have vendors to use them. I have mixed feelings about
the ecosystem value of private label standardization, especially if
the standard itself is kept secret.

Then of course we see the private standards get quietly implemented in
Linux.

An open source kernel implementation of a private standard for HW that
only one company can purchase that is only usable with a proprietary
userspace. Not exactly what I'd like to see.

> > I'd say here things are more like "lets innovate!" "lets
> > differentiate!" "customers pay a premium for uniquess"
> 
> Which favors complex and hard-to-copy offloads, over
> iterating on incremental common sense improvements.

I wouldn't use such a broad brush, but sure sometimes that is a
direction. More often complex is due to lack of better ideas, nobody
actually wants it to be complex, that just makes it more expensive to
build and more likely to fail..

> FWIW the "sides of the purchasing table" phrasing brings to mind
> industry forums rather than open source communities... Whether Linux
> is turning into an industry forum, and what Joreen would have to say
> about that*.. discussion for another time.

Well, Linux is an industry forum for sure, and it varys how much power
it projects. DRM's principled stand has undoubtedly had a large
impact, for instance.

> > I don't like what I see as a dangerous
> > trend of large cloud operators pushing things into the kernel where
> > the gold standard userspace is kept as some internal proprietary
> > application.
> 
> Curious what you mean here.

Ah, I stumble across stuff from time to time - KVM and related has
some interesting things. Especially with this new confidential compute
stuff. AMD just tried to get something into their mainline iommu
driver to support their out of tree kernel, for instance.

People try to bend the rules all the time.

> > I'm interested in the Linux software - and maintaining the open source
> > ecosystem. I've spent almost my whole career in this kind of space.
> > 
> > So I feel much closer to what I see as Linus's perspective: Bring your
> > open drivers, bring your open userspace, everyone is welcome.
> 
> (*as long as they are on a side of the purchasing table) ?

Naw, "hobbyists" are welcome of course, but I get the feeling that is
getting rarer.

> > Port your essential argument over to the storage world - what would
> > you say if the MTD developers insisted that proprietary NVMe shouldn't
> > be allowed to use "their" block APIs in Linux?
> > 
> > Or the MD/DM developers said no RAID controller drivers were allowed
> > to use "their" block stack?
> > 
> > I think as an overall community we would loose more than we gain.
> > 
> > So, why in your mind is networking so different from storage?
> 
> Networking is about connecting devices. It requires standards,
> interoperability and backward compatibility.
> 
> I'm not an expert on storage but my understanding is that the
> standardization of the internals is limited and seen as unnecessary.
> So there is no real potential for open source implementations of
> disk FW. Movement of data from point (a) to point (b) is not interesting
> either so NVMe is perfectly fine. Developers innovate in filesystems 
> instead.
>
> In networking we have strong standards so you can (and do) write
> open source software all the way down to the PHYs (serdes is where
> things get quite tricky). At the same time movement of data from point
> a to point b is _the_ problem so we need the ability to innovate in
> the transport space.
> 
> Now we have strayed quite far from the initial problem under discussion,
> but you can't say "networking is just like storage" and not expect
> a tirade from a networking guy :-D 

Heh, well, I don't agree with your characterization - from an open
source perspective I wouldn't call any FW "uninteresting", and the
storage device SW internals are super interesting/complicated and full
of incredible innovation.

Even PHYs, at slow speeds, are mostly closed FW running in proprietary
DSPs. netdev has a line they want to innovate at the packet level, but
everything underneath is still basically closed/proprietary.

I think that is great for netdev, but moving the line one OSI level
higher doesn't suddenly create an open source problem either, IMHO.

> > > > You've made it very clear you don't like the RDMA technology, but you
> > > > have no right to try and use your position as a kernel maintainer to
> > > > try and kill it by refusing PRs to shared driver code.  
> > > 
> > > For the n-th time, not my intention. RDMA may be more open than NVMe.
> > > Do your thing. Just do it with your own APIs.  
> > 
> > The standards being implemented broadly require the use of the APIs -
> > particularly the shared IP address.
> 
> No point talking about IP addresses, that ship has sailed.
> I bet the size of both communities was also orders of magnitude
> smaller back then. Different conditions different outcomes.

So, like I said, IP comes with baggage. Where do you draw the line?
What facets of the IP are we allowed to mirror and what are not? How
are you making this seemingly arbitrary decision?

The ipsec patches here have almost 0 impact on netdev because it is a
tiny steering engine configuration. I'd have more sympathy to the
argument if it was consuming a huge API surface to do this.

> We don't support black-box transport offloads in netdev. I thought that
> it'd come across but maybe I should spell it out - just because you
> are welcome in Linux does not mean RDMA devices are welcome in netdev.

Which is why they are not in netdev :) Nobody doubts this.

> As much as we got distracted by our ideological differences over the
> course of this thread - the issue is that I believe we had an agreement
> which was not upheld.
>
> I thought we compromised that to make the full offload sensible in
> netdev world nVidia would implement forwarding to xfrm tunnels using 
> tc rules. You want to add a feature in netdev, it needs to be usable
> in a non-trivial way in netdev. Seems fair.

Yes, and it is on Leon's work list. Notice Leon didn't do this RDMA
IPSEC patches. This is a huge journey for us, there are lots of parts
and several people working on it.

I understood the agreement was that we would do it, not that it done
as the very next thing. Stephen also asked for stuff and Leon is
working on that too.

> The simplest way forward would be to commit to when mlx5 will support
> redirects to xfrm tunnel via tc...

He needs to fix the bugs he created and found first :)

As far as I'm concerned TC will stay on his list until it is done.

Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-07 19:52                               ` Jason Gunthorpe
@ 2023-02-07 22:03                                 ` Jakub Kicinski
  2023-02-08  9:17                                   ` Leon Romanovsky
  2023-02-08 16:13                                   ` Jason Gunthorpe
  0 siblings, 2 replies; 33+ messages in thread
From: Jakub Kicinski @ 2023-02-07 22:03 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Saeed Mahameed, Leon Romanovsky, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Tue, 7 Feb 2023 15:52:59 -0400 Jason Gunthorpe wrote:
> On Mon, Feb 06, 2023 at 04:38:41PM -0800, Jakub Kicinski wrote:
> > On Mon, 6 Feb 2023 10:58:56 -0400 Jason Gunthorpe wrote:  
> > > What I'm reacting to is your remarks that came across as trying to
> > > saying that the particular netdev subystem approach to open-ness was
> > > in fact the same as the larger Linux values on open source and
> > > community.
> > >
> > > netdev is clearly more restrictive, so is DRM, and that's fine. But it
> > > should stay in netdev and not be exported to the rest of the
> > > kernel. Eg don't lock away APIs for what are really shared resources.  
> > 
> > I think you're misrepresenting. The DRM example is pertinent.
> > The DRM disagreement as I recall it was whether Dave gets to nack
> > random drivers in misc/ which are implementing GPU-like functionality
> > but do _not_ use DRM APIs.  
> 
> That isn't what I was thinking about.
> 
> The DRM specialness is they are very demanding about having an open
> user space. More so than most places in the kernel.
> 
> The misc/ argument was about drivers trying to avoid the strict DRM
> open user space requirement. In the end Greg agreed that open
> userspace was something he wanted for misc too.
> 
> DRM tried to use DMABUF as some kind of API wedge, but it didn't
> really work out too well.

DMABUF was what I remember from the Maintainer Summit.
I don't follow the DRM tho, so can't tell if it worked or not :(

> In the end the fight was ideological around what is open enough to be
> inside Linux because the GPU devices were skirting around something of
> a grey area in the project's philosophy on how much open user space is
> actually required.

Right, I see that as very similar to our situation.

> > Whether one subsystem can use another subsystem's API over maintainer's
> > NACK has a pretty obvious answer.  
> 
> I would say not, I've never seen this actually aside from netdev vs
> rdma. If the APIs are being used wrong, sure, but not for ideological
> reasons.
> 
> > Good fences make good neighbors so I'd like to build a fence and
> > avoid having to discuss this over and over.  
> 
> I also would like to not discuss this :)

Well, then... Suggest a delineation or a way forward if you don't like
mine. The circular conversation + RDMA gets its way has to end sooner
or later.

> > Everyone is familiar with the term "vendor lock-in". The principles
> > I listed are hardly hyperscaler driven.  
> 
> The hyperscalers brought it to a whole new level. Previously we'd see
> industry consortium's try to hammer out some consolidation, now we
> quite often see hyperscalers make their own private purchasing
> standards and have vendors to use them. I have mixed feelings about
> the ecosystem value of private label standardization, especially if
> the standard itself is kept secret.
> 
> Then of course we see the private standards get quietly implemented in
> Linux.
> 
> An open source kernel implementation of a private standard for HW that
> only one company can purchase that is only usable with a proprietary
> userspace. Not exactly what I'd like to see.

You switched your argument 180 degrees.

Fist you said:

  What you posted about your goals for netdev is pretty consistent with
  the typical approach from a hyperscaler purchasing department: Make it
  all the same. Grind the competing vendors on price.

So "Make it all the same". Now you're saying hyperscalers have their
own standards.

Don't get me wrong, large customers get to ask for custom solutions.
In networking is a well known anti-pattern, ask anyone who ever worked
on telco solutions or routing protocols.

But I'm struggling to find a coherent argument in what you're saying.

> > > I'd say here things are more like "lets innovate!" "lets
> > > differentiate!" "customers pay a premium for uniquess"  
> > 
> > Which favors complex and hard-to-copy offloads, over
> > iterating on incremental common sense improvements.  
> 
> I wouldn't use such a broad brush, but sure sometimes that is a
> direction. More often complex is due to lack of better ideas, nobody
> actually wants it to be complex, that just makes it more expensive to
> build and more likely to fail..

It'd be unprofessional for me to share details in this forum,
unfortunately.

> > FWIW the "sides of the purchasing table" phrasing brings to mind
> > industry forums rather than open source communities... Whether Linux
> > is turning into an industry forum, and what Joreen would have to say
> > about that*.. discussion for another time.  
> 
> Well, Linux is an industry forum for sure, and it varys how much power
> it projects. DRM's principled stand has undoubtedly had a large
> impact, for instance.

And so obviously did netdev.

> > > I don't like what I see as a dangerous
> > > trend of large cloud operators pushing things into the kernel
> > > where the gold standard userspace is kept as some internal
> > > proprietary application.  
> > 
> > Curious what you mean here.  
> 
> Ah, I stumble across stuff from time to time - KVM and related has
> some interesting things. Especially with this new confidential compute
> stuff. AMD just tried to get something into their mainline iommu
> driver to support their out of tree kernel, for instance.
> 
> People try to bend the rules all the time.

AMD is a vendor, tho, you said "trend of large cloud operators pushing
things into the kernel". I was curious to hear the hyperscaler example
'cause I'd like to be vigilant.

> > > I'm interested in the Linux software - and maintaining the open
> > > source ecosystem. I've spent almost my whole career in this kind
> > > of space.
> > > 
> > > So I feel much closer to what I see as Linus's perspective: Bring
> > > your open drivers, bring your open userspace, everyone is
> > > welcome.  
> > 
> > (*as long as they are on a side of the purchasing table) ?  
> 
> Naw, "hobbyists" are welcome of course, but I get the feeling that is
> getting rarer.

And influx of talented incomers is more important to me personally
than keeping vendors happy.

> > > Port your essential argument over to the storage world - what
> > > would you say if the MTD developers insisted that proprietary
> > > NVMe shouldn't be allowed to use "their" block APIs in Linux?
> > > 
> > > Or the MD/DM developers said no RAID controller drivers were
> > > allowed to use "their" block stack?
> > > 
> > > I think as an overall community we would loose more than we gain.
> > > 
> > > So, why in your mind is networking so different from storage?  
> > 
> > Networking is about connecting devices. It requires standards,
> > interoperability and backward compatibility.
> > 
> > I'm not an expert on storage but my understanding is that the
> > standardization of the internals is limited and seen as unnecessary.
> > So there is no real potential for open source implementations of
> > disk FW. Movement of data from point (a) to point (b) is not
> > interesting either so NVMe is perfectly fine. Developers innovate
> > in filesystems instead.
> >
> > In networking we have strong standards so you can (and do) write
> > open source software all the way down to the PHYs (serdes is where
> > things get quite tricky). At the same time movement of data from
> > point a to point b is _the_ problem so we need the ability to
> > innovate in the transport space.
> > 
> > Now we have strayed quite far from the initial problem under
> > discussion, but you can't say "networking is just like storage" and
> > not expect a tirade from a networking guy :-D   
> 
> Heh, well, I don't agree with your characterization - from an open
> source perspective I wouldn't call any FW "uninteresting", and the
> storage device SW internals are super interesting/complicated and full
> of incredible innovation.

That's not what I said. I said movement of data i.e. the device
interface (NVMe) is not interesting.

FW would be interesting. But AFAIU the FW is not open because there was
no "insertion point" for the community to start hacking. I am very glad
that you think all FW is interesting. In my previous job we were able
to open source the FW the device was running, including the
ahead-of-its-time BPF offload: https://github.com/Netronome/nic-firmware
Should I be looking forward to open source FW coming out of nVidia? :)

> Even PHYs, at slow speeds, are mostly closed FW running in proprietary
> DSPs. netdev has a line they want to innovate at the packet level, but
> everything underneath is still basically closed/proprietary.

Yes, that is what I said, serdes and down.

> I think that is great for netdev, but moving the line one OSI level
> higher doesn't suddenly create an open source problem either, IMHO.

Open source problem? Mis-worded perhaps?

> > > The standards being implemented broadly require the use of the
> > > APIs - particularly the shared IP address.  
> > 
> > No point talking about IP addresses, that ship has sailed.
> > I bet the size of both communities was also orders of magnitude
> > smaller back then. Different conditions different outcomes.  
> 
> So, like I said, IP comes with baggage. Where do you draw the line?
> What facets of the IP are we allowed to mirror and what are not? How
> are you making this seemingly arbitrary decision?

I have some heuristics I use, but I don't really want to be in the
defensive position forever. You suggest something, please.

> The ipsec patches here have almost 0 impact on netdev because it is a
> tiny steering engine configuration. I'd have more sympathy to the
> argument if it was consuming a huge API surface to do this.

The existence of the full IPsec offload in its entirety is questionable.
We let the earlier patches in trusting that you'll deliver the
forwarding support. We're calling "stop" here because when the patches
from this PR were posted to the list we learned for the first time
that the forwarding is perhaps less real than expected.

> > We don't support black-box transport offloads in netdev. I thought
> > that it'd come across but maybe I should spell it out - just
> > because you are welcome in Linux does not mean RDMA devices are
> > welcome in netdev.  
> 
> Which is why they are not in netdev :) Nobody doubts this.
> 
> > As much as we got distracted by our ideological differences over the
> > course of this thread - the issue is that I believe we had an
> > agreement which was not upheld.
> >
> > I thought we compromised that to make the full offload sensible in
> > netdev world nVidia would implement forwarding to xfrm tunnels
> > using tc rules. You want to add a feature in netdev, it needs to be
> > usable in a non-trivial way in netdev. Seems fair.  
> 
> Yes, and it is on Leon's work list. Notice Leon didn't do this RDMA
> IPSEC patches. This is a huge journey for us, there are lots of parts
> and several people working on it.
> 
> I understood the agreement was that we would do it, not that it done
> as the very next thing. Stephen also asked for stuff and Leon is
> working on that too.
> 
> > The simplest way forward would be to commit to when mlx5 will
> > support redirects to xfrm tunnel via tc...  
> 
> He needs to fix the bugs he created and found first :)
> 
> As far as I'm concerned TC will stay on his list until it is done.

This is what I get for trusting a vendor :/

If you can't make a commitment my strong recommendation is for this code
to not be accepted upstream until TC patches emerge.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-07 22:03                                 ` Jakub Kicinski
@ 2023-02-08  9:17                                   ` Leon Romanovsky
  2023-02-08 16:13                                   ` Jason Gunthorpe
  1 sibling, 0 replies; 33+ messages in thread
From: Leon Romanovsky @ 2023-02-08  9:17 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Jason Gunthorpe, Saeed Mahameed, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Tue, Feb 07, 2023 at 02:03:30PM -0800, Jakub Kicinski wrote:
> On Tue, 7 Feb 2023 15:52:59 -0400 Jason Gunthorpe wrote:
> > On Mon, Feb 06, 2023 at 04:38:41PM -0800, Jakub Kicinski wrote:
> > > On Mon, 6 Feb 2023 10:58:56 -0400 Jason Gunthorpe wrote:  

<...>

> > > The simplest way forward would be to commit to when mlx5 will
> > > support redirects to xfrm tunnel via tc...  
> > 
> > He needs to fix the bugs he created and found first :)
> > 
> > As far as I'm concerned TC will stay on his list until it is done.
> 
> This is what I get for trusting a vendor :/

I'm speechless. I was very clear what is my implementation roadmap.

Are you saying that your want-now-tc-attitude is more important than
working HW (found when integrated with libreswan) and Steffen's request
to implement tunnel mode?

> 
> If you can't make a commitment

You got this commitment from two people already, but unfortunately
you refuse to listen.

Thanks

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-07 22:03                                 ` Jakub Kicinski
  2023-02-08  9:17                                   ` Leon Romanovsky
@ 2023-02-08 16:13                                   ` Jason Gunthorpe
  2023-02-08 23:19                                     ` Jakub Kicinski
  1 sibling, 1 reply; 33+ messages in thread
From: Jason Gunthorpe @ 2023-02-08 16:13 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Saeed Mahameed, Leon Romanovsky, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Tue, Feb 07, 2023 at 02:03:30PM -0800, Jakub Kicinski wrote:
> > In the end the fight was ideological around what is open enough to be
> > inside Linux because the GPU devices were skirting around something of
> > a grey area in the project's philosophy on how much open user space is
> > actually required.
> 
> Right, I see that as very similar to our situation.

Er, not at all. As I've explained many times now RDMA is well aligned
with mainstream Linux ideology on code openness.

> > > Good fences make good neighbors so I'd like to build a fence and
> > > avoid having to discuss this over and over.  
> > 
> > I also would like to not discuss this :)
> 
> Well, then... Suggest a delineation or a way forward if you don't like
> mine. The circular conversation + RDMA gets its way has to end sooner
> or later.

I can't accept yours because it means RDMA stops existing. So we must
continue with what has been done for the last 15 years - RDMA
(selectively) mirrors the IP and everything running at or below the IP
header level.

> > An open source kernel implementation of a private standard for HW that
> > only one company can purchase that is only usable with a proprietary
> > userspace. Not exactly what I'd like to see.
> 
> You switched your argument 180 degrees.
> 
> Fist you said:
> 
>   What you posted about your goals for netdev is pretty consistent with
>   the typical approach from a hyperscaler purchasing department: Make it
>   all the same. Grind the competing vendors on price.
> 
> So "Make it all the same". Now you're saying hyperscalers have their
> own standards.

What do you mean? "make it all the same" can be done with private or
open standards?

> > Ah, I stumble across stuff from time to time - KVM and related has
> > some interesting things. Especially with this new confidential compute
> > stuff. AMD just tried to get something into their mainline iommu
> > driver to support their out of tree kernel, for instance.
> > 
> > People try to bend the rules all the time.
> 
> AMD is a vendor, tho, you said "trend of large cloud operators pushing
> things into the kernel". I was curious to hear the hyperscaler example
> 'cause I'd like to be vigilant.

I'm looking at it from the perspective of who owns, operates and
monetizes the propritary close source kernel fork. It is not AMD.

AMD/Intel/ARM provided open patches to a hyperscaler(s) for their CC
solutions that haven't been merged yet. The hyperscaler is the one
that forked Linux into closed source, integrated them and is operating
the closed solution.

That the vendor pushes little parts of the hyperscaler solution to the
kernel & ecosystem in a trickle doesn't make the sad state of affairs
exclusively the vendors fault, even if their name is on the patches,
IMHO.

> > The ipsec patches here have almost 0 impact on netdev because it is a
> > tiny steering engine configuration. I'd have more sympathy to the
> > argument if it was consuming a huge API surface to do this.
> 
> The existence of the full IPsec offload in its entirety is questionable.
> We let the earlier patches in trusting that you'll deliver the
> forwarding support. We're calling "stop" here because when the patches
> from this PR were posted to the list we learned for the first time
> that the forwarding is perhaps less real than expected.

ipsec offload works within netdev for non switch use cases fine. I
would think that alone is enough to be OK for netdev.

I have no idea how you are jumping to some conclusion that since the
RDMA team made their patches it somehow has anything to do with the
work Leon and the netdev team will deliver in future?

> > > The simplest way forward would be to commit to when mlx5 will
> > > support redirects to xfrm tunnel via tc...  
> > 
> > He needs to fix the bugs he created and found first :)
> > 
> > As far as I'm concerned TC will stay on his list until it is done.
> 
> This is what I get for trusting a vendor :/
> 
> If you can't make a commitment my strong recommendation is for this code
> to not be accepted upstream until TC patches emerge.

This is the strongest commitment I am allowed to make in public.

I honestly have no idea why you are so fixated on TC, or what it has
to do with RDMA.

Hasn't our netdev team done enough work on TC stuff to earn some
faith that we do actually care about TC as part of our portfolio?

Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-08 16:13                                   ` Jason Gunthorpe
@ 2023-02-08 23:19                                     ` Jakub Kicinski
  2023-02-09  0:27                                       ` Jason Gunthorpe
  2023-02-09  0:36                                       ` Saeed Mahameed
  0 siblings, 2 replies; 33+ messages in thread
From: Jakub Kicinski @ 2023-02-08 23:19 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Saeed Mahameed, Leon Romanovsky, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Wed, 8 Feb 2023 12:13:00 -0400 Jason Gunthorpe wrote:
> On Tue, Feb 07, 2023 at 02:03:30PM -0800, Jakub Kicinski wrote:
> > > I also would like to not discuss this :)  
> > 
> > Well, then... Suggest a delineation or a way forward if you don't like
> > mine. The circular conversation + RDMA gets its way has to end sooner
> > or later.  
> 
> I can't accept yours because it means RDMA stops existing. So we must
> continue with what has been done for the last 15 years - RDMA
> (selectively) mirrors the IP and everything running at or below the IP
> header level.

Re-implement bits you need for configuration, not stop existing.

> > > An open source kernel implementation of a private standard for HW that
> > > only one company can purchase that is only usable with a proprietary
> > > userspace. Not exactly what I'd like to see.  
> > 
> > You switched your argument 180 degrees.
> > 
> > Fist you said:
> > 
> >   What you posted about your goals for netdev is pretty consistent with
> >   the typical approach from a hyperscaler purchasing department: Make it
> >   all the same. Grind the competing vendors on price.
> > 
> > So "Make it all the same". Now you're saying hyperscalers have their
> > own standards.  
> 
> What do you mean? "make it all the same" can be done with private or
> open standards?

Oh. If it's someone private specs its probably irrelevant to the open
source community?

> > > Ah, I stumble across stuff from time to time - KVM and related has
> > > some interesting things. Especially with this new confidential compute
> > > stuff. AMD just tried to get something into their mainline iommu
> > > driver to support their out of tree kernel, for instance.
> > > 
> > > People try to bend the rules all the time.  
> > 
> > AMD is a vendor, tho, you said "trend of large cloud operators pushing
> > things into the kernel". I was curious to hear the hyperscaler example
> > 'cause I'd like to be vigilant.  
> 
> I'm looking at it from the perspective of who owns, operates and
> monetizes the propritary close source kernel fork. It is not AMD.
> 
> AMD/Intel/ARM provided open patches to a hyperscaler(s) for their CC
> solutions that haven't been merged yet. The hyperscaler is the one
> that forked Linux into closed source, integrated them and is operating
> the closed solution.
> 
> That the vendor pushes little parts of the hyperscaler solution to the
> kernel & ecosystem in a trickle doesn't make the sad state of affairs
> exclusively the vendors fault, even if their name is on the patches,
> IMHO.

Sad situation. Not my employer and not in netdev, I hope.
I may have forgotten already what brought us down this rabbit hole...

> > > The ipsec patches here have almost 0 impact on netdev because it is a
> > > tiny steering engine configuration. I'd have more sympathy to the
> > > argument if it was consuming a huge API surface to do this.  
> > 
> > The existence of the full IPsec offload in its entirety is questionable.
> > We let the earlier patches in trusting that you'll deliver the
> > forwarding support. We're calling "stop" here because when the patches
> > from this PR were posted to the list we learned for the first time
> > that the forwarding is perhaps less real than expected.  
> 
> ipsec offload works within netdev for non switch use cases fine. I
> would think that alone is enough to be OK for netdev.
> 
> I have no idea how you are jumping to some conclusion that since the
> RDMA team made their patches it somehow has anything to do with the
> work Leon and the netdev team will deliver in future?

We shouldn't reneg what was agreed on earlier.

> > > He needs to fix the bugs he created and found first :)
> > > 
> > > As far as I'm concerned TC will stay on his list until it is done.  
> > 
> > This is what I get for trusting a vendor :/
> > 
> > If you can't make a commitment my strong recommendation is for this code
> > to not be accepted upstream until TC patches emerge.  
> 
> This is the strongest commitment I am allowed to make in public.

As priorities shift it may never happen.

> I honestly have no idea why you are so fixated on TC, or what it has
> to do with RDMA.

It's a strong justification for having full xfrm offload.
You can't forward without full offload.
Anything else could theoretically be improved on the SW side.
The VF switching offload was the winning argument in the past
discussion.

> Hasn't our netdev team done enough work on TC stuff to earn some
> faith that we do actually care about TC as part of our portfolio?

Shouldn't have brought it up in the past discussion then :|
Being asked to implement something tangential to your goals for 
the community to accept your code is hardly unheard of.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-08 23:19                                     ` Jakub Kicinski
@ 2023-02-09  0:27                                       ` Jason Gunthorpe
  2023-02-09  0:48                                         ` Jakub Kicinski
  2023-02-09  0:36                                       ` Saeed Mahameed
  1 sibling, 1 reply; 33+ messages in thread
From: Jason Gunthorpe @ 2023-02-09  0:27 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Saeed Mahameed, Leon Romanovsky, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Wed, Feb 08, 2023 at 03:19:22PM -0800, Jakub Kicinski wrote:
> On Wed, 8 Feb 2023 12:13:00 -0400 Jason Gunthorpe wrote:
> > On Tue, Feb 07, 2023 at 02:03:30PM -0800, Jakub Kicinski wrote:
> > > > I also would like to not discuss this :)  
> > > 
> > > Well, then... Suggest a delineation or a way forward if you don't like
> > > mine. The circular conversation + RDMA gets its way has to end sooner
> > > or later.  
> > 
> > I can't accept yours because it means RDMA stops existing. So we must
> > continue with what has been done for the last 15 years - RDMA
> > (selectively) mirrors the IP and everything running at or below the IP
> > header level.
> 
> Re-implement bits you need for configuration, not stop existing.

This is completely technically infeasible. They share IP addresess, we
cannot have two stacks running IPSEC on top of othe same IP address
without co-ordinating. Almost every part is like that to some degree.

And even if we somehow did keep things 100% seperated, with seperated
IPs - Linus isn't going to let me copy and paste the huge swaths of
core netdev code required to do IP stuff (arp, nd, routing, icmp,
bonding, etc) into RDMA for a reason like this.

So, it really is a complete death blow to demand to keep these things
separated.

Let alone what would happen if we applied the same logic to all the
places sharing the IP with HW - remember iscsi? FCoE?

> > > So "Make it all the same". Now you're saying hyperscalers have their
> > > own standards.  
> > 
> > What do you mean? "make it all the same" can be done with private or
> > open standards?
> 
> Oh. If it's someone private specs its probably irrelevant to the open
> source community?

No, it's what I said I dislike. Private specs, private HW, private
userspace, proprietary kernel forks, but people still try to get
incomplete pieces of stuff into the mainline kernel.

> Sad situation. Not my employer and not in netdev, I hope.

AFAIK your and my employer have done a good job together on joint
projects over the years and have managed to end up with open source
user spaces for almost everything subtantive in the kernel.

> > I have no idea how you are jumping to some conclusion that since the
> > RDMA team made their patches it somehow has anything to do with the
> > work Leon and the netdev team will deliver in future?
> 
> We shouldn't reneg what was agreed on earlier.

Who reneg'd? We always said we'd do it and we are still saying we plan
to do it.

> > Hasn't our netdev team done enough work on TC stuff to earn some
> > faith that we do actually care about TC as part of our portfolio?
> 
> Shouldn't have brought it up in the past discussion then :|
> Being asked to implement something tangential to your goals for 
> the community to accept your code is hardly unheard of.

We agreed to implement. I'm asking for patience since we have a good
historical track record.

Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-08 23:19                                     ` Jakub Kicinski
  2023-02-09  0:27                                       ` Jason Gunthorpe
@ 2023-02-09  0:36                                       ` Saeed Mahameed
  2023-02-09  0:52                                         ` Jakub Kicinski
  1 sibling, 1 reply; 33+ messages in thread
From: Saeed Mahameed @ 2023-02-09  0:36 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Jason Gunthorpe, Leon Romanovsky, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On 08 Feb 15:19, Jakub Kicinski wrote:
>On Wed, 8 Feb 2023 12:13:00 -0400 Jason Gunthorpe wrote:
>> On Tue, Feb 07, 2023 at 02:03:30PM -0800, Jakub Kicinski wrote:
>> > > I also would like to not discuss this :)
>> >
>> > Well, then... Suggest a delineation or a way forward if you don't like
>> > mine. The circular conversation + RDMA gets its way has to end sooner
>> > or later.
>>
>> I can't accept yours because it means RDMA stops existing. So we must
>> continue with what has been done for the last 15 years - RDMA
>> (selectively) mirrors the IP and everything running at or below the IP
>> header level.
>
>Re-implement bits you need for configuration, not stop existing.
>

Why ?? we will end up with the same code in this PULL plus some redundant
rdma API, please see explanation below.

>> > > An open source kernel implementation of a private standard for HW that
>> > > only one company can purchase that is only usable with a proprietary
>> > > userspace. Not exactly what I'd like to see.
>> >
>> > You switched your argument 180 degrees.
>> >
>> > Fist you said:
>> >
>> >   What you posted about your goals for netdev is pretty consistent with
>> >   the typical approach from a hyperscaler purchasing department: Make it
>> >   all the same. Grind the competing vendors on price.
>> >
>> > So "Make it all the same". Now you're saying hyperscalers have their
>> > own standards.
>>
>> What do you mean? "make it all the same" can be done with private or
>> open standards?
>
>Oh. If it's someone private specs its probably irrelevant to the open
>source community?
>
>> > > Ah, I stumble across stuff from time to time - KVM and related has
>> > > some interesting things. Especially with this new confidential compute
>> > > stuff. AMD just tried to get something into their mainline iommu
>> > > driver to support their out of tree kernel, for instance.
>> > >
>> > > People try to bend the rules all the time.
>> >
>> > AMD is a vendor, tho, you said "trend of large cloud operators pushing
>> > things into the kernel". I was curious to hear the hyperscaler example
>> > 'cause I'd like to be vigilant.
>>
>> I'm looking at it from the perspective of who owns, operates and
>> monetizes the propritary close source kernel fork. It is not AMD.
>>
>> AMD/Intel/ARM provided open patches to a hyperscaler(s) for their CC
>> solutions that haven't been merged yet. The hyperscaler is the one
>> that forked Linux into closed source, integrated them and is operating
>> the closed solution.
>>
>> That the vendor pushes little parts of the hyperscaler solution to the
>> kernel & ecosystem in a trickle doesn't make the sad state of affairs
>> exclusively the vendors fault, even if their name is on the patches,
>> IMHO.
>
>Sad situation. Not my employer and not in netdev, I hope.
>I may have forgotten already what brought us down this rabbit hole...
>
>> > > The ipsec patches here have almost 0 impact on netdev because it is a
>> > > tiny steering engine configuration. I'd have more sympathy to the
>> > > argument if it was consuming a huge API surface to do this.
>> >
>> > The existence of the full IPsec offload in its entirety is questionable.
>> > We let the earlier patches in trusting that you'll deliver the
>> > forwarding support. We're calling "stop" here because when the patches
>> > from this PR were posted to the list we learned for the first time
>> > that the forwarding is perhaps less real than expected.
>>
>> ipsec offload works within netdev for non switch use cases fine. I
>> would think that alone is enough to be OK for netdev.
>>
>> I have no idea how you are jumping to some conclusion that since the
>> RDMA team made their patches it somehow has anything to do with the
>> work Leon and the netdev team will deliver in future?
>
>We shouldn't reneg what was agreed on earlier.
>
>> > > He needs to fix the bugs he created and found first :)
>> > >
>> > > As far as I'm concerned TC will stay on his list until it is done.
>> >
>> > This is what I get for trusting a vendor :/
>> >
>> > If you can't make a commitment my strong recommendation is for this code
>> > to not be accepted upstream until TC patches emerge.
>>
>> This is the strongest commitment I am allowed to make in public.
>
>As priorities shift it may never happen.
>
>> I honestly have no idea why you are so fixated on TC, or what it has
>> to do with RDMA.
>
>It's a strong justification for having full xfrm offload.
>You can't forward without full offload.

This pull has nothing to do with "full" xfrm offload, 
For RoCE to exist it has to rely on netdev attributes, such as 
IP, vlan, mac, etc .. in this series we do the same for ipsec,
we setup the steering pipeline with the proper attributes for
RoCE to function.

I don't see it will be reasonable for the rdma user to setup these
attributes twice, once via netdev API and once via rdma APIs,
this will be torture for that user, just because rdma bits are not allowed
in netdev, it's exactly that, some rdma/roce bits purely mlx5_core logic,
and it has to be in mlx5_core due to the sharing of hardware resources
between rdma and netdev.

>Anything else could theoretically be improved on the SW side.
>The VF switching offload was the winning argument in the past
>discussion.
>
>> Hasn't our netdev team done enough work on TC stuff to earn some
>> faith that we do actually care about TC as part of our portfolio?
>
>Shouldn't have brought it up in the past discussion then :|
>Being asked to implement something tangential to your goals for
>the community to accept your code is hardly unheard of.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-09  0:27                                       ` Jason Gunthorpe
@ 2023-02-09  0:48                                         ` Jakub Kicinski
  2023-02-09  0:59                                           ` Jason Gunthorpe
  0 siblings, 1 reply; 33+ messages in thread
From: Jakub Kicinski @ 2023-02-09  0:48 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Saeed Mahameed, Leon Romanovsky, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Wed, 8 Feb 2023 20:27:17 -0400 Jason Gunthorpe wrote:
> On Wed, Feb 08, 2023 at 03:19:22PM -0800, Jakub Kicinski wrote:
> > On Wed, 8 Feb 2023 12:13:00 -0400 Jason Gunthorpe wrote:  
> > > I can't accept yours because it means RDMA stops existing. So we must
> > > continue with what has been done for the last 15 years - RDMA
> > > (selectively) mirrors the IP and everything running at or below the IP
> > > header level.  
> > 
> > Re-implement bits you need for configuration, not stop existing.  
> 
> This is completely technically infeasible. They share IP addresess, we
> cannot have two stacks running IPSEC on top of othe same IP address
> without co-ordinating. Almost every part is like that to some degree.
> 
> And even if we somehow did keep things 100% seperated, with seperated
> IPs - Linus isn't going to let me copy and paste the huge swaths of
> core netdev code required to do IP stuff (arp, nd, routing, icmp,
> bonding, etc) into RDMA for a reason like this.
> 
> So, it really is a complete death blow to demand to keep these things
> separated.
> 
> Let alone what would happen if we applied the same logic to all the
> places sharing the IP with HW - remember iscsi? FCoE?

Who said IP configuration.

> > > What do you mean? "make it all the same" can be done with private or
> > > open standards?  
> > 
> > Oh. If it's someone private specs its probably irrelevant to the open
> > source community?  
> 
> No, it's what I said I dislike. Private specs, private HW, private
> userspace, proprietary kernel forks, but people still try to get
> incomplete pieces of stuff into the mainline kernel.
> 
> > Sad situation. Not my employer and not in netdev, I hope.  
> 
> AFAIK your and my employer have done a good job together on joint
> projects over the years and have managed to end up with open source
> user spaces for almost everything subtantive in the kernel.

Great. Let's make a note of that so there are not more accusations 
that my objectives for netdev are somehow driven by evil hyperscalers.

> > > I have no idea how you are jumping to some conclusion that since the
> > > RDMA team made their patches it somehow has anything to do with the
> > > work Leon and the netdev team will deliver in future?  
> > 
> > We shouldn't reneg what was agreed on earlier.  
> 
> Who reneg'd? We always said we'd do it and we are still saying we plan
> to do it.
> 
> > > Hasn't our netdev team done enough work on TC stuff to earn some
> > > faith that we do actually care about TC as part of our portfolio?  
> > 
> > Shouldn't have brought it up in the past discussion then :|
> > Being asked to implement something tangential to your goals for 
> > the community to accept your code is hardly unheard of.  
> 
> We agreed to implement. I'm asking for patience since we have a good
> historical track record.

If you can't make a strong commitment, what's the point in time,
at which if I were angry that the tc redirect was not posted yet -
you'd consider it understandable?
Perhaps that's sufficiently not legally binding? :)

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-09  0:36                                       ` Saeed Mahameed
@ 2023-02-09  0:52                                         ` Jakub Kicinski
  0 siblings, 0 replies; 33+ messages in thread
From: Jakub Kicinski @ 2023-02-09  0:52 UTC (permalink / raw)
  To: Saeed Mahameed
  Cc: Jason Gunthorpe, Leon Romanovsky, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Wed, 8 Feb 2023 16:36:18 -0800 Saeed Mahameed wrote:
>>> I honestly have no idea why you are so fixated on TC, or what it has
>>> to do with RDMA.  
>>
>> It's a strong justification for having full xfrm offload.
>> You can't forward without full offload.  
> 
> This pull has nothing to do with "full" xfrm offload, 
> For RoCE to exist it has to rely on netdev attributes, such as 
> IP, vlan, mac, etc .. in this series we do the same for ipsec,
> we setup the steering pipeline with the proper attributes for
> RoCE to function.

I think I already admitted that the exact patches in the PR are of
secondary importance.

> I don't see it will be reasonable for the rdma user to setup these
> attributes twice, once via netdev API and once via rdma APIs,
> this will be torture for that user, just because rdma bits are not allowed
> in netdev, it's exactly that, some rdma/roce bits purely mlx5_core logic,
> and it has to be in mlx5_core due to the sharing of hardware resources
> between rdma and netdev.

That's very understandable because for you as the upstream maintainer
of mlx5 either side of the equation (netdev or rdma) are "your users".
Whether we need to be concerned about their comfort is much less
obvious to netdev maintainers.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-09  0:48                                         ` Jakub Kicinski
@ 2023-02-09  0:59                                           ` Jason Gunthorpe
  2023-02-09  1:16                                             ` Jakub Kicinski
  0 siblings, 1 reply; 33+ messages in thread
From: Jason Gunthorpe @ 2023-02-09  0:59 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Saeed Mahameed, Leon Romanovsky, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Wed, Feb 08, 2023 at 04:48:07PM -0800, Jakub Kicinski wrote:
> On Wed, 8 Feb 2023 20:27:17 -0400 Jason Gunthorpe wrote:
> > On Wed, Feb 08, 2023 at 03:19:22PM -0800, Jakub Kicinski wrote:
> > > On Wed, 8 Feb 2023 12:13:00 -0400 Jason Gunthorpe wrote:  
> > > > I can't accept yours because it means RDMA stops existing. So we must
> > > > continue with what has been done for the last 15 years - RDMA
> > > > (selectively) mirrors the IP and everything running at or below the IP
> > > > header level.  
> > > 
> > > Re-implement bits you need for configuration, not stop existing.  
> > 
> > This is completely technically infeasible. They share IP addresess, we
> > cannot have two stacks running IPSEC on top of othe same IP address
> > without co-ordinating. Almost every part is like that to some degree.
> > 
> > And even if we somehow did keep things 100% seperated, with seperated
> > IPs - Linus isn't going to let me copy and paste the huge swaths of
> > core netdev code required to do IP stuff (arp, nd, routing, icmp,
> > bonding, etc) into RDMA for a reason like this.
> > 
> > So, it really is a complete death blow to demand to keep these things
> > separated.
> > 
> > Let alone what would happen if we applied the same logic to all the
> > places sharing the IP with HW - remember iscsi? FCoE?
> 
> Who said IP configuration.

Please explain to me your vision how we could do IPSEC in rdma and
continue to use an IP address owned by netdev while netdev is also
running IPSEC on the same IP address for netdev traffic.

I can't see how it is even technically possible.

Tell me how the NIC knows, on a packet by packet basis, if the IPSEC
or IKE packet should be delivered to netdev or to RDMA.

Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-09  0:59                                           ` Jason Gunthorpe
@ 2023-02-09  1:16                                             ` Jakub Kicinski
  2023-02-10 17:15                                               ` Jason Gunthorpe
  0 siblings, 1 reply; 33+ messages in thread
From: Jakub Kicinski @ 2023-02-09  1:16 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Saeed Mahameed, Leon Romanovsky, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Wed, 8 Feb 2023 20:59:59 -0400 Jason Gunthorpe wrote:
> > Who said IP configuration.  
> 
> Please explain to me your vision how we could do IPSEC in rdma and
> continue to use an IP address owned by netdev while netdev is also
> running IPSEC on the same IP address for netdev traffic.

I'm no expert on IPsec but AFAIK it doesn't treat the entire endpoint
as a single unit.

> I can't see how it is even technically possible.
> 
> Tell me how the NIC knows, on a packet by packet basis, if the IPSEC
> or IKE packet should be delivered to netdev or to RDMA.

Just a forwarding problem. Whether NIC matches on UDP port or ESP+SPI
programmed via some random API is a detail.


Could you please go back to answering the question of how we deliver
on the compromise that was established to merge the full xfrm offload?

There's only so much time I can spend circling the subject.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: pull-request: mlx5-next 2023-01-24 V2
  2023-02-09  1:16                                             ` Jakub Kicinski
@ 2023-02-10 17:15                                               ` Jason Gunthorpe
  0 siblings, 0 replies; 33+ messages in thread
From: Jason Gunthorpe @ 2023-02-10 17:15 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Saeed Mahameed, Leon Romanovsky, David S. Miller, Paolo Abeni,
	Eric Dumazet, Saeed Mahameed, linux-rdma, netdev

On Wed, Feb 08, 2023 at 05:16:46PM -0800, Jakub Kicinski wrote:
> On Wed, 8 Feb 2023 20:59:59 -0400 Jason Gunthorpe wrote:
> > > Who said IP configuration.  
> > 
> > Please explain to me your vision how we could do IPSEC in rdma and
> > continue to use an IP address owned by netdev while netdev is also
> > running IPSEC on the same IP address for netdev traffic.
> 
> I'm no expert on IPsec but AFAIK it doesn't treat the entire endpoint
> as a single unit.

It does, the SA #'s in the ESP header have to be globally allocated.
 
> Could you please go back to answering the question of how we deliver
> on the compromise that was established to merge the full xfrm offload?

I've said repeatedly it is in our plans, we have people working on it,
and I'm not allowed to commit to specific dates in public.

Jason

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2023-02-10 17:15 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-26 23:08 pull-request: mlx5-next 2023-01-24 V2 Saeed Mahameed
2023-02-02  7:46 ` Leon Romanovsky
2023-02-02 17:13   ` Jakub Kicinski
2023-02-02 17:14     ` Jason Gunthorpe
2023-02-02 17:25       ` Jakub Kicinski
2023-02-02 17:44         ` Jason Gunthorpe
2023-02-02 17:54           ` Jakub Kicinski
2023-02-02 18:03             ` Leon Romanovsky
2023-02-02 18:15               ` Saeed Mahameed
2023-02-02 18:30                 ` Jakub Kicinski
2023-02-03 20:05                   ` Saeed Mahameed
2023-02-03 21:14                     ` Jakub Kicinski
2023-02-04  0:18                       ` Jason Gunthorpe
2023-02-04  1:45                         ` Jakub Kicinski
2023-02-06 14:58                           ` Jason Gunthorpe
2023-02-07  0:38                             ` Jakub Kicinski
2023-02-07 19:52                               ` Jason Gunthorpe
2023-02-07 22:03                                 ` Jakub Kicinski
2023-02-08  9:17                                   ` Leon Romanovsky
2023-02-08 16:13                                   ` Jason Gunthorpe
2023-02-08 23:19                                     ` Jakub Kicinski
2023-02-09  0:27                                       ` Jason Gunthorpe
2023-02-09  0:48                                         ` Jakub Kicinski
2023-02-09  0:59                                           ` Jason Gunthorpe
2023-02-09  1:16                                             ` Jakub Kicinski
2023-02-10 17:15                                               ` Jason Gunthorpe
2023-02-09  0:36                                       ` Saeed Mahameed
2023-02-09  0:52                                         ` Jakub Kicinski
2023-02-04  0:47                       ` Saeed Mahameed
2023-02-04  1:57                         ` Jakub Kicinski
2023-02-05 10:26                           ` Leon Romanovsky
2023-02-02 18:07       ` Leon Romanovsky
2023-02-03 20:14 ` Saeed Mahameed

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).