* [PATCH net-next 01/16] net/mlx5: E-Switch, Add operational mode to the SRIOV e-Switch
2016-06-27 16:07 [PATCH net-next 00/16] Mellanox 100G SRIOV E-Switch offload and VF representors Saeed Mahameed
@ 2016-06-27 16:07 ` Saeed Mahameed
2016-06-27 16:07 ` [PATCH net-next 02/16] net/mlx5: E-Switch, Add support for the sriov offloads mode Saeed Mahameed
` (14 subsequent siblings)
15 siblings, 0 replies; 47+ messages in thread
From: Saeed Mahameed @ 2016-06-27 16:07 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Saeed Mahameed
From: Or Gerlitz <ogerlitz@mellanox.com>
Define three modes for the SRIOV e-switch operation, none (SRIOV_NONE,
none of the VF vports are enabled), legacy (SRIOV_LEGACY, the current mode)
and sriov offloads (SRIOV_OFFLOADS). Currently, when in SRIOV, only the
legacy mode is supported, where steering rules are of the form:
destination mac --> VF vport
This patch does not change any functionality.
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 51 +++++++++++++----------
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 19 +++++++--
drivers/net/ethernet/mellanox/mlx5/core/sriov.c | 5 ++-
3 files changed, 46 insertions(+), 29 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index aebbd6c..8068dde 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -428,7 +428,7 @@ esw_fdb_set_vport_promisc_rule(struct mlx5_eswitch *esw, u32 vport)
return __esw_fdb_set_vport_rule(esw, vport, true, mac_c, mac_v);
}
-static int esw_create_fdb_table(struct mlx5_eswitch *esw, int nvports)
+static int esw_create_legacy_fdb_table(struct mlx5_eswitch *esw, int nvports)
{
int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
struct mlx5_core_dev *dev = esw->dev;
@@ -479,7 +479,7 @@ static int esw_create_fdb_table(struct mlx5_eswitch *esw, int nvports)
esw_warn(dev, "Failed to create flow group err(%d)\n", err);
goto out;
}
- esw->fdb_table.addr_grp = g;
+ esw->fdb_table.legacy.addr_grp = g;
/* Allmulti group : One rule that forwards any mcast traffic */
MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
@@ -494,7 +494,7 @@ static int esw_create_fdb_table(struct mlx5_eswitch *esw, int nvports)
esw_warn(dev, "Failed to create allmulti flow group err(%d)\n", err);
goto out;
}
- esw->fdb_table.allmulti_grp = g;
+ esw->fdb_table.legacy.allmulti_grp = g;
/* Promiscuous group :
* One rule that forward all unmatched traffic from previous groups
@@ -511,17 +511,17 @@ static int esw_create_fdb_table(struct mlx5_eswitch *esw, int nvports)
esw_warn(dev, "Failed to create promisc flow group err(%d)\n", err);
goto out;
}
- esw->fdb_table.promisc_grp = g;
+ esw->fdb_table.legacy.promisc_grp = g;
out:
if (err) {
- if (!IS_ERR_OR_NULL(esw->fdb_table.allmulti_grp)) {
- mlx5_destroy_flow_group(esw->fdb_table.allmulti_grp);
- esw->fdb_table.allmulti_grp = NULL;
+ if (!IS_ERR_OR_NULL(esw->fdb_table.legacy.allmulti_grp)) {
+ mlx5_destroy_flow_group(esw->fdb_table.legacy.allmulti_grp);
+ esw->fdb_table.legacy.allmulti_grp = NULL;
}
- if (!IS_ERR_OR_NULL(esw->fdb_table.addr_grp)) {
- mlx5_destroy_flow_group(esw->fdb_table.addr_grp);
- esw->fdb_table.addr_grp = NULL;
+ if (!IS_ERR_OR_NULL(esw->fdb_table.legacy.addr_grp)) {
+ mlx5_destroy_flow_group(esw->fdb_table.legacy.addr_grp);
+ esw->fdb_table.legacy.addr_grp = NULL;
}
if (!IS_ERR_OR_NULL(esw->fdb_table.fdb)) {
mlx5_destroy_flow_table(esw->fdb_table.fdb);
@@ -533,20 +533,20 @@ out:
return err;
}
-static void esw_destroy_fdb_table(struct mlx5_eswitch *esw)
+static void esw_destroy_legacy_fdb_table(struct mlx5_eswitch *esw)
{
if (!esw->fdb_table.fdb)
return;
esw_debug(esw->dev, "Destroy FDB Table\n");
- mlx5_destroy_flow_group(esw->fdb_table.promisc_grp);
- mlx5_destroy_flow_group(esw->fdb_table.allmulti_grp);
- mlx5_destroy_flow_group(esw->fdb_table.addr_grp);
+ mlx5_destroy_flow_group(esw->fdb_table.legacy.promisc_grp);
+ mlx5_destroy_flow_group(esw->fdb_table.legacy.allmulti_grp);
+ mlx5_destroy_flow_group(esw->fdb_table.legacy.addr_grp);
mlx5_destroy_flow_table(esw->fdb_table.fdb);
esw->fdb_table.fdb = NULL;
- esw->fdb_table.addr_grp = NULL;
- esw->fdb_table.allmulti_grp = NULL;
- esw->fdb_table.promisc_grp = NULL;
+ esw->fdb_table.legacy.addr_grp = NULL;
+ esw->fdb_table.legacy.allmulti_grp = NULL;
+ esw->fdb_table.legacy.promisc_grp = NULL;
}
/* E-Switch vport UC/MC lists management */
@@ -1540,7 +1540,7 @@ static void esw_disable_vport(struct mlx5_eswitch *esw, int vport_num)
}
/* Public E-Switch API */
-int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs)
+int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
{
int err;
int i;
@@ -1561,11 +1561,14 @@ int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs)
if (!MLX5_CAP_ESW_EGRESS_ACL(esw->dev, ft_support))
esw_warn(esw->dev, "E-Switch engress ACL is not supported by FW\n");
- esw_info(esw->dev, "E-Switch enable SRIOV: nvfs(%d)\n", nvfs);
+ esw_info(esw->dev, "E-Switch enable SRIOV: nvfs(%d) mode (%d)\n", nvfs, mode);
+ if (mode != SRIOV_LEGACY)
+ return -EINVAL;
+ esw->mode = mode;
esw_disable_vport(esw, 0);
- err = esw_create_fdb_table(esw, nvfs + 1);
+ err = esw_create_legacy_fdb_table(esw, nvfs + 1);
if (err)
goto abort;
@@ -1590,8 +1593,8 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
MLX5_CAP_GEN(esw->dev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
return;
- esw_info(esw->dev, "disable SRIOV: active vports(%d)\n",
- esw->enabled_vports);
+ esw_info(esw->dev, "disable SRIOV: active vports(%d) mode(%d)\n",
+ esw->enabled_vports, esw->mode);
mc_promisc = esw->mc_promisc;
@@ -1601,8 +1604,9 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
if (mc_promisc && mc_promisc->uplink_rule)
mlx5_del_flow_rule(mc_promisc->uplink_rule);
- esw_destroy_fdb_table(esw);
+ esw_destroy_legacy_fdb_table(esw);
+ esw->mode = SRIOV_NONE;
/* VPORT 0 (PF) must be enabled back with non-sriov configuration */
esw_enable_vport(esw, 0, UC_ADDR_CHANGE);
}
@@ -1673,6 +1677,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
esw->total_vports = total_vports;
esw->enabled_vports = 0;
+ esw->mode = SRIOV_NONE;
dev->priv.eswitch = esw;
esw_enable_vport(esw, 0, UC_ADDR_CHANGE);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index fd68002..544fbfe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -134,9 +134,19 @@ struct mlx5_l2_table {
struct mlx5_eswitch_fdb {
void *fdb;
- struct mlx5_flow_group *addr_grp;
- struct mlx5_flow_group *allmulti_grp;
- struct mlx5_flow_group *promisc_grp;
+ union {
+ struct legacy_fdb {
+ struct mlx5_flow_group *addr_grp;
+ struct mlx5_flow_group *allmulti_grp;
+ struct mlx5_flow_group *promisc_grp;
+ } legacy;
+ };
+};
+
+enum {
+ SRIOV_NONE,
+ SRIOV_LEGACY,
+ SRIOV_OFFLOADS
};
struct mlx5_eswitch {
@@ -153,13 +163,14 @@ struct mlx5_eswitch {
*/
struct mutex state_lock;
struct esw_mc_addr *mc_promisc;
+ int mode;
};
/* E-Switch API */
int mlx5_eswitch_init(struct mlx5_core_dev *dev);
void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw);
void mlx5_eswitch_vport_event(struct mlx5_eswitch *esw, struct mlx5_eqe *eqe);
-int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs);
+int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode);
void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw);
int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw,
int vport, u8 mac[ETH_ALEN]);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
index d6a3f41..b380a6b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c
@@ -167,7 +167,7 @@ int mlx5_core_sriov_configure(struct pci_dev *pdev, int num_vfs)
mlx5_core_init_vfs(dev, num_vfs);
#ifdef CONFIG_MLX5_CORE_EN
- mlx5_eswitch_enable_sriov(dev->priv.eswitch, num_vfs);
+ mlx5_eswitch_enable_sriov(dev->priv.eswitch, num_vfs, SRIOV_LEGACY);
#endif
return num_vfs;
@@ -209,7 +209,8 @@ int mlx5_sriov_init(struct mlx5_core_dev *dev)
mlx5_core_init_vfs(dev, cur_vfs);
#ifdef CONFIG_MLX5_CORE_EN
if (cur_vfs)
- mlx5_eswitch_enable_sriov(dev->priv.eswitch, cur_vfs);
+ mlx5_eswitch_enable_sriov(dev->priv.eswitch, cur_vfs,
+ SRIOV_LEGACY);
#endif
enable_vfs(dev, cur_vfs);
--
2.8.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* [PATCH net-next 02/16] net/mlx5: E-Switch, Add support for the sriov offloads mode
2016-06-27 16:07 [PATCH net-next 00/16] Mellanox 100G SRIOV E-Switch offload and VF representors Saeed Mahameed
2016-06-27 16:07 ` [PATCH net-next 01/16] net/mlx5: E-Switch, Add operational mode to the SRIOV e-Switch Saeed Mahameed
@ 2016-06-27 16:07 ` Saeed Mahameed
2016-06-27 16:07 ` [PATCH net-next 03/16] net/mlx5: E-Switch, Add miss rule for " Saeed Mahameed
` (13 subsequent siblings)
15 siblings, 0 replies; 47+ messages in thread
From: Saeed Mahameed @ 2016-06-27 16:07 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Saeed Mahameed
From: Or Gerlitz <ogerlitz@mellanox.com>
Unlike the legacy mode, here, forwarding rules are not learned by the
driver per events on macs set by VFs/VMs into their vports, but rather
should be programmed by higher-level SW entities.
Saying that, still, in the offloads mode (SRIOV_OFFLOADS), two flow
groups are created by the driver for management (slow path) purposes:
The first group will be used for sending packets over e-switch vports
from the host OS where the e-switch management code runs, to be
received by VFs.
The second group will be used by a miss rule which forwards packets toward
the e-switch manager. Further logic will trap these packets such that
the receiving net-device as seen by the networking stack is the representor
of the vport that sent the packet over the e-switch data-path.
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 35 +++---
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 16 +++
.../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 135 +++++++++++++++++++++
4 files changed, 168 insertions(+), 20 deletions(-)
create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index c4f450f..96f1826 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -5,7 +5,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
mad.o transobj.o vport.o sriov.o fs_cmd.o fs_core.o \
fs_counters.o rl.o
-mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o eswitch.o \
+mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o eswitch.o eswitch_offloads.o \
en_main.o en_fs.o en_ethtool.o en_tx.o en_rx.o \
en_rx_am.o en_txrx.o en_clock.o vxlan.o en_tc.o \
en_arfs.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 8068dde..1fc4cfd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -40,17 +40,6 @@
#define UPLINK_VPORT 0xFFFF
-#define MLX5_DEBUG_ESWITCH_MASK BIT(3)
-
-#define esw_info(dev, format, ...) \
- pr_info("(%s): E-Switch: " format, (dev)->priv.name, ##__VA_ARGS__)
-
-#define esw_warn(dev, format, ...) \
- pr_warn("(%s): E-Switch: " format, (dev)->priv.name, ##__VA_ARGS__)
-
-#define esw_debug(dev, format, ...) \
- mlx5_core_dbg_mask(dev, MLX5_DEBUG_ESWITCH_MASK, format, ##__VA_ARGS__)
-
enum {
MLX5_ACTION_NONE = 0,
MLX5_ACTION_ADD = 1,
@@ -92,6 +81,9 @@ enum {
MC_ADDR_CHANGE | \
PROMISC_CHANGE)
+int esw_create_offloads_fdb_table(struct mlx5_eswitch *esw, int nvports);
+void esw_destroy_offloads_fdb_table(struct mlx5_eswitch *esw);
+
static int arm_vport_context_events_cmd(struct mlx5_core_dev *dev, u16 vport,
u32 events_mask)
{
@@ -578,7 +570,8 @@ static int esw_add_uc_addr(struct mlx5_eswitch *esw, struct vport_addr *vaddr)
if (err)
goto abort;
- if (esw->fdb_table.fdb) /* SRIOV is enabled: Forward UC MAC to vport */
+ /* SRIOV is enabled: Forward UC MAC to vport */
+ if (esw->fdb_table.fdb && esw->mode == SRIOV_LEGACY)
vaddr->flow_rule = esw_fdb_set_vport_rule(esw, mac, vport);
esw_debug(esw->dev, "\tADDED UC MAC: vport[%d] %pM index:%d fr(%p)\n",
@@ -1543,7 +1536,7 @@ static void esw_disable_vport(struct mlx5_eswitch *esw, int vport_num)
int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
{
int err;
- int i;
+ int i, enabled_events;
if (!esw || !MLX5_CAP_GEN(esw->dev, vport_group_manager) ||
MLX5_CAP_GEN(esw->dev, port_type) != MLX5_CAP_PORT_TYPE_ETH)
@@ -1562,18 +1555,19 @@ int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
esw_warn(esw->dev, "E-Switch engress ACL is not supported by FW\n");
esw_info(esw->dev, "E-Switch enable SRIOV: nvfs(%d) mode (%d)\n", nvfs, mode);
- if (mode != SRIOV_LEGACY)
- return -EINVAL;
-
esw->mode = mode;
esw_disable_vport(esw, 0);
- err = esw_create_legacy_fdb_table(esw, nvfs + 1);
+ if (mode == SRIOV_LEGACY)
+ err = esw_create_legacy_fdb_table(esw, nvfs + 1);
+ else
+ err = esw_create_offloads_fdb_table(esw, nvfs + 1);
if (err)
goto abort;
+ enabled_events = (mode == SRIOV_LEGACY) ? SRIOV_VPORT_EVENTS : UC_ADDR_CHANGE;
for (i = 0; i <= nvfs; i++)
- esw_enable_vport(esw, i, SRIOV_VPORT_EVENTS);
+ esw_enable_vport(esw, i, enabled_events);
esw_info(esw->dev, "SRIOV enabled: active vports(%d)\n",
esw->enabled_vports);
@@ -1604,7 +1598,10 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
if (mc_promisc && mc_promisc->uplink_rule)
mlx5_del_flow_rule(mc_promisc->uplink_rule);
- esw_destroy_legacy_fdb_table(esw);
+ if (esw->mode == SRIOV_LEGACY)
+ esw_destroy_legacy_fdb_table(esw);
+ else
+ esw_destroy_offloads_fdb_table(esw);
esw->mode = SRIOV_NONE;
/* VPORT 0 (PF) must be enabled back with non-sriov configuration */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 544fbfe..2360180 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -140,6 +140,11 @@ struct mlx5_eswitch_fdb {
struct mlx5_flow_group *allmulti_grp;
struct mlx5_flow_group *promisc_grp;
} legacy;
+
+ struct offloads_fdb {
+ struct mlx5_flow_group *send_to_vport_grp;
+ struct mlx5_flow_group *miss_grp;
+ } offloads;
};
};
@@ -188,4 +193,15 @@ int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
int vport,
struct ifla_vf_stats *vf_stats);
+#define MLX5_DEBUG_ESWITCH_MASK BIT(3)
+
+#define esw_info(dev, format, ...) \
+ pr_info("(%s): E-Switch: " format, (dev)->priv.name, ##__VA_ARGS__)
+
+#define esw_warn(dev, format, ...) \
+ pr_warn("(%s): E-Switch: " format, (dev)->priv.name, ##__VA_ARGS__)
+
+#define esw_debug(dev, format, ...) \
+ mlx5_core_dbg_mask(dev, MLX5_DEBUG_ESWITCH_MASK, format, ##__VA_ARGS__)
+
#endif /* __MLX5_ESWITCH_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
new file mode 100644
index 0000000..c6b28df
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/mlx5_ifc.h>
+#include <linux/mlx5/vport.h>
+#include <linux/mlx5/fs.h>
+#include "mlx5_core.h"
+#include "eswitch.h"
+
+#define MAX_PF_SQ 256
+
+int esw_create_offloads_fdb_table(struct mlx5_eswitch *esw, int nvports)
+{
+ int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+ struct mlx5_core_dev *dev = esw->dev;
+ struct mlx5_flow_namespace *root_ns;
+ struct mlx5_flow_table *fdb = NULL;
+ struct mlx5_flow_group *g;
+ u32 *flow_group_in;
+ void *match_criteria;
+ int table_size, ix, err = 0;
+
+ flow_group_in = mlx5_vzalloc(inlen);
+ if (!flow_group_in)
+ return -ENOMEM;
+
+ root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
+ if (!root_ns) {
+ esw_warn(dev, "Failed to get FDB flow namespace\n");
+ goto ns_err;
+ }
+
+ esw_debug(dev, "Create offloads FDB table, log_max_size(%d)\n",
+ MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
+
+ table_size = nvports + MAX_PF_SQ + 1;
+ fdb = mlx5_create_flow_table(root_ns, 0, table_size, 0);
+ if (IS_ERR(fdb)) {
+ err = PTR_ERR(fdb);
+ esw_warn(dev, "Failed to create FDB Table err %d\n", err);
+ goto fdb_err;
+ }
+ esw->fdb_table.fdb = fdb;
+
+ /* create send-to-vport group */
+ memset(flow_group_in, 0, inlen);
+ MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
+ MLX5_MATCH_MISC_PARAMETERS);
+
+ match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria);
+
+ MLX5_SET_TO_ONES(fte_match_param, match_criteria, misc_parameters.source_sqn);
+ MLX5_SET_TO_ONES(fte_match_param, match_criteria, misc_parameters.source_port);
+
+ ix = nvports + MAX_PF_SQ;
+ MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
+ MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, ix - 1);
+
+ g = mlx5_create_flow_group(fdb, flow_group_in);
+ if (IS_ERR(g)) {
+ err = PTR_ERR(g);
+ esw_warn(dev, "Failed to create send-to-vport flow group err(%d)\n", err);
+ goto send_vport_err;
+ }
+ esw->fdb_table.offloads.send_to_vport_grp = g;
+
+ /* create miss group */
+ memset(flow_group_in, 0, inlen);
+ MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable, 0);
+
+ MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, ix);
+ MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, ix + 1);
+
+ g = mlx5_create_flow_group(fdb, flow_group_in);
+ if (IS_ERR(g)) {
+ err = PTR_ERR(g);
+ esw_warn(dev, "Failed to create miss flow group err(%d)\n", err);
+ goto miss_err;
+ }
+ esw->fdb_table.offloads.miss_grp = g;
+
+ return 0;
+
+miss_err:
+ mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp);
+send_vport_err:
+ mlx5_destroy_flow_table(fdb);
+fdb_err:
+ns_err:
+ kvfree(flow_group_in);
+ return err;
+}
+
+void esw_destroy_offloads_fdb_table(struct mlx5_eswitch *esw)
+{
+ if (!esw->fdb_table.fdb)
+ return;
+
+ esw_debug(esw->dev, "Destroy offloads FDB Table\n");
+ mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp);
+ mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_grp);
+
+ mlx5_destroy_flow_table(esw->fdb_table.fdb);
+}
--
2.8.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* [PATCH net-next 03/16] net/mlx5: E-Switch, Add miss rule for offloads mode
2016-06-27 16:07 [PATCH net-next 00/16] Mellanox 100G SRIOV E-Switch offload and VF representors Saeed Mahameed
2016-06-27 16:07 ` [PATCH net-next 01/16] net/mlx5: E-Switch, Add operational mode to the SRIOV e-Switch Saeed Mahameed
2016-06-27 16:07 ` [PATCH net-next 02/16] net/mlx5: E-Switch, Add support for the sriov offloads mode Saeed Mahameed
@ 2016-06-27 16:07 ` Saeed Mahameed
2016-06-27 16:53 ` Sergei Shtylyov
2016-06-27 16:07 ` [PATCH net-next 04/16] net/mlx5: E-Switch, Add API to create send-to-vport rules Saeed Mahameed
` (12 subsequent siblings)
15 siblings, 1 reply; 47+ messages in thread
From: Saeed Mahameed @ 2016-06-27 16:07 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Saeed Mahameed
From: Or Gerlitz <ogerlitz@mellanox.com>
In the sriov offloads mode, packets that are not matched by any other
rule should be sent towards the e-switch manager for further processing.
Add such "miss" rule which matches ANY packet as the last rule in the
e-switch FDB and programs the HW to send the packet to vport 0 where
the e-switch manager runs.
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 1 +
.../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 42 ++++++++++++++++++++++
2 files changed, 43 insertions(+)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 2360180..8eed33f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -144,6 +144,7 @@ struct mlx5_eswitch_fdb {
struct offloads_fdb {
struct mlx5_flow_group *send_to_vport_grp;
struct mlx5_flow_group *miss_grp;
+ struct mlx5_flow_rule *miss_rule;
} offloads;
};
};
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index c6b28df..9310017 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -38,6 +38,41 @@
#include "mlx5_core.h"
#include "eswitch.h"
+static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
+{
+ struct mlx5_flow_destination dest;
+ struct mlx5_flow_rule *flow_rule = NULL;
+ int match_header = 0;
+ u32 *match_v, *match_c;
+ int err = 0;
+
+ match_v = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
+ match_c = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
+ if (!match_v || !match_c) {
+ esw_warn(esw->dev, "FDB: Failed to alloc match parameters\n");
+ err = -ENOMEM;
+ goto out;
+ }
+
+ dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+ dest.vport_num = 0;
+
+ flow_rule = mlx5_add_flow_rule(esw->fdb_table.fdb, match_header, match_c,
+ match_v, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+ 0, &dest);
+ if (IS_ERR(flow_rule)) {
+ err = PTR_ERR(flow_rule);
+ esw_warn(esw->dev, "FDB: Failed to add miss flow rule err %d\n", err);
+ goto out;
+ }
+
+ esw->fdb_table.offloads.miss_rule = flow_rule;
+out:
+ kfree(match_v);
+ kfree(match_c);
+ return err;
+}
+
#define MAX_PF_SQ 256
int esw_create_offloads_fdb_table(struct mlx5_eswitch *esw, int nvports)
@@ -110,8 +145,14 @@ int esw_create_offloads_fdb_table(struct mlx5_eswitch *esw, int nvports)
}
esw->fdb_table.offloads.miss_grp = g;
+ err = esw_add_fdb_miss_rule(esw);
+ if (err)
+ goto miss_rule_err;
+
return 0;
+miss_rule_err:
+ mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_grp);
miss_err:
mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp);
send_vport_err:
@@ -128,6 +169,7 @@ void esw_destroy_offloads_fdb_table(struct mlx5_eswitch *esw)
return;
esw_debug(esw->dev, "Destroy offloads FDB Table\n");
+ mlx5_del_flow_rule(esw->fdb_table.offloads.miss_rule);
mlx5_destroy_flow_group(esw->fdb_table.offloads.send_to_vport_grp);
mlx5_destroy_flow_group(esw->fdb_table.offloads.miss_grp);
--
2.8.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 03/16] net/mlx5: E-Switch, Add miss rule for offloads mode
2016-06-27 16:07 ` [PATCH net-next 03/16] net/mlx5: E-Switch, Add miss rule for " Saeed Mahameed
@ 2016-06-27 16:53 ` Sergei Shtylyov
2016-06-27 20:40 ` Or Gerlitz
0 siblings, 1 reply; 47+ messages in thread
From: Sergei Shtylyov @ 2016-06-27 16:53 UTC (permalink / raw)
To: Saeed Mahameed, David S. Miller
Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend
Hello.
On 06/27/2016 07:07 PM, Saeed Mahameed wrote:
> From: Or Gerlitz <ogerlitz@mellanox.com>
>
> In the sriov offloads mode, packets that are not matched by any other
> rule should be sent towards the e-switch manager for further processing.
>
> Add such "miss" rule which matches ANY packet as the last rule in the
> e-switch FDB and programs the HW to send the packet to vport 0 where
> the e-switch manager runs.
>
> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
[...]
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> index c6b28df..9310017 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> @@ -38,6 +38,41 @@
> #include "mlx5_core.h"
> #include "eswitch.h"
>
> +static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
> +{
> + struct mlx5_flow_destination dest;
> + struct mlx5_flow_rule *flow_rule = NULL;
> + int match_header = 0;
This variable doesn't apperar necessary...
> + u32 *match_v, *match_c;
> + int err = 0;
> +
> + match_v = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
> + match_c = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
> + if (!match_v || !match_c) {
> + esw_warn(esw->dev, "FDB: Failed to alloc match parameters\n");
> + err = -ENOMEM;
> + goto out;
> + }
> +
> + dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
> + dest.vport_num = 0;
> +
> + flow_rule = mlx5_add_flow_rule(esw->fdb_table.fdb, match_header, match_c,
Whu not just pass 0 instead of 'match_header'?
> + match_v, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
> + 0, &dest);
> + if (IS_ERR(flow_rule)) {
> + err = PTR_ERR(flow_rule);
> + esw_warn(esw->dev, "FDB: Failed to add miss flow rule err %d\n", err);
> + goto out;
> + }
> +
> + esw->fdb_table.offloads.miss_rule = flow_rule;
> +out:
> + kfree(match_v);
> + kfree(match_c);
> + return err;
> +}
> +
> #define MAX_PF_SQ 256
>
> int esw_create_offloads_fdb_table(struct mlx5_eswitch *esw, int nvports)
[...]
MBR, Sergei
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 03/16] net/mlx5: E-Switch, Add miss rule for offloads mode
2016-06-27 16:53 ` Sergei Shtylyov
@ 2016-06-27 20:40 ` Or Gerlitz
0 siblings, 0 replies; 47+ messages in thread
From: Or Gerlitz @ 2016-06-27 20:40 UTC (permalink / raw)
To: Sergei Shtylyov
Cc: Saeed Mahameed, David S. Miller, Linux Netdev List, Or Gerlitz,
Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek, Jesse Brandeburg,
John Fastabend
On Mon, Jun 27, 2016 at 7:53 PM, Sergei Shtylyov
<sergei.shtylyov@cogentembedded.com> wrote:
>> +static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
>> +{
>> + struct mlx5_flow_destination dest;
>> + struct mlx5_flow_rule *flow_rule = NULL;
>> + int match_header = 0;
>
>
> This variable doesn't appear necessary...
yep
>> + dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
>> + dest.vport_num = 0;
>> +
>> + flow_rule = mlx5_add_flow_rule(esw->fdb_table.fdb, match_header,
>> match_c,
>
>
> Why not just pass 0 instead of 'match_header'?
Correct, will fix that.
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next 04/16] net/mlx5: E-Switch, Add API to create send-to-vport rules
2016-06-27 16:07 [PATCH net-next 00/16] Mellanox 100G SRIOV E-Switch offload and VF representors Saeed Mahameed
` (2 preceding siblings ...)
2016-06-27 16:07 ` [PATCH net-next 03/16] net/mlx5: E-Switch, Add miss rule for " Saeed Mahameed
@ 2016-06-27 16:07 ` Saeed Mahameed
2016-06-27 16:07 ` [PATCH net-next 05/16] net/mlx5: Introduce offloads steering namespace Saeed Mahameed
` (11 subsequent siblings)
15 siblings, 0 replies; 47+ messages in thread
From: Saeed Mahameed @ 2016-06-27 16:07 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Saeed Mahameed
From: Or Gerlitz <ogerlitz@mellanox.com>
Add the API to create send-to-vport e-switch rules of the form
packet meta-data :: send-queue-number == $SQN and source-vport == 0 --> $VPORT
These rules are to be used for a send-to-vport logic which conceptually bypasses
the "normal" steering rules currently present at the e-switch datapath.
Such rule should apply only for packets that originate in the e-switch manager
vport (0) and are sent for a given SQN which is used by a given VF representor
device, and hence the matching logic.
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 3 +-
.../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 39 ++++++++++++++++++++++
2 files changed, 41 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 8eed33f..b7fabd1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -193,6 +193,8 @@ int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw,
int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
int vport,
struct ifla_vf_stats *vf_stats);
+struct mlx5_flow_rule *
+mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn);
#define MLX5_DEBUG_ESWITCH_MASK BIT(3)
@@ -204,5 +206,4 @@ int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
#define esw_debug(dev, format, ...) \
mlx5_core_dbg_mask(dev, MLX5_DEBUG_ESWITCH_MASK, format, ##__VA_ARGS__)
-
#endif /* __MLX5_ESWITCH_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 9310017..a8be43d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -38,6 +38,45 @@
#include "mlx5_core.h"
#include "eswitch.h"
+struct mlx5_flow_rule *
+mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn)
+{
+ struct mlx5_flow_destination dest;
+ struct mlx5_flow_rule *flow_rule;
+ int match_header = MLX5_MATCH_MISC_PARAMETERS;
+ u32 *match_v, *match_c;
+ void *misc;
+
+ match_v = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
+ match_c = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
+ if (!match_v || !match_c) {
+ esw_warn(esw->dev, "FDB: Failed to alloc match parameters\n");
+ flow_rule = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ misc = MLX5_ADDR_OF(fte_match_param, match_v, misc_parameters);
+ MLX5_SET(fte_match_set_misc, misc, source_sqn, sqn);
+ MLX5_SET(fte_match_set_misc, misc, source_port, 0x0); /* source vport is 0 */
+
+ misc = MLX5_ADDR_OF(fte_match_param, match_c, misc_parameters);
+ MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_sqn);
+ MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
+
+ dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
+ dest.vport_num = vport;
+
+ flow_rule = mlx5_add_flow_rule(esw->fdb_table.fdb, match_header, match_c,
+ match_v, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+ 0, &dest);
+ if (IS_ERR(flow_rule))
+ esw_warn(esw->dev, "FDB: Failed to add send to vport rule err %ld\n", PTR_ERR(flow_rule));
+out:
+ kfree(match_v);
+ kfree(match_c);
+ return flow_rule;
+}
+
static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
{
struct mlx5_flow_destination dest;
--
2.8.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* [PATCH net-next 05/16] net/mlx5: Introduce offloads steering namespace
2016-06-27 16:07 [PATCH net-next 00/16] Mellanox 100G SRIOV E-Switch offload and VF representors Saeed Mahameed
` (3 preceding siblings ...)
2016-06-27 16:07 ` [PATCH net-next 04/16] net/mlx5: E-Switch, Add API to create send-to-vport rules Saeed Mahameed
@ 2016-06-27 16:07 ` Saeed Mahameed
2016-06-27 16:07 ` [PATCH net-next 06/16] net/mlx5: E-Switch, Add offloads table Saeed Mahameed
` (10 subsequent siblings)
15 siblings, 0 replies; 47+ messages in thread
From: Saeed Mahameed @ 2016-06-27 16:07 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Amir Vadai, Saeed Mahameed
From: Or Gerlitz <ogerlitz@mellanox.com>
Add a new namespace (MLX5_FLOW_NAMESPACE_OFFLOADS) to be populated
with flow steering rules that deal with rules that have have to
be executed before the EN NIC steering rules are matched.
The namespace is located after the bypass name-space and before the
kernel name-space. Therefore, it precedes the HW processing done for
rules set for the kernel NIC name-space.
Under SRIOV, it would allow us to match on e-switch missed packet
and forward them to the relevant VF representor TIR.
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 11 ++++++++++-
include/linux/mlx5/fs.h | 1 +
2 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index e912a3d..b040110 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -83,6 +83,11 @@
#define ANCHOR_NUM_LEVELS 1
#define ANCHOR_NUM_PRIOS 1
#define ANCHOR_MIN_LEVEL (BY_PASS_MIN_LEVEL + 1)
+
+#define OFFLOADS_MAX_FT 1
+#define OFFLOADS_NUM_PRIOS 1
+#define OFFLOADS_MIN_LEVEL (ANCHOR_MIN_LEVEL + 1)
+
struct node_caps {
size_t arr_sz;
long *caps;
@@ -98,7 +103,7 @@ static struct init_tree_node {
int num_levels;
} root_fs = {
.type = FS_TYPE_NAMESPACE,
- .ar_size = 4,
+ .ar_size = 5,
.children = (struct init_tree_node[]) {
ADD_PRIO(0, BY_PASS_MIN_LEVEL, 0,
FS_REQUIRED_CAPS(FS_CAP(flow_table_properties_nic_receive.flow_modify_en),
@@ -107,6 +112,9 @@ static struct init_tree_node {
FS_CAP(flow_table_properties_nic_receive.flow_table_modify)),
ADD_NS(ADD_MULTIPLE_PRIO(MLX5_BY_PASS_NUM_PRIOS,
BY_PASS_PRIO_NUM_LEVELS))),
+ ADD_PRIO(0, OFFLOADS_MIN_LEVEL, 0, {},
+ ADD_NS(ADD_MULTIPLE_PRIO(OFFLOADS_NUM_PRIOS, OFFLOADS_MAX_FT))),
+
ADD_PRIO(0, KERNEL_MIN_LEVEL, 0, {},
ADD_NS(ADD_MULTIPLE_PRIO(1, 1),
ADD_MULTIPLE_PRIO(KERNEL_NIC_NUM_PRIOS,
@@ -1369,6 +1377,7 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev,
switch (type) {
case MLX5_FLOW_NAMESPACE_BYPASS:
+ case MLX5_FLOW_NAMESPACE_OFFLOADS:
case MLX5_FLOW_NAMESPACE_KERNEL:
case MLX5_FLOW_NAMESPACE_LEFTOVERS:
case MLX5_FLOW_NAMESPACE_ANCHOR:
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 4b7a107..6ad1119 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -54,6 +54,7 @@ static inline void build_leftovers_ft_param(int *priority,
enum mlx5_flow_namespace_type {
MLX5_FLOW_NAMESPACE_BYPASS,
+ MLX5_FLOW_NAMESPACE_OFFLOADS,
MLX5_FLOW_NAMESPACE_KERNEL,
MLX5_FLOW_NAMESPACE_LEFTOVERS,
MLX5_FLOW_NAMESPACE_ANCHOR,
--
2.8.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* [PATCH net-next 06/16] net/mlx5: E-Switch, Add offloads table
2016-06-27 16:07 [PATCH net-next 00/16] Mellanox 100G SRIOV E-Switch offload and VF representors Saeed Mahameed
` (4 preceding siblings ...)
2016-06-27 16:07 ` [PATCH net-next 05/16] net/mlx5: Introduce offloads steering namespace Saeed Mahameed
@ 2016-06-27 16:07 ` Saeed Mahameed
2016-06-27 16:07 ` [PATCH net-next 07/16] net/mlx5: E-Switch, Add API to create vport rx rules Saeed Mahameed
` (9 subsequent siblings)
15 siblings, 0 replies; 47+ messages in thread
From: Saeed Mahameed @ 2016-06-27 16:07 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Saeed Mahameed
From: Or Gerlitz <ogerlitz@mellanox.com>
Belongs to the NIC offloads name-space, and to be used as part of the
SRIOV offloads logic to steer packets that hit the e-switch miss rule
to the TIR of the relevant VF representor.
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 5 ++++
.../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 31 ++++++++++++++++++++++
2 files changed, 36 insertions(+)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index b7fabd1..32db37a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -155,6 +155,10 @@ enum {
SRIOV_OFFLOADS
};
+struct mlx5_esw_offload {
+ struct mlx5_flow_table *ft_offloads;
+};
+
struct mlx5_eswitch {
struct mlx5_core_dev *dev;
struct mlx5_l2_table l2_table;
@@ -169,6 +173,7 @@ struct mlx5_eswitch {
*/
struct mutex state_lock;
struct esw_mc_addr *mc_promisc;
+ struct mlx5_esw_offload offloads;
int mode;
};
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index a8be43d..3ca926b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -214,3 +214,34 @@ void esw_destroy_offloads_fdb_table(struct mlx5_eswitch *esw)
mlx5_destroy_flow_table(esw->fdb_table.fdb);
}
+
+static int esw_create_offloads_table(struct mlx5_eswitch *esw)
+{
+ struct mlx5_flow_namespace *ns;
+ struct mlx5_flow_table *ft_offloads;
+ struct mlx5_core_dev *dev = esw->dev;
+ int err = 0;
+
+ ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_OFFLOADS);
+ if (!ns) {
+ esw_warn(esw->dev, "Failed to get offloads flow namespace\n");
+ return -ENOMEM;
+ }
+
+ ft_offloads = mlx5_create_flow_table(ns, 0, dev->priv.sriov.num_vfs + 2, 0);
+ if (IS_ERR(ft_offloads)) {
+ err = PTR_ERR(ft_offloads);
+ esw_warn(esw->dev, "Failed to create offloads table, err %d\n", err);
+ return err;
+ }
+
+ esw->offloads.ft_offloads = ft_offloads;
+ return 0;
+}
+
+static void esw_destroy_offloads_table(struct mlx5_eswitch *esw)
+{
+ struct mlx5_esw_offload *offloads = &esw->offloads;
+
+ mlx5_destroy_flow_table(offloads->ft_offloads);
+}
--
2.8.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* [PATCH net-next 07/16] net/mlx5: E-Switch, Add API to create vport rx rules
2016-06-27 16:07 [PATCH net-next 00/16] Mellanox 100G SRIOV E-Switch offload and VF representors Saeed Mahameed
` (5 preceding siblings ...)
2016-06-27 16:07 ` [PATCH net-next 06/16] net/mlx5: E-Switch, Add offloads table Saeed Mahameed
@ 2016-06-27 16:07 ` Saeed Mahameed
2016-06-27 16:07 ` [PATCH net-next 08/16] net/devlink: Add E-Switch mode control Saeed Mahameed
` (8 subsequent siblings)
15 siblings, 0 replies; 47+ messages in thread
From: Saeed Mahameed @ 2016-06-27 16:07 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Saeed Mahameed
From: Or Gerlitz <ogerlitz@mellanox.com>
Add the API to create vport rx rules of the form
packet meta-data :: vport == $VPORT --> $TIR
where the TIR is opened by this VF representor.
This logic will by used for packets that didn't match any rule in the
e-switch datapath and should be received into the host OS through the
netdevice that represents the VF they were sent from.
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 4 +
.../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 85 ++++++++++++++++++++++
2 files changed, 89 insertions(+)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 32db37a..cf959f7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -157,6 +157,7 @@ enum {
struct mlx5_esw_offload {
struct mlx5_flow_table *ft_offloads;
+ struct mlx5_flow_group *vport_rx_group;
};
struct mlx5_eswitch {
@@ -201,6 +202,9 @@ int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
struct mlx5_flow_rule *
mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn);
+struct mlx5_flow_rule *
+mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn);
+
#define MLX5_DEBUG_ESWITCH_MASK BIT(3)
#define esw_info(dev, format, ...) \
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 3ca926b..67ff1e8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -245,3 +245,88 @@ static void esw_destroy_offloads_table(struct mlx5_eswitch *esw)
mlx5_destroy_flow_table(offloads->ft_offloads);
}
+
+static int esw_create_vport_rx_group(struct mlx5_eswitch *esw)
+{
+ int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
+ struct mlx5_flow_group *g;
+ struct mlx5_priv *priv = &esw->dev->priv;
+ u32 *flow_group_in;
+ void *match_criteria, *misc;
+ int err = 0;
+ int nvports = priv->sriov.num_vfs + 2;
+
+ flow_group_in = mlx5_vzalloc(inlen);
+ if (!flow_group_in)
+ return -ENOMEM;
+
+ /* create vport rx group */
+ memset(flow_group_in, 0, inlen);
+ MLX5_SET(create_flow_group_in, flow_group_in, match_criteria_enable,
+ MLX5_MATCH_MISC_PARAMETERS);
+
+ match_criteria = MLX5_ADDR_OF(create_flow_group_in, flow_group_in, match_criteria);
+ misc = MLX5_ADDR_OF(fte_match_param, match_criteria, misc_parameters);
+ MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
+
+ MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, 0);
+ MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, nvports - 1);
+
+ g = mlx5_create_flow_group(esw->offloads.ft_offloads, flow_group_in);
+
+ if (IS_ERR(g)) {
+ err = PTR_ERR(g);
+ mlx5_core_warn(esw->dev, "Failed to create vport rx group err %d\n", err);
+ goto out;
+ }
+
+ esw->offloads.vport_rx_group = g;
+out:
+ kfree(flow_group_in);
+ return err;
+}
+
+static void esw_destroy_vport_rx_group(struct mlx5_eswitch *esw)
+{
+ mlx5_destroy_flow_group(esw->offloads.vport_rx_group);
+}
+
+struct mlx5_flow_rule *
+mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn)
+{
+ struct mlx5_flow_destination dest;
+ struct mlx5_flow_rule *flow_rule;
+ int match_header = MLX5_MATCH_MISC_PARAMETERS;
+ u32 *match_v, *match_c;
+ void *misc;
+
+ match_v = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
+ match_c = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL);
+ if (!match_v || !match_c) {
+ esw_warn(esw->dev, "Failed to alloc match parameters\n");
+ flow_rule = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ misc = MLX5_ADDR_OF(fte_match_param, match_v, misc_parameters);
+ MLX5_SET(fte_match_set_misc, misc, source_port, vport);
+
+ misc = MLX5_ADDR_OF(fte_match_param, match_c, misc_parameters);
+ MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
+
+ dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+ dest.tir_num = tirn;
+
+ flow_rule = mlx5_add_flow_rule(esw->offloads.ft_offloads, match_header, match_c,
+ match_v, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,
+ 0, &dest);
+ if (IS_ERR(flow_rule)) {
+ esw_warn(esw->dev, "fs offloads: Failed to add vport rx rule err %ld\n", PTR_ERR(flow_rule));
+ goto out;
+ }
+
+out:
+ kfree(match_v);
+ kfree(match_c);
+ return flow_rule;
+}
--
2.8.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-27 16:07 [PATCH net-next 00/16] Mellanox 100G SRIOV E-Switch offload and VF representors Saeed Mahameed
` (6 preceding siblings ...)
2016-06-27 16:07 ` [PATCH net-next 07/16] net/mlx5: E-Switch, Add API to create vport rx rules Saeed Mahameed
@ 2016-06-27 16:07 ` Saeed Mahameed
2016-06-28 5:57 ` John Fastabend
2016-06-28 12:27 ` Jiri Pirko
2016-06-27 16:07 ` [PATCH net-next 09/16] net/mlx5: Add devlink interface Saeed Mahameed
` (7 subsequent siblings)
15 siblings, 2 replies; 47+ messages in thread
From: Saeed Mahameed @ 2016-06-27 16:07 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Saeed Mahameed
From: Or Gerlitz <ogerlitz@mellanox.com>
Add the commands to set and show the mode of SRIOV E-Switch,
two modes are supported:
* legacy : operating in the "old" L2 based mode (DMAC --> VF vport)
* offloads : offloading SW rules/policy (e.g Bridge/FDB or TC/Flows based) set by the host OS
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
include/net/devlink.h | 3 ++
include/uapi/linux/devlink.h | 9 +++++
net/core/devlink.c | 87 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 99 insertions(+)
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 1d45b61..c99ffe8 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -90,6 +90,9 @@ struct devlink_ops {
u16 tc_index,
enum devlink_sb_pool_type pool_type,
u32 *p_cur, u32 *p_max);
+
+ int (*eswitch_mode_get)(struct devlink *devlink, u16 *p_mode);
+ int (*eswitch_mode_set)(struct devlink *devlink, u16 mode);
};
static inline void *devlink_priv(struct devlink *devlink)
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index ba0073b..dd7c1b4 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -57,6 +57,8 @@ enum devlink_command {
DEVLINK_CMD_SB_OCC_SNAPSHOT,
DEVLINK_CMD_SB_OCC_MAX_CLEAR,
+ DEVLINK_CMD_ESWITCH_MODE_GET,
+ DEVLINK_CMD_ESWITCH_MODE_SET,
/* add new commands above here */
__DEVLINK_CMD_MAX,
@@ -95,6 +97,12 @@ enum devlink_sb_threshold_type {
#define DEVLINK_SB_THRESHOLD_TO_ALPHA_MAX 20
+enum devlink_eswitch_mode {
+ DEVLINK_ESWITCH_MODE_NONE,
+ DEVLINK_ESWITCH_MODE_LEGACY,
+ DEVLINK_ESWITCH_MODE_OFFLOADS,
+};
+
enum devlink_attr {
/* don't change the order or add anything between, this is ABI! */
DEVLINK_ATTR_UNSPEC,
@@ -125,6 +133,7 @@ enum devlink_attr {
DEVLINK_ATTR_SB_TC_INDEX, /* u16 */
DEVLINK_ATTR_SB_OCC_CUR, /* u32 */
DEVLINK_ATTR_SB_OCC_MAX, /* u32 */
+ DEVLINK_ATTR_ESWITCH_MODE, /* u16 */
/* add new attributes above here, update the policy in devlink.c */
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 933e8d4..b2e592a 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1394,6 +1394,78 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
return -EOPNOTSUPP;
}
+static int devlink_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags, u16 mode)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_eswitch_mode_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const struct devlink_ops *ops = devlink->ops;
+ struct sk_buff *msg;
+ u16 mode;
+ int err;
+
+ if (!ops || !ops->eswitch_mode_get)
+ return -EOPNOTSUPP;
+
+ err = ops->eswitch_mode_get(devlink, &mode);
+ if (err)
+ return err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_MODE_GET,
+ info->snd_portid, info->snd_seq, 0, mode);
+
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_eswitch_mode_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const struct devlink_ops *ops = devlink->ops;
+ u16 mode;
+
+ if (!info->attrs[DEVLINK_ATTR_ESWITCH_MODE])
+ return -EINVAL;
+
+ mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+
+ if (ops && ops->eswitch_mode_set)
+ return ops->eswitch_mode_set(devlink, mode);
+ return -EOPNOTSUPP;
+}
+
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
@@ -1407,6 +1479,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 },
[DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 },
[DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 },
};
static const struct genl_ops devlink_nl_ops[] = {
@@ -1525,6 +1598,20 @@ static const struct genl_ops devlink_nl_ops[] = {
DEVLINK_NL_FLAG_NEED_SB |
DEVLINK_NL_FLAG_LOCK_PORTS,
},
+ {
+ .cmd = DEVLINK_CMD_ESWITCH_MODE_GET,
+ .doit = devlink_nl_cmd_eswitch_mode_get_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
+ .cmd = DEVLINK_CMD_ESWITCH_MODE_SET,
+ .doit = devlink_nl_cmd_eswitch_mode_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
};
/**
--
2.8.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-27 16:07 ` [PATCH net-next 08/16] net/devlink: Add E-Switch mode control Saeed Mahameed
@ 2016-06-28 5:57 ` John Fastabend
2016-06-28 10:25 ` Or Gerlitz
2016-06-28 12:27 ` Jiri Pirko
1 sibling, 1 reply; 47+ messages in thread
From: John Fastabend @ 2016-06-28 5:57 UTC (permalink / raw)
To: Saeed Mahameed, David S. Miller
Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend
On 16-06-27 09:07 AM, Saeed Mahameed wrote:
> From: Or Gerlitz <ogerlitz@mellanox.com>
>
> Add the commands to set and show the mode of SRIOV E-Switch,
> two modes are supported:
>
> * legacy : operating in the "old" L2 based mode (DMAC --> VF vport)
> * offloads : offloading SW rules/policy (e.g Bridge/FDB or TC/Flows based) set by the host OS
>
> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
> ---
Hi,
Nice work overall also I really appreciated that the core networking
interfaces appear to able to support this without any change.
On this patch though do we really need modes like this? My concern with
modes is two fold. One its another knob that some controller will have
to get right which I would prefer to avoid. And two I suspect switching
between the two modes flushes the tables or leaves them in some
unexpected state? At least I can't figure out what the expected should
be off-hand.
Could we instead continue to use the "legacy" mode by default by just
populating the fdb table correctly and then if users want to enable
the "offloads" mode they can modify the fdb tables by deleting entries
or adding them or just extending the dmac/vf mapping via 'tc'. This
would seem natural to me. The flooding rules in fdb might need to be
exposed a bit more cleanly to get the right default flooding behavior
etc. But to me at least this would be much cleaner. Everything will be
nicely defined and we wont have issues with drivers doing slightly
and subtle different defaults between legacy/offload and the transitions
between the states or on resets or etc. If users need to discover the
current configuration then they just query fdb, query tc, and the state
is known no need for any magic toggle switch as best I can see.
Otherwise I didn't review the mlx code but read the commit msgs and
it looks good. I'll take a closer look in the morning.
Thanks,
John
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-28 5:57 ` John Fastabend
@ 2016-06-28 10:25 ` Or Gerlitz
2016-06-28 16:19 ` John Fastabend
0 siblings, 1 reply; 47+ messages in thread
From: Or Gerlitz @ 2016-06-28 10:25 UTC (permalink / raw)
To: John Fastabend
Cc: Saeed Mahameed, David S. Miller, netdev, Hadar Hen-Zion,
Jiri Pirko, Andy Gospodarek, Jesse Brandeburg, John Fastabend,
Ido Schimmel
On 6/28/2016 8:57 AM, John Fastabend wrote:
> On 16-06-27 09:07 AM, Saeed Mahameed wrote:
>> Add the commands to set and show the mode of SRIOV E-Switch, two modes are supported:
>>
>> * legacy : operating in the "old" L2 based mode (DMAC --> VF vport)
>> * offloads : offloading SW rules/policy (e.g Bridge/FDB or TC/Flows based) set by the host OS
>>
>> Nice work overall also I really appreciated that the core networking
>> interfaces appear to able to support this without any change.
thanks..
>> On this patch though do we really need modes like this? My concern with
>> modes is two fold. One its another knob that some controller will have
>> to get right which I would prefer to avoid. And two I suspect switching
>> between the two modes flushes the tables or leaves them in some
>> unexpected state? At least I can't figure out what the expected should
>> be off-hand.
Re the 1st concern (another knob), I think we do want that, see below
Re the 2nd concern, I will re-read the cover letter and change logs and
if needed clarify/improve: the transition is clean! When you are moving
from legacy to offloads or the other way around, nothing is left in
unexpected state, all HW forwarding tables as filled by the current
mode are flushed and next they are set as needed for the new mode.
>> Could we instead continue to use the "legacy" mode by default by just
>> populating the fdb table correctly and then if users want to enable
>> the "offloads" mode they can modify the fdb tables by deleting entries
>> or adding them or just extending the dmac/vf mapping via 'tc'. This
>> would seem natural to me. The flooding rules in fdb might need to be
>> exposed a bit more cleanly to get the right default flooding behavior
>> etc. But to me at least this would be much cleaner. Everything will be
>> nicely defined and we wont have issues with drivers doing slightly
>> and subtle different defaults between legacy/offload and the transitions
>> between the states or on resets or etc. If users need to discover the
>> current configuration then they just query fdb, query tc, and the state
>> is known no need for any magic toggle switch as best I can see.
Few comments here:
Each mode has it's own way of the driver doing setup for the HW tables
and how population of the HW tables is done.
The offloads mode needs to create a black hole miss rule and
send-to-vport rules and create the tables so they can contain later
rules set by the kernel in a way which is HW/driver dependent.
The legacy mode creates the tables differently and populates them later
with rule set by
the driver and not the kernel.
Even if we put the different table setup issue a side, I don't think it
would be correct for bridge/tc to remove rules they didn't add, which is
needed under your proposal when moving from legacy type rules to
offloads mode. Querying is problematic too, since legacy could (and
does) involve some default rules set by the FW, e.g that deals with
outer world (== not belonging to VM on this host) MACs which are
invisible to the driver.
That legacy was here and we can't avoid handling it properly for which
this knob is needed. Note that a vendor can choose to put their default
to be offloads, hopefully over time, we will all go there :)
>> Otherwise I didn't review the mlx code but read the commit msgs and
>> it looks good. I'll take a closer look in the morning.
appreciated
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-28 10:25 ` Or Gerlitz
@ 2016-06-28 16:19 ` John Fastabend
2016-06-28 17:19 ` John Fastabend
2016-06-29 9:44 ` Or Gerlitz
0 siblings, 2 replies; 47+ messages in thread
From: John Fastabend @ 2016-06-28 16:19 UTC (permalink / raw)
To: Or Gerlitz
Cc: Saeed Mahameed, David S. Miller, netdev, Hadar Hen-Zion,
Jiri Pirko, Andy Gospodarek, Jesse Brandeburg, John Fastabend,
Ido Schimmel
On 16-06-28 03:25 AM, Or Gerlitz wrote:
> On 6/28/2016 8:57 AM, John Fastabend wrote:
>> On 16-06-27 09:07 AM, Saeed Mahameed wrote:
>>> Add the commands to set and show the mode of SRIOV E-Switch, two
>>> modes are supported:
>>>
>>> * legacy : operating in the "old" L2 based mode (DMAC --> VF vport)
>>> * offloads : offloading SW rules/policy (e.g Bridge/FDB or TC/Flows
>>> based) set by the host OS
>>>
>>> Nice work overall also I really appreciated that the core networking
>>> interfaces appear to able to support this without any change.
>
> thanks..
>
>>> On this patch though do we really need modes like this? My concern with
>>> modes is two fold. One its another knob that some controller will have
>>> to get right which I would prefer to avoid. And two I suspect switching
>>> between the two modes flushes the tables or leaves them in some
>>> unexpected state? At least I can't figure out what the expected should
>>> be off-hand.
>
> Re the 1st concern (another knob), I think we do want that, see below
>
> Re the 2nd concern, I will re-read the cover letter and change logs and
> if needed clarify/improve: the transition is clean! When you are moving
> from legacy to offloads or the other way around, nothing is left in
> unexpected state, all HW forwarding tables as filled by the current
> mode are flushed and next they are set as needed for the new mode.
>
OK if I had read the entire patch series maybe I would have caught this
:)
>>> Could we instead continue to use the "legacy" mode by default by just
>>> populating the fdb table correctly and then if users want to enable
>>> the "offloads" mode they can modify the fdb tables by deleting entries
>>> or adding them or just extending the dmac/vf mapping via 'tc'. This
>>> would seem natural to me. The flooding rules in fdb might need to be
>>> exposed a bit more cleanly to get the right default flooding behavior
>>> etc. But to me at least this would be much cleaner. Everything will be
>>> nicely defined and we wont have issues with drivers doing slightly
>>> and subtle different defaults between legacy/offload and the transitions
>>> between the states or on resets or etc. If users need to discover the
>>> current configuration then they just query fdb, query tc, and the state
>>> is known no need for any magic toggle switch as best I can see.
>
>
> Few comments here:
>
> Each mode has it's own way of the driver doing setup for the HW tables
> and how population of the HW tables is done.
hmm so in the hardware I have there is actually a l2 table and various
other tables so I don't have any issue with doing table setup. I would
like to see a table_create/table_delete/table_show devlink commands at
some point though but I'm not there yet. This would allow users to
optimize the table slices if they cared to. But that is future work
IMO. Certainly not needed in this series at least. If you want I can
show you a patch I had for this against rocker but it was before devlink
so it would need some porting.
>
> The offloads mode needs to create a black hole miss rule and
> send-to-vport rules and create the tables so they can contain later
> rules set by the kernel in a way which is HW/driver dependent.
Agreed a black hole miss rule needs to be applied but rather than apply
it automatically with some toggle I would prefer to just add a 'tc' rule
for this. Or alternatively it can be added by configuring flooding
ports so that only a single port is in the flooding mode. This could
all be done via 'bridge fdb ...' and 'bridge link ...' today I believe.
Then the user defines the state and not the driver writer. It really is
cleaner in my opinion.
One oddball case I have is if I have two PF functions behind a single
network facing port. Yes its a bit strange but in this case its nice to
pick which host facing PF to flood on vs the driver picking one.
And send-to-vport rules I'm not entirely clear on what these actually
are used for. Is this a rule to match packets sent from a VF representer
netdev to the actual VF pcie device? If this is the case its seems to
me that any packet sent on a VF representer should be sent to the VF
directly and these rules can be created when the VF is created. Or did
you mean some other rule by this?
>
> The legacy mode creates the tables differently and populates them later
> with rule set by
> the driver and not the kernel.
>
> Even if we put the different table setup issue a side, I don't think it
> would be correct for bridge/tc to remove rules they didn't add, which is
> needed under your proposal when moving from legacy type rules to
> offloads mode. Querying is problematic too, since legacy could (and
> does) involve some default rules set by the FW, e.g that deals with
> outer world (== not belonging to VM on this host) MACs which are
> invisible to the driver.
But even legacy mode should report the correct fdb table and setup.
I don't think querying should be a problem if the driver reports the
configuration correctly. This allows us visibility into the driver
default case so we don't have to guess what driver X writer implemented.
>
> That legacy was here and we can't avoid handling it properly for which
> this knob is needed. Note that a vendor can choose to put their default
> to be offloads, hopefully over time, we will all go there :)
>
But you can come up in legacy mode and report it via the existing
mechanisms 'tc', 'bridge', etc. and then users can transition to any
mode they like using the tools.
I really don't think the switch here is necessary if you implement the
bridge hooks and tc hooks. cls_u32 can handle this for example and I
would expect flower can as well if you want to do mgmt via flow based
tc commands. And the bridge tool has the attributes for per port
flooding but not sure off-hand if its packed into the msg sent to the
driver. But we could fix that fairly easily in another patch series if
needed.
>>> Otherwise I didn't review the mlx code but read the commit msgs and
>>> it looks good. I'll take a closer look in the morning.
>
> appreciated
>
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-28 16:19 ` John Fastabend
@ 2016-06-28 17:19 ` John Fastabend
2016-06-28 18:46 ` Jiri Pirko
2016-06-29 9:44 ` Or Gerlitz
1 sibling, 1 reply; 47+ messages in thread
From: John Fastabend @ 2016-06-28 17:19 UTC (permalink / raw)
To: Or Gerlitz
Cc: Saeed Mahameed, David S. Miller, netdev, Hadar Hen-Zion,
Jiri Pirko, Andy Gospodarek, Jesse Brandeburg, John Fastabend,
Ido Schimmel
On 16-06-28 09:19 AM, John Fastabend wrote:
> On 16-06-28 03:25 AM, Or Gerlitz wrote:
>> On 6/28/2016 8:57 AM, John Fastabend wrote:
>>> On 16-06-27 09:07 AM, Saeed Mahameed wrote:
>>>> Add the commands to set and show the mode of SRIOV E-Switch, two
>>>> modes are supported:
>>>>
>>>> * legacy : operating in the "old" L2 based mode (DMAC --> VF vport)
>>>> * offloads : offloading SW rules/policy (e.g Bridge/FDB or TC/Flows
>>>> based) set by the host OS
>>>>
>>>> Nice work overall also I really appreciated that the core networking
>>>> interfaces appear to able to support this without any change.
>>
>> thanks..
>>
>>>> On this patch though do we really need modes like this? My concern with
>>>> modes is two fold. One its another knob that some controller will have
>>>> to get right which I would prefer to avoid. And two I suspect switching
>>>> between the two modes flushes the tables or leaves them in some
>>>> unexpected state? At least I can't figure out what the expected should
>>>> be off-hand.
>>
>> Re the 1st concern (another knob), I think we do want that, see below
>>
>> Re the 2nd concern, I will re-read the cover letter and change logs and
>> if needed clarify/improve: the transition is clean! When you are moving
>> from legacy to offloads or the other way around, nothing is left in
>> unexpected state, all HW forwarding tables as filled by the current
>> mode are flushed and next they are set as needed for the new mode.
>>
>
> OK if I had read the entire patch series maybe I would have caught this
> :)
>
>>>> Could we instead continue to use the "legacy" mode by default by just
>>>> populating the fdb table correctly and then if users want to enable
>>>> the "offloads" mode they can modify the fdb tables by deleting entries
>>>> or adding them or just extending the dmac/vf mapping via 'tc'. This
>>>> would seem natural to me. The flooding rules in fdb might need to be
>>>> exposed a bit more cleanly to get the right default flooding behavior
>>>> etc. But to me at least this would be much cleaner. Everything will be
>>>> nicely defined and we wont have issues with drivers doing slightly
>>>> and subtle different defaults between legacy/offload and the transitions
>>>> between the states or on resets or etc. If users need to discover the
>>>> current configuration then they just query fdb, query tc, and the state
>>>> is known no need for any magic toggle switch as best I can see.
>>
>>
>> Few comments here:
>>
>> Each mode has it's own way of the driver doing setup for the HW tables
>> and how population of the HW tables is done.
>
> hmm so in the hardware I have there is actually a l2 table and various
> other tables so I don't have any issue with doing table setup. I would
> like to see a table_create/table_delete/table_show devlink commands at
> some point though but I'm not there yet. This would allow users to
> optimize the table slices if they cared to. But that is future work
> IMO. Certainly not needed in this series at least. If you want I can
> show you a patch I had for this against rocker but it was before devlink
> so it would need some porting.
>
>>
>> The offloads mode needs to create a black hole miss rule and
>> send-to-vport rules and create the tables so they can contain later
>> rules set by the kernel in a way which is HW/driver dependent.
>
> Agreed a black hole miss rule needs to be applied but rather than apply
> it automatically with some toggle I would prefer to just add a 'tc' rule
> for this. Or alternatively it can be added by configuring flooding
> ports so that only a single port is in the flooding mode. This could
> all be done via 'bridge fdb ...' and 'bridge link ...' today I believe.
> Then the user defines the state and not the driver writer. It really is
> cleaner in my opinion.
>
> One oddball case I have is if I have two PF functions behind a single
> network facing port. Yes its a bit strange but in this case its nice to
> pick which host facing PF to flood on vs the driver picking one.
>
> And send-to-vport rules I'm not entirely clear on what these actually
> are used for. Is this a rule to match packets sent from a VF representer
> netdev to the actual VF pcie device? If this is the case its seems to
> me that any packet sent on a VF representer should be sent to the VF
> directly and these rules can be created when the VF is created. Or did
> you mean some other rule by this?
>
>>
>> The legacy mode creates the tables differently and populates them later
>> with rule set by
>> the driver and not the kernel.
>>
>> Even if we put the different table setup issue a side, I don't think it
>> would be correct for bridge/tc to remove rules they didn't add, which is
>> needed under your proposal when moving from legacy type rules to
>> offloads mode. Querying is problematic too, since legacy could (and
>> does) involve some default rules set by the FW, e.g that deals with
>> outer world (== not belonging to VM on this host) MACs which are
>> invisible to the driver.
>
> But even legacy mode should report the correct fdb table and setup.
> I don't think querying should be a problem if the driver reports the
> configuration correctly. This allows us visibility into the driver
> default case so we don't have to guess what driver X writer implemented.
>
>>
>> That legacy was here and we can't avoid handling it properly for which
>> this knob is needed. Note that a vendor can choose to put their default
>> to be offloads, hopefully over time, we will all go there :)
>>
>
> But you can come up in legacy mode and report it via the existing
> mechanisms 'tc', 'bridge', etc. and then users can transition to any
> mode they like using the tools.
>
> I really don't think the switch here is necessary if you implement the
> bridge hooks and tc hooks. cls_u32 can handle this for example and I
> would expect flower can as well if you want to do mgmt via flow based
> tc commands. And the bridge tool has the attributes for per port
> flooding but not sure off-hand if its packed into the msg sent to the
> driver. But we could fix that fairly easily in another patch series if
> needed.
>
Actually with a bit more thought it might be nice to have a
flag to enable/disable creation of vf netdev representer in case it
somehow causes issues with existing software. We typically
enable/disable features with ethtool feature flags though not via
devlink so I think it would fit better as an ethtool flag same as
all the other hardware features.
The above points in the last mail are more about how it influences the
forwarding rules in the switch and my preference would be that it
doesn't change how the forwarding works in the switch and instead the
forwarding state is managed via standard tools 'tc', 'bridge', etc. So
I think my comments are still relevant. However as long as when we
query the nic/switch we get the correct information back in any mode
I'm not too concerned I suspect any software that actually uses this
will have to query and reconfigure either way counting on driver
writers to get policy correct is not a stable way to write usermode
software.
All that said I don't plan to change the forwarding state this way
with the intel drivers when implementing the vf representer.
Yet another reason not to change the state of the forwarding rules is
even on older hardware that only supports l2 mac/vlan based forwarding
having a VF representer is useful to configure the device and send/recv
some basic control packets (e.g. lldp). On these devices l2 mac/vlan
mode is the only one supported.
>>>> Otherwise I didn't review the mlx code but read the commit msgs and
>>>> it looks good. I'll take a closer look in the morning.
>>
>> appreciated
>>
>
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-28 17:19 ` John Fastabend
@ 2016-06-28 18:46 ` Jiri Pirko
2016-06-28 19:04 ` Samudrala, Sridhar
0 siblings, 1 reply; 47+ messages in thread
From: Jiri Pirko @ 2016-06-28 18:46 UTC (permalink / raw)
To: John Fastabend
Cc: Or Gerlitz, Saeed Mahameed, David S. Miller, netdev,
Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek, Jesse Brandeburg,
John Fastabend, Ido Schimmel
Tue, Jun 28, 2016 at 07:19:06PM CEST, john.fastabend@gmail.com wrote:
>On 16-06-28 09:19 AM, John Fastabend wrote:
>> On 16-06-28 03:25 AM, Or Gerlitz wrote:
>>> On 6/28/2016 8:57 AM, John Fastabend wrote:
>>>> On 16-06-27 09:07 AM, Saeed Mahameed wrote:
>>>>> Add the commands to set and show the mode of SRIOV E-Switch, two
>>>>> modes are supported:
>>>>>
>>>>> * legacy : operating in the "old" L2 based mode (DMAC --> VF vport)
>>>>> * offloads : offloading SW rules/policy (e.g Bridge/FDB or TC/Flows
>>>>> based) set by the host OS
>>>>>
>>>>> Nice work overall also I really appreciated that the core networking
>>>>> interfaces appear to able to support this without any change.
>>>
>>> thanks..
>>>
>>>>> On this patch though do we really need modes like this? My concern with
>>>>> modes is two fold. One its another knob that some controller will have
>>>>> to get right which I would prefer to avoid. And two I suspect switching
>>>>> between the two modes flushes the tables or leaves them in some
>>>>> unexpected state? At least I can't figure out what the expected should
>>>>> be off-hand.
>>>
>>> Re the 1st concern (another knob), I think we do want that, see below
>>>
>>> Re the 2nd concern, I will re-read the cover letter and change logs and
>>> if needed clarify/improve: the transition is clean! When you are moving
>>> from legacy to offloads or the other way around, nothing is left in
>>> unexpected state, all HW forwarding tables as filled by the current
>>> mode are flushed and next they are set as needed for the new mode.
>>>
>>
>> OK if I had read the entire patch series maybe I would have caught this
>> :)
>>
>>>>> Could we instead continue to use the "legacy" mode by default by just
>>>>> populating the fdb table correctly and then if users want to enable
>>>>> the "offloads" mode they can modify the fdb tables by deleting entries
>>>>> or adding them or just extending the dmac/vf mapping via 'tc'. This
>>>>> would seem natural to me. The flooding rules in fdb might need to be
>>>>> exposed a bit more cleanly to get the right default flooding behavior
>>>>> etc. But to me at least this would be much cleaner. Everything will be
>>>>> nicely defined and we wont have issues with drivers doing slightly
>>>>> and subtle different defaults between legacy/offload and the transitions
>>>>> between the states or on resets or etc. If users need to discover the
>>>>> current configuration then they just query fdb, query tc, and the state
>>>>> is known no need for any magic toggle switch as best I can see.
>>>
>>>
>>> Few comments here:
>>>
>>> Each mode has it's own way of the driver doing setup for the HW tables
>>> and how population of the HW tables is done.
>>
>> hmm so in the hardware I have there is actually a l2 table and various
>> other tables so I don't have any issue with doing table setup. I would
>> like to see a table_create/table_delete/table_show devlink commands at
>> some point though but I'm not there yet. This would allow users to
>> optimize the table slices if they cared to. But that is future work
>> IMO. Certainly not needed in this series at least. If you want I can
>> show you a patch I had for this against rocker but it was before devlink
>> so it would need some porting.
>>
>>>
>>> The offloads mode needs to create a black hole miss rule and
>>> send-to-vport rules and create the tables so they can contain later
>>> rules set by the kernel in a way which is HW/driver dependent.
>>
>> Agreed a black hole miss rule needs to be applied but rather than apply
>> it automatically with some toggle I would prefer to just add a 'tc' rule
>> for this. Or alternatively it can be added by configuring flooding
>> ports so that only a single port is in the flooding mode. This could
>> all be done via 'bridge fdb ...' and 'bridge link ...' today I believe.
>> Then the user defines the state and not the driver writer. It really is
>> cleaner in my opinion.
>>
>> One oddball case I have is if I have two PF functions behind a single
>> network facing port. Yes its a bit strange but in this case its nice to
>> pick which host facing PF to flood on vs the driver picking one.
>>
>> And send-to-vport rules I'm not entirely clear on what these actually
>> are used for. Is this a rule to match packets sent from a VF representer
>> netdev to the actual VF pcie device? If this is the case its seems to
>> me that any packet sent on a VF representer should be sent to the VF
>> directly and these rules can be created when the VF is created. Or did
>> you mean some other rule by this?
>>
>>>
>>> The legacy mode creates the tables differently and populates them later
>>> with rule set by
>>> the driver and not the kernel.
>>>
>>> Even if we put the different table setup issue a side, I don't think it
>>> would be correct for bridge/tc to remove rules they didn't add, which is
>>> needed under your proposal when moving from legacy type rules to
>>> offloads mode. Querying is problematic too, since legacy could (and
>>> does) involve some default rules set by the FW, e.g that deals with
>>> outer world (== not belonging to VM on this host) MACs which are
>>> invisible to the driver.
>>
>> But even legacy mode should report the correct fdb table and setup.
>> I don't think querying should be a problem if the driver reports the
>> configuration correctly. This allows us visibility into the driver
>> default case so we don't have to guess what driver X writer implemented.
>>
>>>
>>> That legacy was here and we can't avoid handling it properly for which
>>> this knob is needed. Note that a vendor can choose to put their default
>>> to be offloads, hopefully over time, we will all go there :)
>>>
>>
>> But you can come up in legacy mode and report it via the existing
>> mechanisms 'tc', 'bridge', etc. and then users can transition to any
>> mode they like using the tools.
>>
>> I really don't think the switch here is necessary if you implement the
>> bridge hooks and tc hooks. cls_u32 can handle this for example and I
>> would expect flower can as well if you want to do mgmt via flow based
>> tc commands. And the bridge tool has the attributes for per port
>> flooding but not sure off-hand if its packed into the msg sent to the
>> driver. But we could fix that fairly easily in another patch series if
>> needed.
>>
>
>Actually with a bit more thought it might be nice to have a
>flag to enable/disable creation of vf netdev representer in case it
>somehow causes issues with existing software. We typically
>enable/disable features with ethtool feature flags though not via
>devlink so I think it would fit better as an ethtool flag same as
>all the other hardware features.
This is not a property of a netdevice, but a devlink device. That should
be a handle of creating/not creating representors. And I think that what
this patch is doing serves that purpose as well. For legacy mode, the
representors are not created, for offload/switchdev mode they are
created.
Does not make sense to have this in ethtool one bit to me.
>
>The above points in the last mail are more about how it influences the
>forwarding rules in the switch and my preference would be that it
>doesn't change how the forwarding works in the switch and instead the
>forwarding state is managed via standard tools 'tc', 'bridge', etc. So
>I think my comments are still relevant. However as long as when we
>query the nic/switch we get the correct information back in any mode
>I'm not too concerned I suspect any software that actually uses this
>will have to query and reconfigure either way counting on driver
>writers to get policy correct is not a stable way to write usermode
>software.
>
>All that said I don't plan to change the forwarding state this way
>with the intel drivers when implementing the vf representer.
>
>Yet another reason not to change the state of the forwarding rules is
>even on older hardware that only supports l2 mac/vlan based forwarding
>having a VF representer is useful to configure the device and send/recv
>some basic control packets (e.g. lldp). On these devices l2 mac/vlan
>mode is the only one supported.
>
>>>>> Otherwise I didn't review the mlx code but read the commit msgs and
>>>>> it looks good. I'll take a closer look in the morning.
>>>
>>> appreciated
>>>
>>
>
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-28 18:46 ` Jiri Pirko
@ 2016-06-28 19:04 ` Samudrala, Sridhar
2016-06-28 19:12 ` Jiri Pirko
0 siblings, 1 reply; 47+ messages in thread
From: Samudrala, Sridhar @ 2016-06-28 19:04 UTC (permalink / raw)
To: Jiri Pirko, John Fastabend
Cc: Or Gerlitz, Saeed Mahameed, David S. Miller, netdev,
Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek, Jesse Brandeburg,
John Fastabend, Ido Schimmel
On 6/28/2016 11:46 AM, Jiri Pirko wrote:
> Tue, Jun 28, 2016 at 07:19:06PM CEST, john.fastabend@gmail.com wrote:
>> On 16-06-28 09:19 AM, John Fastabend wrote:
>>> On 16-06-28 03:25 AM, Or Gerlitz wrote:
>>>> On 6/28/2016 8:57 AM, John Fastabend wrote:
>>>>> On 16-06-27 09:07 AM, Saeed Mahameed wrote:
>>>>>> Add the commands to set and show the mode of SRIOV E-Switch, two
>>>>>> modes are supported:
>>>>>>
>>>>>> * legacy : operating in the "old" L2 based mode (DMAC --> VF vport)
>>>>>> * offloads : offloading SW rules/policy (e.g Bridge/FDB or TC/Flows
>>>>>> based) set by the host OS
>>>>>>
>>>>>> Nice work overall also I really appreciated that the core networking
>>>>>> interfaces appear to able to support this without any change.
>>>> thanks..
>>>>
>>>>>> On this patch though do we really need modes like this? My concern with
>>>>>> modes is two fold. One its another knob that some controller will have
>>>>>> to get right which I would prefer to avoid. And two I suspect switching
>>>>>> between the two modes flushes the tables or leaves them in some
>>>>>> unexpected state? At least I can't figure out what the expected should
>>>>>> be off-hand.
>>>> Re the 1st concern (another knob), I think we do want that, see below
>>>>
>>>> Re the 2nd concern, I will re-read the cover letter and change logs and
>>>> if needed clarify/improve: the transition is clean! When you are moving
>>>> from legacy to offloads or the other way around, nothing is left in
>>>> unexpected state, all HW forwarding tables as filled by the current
>>>> mode are flushed and next they are set as needed for the new mode.
>>>>
>>> OK if I had read the entire patch series maybe I would have caught this
>>> :)
>>>
>>>>>> Could we instead continue to use the "legacy" mode by default by just
>>>>>> populating the fdb table correctly and then if users want to enable
>>>>>> the "offloads" mode they can modify the fdb tables by deleting entries
>>>>>> or adding them or just extending the dmac/vf mapping via 'tc'. This
>>>>>> would seem natural to me. The flooding rules in fdb might need to be
>>>>>> exposed a bit more cleanly to get the right default flooding behavior
>>>>>> etc. But to me at least this would be much cleaner. Everything will be
>>>>>> nicely defined and we wont have issues with drivers doing slightly
>>>>>> and subtle different defaults between legacy/offload and the transitions
>>>>>> between the states or on resets or etc. If users need to discover the
>>>>>> current configuration then they just query fdb, query tc, and the state
>>>>>> is known no need for any magic toggle switch as best I can see.
>>>>
>>>> Few comments here:
>>>>
>>>> Each mode has it's own way of the driver doing setup for the HW tables
>>>> and how population of the HW tables is done.
>>> hmm so in the hardware I have there is actually a l2 table and various
>>> other tables so I don't have any issue with doing table setup. I would
>>> like to see a table_create/table_delete/table_show devlink commands at
>>> some point though but I'm not there yet. This would allow users to
>>> optimize the table slices if they cared to. But that is future work
>>> IMO. Certainly not needed in this series at least. If you want I can
>>> show you a patch I had for this against rocker but it was before devlink
>>> so it would need some porting.
>>>
>>>> The offloads mode needs to create a black hole miss rule and
>>>> send-to-vport rules and create the tables so they can contain later
>>>> rules set by the kernel in a way which is HW/driver dependent.
>>> Agreed a black hole miss rule needs to be applied but rather than apply
>>> it automatically with some toggle I would prefer to just add a 'tc' rule
>>> for this. Or alternatively it can be added by configuring flooding
>>> ports so that only a single port is in the flooding mode. This could
>>> all be done via 'bridge fdb ...' and 'bridge link ...' today I believe.
>>> Then the user defines the state and not the driver writer. It really is
>>> cleaner in my opinion.
>>>
>>> One oddball case I have is if I have two PF functions behind a single
>>> network facing port. Yes its a bit strange but in this case its nice to
>>> pick which host facing PF to flood on vs the driver picking one.
>>>
>>> And send-to-vport rules I'm not entirely clear on what these actually
>>> are used for. Is this a rule to match packets sent from a VF representer
>>> netdev to the actual VF pcie device? If this is the case its seems to
>>> me that any packet sent on a VF representer should be sent to the VF
>>> directly and these rules can be created when the VF is created. Or did
>>> you mean some other rule by this?
>>>
>>>> The legacy mode creates the tables differently and populates them later
>>>> with rule set by
>>>> the driver and not the kernel.
>>>>
>>>> Even if we put the different table setup issue a side, I don't think it
>>>> would be correct for bridge/tc to remove rules they didn't add, which is
>>>> needed under your proposal when moving from legacy type rules to
>>>> offloads mode. Querying is problematic too, since legacy could (and
>>>> does) involve some default rules set by the FW, e.g that deals with
>>>> outer world (== not belonging to VM on this host) MACs which are
>>>> invisible to the driver.
>>> But even legacy mode should report the correct fdb table and setup.
>>> I don't think querying should be a problem if the driver reports the
>>> configuration correctly. This allows us visibility into the driver
>>> default case so we don't have to guess what driver X writer implemented.
>>>
>>>> That legacy was here and we can't avoid handling it properly for which
>>>> this knob is needed. Note that a vendor can choose to put their default
>>>> to be offloads, hopefully over time, we will all go there :)
>>>>
>>> But you can come up in legacy mode and report it via the existing
>>> mechanisms 'tc', 'bridge', etc. and then users can transition to any
>>> mode they like using the tools.
>>>
>>> I really don't think the switch here is necessary if you implement the
>>> bridge hooks and tc hooks. cls_u32 can handle this for example and I
>>> would expect flower can as well if you want to do mgmt via flow based
>>> tc commands. And the bridge tool has the attributes for per port
>>> flooding but not sure off-hand if its packed into the msg sent to the
>>> driver. But we could fix that fairly easily in another patch series if
>>> needed.
>>>
>> Actually with a bit more thought it might be nice to have a
>> flag to enable/disable creation of vf netdev representer in case it
>> somehow causes issues with existing software. We typically
>> enable/disable features with ethtool feature flags though not via
>> devlink so I think it would fit better as an ethtool flag same as
>> all the other hardware features.
> This is not a property of a netdevice, but a devlink device. That should
> be a handle of creating/not creating representors. And I think that what
> this patch is doing serves that purpose as well. For legacy mode, the
> representors are not created, for offload/switchdev mode they are
> created.
Even in legacy mode, i think there is a value in creating VP representor
netdevs.
We are planning to expose VF statistics, ntuple filters, additional
fdb, vlan entries via this
netdev for VFs in the default mode.
Isn't it possible to switch to offloads mode by deleting the 'legacy'
flow rules and
adding 'offloads' flow rules from userspace?
>
> Does not make sense to have this in ethtool one bit to me.
>
>
>> The above points in the last mail are more about how it influences the
>> forwarding rules in the switch and my preference would be that it
>> doesn't change how the forwarding works in the switch and instead the
>> forwarding state is managed via standard tools 'tc', 'bridge', etc. So
>> I think my comments are still relevant. However as long as when we
>> query the nic/switch we get the correct information back in any mode
>> I'm not too concerned I suspect any software that actually uses this
>> will have to query and reconfigure either way counting on driver
>> writers to get policy correct is not a stable way to write usermode
>> software.
>>
>> All that said I don't plan to change the forwarding state this way
>> with the intel drivers when implementing the vf representer.
>>
>> Yet another reason not to change the state of the forwarding rules is
>> even on older hardware that only supports l2 mac/vlan based forwarding
>> having a VF representer is useful to configure the device and send/recv
>> some basic control packets (e.g. lldp). On these devices l2 mac/vlan
>> mode is the only one supported.
>>
>>>>>> Otherwise I didn't review the mlx code but read the commit msgs and
>>>>>> it looks good. I'll take a closer look in the morning.
>>>> appreciated
>>>>
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-28 19:04 ` Samudrala, Sridhar
@ 2016-06-28 19:12 ` Jiri Pirko
2016-06-28 19:31 ` John Fastabend
0 siblings, 1 reply; 47+ messages in thread
From: Jiri Pirko @ 2016-06-28 19:12 UTC (permalink / raw)
To: Samudrala, Sridhar
Cc: John Fastabend, Or Gerlitz, Saeed Mahameed, David S. Miller,
netdev, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Ido Schimmel
Tue, Jun 28, 2016 at 09:04:00PM CEST, sridhar.samudrala@intel.com wrote:
>
>
>On 6/28/2016 11:46 AM, Jiri Pirko wrote:
>>Tue, Jun 28, 2016 at 07:19:06PM CEST, john.fastabend@gmail.com wrote:
>>>On 16-06-28 09:19 AM, John Fastabend wrote:
>>>>On 16-06-28 03:25 AM, Or Gerlitz wrote:
>>>>>On 6/28/2016 8:57 AM, John Fastabend wrote:
>>>>>>On 16-06-27 09:07 AM, Saeed Mahameed wrote:
>>>>>>>Add the commands to set and show the mode of SRIOV E-Switch, two
>>>>>>>modes are supported:
>>>>>>>
>>>>>>>* legacy : operating in the "old" L2 based mode (DMAC --> VF vport)
>>>>>>>* offloads : offloading SW rules/policy (e.g Bridge/FDB or TC/Flows
>>>>>>>based) set by the host OS
>>>>>>>
>>>>>>>Nice work overall also I really appreciated that the core networking
>>>>>>>interfaces appear to able to support this without any change.
>>>>>thanks..
>>>>>
>>>>>>>On this patch though do we really need modes like this? My concern with
>>>>>>>modes is two fold. One its another knob that some controller will have
>>>>>>>to get right which I would prefer to avoid. And two I suspect switching
>>>>>>>between the two modes flushes the tables or leaves them in some
>>>>>>>unexpected state? At least I can't figure out what the expected should
>>>>>>>be off-hand.
>>>>>Re the 1st concern (another knob), I think we do want that, see below
>>>>>
>>>>>Re the 2nd concern, I will re-read the cover letter and change logs and
>>>>>if needed clarify/improve: the transition is clean! When you are moving
>>>>>from legacy to offloads or the other way around, nothing is left in
>>>>>unexpected state, all HW forwarding tables as filled by the current
>>>>>mode are flushed and next they are set as needed for the new mode.
>>>>>
>>>>OK if I had read the entire patch series maybe I would have caught this
>>>>:)
>>>>
>>>>>>>Could we instead continue to use the "legacy" mode by default by just
>>>>>>>populating the fdb table correctly and then if users want to enable
>>>>>>>the "offloads" mode they can modify the fdb tables by deleting entries
>>>>>>>or adding them or just extending the dmac/vf mapping via 'tc'. This
>>>>>>>would seem natural to me. The flooding rules in fdb might need to be
>>>>>>>exposed a bit more cleanly to get the right default flooding behavior
>>>>>>>etc. But to me at least this would be much cleaner. Everything will be
>>>>>>>nicely defined and we wont have issues with drivers doing slightly
>>>>>>>and subtle different defaults between legacy/offload and the transitions
>>>>>>>between the states or on resets or etc. If users need to discover the
>>>>>>>current configuration then they just query fdb, query tc, and the state
>>>>>>>is known no need for any magic toggle switch as best I can see.
>>>>>
>>>>>Few comments here:
>>>>>
>>>>>Each mode has it's own way of the driver doing setup for the HW tables
>>>>>and how population of the HW tables is done.
>>>>hmm so in the hardware I have there is actually a l2 table and various
>>>>other tables so I don't have any issue with doing table setup. I would
>>>>like to see a table_create/table_delete/table_show devlink commands at
>>>>some point though but I'm not there yet. This would allow users to
>>>>optimize the table slices if they cared to. But that is future work
>>>>IMO. Certainly not needed in this series at least. If you want I can
>>>>show you a patch I had for this against rocker but it was before devlink
>>>>so it would need some porting.
>>>>
>>>>>The offloads mode needs to create a black hole miss rule and
>>>>>send-to-vport rules and create the tables so they can contain later
>>>>>rules set by the kernel in a way which is HW/driver dependent.
>>>>Agreed a black hole miss rule needs to be applied but rather than apply
>>>>it automatically with some toggle I would prefer to just add a 'tc' rule
>>>>for this. Or alternatively it can be added by configuring flooding
>>>>ports so that only a single port is in the flooding mode. This could
>>>>all be done via 'bridge fdb ...' and 'bridge link ...' today I believe.
>>>>Then the user defines the state and not the driver writer. It really is
>>>>cleaner in my opinion.
>>>>
>>>>One oddball case I have is if I have two PF functions behind a single
>>>>network facing port. Yes its a bit strange but in this case its nice to
>>>>pick which host facing PF to flood on vs the driver picking one.
>>>>
>>>>And send-to-vport rules I'm not entirely clear on what these actually
>>>>are used for. Is this a rule to match packets sent from a VF representer
>>>>netdev to the actual VF pcie device? If this is the case its seems to
>>>>me that any packet sent on a VF representer should be sent to the VF
>>>>directly and these rules can be created when the VF is created. Or did
>>>>you mean some other rule by this?
>>>>
>>>>>The legacy mode creates the tables differently and populates them later
>>>>>with rule set by
>>>>>the driver and not the kernel.
>>>>>
>>>>>Even if we put the different table setup issue a side, I don't think it
>>>>>would be correct for bridge/tc to remove rules they didn't add, which is
>>>>>needed under your proposal when moving from legacy type rules to
>>>>>offloads mode. Querying is problematic too, since legacy could (and
>>>>>does) involve some default rules set by the FW, e.g that deals with
>>>>>outer world (== not belonging to VM on this host) MACs which are
>>>>>invisible to the driver.
>>>>But even legacy mode should report the correct fdb table and setup.
>>>>I don't think querying should be a problem if the driver reports the
>>>>configuration correctly. This allows us visibility into the driver
>>>>default case so we don't have to guess what driver X writer implemented.
>>>>
>>>>>That legacy was here and we can't avoid handling it properly for which
>>>>>this knob is needed. Note that a vendor can choose to put their default
>>>>>to be offloads, hopefully over time, we will all go there :)
>>>>>
>>>>But you can come up in legacy mode and report it via the existing
>>>>mechanisms 'tc', 'bridge', etc. and then users can transition to any
>>>>mode they like using the tools.
>>>>
>>>>I really don't think the switch here is necessary if you implement the
>>>>bridge hooks and tc hooks. cls_u32 can handle this for example and I
>>>>would expect flower can as well if you want to do mgmt via flow based
>>>>tc commands. And the bridge tool has the attributes for per port
>>>>flooding but not sure off-hand if its packed into the msg sent to the
>>>>driver. But we could fix that fairly easily in another patch series if
>>>>needed.
>>>>
>>>Actually with a bit more thought it might be nice to have a
>>>flag to enable/disable creation of vf netdev representer in case it
>>>somehow causes issues with existing software. We typically
>>>enable/disable features with ethtool feature flags though not via
>>>devlink so I think it would fit better as an ethtool flag same as
>>>all the other hardware features.
>>This is not a property of a netdevice, but a devlink device. That should
>>be a handle of creating/not creating representors. And I think that what
>>this patch is doing serves that purpose as well. For legacy mode, the
>>representors are not created, for offload/switchdev mode they are
>>created.
>
>Even in legacy mode, i think there is a value in creating VP representor
>netdevs.
Why?! Please, leave legacy be legacy. Use the new mode for implementing new
features. Don't make things any more complicated :(
>We are planning to expose VF statistics, ntuple filters, additional fdb,
>vlan entries via this
>netdev for VFs in the default mode.
>
>Isn't it possible to switch to offloads mode by deleting the 'legacy' flow
>rules and
>adding 'offloads' flow rules from userspace?
>
>
>>
>>Does not make sense to have this in ethtool one bit to me.
>>
>>
>>>The above points in the last mail are more about how it influences the
>>>forwarding rules in the switch and my preference would be that it
>>>doesn't change how the forwarding works in the switch and instead the
>>>forwarding state is managed via standard tools 'tc', 'bridge', etc. So
>>>I think my comments are still relevant. However as long as when we
>>>query the nic/switch we get the correct information back in any mode
>>>I'm not too concerned I suspect any software that actually uses this
>>>will have to query and reconfigure either way counting on driver
>>>writers to get policy correct is not a stable way to write usermode
>>>software.
>>>
>>>All that said I don't plan to change the forwarding state this way
>>>with the intel drivers when implementing the vf representer.
>>>
>>>Yet another reason not to change the state of the forwarding rules is
>>>even on older hardware that only supports l2 mac/vlan based forwarding
>>>having a VF representer is useful to configure the device and send/recv
>>>some basic control packets (e.g. lldp). On these devices l2 mac/vlan
>>>mode is the only one supported.
>>>
>>>>>>>Otherwise I didn't review the mlx code but read the commit msgs and
>>>>>>>it looks good. I'll take a closer look in the morning.
>>>>>appreciated
>>>>>
>
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-28 19:12 ` Jiri Pirko
@ 2016-06-28 19:31 ` John Fastabend
2016-06-29 14:48 ` Or Gerlitz
0 siblings, 1 reply; 47+ messages in thread
From: John Fastabend @ 2016-06-28 19:31 UTC (permalink / raw)
To: Jiri Pirko, Samudrala, Sridhar
Cc: Or Gerlitz, Saeed Mahameed, David S. Miller, netdev,
Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek, Jesse Brandeburg,
John Fastabend, Ido Schimmel
On 16-06-28 12:12 PM, Jiri Pirko wrote:
> Tue, Jun 28, 2016 at 09:04:00PM CEST, sridhar.samudrala@intel.com wrote:
>>
>>
>> On 6/28/2016 11:46 AM, Jiri Pirko wrote:
>>> Tue, Jun 28, 2016 at 07:19:06PM CEST, john.fastabend@gmail.com wrote:
>>>> On 16-06-28 09:19 AM, John Fastabend wrote:
>>>>> On 16-06-28 03:25 AM, Or Gerlitz wrote:
>>>>>> On 6/28/2016 8:57 AM, John Fastabend wrote:
>>>>>>> On 16-06-27 09:07 AM, Saeed Mahameed wrote:
>>>>>>>> Add the commands to set and show the mode of SRIOV E-Switch, two
>>>>>>>> modes are supported:
>>>>>>>>
>>>>>>>> * legacy : operating in the "old" L2 based mode (DMAC --> VF vport)
>>>>>>>> * offloads : offloading SW rules/policy (e.g Bridge/FDB or TC/Flows
>>>>>>>> based) set by the host OS
>>>>>>>>
>>>>>>>> Nice work overall also I really appreciated that the core networking
>>>>>>>> interfaces appear to able to support this without any change.
>>>>>> thanks..
>>>>>>
>>>>>>>> On this patch though do we really need modes like this? My concern with
>>>>>>>> modes is two fold. One its another knob that some controller will have
>>>>>>>> to get right which I would prefer to avoid. And two I suspect switching
>>>>>>>> between the two modes flushes the tables or leaves them in some
>>>>>>>> unexpected state? At least I can't figure out what the expected should
>>>>>>>> be off-hand.
>>>>>> Re the 1st concern (another knob), I think we do want that, see below
>>>>>>
>>>>>> Re the 2nd concern, I will re-read the cover letter and change logs and
>>>>>> if needed clarify/improve: the transition is clean! When you are moving
>>>>> >from legacy to offloads or the other way around, nothing is left in
>>>>>> unexpected state, all HW forwarding tables as filled by the current
>>>>>> mode are flushed and next they are set as needed for the new mode.
>>>>>>
>>>>> OK if I had read the entire patch series maybe I would have caught this
>>>>> :)
>>>>>
>>>>>>>> Could we instead continue to use the "legacy" mode by default by just
>>>>>>>> populating the fdb table correctly and then if users want to enable
>>>>>>>> the "offloads" mode they can modify the fdb tables by deleting entries
>>>>>>>> or adding them or just extending the dmac/vf mapping via 'tc'. This
>>>>>>>> would seem natural to me. The flooding rules in fdb might need to be
>>>>>>>> exposed a bit more cleanly to get the right default flooding behavior
>>>>>>>> etc. But to me at least this would be much cleaner. Everything will be
>>>>>>>> nicely defined and we wont have issues with drivers doing slightly
>>>>>>>> and subtle different defaults between legacy/offload and the transitions
>>>>>>>> between the states or on resets or etc. If users need to discover the
>>>>>>>> current configuration then they just query fdb, query tc, and the state
>>>>>>>> is known no need for any magic toggle switch as best I can see.
>>>>>>
>>>>>> Few comments here:
>>>>>>
>>>>>> Each mode has it's own way of the driver doing setup for the HW tables
>>>>>> and how population of the HW tables is done.
>>>>> hmm so in the hardware I have there is actually a l2 table and various
>>>>> other tables so I don't have any issue with doing table setup. I would
>>>>> like to see a table_create/table_delete/table_show devlink commands at
>>>>> some point though but I'm not there yet. This would allow users to
>>>>> optimize the table slices if they cared to. But that is future work
>>>>> IMO. Certainly not needed in this series at least. If you want I can
>>>>> show you a patch I had for this against rocker but it was before devlink
>>>>> so it would need some porting.
>>>>>
>>>>>> The offloads mode needs to create a black hole miss rule and
>>>>>> send-to-vport rules and create the tables so they can contain later
>>>>>> rules set by the kernel in a way which is HW/driver dependent.
>>>>> Agreed a black hole miss rule needs to be applied but rather than apply
>>>>> it automatically with some toggle I would prefer to just add a 'tc' rule
>>>>> for this. Or alternatively it can be added by configuring flooding
>>>>> ports so that only a single port is in the flooding mode. This could
>>>>> all be done via 'bridge fdb ...' and 'bridge link ...' today I believe.
>>>>> Then the user defines the state and not the driver writer. It really is
>>>>> cleaner in my opinion.
>>>>>
>>>>> One oddball case I have is if I have two PF functions behind a single
>>>>> network facing port. Yes its a bit strange but in this case its nice to
>>>>> pick which host facing PF to flood on vs the driver picking one.
>>>>>
>>>>> And send-to-vport rules I'm not entirely clear on what these actually
>>>>> are used for. Is this a rule to match packets sent from a VF representer
>>>>> netdev to the actual VF pcie device? If this is the case its seems to
>>>>> me that any packet sent on a VF representer should be sent to the VF
>>>>> directly and these rules can be created when the VF is created. Or did
>>>>> you mean some other rule by this?
>>>>>
>>>>>> The legacy mode creates the tables differently and populates them later
>>>>>> with rule set by
>>>>>> the driver and not the kernel.
>>>>>>
>>>>>> Even if we put the different table setup issue a side, I don't think it
>>>>>> would be correct for bridge/tc to remove rules they didn't add, which is
>>>>>> needed under your proposal when moving from legacy type rules to
>>>>>> offloads mode. Querying is problematic too, since legacy could (and
>>>>>> does) involve some default rules set by the FW, e.g that deals with
>>>>>> outer world (== not belonging to VM on this host) MACs which are
>>>>>> invisible to the driver.
>>>>> But even legacy mode should report the correct fdb table and setup.
>>>>> I don't think querying should be a problem if the driver reports the
>>>>> configuration correctly. This allows us visibility into the driver
>>>>> default case so we don't have to guess what driver X writer implemented.
>>>>>
>>>>>> That legacy was here and we can't avoid handling it properly for which
>>>>>> this knob is needed. Note that a vendor can choose to put their default
>>>>>> to be offloads, hopefully over time, we will all go there :)
>>>>>>
>>>>> But you can come up in legacy mode and report it via the existing
>>>>> mechanisms 'tc', 'bridge', etc. and then users can transition to any
>>>>> mode they like using the tools.
>>>>>
>>>>> I really don't think the switch here is necessary if you implement the
>>>>> bridge hooks and tc hooks. cls_u32 can handle this for example and I
>>>>> would expect flower can as well if you want to do mgmt via flow based
>>>>> tc commands. And the bridge tool has the attributes for per port
>>>>> flooding but not sure off-hand if its packed into the msg sent to the
>>>>> driver. But we could fix that fairly easily in another patch series if
>>>>> needed.
>>>>>
>>>> Actually with a bit more thought it might be nice to have a
>>>> flag to enable/disable creation of vf netdev representer in case it
>>>> somehow causes issues with existing software. We typically
>>>> enable/disable features with ethtool feature flags though not via
>>>> devlink so I think it would fit better as an ethtool flag same as
>>>> all the other hardware features.
>>> This is not a property of a netdevice, but a devlink device. That should
>>> be a handle of creating/not creating representors. And I think that what
>>> this patch is doing serves that purpose as well. For legacy mode, the
>>> representors are not created, for offload/switchdev mode they are
>>> created.
>>
>> Even in legacy mode, i think there is a value in creating VP representor
>> netdevs.
>
> Why?! Please, leave legacy be legacy. Use the new mode for implementing new
> features. Don't make things any more complicated :(
>
OK so how I read this is there are two things going on that are being
conflated together. Creating VF netdev's is linked to the PCIe
subsystems and brings VFs into the netdev model. This is a good thing
but doesn't need to be a global nic policy it can be per port hence
the ethtool flag vs devlink discussion. I don't actually have a use case
to have one port with VF netdevs and another without it so I'm not too
particular on this. Logically it looks like a per port setting because
the hardware has no issues with making one physical function create
a netdev for each of its VFs and the other one run without these
netdevs. This is why I called it out.
How this relates to bridge, tc, etc. is now you have a identifier to
configure instead of using strange 'ip link set ... vf#' commands. This
is great. But I see no reason the hardware has to make changes to
the existing tables or any of this. Before we used 'bridge fdb' and 'ip
link' now we can use bridge tools more effectively and can deprecate
the overloaded use of ip. But again I see no reason to thrash the
forwarding state of the switch because we happen to be adding VFs.
Having a set of fdb rules to forward MAC/Vlan pairs (as we do now)
seems like a perfectly reasonable default. Add with this patch now
when I run 'fdb show' I can see the defaults.
Maybe I'm reading to much into the devlink flag names and if instead
you use a switch like the following,
VF representer : enable/disable the creation VF netdev's to represent
the virtual functions on the PF
Much less complicated then magic switching between forwarding logic IMO
and you don't whack a default configuration that an entire stack (e.g.
libvirt) has been built to use.
>
>> We are planning to expose VF statistics, ntuple filters, additional fdb,
>> vlan entries via this
>> netdev for VFs in the default mode.
>>
>> Isn't it possible to switch to offloads mode by deleting the 'legacy' flow
>> rules and
>> adding 'offloads' flow rules from userspace?
>>
>>
>>>
>>> Does not make sense to have this in ethtool one bit to me.
>>>
>>>
>>>> The above points in the last mail are more about how it influences the
>>>> forwarding rules in the switch and my preference would be that it
>>>> doesn't change how the forwarding works in the switch and instead the
>>>> forwarding state is managed via standard tools 'tc', 'bridge', etc. So
>>>> I think my comments are still relevant. However as long as when we
>>>> query the nic/switch we get the correct information back in any mode
>>>> I'm not too concerned I suspect any software that actually uses this
>>>> will have to query and reconfigure either way counting on driver
>>>> writers to get policy correct is not a stable way to write usermode
>>>> software.
>>>>
>>>> All that said I don't plan to change the forwarding state this way
>>>> with the intel drivers when implementing the vf representer.
>>>>
>>>> Yet another reason not to change the state of the forwarding rules is
>>>> even on older hardware that only supports l2 mac/vlan based forwarding
>>>> having a VF representer is useful to configure the device and send/recv
>>>> some basic control packets (e.g. lldp). On these devices l2 mac/vlan
>>>> mode is the only one supported.
>>>>
>>>>>>>> Otherwise I didn't review the mlx code but read the commit msgs and
>>>>>>>> it looks good. I'll take a closer look in the morning.
>>>>>> appreciated
>>>>>>
>>
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-28 19:31 ` John Fastabend
@ 2016-06-29 14:48 ` Or Gerlitz
2016-06-29 16:35 ` John Fastabend
0 siblings, 1 reply; 47+ messages in thread
From: Or Gerlitz @ 2016-06-29 14:48 UTC (permalink / raw)
To: John Fastabend, Jiri Pirko
Cc: Samudrala, Sridhar, Saeed Mahameed, David S. Miller, netdev,
Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek, Jesse Brandeburg,
John Fastabend, Ido Schimmel, Tal Anker
On 6/28/2016 10:31 PM, John Fastabend wrote:
> On 16-06-28 12:12 PM, Jiri Pirko wrote:
>>
>> Why?! Please, leave legacy be legacy. Use the new mode for implementing new features. Don't make things any more complicated :(
>>
> OK so how I read this is there are two things going on that are being
> conflated together. Creating VF netdev's is linked to the PCIe
> subsystems and brings VFs into the netdev model. This is a good thing
> but doesn't need to be a global nic policy it can be per port hence
> the ethtool flag vs devlink discussion. I don't actually have a use case
> to have one port with VF netdevs and another without it so I'm not too
> particular on this. Logically it looks like a per port setting because
> the hardware has no issues with making one physical function create
> a netdev for each of its VFs and the other one run without these
> netdevs. This is why I called it out.
>
> How this relates to bridge, tc, etc. is now you have a identifier to
> configure instead of using strange 'ip link set ... vf#' commands. This
> is great. But I see no reason the hardware has to make changes to
> the existing tables or any of this. Before we used 'bridge fdb' and 'ip
> link' now we can use bridge tools more effectively and can deprecate
> the overloaded use of ip. But again I see no reason to thrash the
> forwarding state of the switch because we happen to be adding VFs.
> Having a set of fdb rules to forward MAC/Vlan pairs (as we do now)
> seems like a perfectly reasonable default. Add with this patch now
> when I run 'fdb show' I can see the defaults.
>
> Maybe I'm reading to much into the devlink flag names and if instead
> you use a switch like the following,
>
> VF representer : enable/disable the creation VF netdev's to represent
> the virtual functions on the PF
>
>
> Much less complicated then magic switching between forwarding logic IMO
> and you don't whack a default configuration that an entire stack (e.g.
> libvirt) has been built to use.
John,
I'll try to address here the core questions and arguments you brought.
Re letting the user to observe/modify the rules added by the
driver/firmware while legacy mode. Even if possible with bridge/fdb, it
will be really pragmatical and doesn't make sense to get that donefor
the TC subsystem. So this isn't a well defined solution and anyway, as
you said, legacy mode enhancements is a different exercise. Personally,
I agree with Jiri, that we should legacy be legacyand focus on adding
the new model.
The new model has few building blocks, and by all means, have the VF
representors is not the full story, which is not magic but rather the
following:
1. VF (vport) representors netdevices + the needed mechanics
(send-to-vport rules that makes xmit on VF rep --> recv on VF)
2. handling HW data-patch misses --> send to CPU or drop
3. ability to offload SW rules (tc/bridge/etc) using VF representors and
ingress qdiscs / bridge fdb rules / switchdev fdb rule, etc
The knob we suggested says that the system is put into a state where
1,2,3 are needed to make it full performance and functional one. This
submission includes parts 1 and 2, so the offloading of SW rules will
done in successive submission which uses TC offloads which are already
upstream (u32 or flower).
So... we're almost in agreement, do you have another name for the knob
that goes beyond creation/deletion of VF reps? maybe that would be it
for making a progress...
Or.
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-29 14:48 ` Or Gerlitz
@ 2016-06-29 16:35 ` John Fastabend
2016-06-29 21:33 ` Or Gerlitz
0 siblings, 1 reply; 47+ messages in thread
From: John Fastabend @ 2016-06-29 16:35 UTC (permalink / raw)
To: Or Gerlitz, Jiri Pirko
Cc: Samudrala, Sridhar, Saeed Mahameed, David S. Miller, netdev,
Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek, Jesse Brandeburg,
John Fastabend, Ido Schimmel, Tal Anker
On 16-06-29 07:48 AM, Or Gerlitz wrote:
> On 6/28/2016 10:31 PM, John Fastabend wrote:
>> On 16-06-28 12:12 PM, Jiri Pirko wrote:
>>>
>>> Why?! Please, leave legacy be legacy. Use the new mode for
>>> implementing new features. Don't make things any more complicated :(
>>>
>> OK so how I read this is there are two things going on that are being
>> conflated together. Creating VF netdev's is linked to the PCIe
>> subsystems and brings VFs into the netdev model. This is a good thing
>> but doesn't need to be a global nic policy it can be per port hence
>> the ethtool flag vs devlink discussion. I don't actually have a use case
>> to have one port with VF netdevs and another without it so I'm not too
>> particular on this. Logically it looks like a per port setting because
>> the hardware has no issues with making one physical function create
>> a netdev for each of its VFs and the other one run without these
>> netdevs. This is why I called it out.
>>
>> How this relates to bridge, tc, etc. is now you have a identifier to
>> configure instead of using strange 'ip link set ... vf#' commands. This
>> is great. But I see no reason the hardware has to make changes to
>> the existing tables or any of this. Before we used 'bridge fdb' and 'ip
>> link' now we can use bridge tools more effectively and can deprecate
>> the overloaded use of ip. But again I see no reason to thrash the
>> forwarding state of the switch because we happen to be adding VFs.
>> Having a set of fdb rules to forward MAC/Vlan pairs (as we do now)
>> seems like a perfectly reasonable default. Add with this patch now
>> when I run 'fdb show' I can see the defaults.
>>
>> Maybe I'm reading to much into the devlink flag names and if instead
>> you use a switch like the following,
>>
>> VF representer : enable/disable the creation VF netdev's to represent
>> the virtual functions on the PF
>>
>>
>> Much less complicated then magic switching between forwarding logic IMO
>> and you don't whack a default configuration that an entire stack (e.g.
>> libvirt) has been built to use.
>
>
> John,
>
> I'll try to address here the core questions and arguments you brought.
>
thanks. Also just to reiterate I really like the series just a few
details here.
> Re letting the user to observe/modify the rules added by the
> driver/firmware while legacy mode. Even if possible with bridge/fdb, it
> will be really pragmatical and doesn't make sense to get that donefor
> the TC subsystem. So this isn't a well defined solution and anyway, as
> you said, legacy mode enhancements is a different exercise. Personally,
> I agree with Jiri, that we should legacy be legacyand focus on adding
> the new model.
>
The ixgbe driver already supports bridge and tc commands without the VF
representer. Adding the VF representer to these drivers just extends
the existing support so we have an identifier for VFs and now the
redirect action works and the fdb commands can specify the VF netdevs.
I don't see this as a problem because we already do it today with
'ip' and bridge tools.
We are also slightly in disagreement about what the default should be
with VF netdevs. I think the default should be the same L2 mac/vlan
switch behavior and see no reason to change it by default just because
we added VF netdevs. The infrastructure libvirt/openstack/etc are built
around this default today. But I guess nothing in this series specifies
what the defaults of any given driver will be. VF netdevs are still
useful even on older hardware that only supports mac/vlan forwarding to
expose statistics and send/receive control frames such as lldp.
> The new model has few building blocks, and by all means, have the VF
> representors is not the full story, which is not magic but rather the
> following:
>
> 1. VF (vport) representors netdevices + the needed mechanics
> (send-to-vport rules that makes xmit on VF rep --> recv on VF)
>
We all agree on this. For me this should be its own knob VF netdevs or
no VF netdevs.
There is also my point that this is really a port attribute of the PCIe
configuration not a switch attribute.
> 2. handling HW data-patch misses --> send to CPU or drop
Yep need this also but we have a standard way to configure this already
with bridge and 'tc' so why have a toggle for it? Also you don't know
in the driver where I want to send missed packets. In some use cases I
have the VM is managing the system and in these cases I want to send
missed packets to a VF.
In ixgbe we get this for free (with the vf identifier netdevs) because
we have 'tc' and 'bridge' already hooked up. With 'tc' you can define
a wild card match with low priority and with 'bridge' model you can
setup the flood ports to do this.
>
> 3. ability to offload SW rules (tc/bridge/etc) using VF representors and
> ingress qdiscs / bridge fdb rules / switchdev fdb rule, etc
>
> The knob we suggested says that the system is put into a state where
> 1,2,3 are needed to make it full performance and functional one. This
> submission includes parts 1 and 2, so the offloading of SW rules will
> done in successive submission which uses TC offloads which are already
> upstream (u32 or flower).
>
> So... we're almost in agreement, do you have another name for the knob
> that goes beyond creation/deletion of VF reps? maybe that would be it
> for making a progress...
The sticking point for me is (2) is not needed if you do (3)
correctly. So once you have implemented bridge and one of the 'tc'
classifiers that can be used to specify the policy in (2) and you don't
have a chunk policy being defined by the driver writer.
Just to put out an alternative if you add an ethtool feature flag 'VF
representer' so that I can specify enable/disable of VFs per port that
would resolve my concerns.
If you have this additional switch in devlink to hammer the datapath
between two switch modes that seems OK but I'm not sure who else other
than mlx drivers would use it. Additionally if you just used this
devlink hook to set the feature flag on each port and made it 'fixed'
from an ethtool perspective that would work for me as well. Then on
my devices that support VF representers per port I can configure it
and on the devices that can only do it globally it is configured with
this devlink thing.
Why I think the VF representer is a per port ethtool flag and not a
devlink option is my use case might be to assign a PF into a VM or
namespace where I don't want VF netdevs.
Thanks,
.John
>
> Or.
>
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-29 16:35 ` John Fastabend
@ 2016-06-29 21:33 ` Or Gerlitz
2016-06-29 22:09 ` John Fastabend
0 siblings, 1 reply; 47+ messages in thread
From: Or Gerlitz @ 2016-06-29 21:33 UTC (permalink / raw)
To: John Fastabend
Cc: Or Gerlitz, Jiri Pirko, Samudrala, Sridhar, Saeed Mahameed,
David S. Miller, Linux Netdev List, Hadar Hen-Zion, Jiri Pirko,
Andy Gospodarek, Jesse Brandeburg, John Fastabend, Ido Schimmel,
Tal Anker
On Wed, Jun 29, 2016 at 7:35 PM, John Fastabend
<john.fastabend@gmail.com> wrote:
> On 16-06-29 07:48 AM, Or Gerlitz wrote:
>> On 6/28/2016 10:31 PM, John Fastabend wrote:
>>> On 16-06-28 12:12 PM, Jiri Pirko wrote:
>>>> Why?! Please, leave legacy be legacy. Use the new mode for
>>>> implementing new features. Don't make things any more complicated :(
[...]
>>> Maybe I'm reading to much into the devlink flag names and if instead
>>> you use a switch like the following,
>>> VF representer : enable/disable the creation VF netdev's to represent
>>> the virtual functions on the PF
>>> Much less complicated then magic switching between forwarding logic IMO
>>> and you don't whack a default configuration that an entire stack (e.g.
>>> libvirt) has been built to use.
>> Re letting the user to observe/modify the rules added by the
>> driver/firmware while legacy mode. Even if possible with bridge/fdb, it
>> will be really pragmatical and doesn't make sense to get that donefor
>> the TC subsystem. So this isn't a well defined solution and anyway, as
>> you said, legacy mode enhancements is a different exercise. Personally,
>> I agree with Jiri, that we should legacy be legacyand focus on adding
>> the new model.
> The ixgbe driver already supports bridge and tc commands without the VF
> representer. Adding the VF representer to these drivers just extends
> the existing support so we have an identifier for VFs and now the
> redirect action works and the fdb commands can specify the VF netdevs.
> I don't see this as a problem because we already do it today with
> 'ip' and bridge tools.
To be precise, for both ixgbe and mlx5, the existing tc support
(u32/ixgbe, flower/mlx5) is not for switching functionality but rather
for NIC-ish one, e.g drop, mark, etc. Indeed in ixgbe you added
redirect to VF, but this is only for south --> north (wire --> VF)
traffic, w.o the VF rep you can't do the other way around.
Just to clarify, to what exact bridge command support did you refer for ixgbe?
The forwarding done in the legacy mode is not well defined, and
different across vendors, adding there the VF reps will not make it
any better b/c some steering rules will be set by tc/bridge offloads
while other rules will be put by the driver.
I don't see how this takes us to better place.
> We are also slightly in disagreement about what the default should be
> with VF netdevs. I think the default should be the same L2 mac/vlan
> switch behavior and see no reason to change it by default just because
> we added VF netdevs. The infrastructure libvirt/openstack/etc are built
> around this default today. But I guess nothing in this series specifies
> what the defaults of any given driver will be. VF netdevs are still
> useful even on older hardware that only supports mac/vlan forwarding to
> expose statistics and send/receive control frames such as lldp.
Again, this is not about default engineering... and using the VF reps
(not VF netdevs) in legacy mode only make it more cryptic to my
opinion. I agree some changes would be needed in openstack to support
the new model, but this is how progress is made... you can't always
make all layer above you unchanged. Note that the VF reps behave the
same as tap devices (v-switch doing xmit on tap --> recv in VM, VM
sends --> recv on tap into the v-switch), so the change in open-stack
would not be that big.
[...]
> Why I think the VF representer is a per port ethtool flag and not a
> devlink option is my use case might be to assign a PF into a VM or
> namespace where I don't want VF netdevs.
again, we think the correct place to set how the eswitch is managed is
through eswitch manager PCI devices and not net devices and hence
ethtool is not the way to go.
Also, how do you want your e-switch to be managed in this case?
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-29 21:33 ` Or Gerlitz
@ 2016-06-29 22:09 ` John Fastabend
2016-06-30 3:35 ` John Fastabend
0 siblings, 1 reply; 47+ messages in thread
From: John Fastabend @ 2016-06-29 22:09 UTC (permalink / raw)
To: Or Gerlitz
Cc: Or Gerlitz, Jiri Pirko, Samudrala, Sridhar, Saeed Mahameed,
David S. Miller, Linux Netdev List, Hadar Hen-Zion, Jiri Pirko,
Andy Gospodarek, Jesse Brandeburg, John Fastabend, Ido Schimmel,
Tal Anker
On 16-06-29 02:33 PM, Or Gerlitz wrote:
> On Wed, Jun 29, 2016 at 7:35 PM, John Fastabend
> <john.fastabend@gmail.com> wrote:
>> On 16-06-29 07:48 AM, Or Gerlitz wrote:
>>> On 6/28/2016 10:31 PM, John Fastabend wrote:
>>>> On 16-06-28 12:12 PM, Jiri Pirko wrote:
>
>>>>> Why?! Please, leave legacy be legacy. Use the new mode for
>>>>> implementing new features. Don't make things any more complicated :(
>
> [...]
>>>> Maybe I'm reading to much into the devlink flag names and if instead
>>>> you use a switch like the following,
>
>>>> VF representer : enable/disable the creation VF netdev's to represent
>>>> the virtual functions on the PF
>
>>>> Much less complicated then magic switching between forwarding logic IMO
>>>> and you don't whack a default configuration that an entire stack (e.g.
>>>> libvirt) has been built to use.
>
>>> Re letting the user to observe/modify the rules added by the
>>> driver/firmware while legacy mode. Even if possible with bridge/fdb, it
>>> will be really pragmatical and doesn't make sense to get that donefor
>>> the TC subsystem. So this isn't a well defined solution and anyway, as
>>> you said, legacy mode enhancements is a different exercise. Personally,
>>> I agree with Jiri, that we should legacy be legacyand focus on adding
>>> the new model.
>
>> The ixgbe driver already supports bridge and tc commands without the VF
>> representer. Adding the VF representer to these drivers just extends
>> the existing support so we have an identifier for VFs and now the
>> redirect action works and the fdb commands can specify the VF netdevs.
>> I don't see this as a problem because we already do it today with
>> 'ip' and bridge tools.
>
> To be precise, for both ixgbe and mlx5, the existing tc support
> (u32/ixgbe, flower/mlx5) is not for switching functionality but rather
> for NIC-ish one, e.g drop, mark, etc. Indeed in ixgbe you added
> redirect to VF, but this is only for south --> north (wire --> VF)
> traffic, w.o the VF rep you can't do the other way around.
>
Correct which is why we need the VF rep. So we are completely in
sync there.
> Just to clarify, to what exact bridge command support did you refer for ixgbe?
'bridge fdb' commands are supported today on the PF. But its the
same story as above we need the VF rep to also use it on the
VF representer
Also 'bridge link' command for veb/vepa modes is supported and the
other link attributes could be supported with additional driver
support. No need for core changes here. But again yes only on the
PF so again we need the VF reps.
>
> The forwarding done in the legacy mode is not well defined, and
> different across vendors, adding there the VF reps will not make it
> any better b/c some steering rules will be set by tc/bridge offloads
> while other rules will be put by the driver.
> I don't see how this takes us to better place.
In legacy mode or any other mode you are defining some default policy
and rules.
In the legacy mode we use mac/vlan assigned l2 forwarding entries in the
hardware fdb which are seen when you query 'ip link' and 'bridge fdb'
today. And similarly can be modified today using 'ip link' and 'bridge
fdb' at least on the intel devices. Its not undefined in any way with
a quick query of the tools we can learn exactly what the configuration
is and even change it. This works fairly well with existing controllers
and stacks.
The limitations are 'ip' only supports a single MAC address per VF and
'tc' doesn't work on VF ports because when the VF is assigned to a VM
or namespace we lose visibility of it. Providing a VF rep for this
solves both of those problems.
In this new mode the default policy is to create a default miss rule
and implement no l2 forwarding rules. Unfortunately not all hardware
in use supports this default miss rule case but would still benefit
from having a VF rep. So we shouldn't make this a stipulation for
enabling VF reps. It also changes a default policy that has been in
place for years without IMO at least any compelling reason. It will
be easy enough to change the default l2 policy to a flow based model
with a few bridge/tc commands.
>
>> We are also slightly in disagreement about what the default should be
>> with VF netdevs. I think the default should be the same L2 mac/vlan
>> switch behavior and see no reason to change it by default just because
>> we added VF netdevs. The infrastructure libvirt/openstack/etc are built
>> around this default today. But I guess nothing in this series specifies
>> what the defaults of any given driver will be. VF netdevs are still
>> useful even on older hardware that only supports mac/vlan forwarding to
>> expose statistics and send/receive control frames such as lldp.
>
> Again, this is not about default engineering... and using the VF reps
> (not VF netdevs) in legacy mode only make it more cryptic to my
> opinion. I agree some changes would be needed in openstack to support
> the new model, but this is how progress is made... you can't always
> make all layer above you unchanged. Note that the VF reps behave the
> same as tap devices (v-switch doing xmit on tap --> recv in VM, VM
> sends --> recv on tap into the v-switch), so the change in open-stack
> would not be that big.
>
But in this case we have no reason to break the stack above us. The
currently deployed usage is L2 mac/vlan. As soon as you bind a vSwitch
or whatever mgmt agent to the device it can go ahead and manage the
switch putting it in the correct mode using the tooling in 'bridge' and
'tc'.
> [...]
>
>> Why I think the VF representer is a per port ethtool flag and not a
>> devlink option is my use case might be to assign a PF into a VM or
>> namespace where I don't want VF netdevs.
>
> again, we think the correct place to set how the eswitch is managed is
> through eswitch manager PCI devices and not net devices and hence
> ethtool is not the way to go.
>
> Also, how do you want your e-switch to be managed in this case?
>
In the case where I don't create vf netdevs on one of the PFs I'll
manage the forwarding tables via the existing mechanisms 'ip' and
'bridge'. However its likely not a big deal because 'ip' and 'bridge'
will continue to work even if VF reps are around. The ethtool/devlink
comment was more about pointing out that creating VFs does not
require you to manage your switch any differently. Its useful even on
devices that can't support flow based forwarding for statistics and
setting port attributes like mtu, etc.
.John
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-29 22:09 ` John Fastabend
@ 2016-06-30 3:35 ` John Fastabend
2016-06-30 4:04 ` John Fastabend
0 siblings, 1 reply; 47+ messages in thread
From: John Fastabend @ 2016-06-30 3:35 UTC (permalink / raw)
To: Or Gerlitz
Cc: Or Gerlitz, Jiri Pirko, Samudrala, Sridhar, Saeed Mahameed,
David S. Miller, Linux Netdev List, Hadar Hen-Zion, Jiri Pirko,
Andy Gospodarek, Jesse Brandeburg, John Fastabend, Ido Schimmel,
Tal Anker
On 16-06-29 03:09 PM, John Fastabend wrote:
> On 16-06-29 02:33 PM, Or Gerlitz wrote:
>> On Wed, Jun 29, 2016 at 7:35 PM, John Fastabend
>> <john.fastabend@gmail.com> wrote:
>>> On 16-06-29 07:48 AM, Or Gerlitz wrote:
>>>> On 6/28/2016 10:31 PM, John Fastabend wrote:
>>>>> On 16-06-28 12:12 PM, Jiri Pirko wrote:
>>
>>>>>> Why?! Please, leave legacy be legacy. Use the new mode for
>>>>>> implementing new features. Don't make things any more complicated :(
>>
>> [...]
>>>>> Maybe I'm reading to much into the devlink flag names and if instead
>>>>> you use a switch like the following,
>>
>>>>> VF representer : enable/disable the creation VF netdev's to represent
>>>>> the virtual functions on the PF
>>
>>>>> Much less complicated then magic switching between forwarding logic IMO
>>>>> and you don't whack a default configuration that an entire stack (e.g.
>>>>> libvirt) has been built to use.
>>
>>>> Re letting the user to observe/modify the rules added by the
>>>> driver/firmware while legacy mode. Even if possible with bridge/fdb, it
>>>> will be really pragmatical and doesn't make sense to get that donefor
>>>> the TC subsystem. So this isn't a well defined solution and anyway, as
>>>> you said, legacy mode enhancements is a different exercise. Personally,
>>>> I agree with Jiri, that we should legacy be legacyand focus on adding
>>>> the new model.
>>
>>> The ixgbe driver already supports bridge and tc commands without the VF
>>> representer. Adding the VF representer to these drivers just extends
>>> the existing support so we have an identifier for VFs and now the
>>> redirect action works and the fdb commands can specify the VF netdevs.
>>> I don't see this as a problem because we already do it today with
>>> 'ip' and bridge tools.
>>
>> To be precise, for both ixgbe and mlx5, the existing tc support
>> (u32/ixgbe, flower/mlx5) is not for switching functionality but rather
>> for NIC-ish one, e.g drop, mark, etc. Indeed in ixgbe you added
>> redirect to VF, but this is only for south --> north (wire --> VF)
>> traffic, w.o the VF rep you can't do the other way around.
>>
>
> Correct which is why we need the VF rep. So we are completely in
> sync there.
>
>> Just to clarify, to what exact bridge command support did you refer for ixgbe?
>
> 'bridge fdb' commands are supported today on the PF. But its the
> same story as above we need the VF rep to also use it on the
> VF representer
>
> Also 'bridge link' command for veb/vepa modes is supported and the
> other link attributes could be supported with additional driver
> support. No need for core changes here. But again yes only on the
> PF so again we need the VF reps.
>
>>
>> The forwarding done in the legacy mode is not well defined, and
>> different across vendors, adding there the VF reps will not make it
>> any better b/c some steering rules will be set by tc/bridge offloads
>> while other rules will be put by the driver.
>> I don't see how this takes us to better place.
>
> In legacy mode or any other mode you are defining some default policy
> and rules.
>
> In the legacy mode we use mac/vlan assigned l2 forwarding entries in the
> hardware fdb which are seen when you query 'ip link' and 'bridge fdb'
> today. And similarly can be modified today using 'ip link' and 'bridge
> fdb' at least on the intel devices. Its not undefined in any way with
> a quick query of the tools we can learn exactly what the configuration
> is and even change it. This works fairly well with existing controllers
> and stacks.
>
> The limitations are 'ip' only supports a single MAC address per VF and
> 'tc' doesn't work on VF ports because when the VF is assigned to a VM
> or namespace we lose visibility of it. Providing a VF rep for this
> solves both of those problems.
>
> In this new mode the default policy is to create a default miss rule
> and implement no l2 forwarding rules. Unfortunately not all hardware
> in use supports this default miss rule case but would still benefit
> from having a VF rep. So we shouldn't make this a stipulation for
> enabling VF reps. It also changes a default policy that has been in
> place for years without IMO at least any compelling reason. It will
> be easy enough to change the default l2 policy to a flow based model
> with a few bridge/tc commands.
>
>>
>>> We are also slightly in disagreement about what the default should be
>>> with VF netdevs. I think the default should be the same L2 mac/vlan
>>> switch behavior and see no reason to change it by default just because
>>> we added VF netdevs. The infrastructure libvirt/openstack/etc are built
>>> around this default today. But I guess nothing in this series specifies
>>> what the defaults of any given driver will be. VF netdevs are still
>>> useful even on older hardware that only supports mac/vlan forwarding to
>>> expose statistics and send/receive control frames such as lldp.
>>
>> Again, this is not about default engineering... and using the VF reps
>> (not VF netdevs) in legacy mode only make it more cryptic to my
>> opinion. I agree some changes would be needed in openstack to support
>> the new model, but this is how progress is made... you can't always
>> make all layer above you unchanged. Note that the VF reps behave the
>> same as tap devices (v-switch doing xmit on tap --> recv in VM, VM
>> sends --> recv on tap into the v-switch), so the change in open-stack
>> would not be that big.
>>
>
> But in this case we have no reason to break the stack above us. The
> currently deployed usage is L2 mac/vlan. As soon as you bind a vSwitch
> or whatever mgmt agent to the device it can go ahead and manage the
> switch putting it in the correct mode using the tooling in 'bridge' and
> 'tc'.
>
>
>> [...]
>>
>>> Why I think the VF representer is a per port ethtool flag and not a
>>> devlink option is my use case might be to assign a PF into a VM or
>>> namespace where I don't want VF netdevs.
>>
>> again, we think the correct place to set how the eswitch is managed is
>> through eswitch manager PCI devices and not net devices and hence
>> ethtool is not the way to go.
>>
>> Also, how do you want your e-switch to be managed in this case?
>>
>
> In the case where I don't create vf netdevs on one of the PFs I'll
> manage the forwarding tables via the existing mechanisms 'ip' and
> 'bridge'. However its likely not a big deal because 'ip' and 'bridge'
> will continue to work even if VF reps are around. The ethtool/devlink
> comment was more about pointing out that creating VFs does not
> require you to manage your switch any differently. Its useful even on
> devices that can't support flow based forwarding for statistics and
> setting port attributes like mtu, etc.
>
> .John
>
Probably bad form to respond to my own email but just to highlight how
subtle the distinction is (hopefully not to much repeat),
Today in "legacy" mode each VF mac address is automatically added to
the fdb along with the PF mac address. If there is a miss in the table
(an unknown mac) the packet is sent to the PF but unless the PF is in
promisc mode the packet is dropped by the rx filter. I presume even
with the proposed model you would want to continue to enforce the
rx filter otherwise the instance you flip the mode you are open to
receive unwanted traffic. The promisc mode semantics have been in place
for a long time so certainly don't want to break that. Can we agree on
the promisc point? Also bridges/vswitch/etc already set promisc mode
once they attach to the netdevs.
(assuming we agree on the promisc point?)
In your proposed model the only difference I can see is when the mode is
changed you don't want to add the VF mac address to the fdb table. How
about rather than make this part of the mode selection pick one way to
do this in all cases. Either add the VF mac addresses to the fdb or
do not do this. I have a preference for adding the VF mac addresses
because this is the current behavior. Then rename the devlink option
"VF reps" or something because that is what it is controlling.
The last thing to argue about is if its a port attribute ala ethtool
or a device attribute ala devlink. But maybe we can agree on everything
up to this point?
Thanks,
John
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-30 3:35 ` John Fastabend
@ 2016-06-30 4:04 ` John Fastabend
2016-06-30 6:25 ` Jiri Pirko
0 siblings, 1 reply; 47+ messages in thread
From: John Fastabend @ 2016-06-30 4:04 UTC (permalink / raw)
To: Or Gerlitz
Cc: Or Gerlitz, Jiri Pirko, Samudrala, Sridhar, Saeed Mahameed,
David S. Miller, Linux Netdev List, Hadar Hen-Zion, Jiri Pirko,
Andy Gospodarek, Jesse Brandeburg, John Fastabend, Ido Schimmel,
Tal Anker
On 16-06-29 08:35 PM, John Fastabend wrote:
> On 16-06-29 03:09 PM, John Fastabend wrote:
>> On 16-06-29 02:33 PM, Or Gerlitz wrote:
>>> On Wed, Jun 29, 2016 at 7:35 PM, John Fastabend
>>> <john.fastabend@gmail.com> wrote:
>>>> On 16-06-29 07:48 AM, Or Gerlitz wrote:
>>>>> On 6/28/2016 10:31 PM, John Fastabend wrote:
>>>>>> On 16-06-28 12:12 PM, Jiri Pirko wrote:
>>>
>>>>>>> Why?! Please, leave legacy be legacy. Use the new mode for
>>>>>>> implementing new features. Don't make things any more complicated :(
>>>
>>> [...]
>>>>>> Maybe I'm reading to much into the devlink flag names and if instead
>>>>>> you use a switch like the following,
>>>
>>>>>> VF representer : enable/disable the creation VF netdev's to represent
>>>>>> the virtual functions on the PF
>>>
>>>>>> Much less complicated then magic switching between forwarding logic IMO
>>>>>> and you don't whack a default configuration that an entire stack (e.g.
>>>>>> libvirt) has been built to use.
>>>
>>>>> Re letting the user to observe/modify the rules added by the
>>>>> driver/firmware while legacy mode. Even if possible with bridge/fdb, it
>>>>> will be really pragmatical and doesn't make sense to get that donefor
>>>>> the TC subsystem. So this isn't a well defined solution and anyway, as
>>>>> you said, legacy mode enhancements is a different exercise. Personally,
>>>>> I agree with Jiri, that we should legacy be legacyand focus on adding
>>>>> the new model.
>>>
>>>> The ixgbe driver already supports bridge and tc commands without the VF
>>>> representer. Adding the VF representer to these drivers just extends
>>>> the existing support so we have an identifier for VFs and now the
>>>> redirect action works and the fdb commands can specify the VF netdevs.
>>>> I don't see this as a problem because we already do it today with
>>>> 'ip' and bridge tools.
>>>
>>> To be precise, for both ixgbe and mlx5, the existing tc support
>>> (u32/ixgbe, flower/mlx5) is not for switching functionality but rather
>>> for NIC-ish one, e.g drop, mark, etc. Indeed in ixgbe you added
>>> redirect to VF, but this is only for south --> north (wire --> VF)
>>> traffic, w.o the VF rep you can't do the other way around.
>>>
>>
>> Correct which is why we need the VF rep. So we are completely in
>> sync there.
>>
>>> Just to clarify, to what exact bridge command support did you refer for ixgbe?
>>
>> 'bridge fdb' commands are supported today on the PF. But its the
>> same story as above we need the VF rep to also use it on the
>> VF representer
>>
>> Also 'bridge link' command for veb/vepa modes is supported and the
>> other link attributes could be supported with additional driver
>> support. No need for core changes here. But again yes only on the
>> PF so again we need the VF reps.
>>
>>>
>>> The forwarding done in the legacy mode is not well defined, and
>>> different across vendors, adding there the VF reps will not make it
>>> any better b/c some steering rules will be set by tc/bridge offloads
>>> while other rules will be put by the driver.
>>> I don't see how this takes us to better place.
>>
>> In legacy mode or any other mode you are defining some default policy
>> and rules.
>>
>> In the legacy mode we use mac/vlan assigned l2 forwarding entries in the
>> hardware fdb which are seen when you query 'ip link' and 'bridge fdb'
>> today. And similarly can be modified today using 'ip link' and 'bridge
>> fdb' at least on the intel devices. Its not undefined in any way with
>> a quick query of the tools we can learn exactly what the configuration
>> is and even change it. This works fairly well with existing controllers
>> and stacks.
>>
>> The limitations are 'ip' only supports a single MAC address per VF and
>> 'tc' doesn't work on VF ports because when the VF is assigned to a VM
>> or namespace we lose visibility of it. Providing a VF rep for this
>> solves both of those problems.
>>
>> In this new mode the default policy is to create a default miss rule
>> and implement no l2 forwarding rules. Unfortunately not all hardware
>> in use supports this default miss rule case but would still benefit
>> from having a VF rep. So we shouldn't make this a stipulation for
>> enabling VF reps. It also changes a default policy that has been in
>> place for years without IMO at least any compelling reason. It will
>> be easy enough to change the default l2 policy to a flow based model
>> with a few bridge/tc commands.
>>
>>>
>>>> We are also slightly in disagreement about what the default should be
>>>> with VF netdevs. I think the default should be the same L2 mac/vlan
>>>> switch behavior and see no reason to change it by default just because
>>>> we added VF netdevs. The infrastructure libvirt/openstack/etc are built
>>>> around this default today. But I guess nothing in this series specifies
>>>> what the defaults of any given driver will be. VF netdevs are still
>>>> useful even on older hardware that only supports mac/vlan forwarding to
>>>> expose statistics and send/receive control frames such as lldp.
>>>
>>> Again, this is not about default engineering... and using the VF reps
>>> (not VF netdevs) in legacy mode only make it more cryptic to my
>>> opinion. I agree some changes would be needed in openstack to support
>>> the new model, but this is how progress is made... you can't always
>>> make all layer above you unchanged. Note that the VF reps behave the
>>> same as tap devices (v-switch doing xmit on tap --> recv in VM, VM
>>> sends --> recv on tap into the v-switch), so the change in open-stack
>>> would not be that big.
>>>
>>
>> But in this case we have no reason to break the stack above us. The
>> currently deployed usage is L2 mac/vlan. As soon as you bind a vSwitch
>> or whatever mgmt agent to the device it can go ahead and manage the
>> switch putting it in the correct mode using the tooling in 'bridge' and
>> 'tc'.
>>
>>
>>> [...]
>>>
>>>> Why I think the VF representer is a per port ethtool flag and not a
>>>> devlink option is my use case might be to assign a PF into a VM or
>>>> namespace where I don't want VF netdevs.
>>>
>>> again, we think the correct place to set how the eswitch is managed is
>>> through eswitch manager PCI devices and not net devices and hence
>>> ethtool is not the way to go.
>>>
>>> Also, how do you want your e-switch to be managed in this case?
>>>
>>
>> In the case where I don't create vf netdevs on one of the PFs I'll
>> manage the forwarding tables via the existing mechanisms 'ip' and
>> 'bridge'. However its likely not a big deal because 'ip' and 'bridge'
>> will continue to work even if VF reps are around. The ethtool/devlink
>> comment was more about pointing out that creating VFs does not
>> require you to manage your switch any differently. Its useful even on
>> devices that can't support flow based forwarding for statistics and
>> setting port attributes like mtu, etc.
>>
>> .John
>>
>
> Probably bad form to respond to my own email but just to highlight how
> subtle the distinction is (hopefully not to much repeat),
>
> Today in "legacy" mode each VF mac address is automatically added to
> the fdb along with the PF mac address. If there is a miss in the table
> (an unknown mac) the packet is sent to the PF but unless the PF is in
> promisc mode the packet is dropped by the rx filter. I presume even
> with the proposed model you would want to continue to enforce the
> rx filter otherwise the instance you flip the mode you are open to
> receive unwanted traffic. The promisc mode semantics have been in place
> for a long time so certainly don't want to break that. Can we agree on
> the promisc point? Also bridges/vswitch/etc already set promisc mode
> once they attach to the netdevs.
>
> (assuming we agree on the promisc point?)
> In your proposed model the only difference I can see is when the mode is
> changed you don't want to add the VF mac address to the fdb table. How
> about rather than make this part of the mode selection pick one way to
> do this in all cases. Either add the VF mac addresses to the fdb or
> do not do this. I have a preference for adding the VF mac addresses
> because this is the current behavior. Then rename the devlink option
> "VF reps" or something because that is what it is controlling.
>
> The last thing to argue about is if its a port attribute ala ethtool
> or a device attribute ala devlink. But maybe we can agree on everything
> up to this point?
>
> Thanks,
> John
>
FWIW reviewing devlink and items I want to put there in the future I've
decided it makes sense to keep it in devlink (sorry took me a day of
emails to get here). If you can agree to the above and rename it
something like,
+enum devlink_eswitch_mode {
+ DEVLINK_ESWITCH_MODE_NONE,
+ DEVLINK_ESWITCH_MODE_LEGACY,
+ DEVLINK_ESWITCH_MODE_CREATE_VF_NETDEVS,
+};
I'll Ack it and implement it on the drivers I tend to work on.
.John
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-30 4:04 ` John Fastabend
@ 2016-06-30 6:25 ` Jiri Pirko
2016-06-30 7:13 ` Samudrala, Sridhar
0 siblings, 1 reply; 47+ messages in thread
From: Jiri Pirko @ 2016-06-30 6:25 UTC (permalink / raw)
To: John Fastabend
Cc: Or Gerlitz, Or Gerlitz, Samudrala, Sridhar, Saeed Mahameed,
David S. Miller, Linux Netdev List, Hadar Hen-Zion, Jiri Pirko,
Andy Gospodarek, Jesse Brandeburg, John Fastabend, Ido Schimmel,
Tal Anker
Thu, Jun 30, 2016 at 06:04:39AM CEST, john.fastabend@gmail.com wrote:
>On 16-06-29 08:35 PM, John Fastabend wrote:
>> On 16-06-29 03:09 PM, John Fastabend wrote:
>>> On 16-06-29 02:33 PM, Or Gerlitz wrote:
>>>> On Wed, Jun 29, 2016 at 7:35 PM, John Fastabend
>>>> <john.fastabend@gmail.com> wrote:
>>>>> On 16-06-29 07:48 AM, Or Gerlitz wrote:
>>>>>> On 6/28/2016 10:31 PM, John Fastabend wrote:
>>>>>>> On 16-06-28 12:12 PM, Jiri Pirko wrote:
>>>>
>>>>>>>> Why?! Please, leave legacy be legacy. Use the new mode for
>>>>>>>> implementing new features. Don't make things any more complicated :(
>>>>
>>>> [...]
>>>>>>> Maybe I'm reading to much into the devlink flag names and if instead
>>>>>>> you use a switch like the following,
>>>>
>>>>>>> VF representer : enable/disable the creation VF netdev's to represent
>>>>>>> the virtual functions on the PF
>>>>
>>>>>>> Much less complicated then magic switching between forwarding logic IMO
>>>>>>> and you don't whack a default configuration that an entire stack (e.g.
>>>>>>> libvirt) has been built to use.
>>>>
>>>>>> Re letting the user to observe/modify the rules added by the
>>>>>> driver/firmware while legacy mode. Even if possible with bridge/fdb, it
>>>>>> will be really pragmatical and doesn't make sense to get that donefor
>>>>>> the TC subsystem. So this isn't a well defined solution and anyway, as
>>>>>> you said, legacy mode enhancements is a different exercise. Personally,
>>>>>> I agree with Jiri, that we should legacy be legacyand focus on adding
>>>>>> the new model.
>>>>
>>>>> The ixgbe driver already supports bridge and tc commands without the VF
>>>>> representer. Adding the VF representer to these drivers just extends
>>>>> the existing support so we have an identifier for VFs and now the
>>>>> redirect action works and the fdb commands can specify the VF netdevs.
>>>>> I don't see this as a problem because we already do it today with
>>>>> 'ip' and bridge tools.
>>>>
>>>> To be precise, for both ixgbe and mlx5, the existing tc support
>>>> (u32/ixgbe, flower/mlx5) is not for switching functionality but rather
>>>> for NIC-ish one, e.g drop, mark, etc. Indeed in ixgbe you added
>>>> redirect to VF, but this is only for south --> north (wire --> VF)
>>>> traffic, w.o the VF rep you can't do the other way around.
>>>>
>>>
>>> Correct which is why we need the VF rep. So we are completely in
>>> sync there.
>>>
>>>> Just to clarify, to what exact bridge command support did you refer for ixgbe?
>>>
>>> 'bridge fdb' commands are supported today on the PF. But its the
>>> same story as above we need the VF rep to also use it on the
>>> VF representer
>>>
>>> Also 'bridge link' command for veb/vepa modes is supported and the
>>> other link attributes could be supported with additional driver
>>> support. No need for core changes here. But again yes only on the
>>> PF so again we need the VF reps.
>>>
>>>>
>>>> The forwarding done in the legacy mode is not well defined, and
>>>> different across vendors, adding there the VF reps will not make it
>>>> any better b/c some steering rules will be set by tc/bridge offloads
>>>> while other rules will be put by the driver.
>>>> I don't see how this takes us to better place.
>>>
>>> In legacy mode or any other mode you are defining some default policy
>>> and rules.
>>>
>>> In the legacy mode we use mac/vlan assigned l2 forwarding entries in the
>>> hardware fdb which are seen when you query 'ip link' and 'bridge fdb'
>>> today. And similarly can be modified today using 'ip link' and 'bridge
>>> fdb' at least on the intel devices. Its not undefined in any way with
>>> a quick query of the tools we can learn exactly what the configuration
>>> is and even change it. This works fairly well with existing controllers
>>> and stacks.
>>>
>>> The limitations are 'ip' only supports a single MAC address per VF and
>>> 'tc' doesn't work on VF ports because when the VF is assigned to a VM
>>> or namespace we lose visibility of it. Providing a VF rep for this
>>> solves both of those problems.
>>>
>>> In this new mode the default policy is to create a default miss rule
>>> and implement no l2 forwarding rules. Unfortunately not all hardware
>>> in use supports this default miss rule case but would still benefit
>>> from having a VF rep. So we shouldn't make this a stipulation for
>>> enabling VF reps. It also changes a default policy that has been in
>>> place for years without IMO at least any compelling reason. It will
>>> be easy enough to change the default l2 policy to a flow based model
>>> with a few bridge/tc commands.
>>>
>>>>
>>>>> We are also slightly in disagreement about what the default should be
>>>>> with VF netdevs. I think the default should be the same L2 mac/vlan
>>>>> switch behavior and see no reason to change it by default just because
>>>>> we added VF netdevs. The infrastructure libvirt/openstack/etc are built
>>>>> around this default today. But I guess nothing in this series specifies
>>>>> what the defaults of any given driver will be. VF netdevs are still
>>>>> useful even on older hardware that only supports mac/vlan forwarding to
>>>>> expose statistics and send/receive control frames such as lldp.
>>>>
>>>> Again, this is not about default engineering... and using the VF reps
>>>> (not VF netdevs) in legacy mode only make it more cryptic to my
>>>> opinion. I agree some changes would be needed in openstack to support
>>>> the new model, but this is how progress is made... you can't always
>>>> make all layer above you unchanged. Note that the VF reps behave the
>>>> same as tap devices (v-switch doing xmit on tap --> recv in VM, VM
>>>> sends --> recv on tap into the v-switch), so the change in open-stack
>>>> would not be that big.
>>>>
>>>
>>> But in this case we have no reason to break the stack above us. The
>>> currently deployed usage is L2 mac/vlan. As soon as you bind a vSwitch
>>> or whatever mgmt agent to the device it can go ahead and manage the
>>> switch putting it in the correct mode using the tooling in 'bridge' and
>>> 'tc'.
>>>
>>>
>>>> [...]
>>>>
>>>>> Why I think the VF representer is a per port ethtool flag and not a
>>>>> devlink option is my use case might be to assign a PF into a VM or
>>>>> namespace where I don't want VF netdevs.
>>>>
>>>> again, we think the correct place to set how the eswitch is managed is
>>>> through eswitch manager PCI devices and not net devices and hence
>>>> ethtool is not the way to go.
>>>>
>>>> Also, how do you want your e-switch to be managed in this case?
>>>>
>>>
>>> In the case where I don't create vf netdevs on one of the PFs I'll
>>> manage the forwarding tables via the existing mechanisms 'ip' and
>>> 'bridge'. However its likely not a big deal because 'ip' and 'bridge'
>>> will continue to work even if VF reps are around. The ethtool/devlink
>>> comment was more about pointing out that creating VFs does not
>>> require you to manage your switch any differently. Its useful even on
>>> devices that can't support flow based forwarding for statistics and
>>> setting port attributes like mtu, etc.
>>>
>>> .John
>>>
>>
>> Probably bad form to respond to my own email but just to highlight how
>> subtle the distinction is (hopefully not to much repeat),
>>
>> Today in "legacy" mode each VF mac address is automatically added to
>> the fdb along with the PF mac address. If there is a miss in the table
>> (an unknown mac) the packet is sent to the PF but unless the PF is in
>> promisc mode the packet is dropped by the rx filter. I presume even
>> with the proposed model you would want to continue to enforce the
>> rx filter otherwise the instance you flip the mode you are open to
>> receive unwanted traffic. The promisc mode semantics have been in place
>> for a long time so certainly don't want to break that. Can we agree on
>> the promisc point? Also bridges/vswitch/etc already set promisc mode
>> once they attach to the netdevs.
>>
>> (assuming we agree on the promisc point?)
>> In your proposed model the only difference I can see is when the mode is
>> changed you don't want to add the VF mac address to the fdb table. How
>> about rather than make this part of the mode selection pick one way to
>> do this in all cases. Either add the VF mac addresses to the fdb or
>> do not do this. I have a preference for adding the VF mac addresses
>> because this is the current behavior. Then rename the devlink option
>> "VF reps" or something because that is what it is controlling.
>>
>> The last thing to argue about is if its a port attribute ala ethtool
>> or a device attribute ala devlink. But maybe we can agree on everything
>> up to this point?
>>
>> Thanks,
>> John
>>
>
>FWIW reviewing devlink and items I want to put there in the future I've
>decided it makes sense to keep it in devlink (sorry took me a day of
>emails to get here). If you can agree to the above and rename it
>something like,
>
>+enum devlink_eswitch_mode {
>+ DEVLINK_ESWITCH_MODE_NONE,
>+ DEVLINK_ESWITCH_MODE_LEGACY,
>+ DEVLINK_ESWITCH_MODE_CREATE_VF_NETDEVS,
That is certainly totally misleading name. The mode is not about
creating "VF netdevs".
The VF representors are created but just as a side effect. The "offload"
mode or maybe better "switchdev" mode is creating representor netdevs for
VFs because they are needed in order to be able to configure ESwitch in
the same way we configure physical switches - putting netdevs into
bridge/bond/ovs/whatever. You see stats on the representors. Basicaly
they are the same as physical port representors on physical switch ASIC.
>+};
>
>I'll Ack it and implement it on the drivers I tend to work on.
>
>.John
>
>
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-30 6:25 ` Jiri Pirko
@ 2016-06-30 7:13 ` Samudrala, Sridhar
2016-06-30 7:41 ` Jiri Pirko
0 siblings, 1 reply; 47+ messages in thread
From: Samudrala, Sridhar @ 2016-06-30 7:13 UTC (permalink / raw)
To: Jiri Pirko, John Fastabend
Cc: Or Gerlitz, Or Gerlitz, Saeed Mahameed, David S. Miller,
Linux Netdev List, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Ido Schimmel, Tal Anker
On 6/29/2016 11:25 PM, Jiri Pirko wrote:
> Thu, Jun 30, 2016 at 06:04:39AM CEST, john.fastabend@gmail.com wrote:
>> On 16-06-29 08:35 PM, John Fastabend wrote:
>>> On 16-06-29 03:09 PM, John Fastabend wrote:
>>>> On 16-06-29 02:33 PM, Or Gerlitz wrote:
>>>>> On Wed, Jun 29, 2016 at 7:35 PM, John Fastabend
>>>>> <john.fastabend@gmail.com> wrote:
>>>>>> On 16-06-29 07:48 AM, Or Gerlitz wrote:
>>>>>>> On 6/28/2016 10:31 PM, John Fastabend wrote:
>>>>>>>> On 16-06-28 12:12 PM, Jiri Pirko wrote:
>>>>>>>>> Why?! Please, leave legacy be legacy. Use the new mode for
>>>>>>>>> implementing new features. Don't make things any more complicated :(
>>>>> [...]
>>>>>>>> Maybe I'm reading to much into the devlink flag names and if instead
>>>>>>>> you use a switch like the following,
>>>>>>>> VF representer : enable/disable the creation VF netdev's to represent
>>>>>>>> the virtual functions on the PF
>>>>>>>> Much less complicated then magic switching between forwarding logic IMO
>>>>>>>> and you don't whack a default configuration that an entire stack (e.g.
>>>>>>>> libvirt) has been built to use.
>>>>>>> Re letting the user to observe/modify the rules added by the
>>>>>>> driver/firmware while legacy mode. Even if possible with bridge/fdb, it
>>>>>>> will be really pragmatical and doesn't make sense to get that donefor
>>>>>>> the TC subsystem. So this isn't a well defined solution and anyway, as
>>>>>>> you said, legacy mode enhancements is a different exercise. Personally,
>>>>>>> I agree with Jiri, that we should legacy be legacyand focus on adding
>>>>>>> the new model.
>>>>>> The ixgbe driver already supports bridge and tc commands without the VF
>>>>>> representer. Adding the VF representer to these drivers just extends
>>>>>> the existing support so we have an identifier for VFs and now the
>>>>>> redirect action works and the fdb commands can specify the VF netdevs.
>>>>>> I don't see this as a problem because we already do it today with
>>>>>> 'ip' and bridge tools.
>>>>> To be precise, for both ixgbe and mlx5, the existing tc support
>>>>> (u32/ixgbe, flower/mlx5) is not for switching functionality but rather
>>>>> for NIC-ish one, e.g drop, mark, etc. Indeed in ixgbe you added
>>>>> redirect to VF, but this is only for south --> north (wire --> VF)
>>>>> traffic, w.o the VF rep you can't do the other way around.
>>>>>
>>>> Correct which is why we need the VF rep. So we are completely in
>>>> sync there.
>>>>
>>>>> Just to clarify, to what exact bridge command support did you refer for ixgbe?
>>>> 'bridge fdb' commands are supported today on the PF. But its the
>>>> same story as above we need the VF rep to also use it on the
>>>> VF representer
>>>>
>>>> Also 'bridge link' command for veb/vepa modes is supported and the
>>>> other link attributes could be supported with additional driver
>>>> support. No need for core changes here. But again yes only on the
>>>> PF so again we need the VF reps.
>>>>
>>>>> The forwarding done in the legacy mode is not well defined, and
>>>>> different across vendors, adding there the VF reps will not make it
>>>>> any better b/c some steering rules will be set by tc/bridge offloads
>>>>> while other rules will be put by the driver.
>>>>> I don't see how this takes us to better place.
>>>> In legacy mode or any other mode you are defining some default policy
>>>> and rules.
>>>>
>>>> In the legacy mode we use mac/vlan assigned l2 forwarding entries in the
>>>> hardware fdb which are seen when you query 'ip link' and 'bridge fdb'
>>>> today. And similarly can be modified today using 'ip link' and 'bridge
>>>> fdb' at least on the intel devices. Its not undefined in any way with
>>>> a quick query of the tools we can learn exactly what the configuration
>>>> is and even change it. This works fairly well with existing controllers
>>>> and stacks.
>>>>
>>>> The limitations are 'ip' only supports a single MAC address per VF and
>>>> 'tc' doesn't work on VF ports because when the VF is assigned to a VM
>>>> or namespace we lose visibility of it. Providing a VF rep for this
>>>> solves both of those problems.
>>>>
>>>> In this new mode the default policy is to create a default miss rule
>>>> and implement no l2 forwarding rules. Unfortunately not all hardware
>>>> in use supports this default miss rule case but would still benefit
>>>> from having a VF rep. So we shouldn't make this a stipulation for
>>>> enabling VF reps. It also changes a default policy that has been in
>>>> place for years without IMO at least any compelling reason. It will
>>>> be easy enough to change the default l2 policy to a flow based model
>>>> with a few bridge/tc commands.
>>>>
>>>>>> We are also slightly in disagreement about what the default should be
>>>>>> with VF netdevs. I think the default should be the same L2 mac/vlan
>>>>>> switch behavior and see no reason to change it by default just because
>>>>>> we added VF netdevs. The infrastructure libvirt/openstack/etc are built
>>>>>> around this default today. But I guess nothing in this series specifies
>>>>>> what the defaults of any given driver will be. VF netdevs are still
>>>>>> useful even on older hardware that only supports mac/vlan forwarding to
>>>>>> expose statistics and send/receive control frames such as lldp.
>>>>> Again, this is not about default engineering... and using the VF reps
>>>>> (not VF netdevs) in legacy mode only make it more cryptic to my
>>>>> opinion. I agree some changes would be needed in openstack to support
>>>>> the new model, but this is how progress is made... you can't always
>>>>> make all layer above you unchanged. Note that the VF reps behave the
>>>>> same as tap devices (v-switch doing xmit on tap --> recv in VM, VM
>>>>> sends --> recv on tap into the v-switch), so the change in open-stack
>>>>> would not be that big.
>>>>>
>>>> But in this case we have no reason to break the stack above us. The
>>>> currently deployed usage is L2 mac/vlan. As soon as you bind a vSwitch
>>>> or whatever mgmt agent to the device it can go ahead and manage the
>>>> switch putting it in the correct mode using the tooling in 'bridge' and
>>>> 'tc'.
>>>>
>>>>
>>>>> [...]
>>>>>
>>>>>> Why I think the VF representer is a per port ethtool flag and not a
>>>>>> devlink option is my use case might be to assign a PF into a VM or
>>>>>> namespace where I don't want VF netdevs.
>>>>> again, we think the correct place to set how the eswitch is managed is
>>>>> through eswitch manager PCI devices and not net devices and hence
>>>>> ethtool is not the way to go.
>>>>>
>>>>> Also, how do you want your e-switch to be managed in this case?
>>>>>
>>>> In the case where I don't create vf netdevs on one of the PFs I'll
>>>> manage the forwarding tables via the existing mechanisms 'ip' and
>>>> 'bridge'. However its likely not a big deal because 'ip' and 'bridge'
>>>> will continue to work even if VF reps are around. The ethtool/devlink
>>>> comment was more about pointing out that creating VFs does not
>>>> require you to manage your switch any differently. Its useful even on
>>>> devices that can't support flow based forwarding for statistics and
>>>> setting port attributes like mtu, etc.
>>>>
>>>> .John
>>>>
>>> Probably bad form to respond to my own email but just to highlight how
>>> subtle the distinction is (hopefully not to much repeat),
>>>
>>> Today in "legacy" mode each VF mac address is automatically added to
>>> the fdb along with the PF mac address. If there is a miss in the table
>>> (an unknown mac) the packet is sent to the PF but unless the PF is in
>>> promisc mode the packet is dropped by the rx filter. I presume even
>>> with the proposed model you would want to continue to enforce the
>>> rx filter otherwise the instance you flip the mode you are open to
>>> receive unwanted traffic. The promisc mode semantics have been in place
>>> for a long time so certainly don't want to break that. Can we agree on
>>> the promisc point? Also bridges/vswitch/etc already set promisc mode
>>> once they attach to the netdevs.
>>>
>>> (assuming we agree on the promisc point?)
>>> In your proposed model the only difference I can see is when the mode is
>>> changed you don't want to add the VF mac address to the fdb table. How
>>> about rather than make this part of the mode selection pick one way to
>>> do this in all cases. Either add the VF mac addresses to the fdb or
>>> do not do this. I have a preference for adding the VF mac addresses
>>> because this is the current behavior. Then rename the devlink option
>>> "VF reps" or something because that is what it is controlling.
>>>
>>> The last thing to argue about is if its a port attribute ala ethtool
>>> or a device attribute ala devlink. But maybe we can agree on everything
>>> up to this point?
>>>
>>> Thanks,
>>> John
>>>
>> FWIW reviewing devlink and items I want to put there in the future I've
>> decided it makes sense to keep it in devlink (sorry took me a day of
>> emails to get here). If you can agree to the above and rename it
>> something like,
>>
>> +enum devlink_eswitch_mode {
>> + DEVLINK_ESWITCH_MODE_NONE,
>> + DEVLINK_ESWITCH_MODE_LEGACY,
>> + DEVLINK_ESWITCH_MODE_CREATE_VF_NETDEVS,
> That is certainly totally misleading name. The mode is not about
> creating "VF netdevs".
>
> The VF representors are created but just as a side effect. The "offload"
> mode or maybe better "switchdev" mode is creating representor netdevs for
> VFs because they are needed in order to be able to configure ESwitch in
> the same way we configure physical switches - putting netdevs into
> bridge/bond/ovs/whatever. You see stats on the representors. Basicaly
> they are the same as physical port representors on physical switch ASIC.
May be we need 2 new modes
- legacy+ mode which only creates VF netdevs and let the user configure and manage the switch via the standard bridge/tc/ip/ethtool interfaces
- 'offload' or 'switchdev' mode that does more than just creating VF netdevs if it is not possible to configure the switch into this mode via standard interfaces.
>
>
>
>> +};
>>
>> I'll Ack it and implement it on the drivers I tend to work on.
>>
>> .John
>>
>>
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-30 7:13 ` Samudrala, Sridhar
@ 2016-06-30 7:41 ` Jiri Pirko
2016-06-30 7:57 ` John Fastabend
0 siblings, 1 reply; 47+ messages in thread
From: Jiri Pirko @ 2016-06-30 7:41 UTC (permalink / raw)
To: Samudrala, Sridhar
Cc: John Fastabend, Or Gerlitz, Or Gerlitz, Saeed Mahameed,
David S. Miller, Linux Netdev List, Hadar Hen-Zion, Jiri Pirko,
Andy Gospodarek, Jesse Brandeburg, John Fastabend, Ido Schimmel,
Tal Anker
Thu, Jun 30, 2016 at 09:13:55AM CEST, sridhar.samudrala@intel.com wrote:
>
>
>On 6/29/2016 11:25 PM, Jiri Pirko wrote:
>>Thu, Jun 30, 2016 at 06:04:39AM CEST, john.fastabend@gmail.com wrote:
>>>On 16-06-29 08:35 PM, John Fastabend wrote:
>>>>On 16-06-29 03:09 PM, John Fastabend wrote:
>>>>>On 16-06-29 02:33 PM, Or Gerlitz wrote:
>>>>>>On Wed, Jun 29, 2016 at 7:35 PM, John Fastabend
>>>>>><john.fastabend@gmail.com> wrote:
>>>>>>>On 16-06-29 07:48 AM, Or Gerlitz wrote:
>>>>>>>>On 6/28/2016 10:31 PM, John Fastabend wrote:
>>>>>>>>>On 16-06-28 12:12 PM, Jiri Pirko wrote:
>>>>>>>>>>Why?! Please, leave legacy be legacy. Use the new mode for
>>>>>>>>>>implementing new features. Don't make things any more complicated :(
>>>>>>[...]
>>>>>>>>>Maybe I'm reading to much into the devlink flag names and if instead
>>>>>>>>>you use a switch like the following,
>>>>>>>>> VF representer : enable/disable the creation VF netdev's to represent
>>>>>>>>> the virtual functions on the PF
>>>>>>>>>Much less complicated then magic switching between forwarding logic IMO
>>>>>>>>>and you don't whack a default configuration that an entire stack (e.g.
>>>>>>>>>libvirt) has been built to use.
>>>>>>>>Re letting the user to observe/modify the rules added by the
>>>>>>>>driver/firmware while legacy mode. Even if possible with bridge/fdb, it
>>>>>>>>will be really pragmatical and doesn't make sense to get that donefor
>>>>>>>>the TC subsystem. So this isn't a well defined solution and anyway, as
>>>>>>>>you said, legacy mode enhancements is a different exercise. Personally,
>>>>>>>>I agree with Jiri, that we should legacy be legacyand focus on adding
>>>>>>>>the new model.
>>>>>>>The ixgbe driver already supports bridge and tc commands without the VF
>>>>>>>representer. Adding the VF representer to these drivers just extends
>>>>>>>the existing support so we have an identifier for VFs and now the
>>>>>>>redirect action works and the fdb commands can specify the VF netdevs.
>>>>>>>I don't see this as a problem because we already do it today with
>>>>>>>'ip' and bridge tools.
>>>>>>To be precise, for both ixgbe and mlx5, the existing tc support
>>>>>>(u32/ixgbe, flower/mlx5) is not for switching functionality but rather
>>>>>>for NIC-ish one, e.g drop, mark, etc. Indeed in ixgbe you added
>>>>>>redirect to VF, but this is only for south --> north (wire --> VF)
>>>>>>traffic, w.o the VF rep you can't do the other way around.
>>>>>>
>>>>>Correct which is why we need the VF rep. So we are completely in
>>>>>sync there.
>>>>>
>>>>>>Just to clarify, to what exact bridge command support did you refer for ixgbe?
>>>>>'bridge fdb' commands are supported today on the PF. But its the
>>>>>same story as above we need the VF rep to also use it on the
>>>>>VF representer
>>>>>
>>>>>Also 'bridge link' command for veb/vepa modes is supported and the
>>>>>other link attributes could be supported with additional driver
>>>>>support. No need for core changes here. But again yes only on the
>>>>>PF so again we need the VF reps.
>>>>>
>>>>>>The forwarding done in the legacy mode is not well defined, and
>>>>>>different across vendors, adding there the VF reps will not make it
>>>>>>any better b/c some steering rules will be set by tc/bridge offloads
>>>>>>while other rules will be put by the driver.
>>>>>>I don't see how this takes us to better place.
>>>>>In legacy mode or any other mode you are defining some default policy
>>>>>and rules.
>>>>>
>>>>>In the legacy mode we use mac/vlan assigned l2 forwarding entries in the
>>>>>hardware fdb which are seen when you query 'ip link' and 'bridge fdb'
>>>>>today. And similarly can be modified today using 'ip link' and 'bridge
>>>>>fdb' at least on the intel devices. Its not undefined in any way with
>>>>>a quick query of the tools we can learn exactly what the configuration
>>>>>is and even change it. This works fairly well with existing controllers
>>>>>and stacks.
>>>>>
>>>>>The limitations are 'ip' only supports a single MAC address per VF and
>>>>>'tc' doesn't work on VF ports because when the VF is assigned to a VM
>>>>>or namespace we lose visibility of it. Providing a VF rep for this
>>>>>solves both of those problems.
>>>>>
>>>>>In this new mode the default policy is to create a default miss rule
>>>>>and implement no l2 forwarding rules. Unfortunately not all hardware
>>>>>in use supports this default miss rule case but would still benefit
>>>>>from having a VF rep. So we shouldn't make this a stipulation for
>>>>>enabling VF reps. It also changes a default policy that has been in
>>>>>place for years without IMO at least any compelling reason. It will
>>>>>be easy enough to change the default l2 policy to a flow based model
>>>>>with a few bridge/tc commands.
>>>>>
>>>>>>>We are also slightly in disagreement about what the default should be
>>>>>>>with VF netdevs. I think the default should be the same L2 mac/vlan
>>>>>>>switch behavior and see no reason to change it by default just because
>>>>>>>we added VF netdevs. The infrastructure libvirt/openstack/etc are built
>>>>>>>around this default today. But I guess nothing in this series specifies
>>>>>>>what the defaults of any given driver will be. VF netdevs are still
>>>>>>>useful even on older hardware that only supports mac/vlan forwarding to
>>>>>>>expose statistics and send/receive control frames such as lldp.
>>>>>>Again, this is not about default engineering... and using the VF reps
>>>>>>(not VF netdevs) in legacy mode only make it more cryptic to my
>>>>>>opinion. I agree some changes would be needed in openstack to support
>>>>>>the new model, but this is how progress is made... you can't always
>>>>>>make all layer above you unchanged. Note that the VF reps behave the
>>>>>>same as tap devices (v-switch doing xmit on tap --> recv in VM, VM
>>>>>>sends --> recv on tap into the v-switch), so the change in open-stack
>>>>>>would not be that big.
>>>>>>
>>>>>But in this case we have no reason to break the stack above us. The
>>>>>currently deployed usage is L2 mac/vlan. As soon as you bind a vSwitch
>>>>>or whatever mgmt agent to the device it can go ahead and manage the
>>>>>switch putting it in the correct mode using the tooling in 'bridge' and
>>>>>'tc'.
>>>>>
>>>>>
>>>>>>[...]
>>>>>>
>>>>>>>Why I think the VF representer is a per port ethtool flag and not a
>>>>>>>devlink option is my use case might be to assign a PF into a VM or
>>>>>>>namespace where I don't want VF netdevs.
>>>>>>again, we think the correct place to set how the eswitch is managed is
>>>>>>through eswitch manager PCI devices and not net devices and hence
>>>>>>ethtool is not the way to go.
>>>>>>
>>>>>>Also, how do you want your e-switch to be managed in this case?
>>>>>>
>>>>>In the case where I don't create vf netdevs on one of the PFs I'll
>>>>>manage the forwarding tables via the existing mechanisms 'ip' and
>>>>>'bridge'. However its likely not a big deal because 'ip' and 'bridge'
>>>>>will continue to work even if VF reps are around. The ethtool/devlink
>>>>>comment was more about pointing out that creating VFs does not
>>>>>require you to manage your switch any differently. Its useful even on
>>>>>devices that can't support flow based forwarding for statistics and
>>>>>setting port attributes like mtu, etc.
>>>>>
>>>>>.John
>>>>>
>>>>Probably bad form to respond to my own email but just to highlight how
>>>>subtle the distinction is (hopefully not to much repeat),
>>>>
>>>>Today in "legacy" mode each VF mac address is automatically added to
>>>>the fdb along with the PF mac address. If there is a miss in the table
>>>>(an unknown mac) the packet is sent to the PF but unless the PF is in
>>>>promisc mode the packet is dropped by the rx filter. I presume even
>>>>with the proposed model you would want to continue to enforce the
>>>>rx filter otherwise the instance you flip the mode you are open to
>>>>receive unwanted traffic. The promisc mode semantics have been in place
>>>>for a long time so certainly don't want to break that. Can we agree on
>>>>the promisc point? Also bridges/vswitch/etc already set promisc mode
>>>>once they attach to the netdevs.
>>>>
>>>>(assuming we agree on the promisc point?)
>>>>In your proposed model the only difference I can see is when the mode is
>>>>changed you don't want to add the VF mac address to the fdb table. How
>>>>about rather than make this part of the mode selection pick one way to
>>>>do this in all cases. Either add the VF mac addresses to the fdb or
>>>>do not do this. I have a preference for adding the VF mac addresses
>>>>because this is the current behavior. Then rename the devlink option
>>>>"VF reps" or something because that is what it is controlling.
>>>>
>>>>The last thing to argue about is if its a port attribute ala ethtool
>>>>or a device attribute ala devlink. But maybe we can agree on everything
>>>>up to this point?
>>>>
>>>>Thanks,
>>>>John
>>>>
>>>FWIW reviewing devlink and items I want to put there in the future I've
>>>decided it makes sense to keep it in devlink (sorry took me a day of
>>>emails to get here). If you can agree to the above and rename it
>>>something like,
>>>
>>>+enum devlink_eswitch_mode {
>>>+ DEVLINK_ESWITCH_MODE_NONE,
>>>+ DEVLINK_ESWITCH_MODE_LEGACY,
>>>+ DEVLINK_ESWITCH_MODE_CREATE_VF_NETDEVS,
>>That is certainly totally misleading name. The mode is not about
>>creating "VF netdevs".
>>
>>The VF representors are created but just as a side effect. The "offload"
>>mode or maybe better "switchdev" mode is creating representor netdevs for
>>VFs because they are needed in order to be able to configure ESwitch in
>>the same way we configure physical switches - putting netdevs into
>>bridge/bond/ovs/whatever. You see stats on the representors. Basicaly
>>they are the same as physical port representors on physical switch ASIC.
>
>May be we need 2 new modes
>- legacy+ mode which only creates VF netdevs and let the user configure and manage the switch via the standard bridge/tc/ip/ethtool interfaces
>- 'offload' or 'switchdev' mode that does more than just creating VF netdevs if it is not possible to configure the switch into this mode via standard interfaces.
What?
That what you described as "legacy+" as "let the user configure and
manage the switch via the standard bridge/tc/ip/ethtool interfaces" is
exactly the "offload/switchdev" mode.
The second mode you described is something that I don't get what you are
talking about...
Please forget about legacy. It's a mistake. Similar to SDKs :(
Let's work on getting the proper offload solution in.
>
>>
>>
>>
>>>+};
>>>
>>>I'll Ack it and implement it on the drivers I tend to work on.
>>>
>>>.John
>>>
>>>
>
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-30 7:41 ` Jiri Pirko
@ 2016-06-30 7:57 ` John Fastabend
2016-06-30 10:52 ` Jiri Pirko
0 siblings, 1 reply; 47+ messages in thread
From: John Fastabend @ 2016-06-30 7:57 UTC (permalink / raw)
To: Jiri Pirko, Samudrala, Sridhar
Cc: Or Gerlitz, Or Gerlitz, Saeed Mahameed, David S. Miller,
Linux Netdev List, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Ido Schimmel, Tal Anker
On 16-06-30 12:41 AM, Jiri Pirko wrote:
> Thu, Jun 30, 2016 at 09:13:55AM CEST, sridhar.samudrala@intel.com wrote:
>>
>>
>> On 6/29/2016 11:25 PM, Jiri Pirko wrote:
>>> Thu, Jun 30, 2016 at 06:04:39AM CEST, john.fastabend@gmail.com wrote:
>>>> On 16-06-29 08:35 PM, John Fastabend wrote:
>>>>> On 16-06-29 03:09 PM, John Fastabend wrote:
>>>>>> On 16-06-29 02:33 PM, Or Gerlitz wrote:
>>>>>>> On Wed, Jun 29, 2016 at 7:35 PM, John Fastabend
>>>>>>> <john.fastabend@gmail.com> wrote:
>>>>>>>> On 16-06-29 07:48 AM, Or Gerlitz wrote:
>>>>>>>>> On 6/28/2016 10:31 PM, John Fastabend wrote:
>>>>>>>>>> On 16-06-28 12:12 PM, Jiri Pirko wrote:
>>>>>>>>>>> Why?! Please, leave legacy be legacy. Use the new mode for
>>>>>>>>>>> implementing new features. Don't make things any more complicated :(
>>>>>>> [...]
>>>>>>>>>> Maybe I'm reading to much into the devlink flag names and if instead
>>>>>>>>>> you use a switch like the following,
>>>>>>>>>> VF representer : enable/disable the creation VF netdev's to represent
>>>>>>>>>> the virtual functions on the PF
>>>>>>>>>> Much less complicated then magic switching between forwarding logic IMO
>>>>>>>>>> and you don't whack a default configuration that an entire stack (e.g.
>>>>>>>>>> libvirt) has been built to use.
>>>>>>>>> Re letting the user to observe/modify the rules added by the
>>>>>>>>> driver/firmware while legacy mode. Even if possible with bridge/fdb, it
>>>>>>>>> will be really pragmatical and doesn't make sense to get that donefor
>>>>>>>>> the TC subsystem. So this isn't a well defined solution and anyway, as
>>>>>>>>> you said, legacy mode enhancements is a different exercise. Personally,
>>>>>>>>> I agree with Jiri, that we should legacy be legacyand focus on adding
>>>>>>>>> the new model.
>>>>>>>> The ixgbe driver already supports bridge and tc commands without the VF
>>>>>>>> representer. Adding the VF representer to these drivers just extends
>>>>>>>> the existing support so we have an identifier for VFs and now the
>>>>>>>> redirect action works and the fdb commands can specify the VF netdevs.
>>>>>>>> I don't see this as a problem because we already do it today with
>>>>>>>> 'ip' and bridge tools.
>>>>>>> To be precise, for both ixgbe and mlx5, the existing tc support
>>>>>>> (u32/ixgbe, flower/mlx5) is not for switching functionality but rather
>>>>>>> for NIC-ish one, e.g drop, mark, etc. Indeed in ixgbe you added
>>>>>>> redirect to VF, but this is only for south --> north (wire --> VF)
>>>>>>> traffic, w.o the VF rep you can't do the other way around.
>>>>>>>
>>>>>> Correct which is why we need the VF rep. So we are completely in
>>>>>> sync there.
>>>>>>
>>>>>>> Just to clarify, to what exact bridge command support did you refer for ixgbe?
>>>>>> 'bridge fdb' commands are supported today on the PF. But its the
>>>>>> same story as above we need the VF rep to also use it on the
>>>>>> VF representer
>>>>>>
>>>>>> Also 'bridge link' command for veb/vepa modes is supported and the
>>>>>> other link attributes could be supported with additional driver
>>>>>> support. No need for core changes here. But again yes only on the
>>>>>> PF so again we need the VF reps.
>>>>>>
>>>>>>> The forwarding done in the legacy mode is not well defined, and
>>>>>>> different across vendors, adding there the VF reps will not make it
>>>>>>> any better b/c some steering rules will be set by tc/bridge offloads
>>>>>>> while other rules will be put by the driver.
>>>>>>> I don't see how this takes us to better place.
>>>>>> In legacy mode or any other mode you are defining some default policy
>>>>>> and rules.
>>>>>>
>>>>>> In the legacy mode we use mac/vlan assigned l2 forwarding entries in the
>>>>>> hardware fdb which are seen when you query 'ip link' and 'bridge fdb'
>>>>>> today. And similarly can be modified today using 'ip link' and 'bridge
>>>>>> fdb' at least on the intel devices. Its not undefined in any way with
>>>>>> a quick query of the tools we can learn exactly what the configuration
>>>>>> is and even change it. This works fairly well with existing controllers
>>>>>> and stacks.
>>>>>>
>>>>>> The limitations are 'ip' only supports a single MAC address per VF and
>>>>>> 'tc' doesn't work on VF ports because when the VF is assigned to a VM
>>>>>> or namespace we lose visibility of it. Providing a VF rep for this
>>>>>> solves both of those problems.
>>>>>>
>>>>>> In this new mode the default policy is to create a default miss rule
>>>>>> and implement no l2 forwarding rules. Unfortunately not all hardware
>>>>>> in use supports this default miss rule case but would still benefit
>>>>> >from having a VF rep. So we shouldn't make this a stipulation for
>>>>>> enabling VF reps. It also changes a default policy that has been in
>>>>>> place for years without IMO at least any compelling reason. It will
>>>>>> be easy enough to change the default l2 policy to a flow based model
>>>>>> with a few bridge/tc commands.
>>>>>>
>>>>>>>> We are also slightly in disagreement about what the default should be
>>>>>>>> with VF netdevs. I think the default should be the same L2 mac/vlan
>>>>>>>> switch behavior and see no reason to change it by default just because
>>>>>>>> we added VF netdevs. The infrastructure libvirt/openstack/etc are built
>>>>>>>> around this default today. But I guess nothing in this series specifies
>>>>>>>> what the defaults of any given driver will be. VF netdevs are still
>>>>>>>> useful even on older hardware that only supports mac/vlan forwarding to
>>>>>>>> expose statistics and send/receive control frames such as lldp.
>>>>>>> Again, this is not about default engineering... and using the VF reps
>>>>>>> (not VF netdevs) in legacy mode only make it more cryptic to my
>>>>>>> opinion. I agree some changes would be needed in openstack to support
>>>>>>> the new model, but this is how progress is made... you can't always
>>>>>>> make all layer above you unchanged. Note that the VF reps behave the
>>>>>>> same as tap devices (v-switch doing xmit on tap --> recv in VM, VM
>>>>>>> sends --> recv on tap into the v-switch), so the change in open-stack
>>>>>>> would not be that big.
>>>>>>>
>>>>>> But in this case we have no reason to break the stack above us. The
>>>>>> currently deployed usage is L2 mac/vlan. As soon as you bind a vSwitch
>>>>>> or whatever mgmt agent to the device it can go ahead and manage the
>>>>>> switch putting it in the correct mode using the tooling in 'bridge' and
>>>>>> 'tc'.
>>>>>>
>>>>>>
>>>>>>> [...]
>>>>>>>
>>>>>>>> Why I think the VF representer is a per port ethtool flag and not a
>>>>>>>> devlink option is my use case might be to assign a PF into a VM or
>>>>>>>> namespace where I don't want VF netdevs.
>>>>>>> again, we think the correct place to set how the eswitch is managed is
>>>>>>> through eswitch manager PCI devices and not net devices and hence
>>>>>>> ethtool is not the way to go.
>>>>>>>
>>>>>>> Also, how do you want your e-switch to be managed in this case?
>>>>>>>
>>>>>> In the case where I don't create vf netdevs on one of the PFs I'll
>>>>>> manage the forwarding tables via the existing mechanisms 'ip' and
>>>>>> 'bridge'. However its likely not a big deal because 'ip' and 'bridge'
>>>>>> will continue to work even if VF reps are around. The ethtool/devlink
>>>>>> comment was more about pointing out that creating VFs does not
>>>>>> require you to manage your switch any differently. Its useful even on
>>>>>> devices that can't support flow based forwarding for statistics and
>>>>>> setting port attributes like mtu, etc.
>>>>>>
>>>>>> .John
>>>>>>
>>>>> Probably bad form to respond to my own email but just to highlight how
>>>>> subtle the distinction is (hopefully not to much repeat),
>>>>>
>>>>> Today in "legacy" mode each VF mac address is automatically added to
>>>>> the fdb along with the PF mac address. If there is a miss in the table
>>>>> (an unknown mac) the packet is sent to the PF but unless the PF is in
>>>>> promisc mode the packet is dropped by the rx filter. I presume even
>>>>> with the proposed model you would want to continue to enforce the
>>>>> rx filter otherwise the instance you flip the mode you are open to
>>>>> receive unwanted traffic. The promisc mode semantics have been in place
>>>>> for a long time so certainly don't want to break that. Can we agree on
>>>>> the promisc point? Also bridges/vswitch/etc already set promisc mode
>>>>> once they attach to the netdevs.
>>>>>
>>>>> (assuming we agree on the promisc point?)
>>>>> In your proposed model the only difference I can see is when the mode is
>>>>> changed you don't want to add the VF mac address to the fdb table. How
>>>>> about rather than make this part of the mode selection pick one way to
>>>>> do this in all cases. Either add the VF mac addresses to the fdb or
>>>>> do not do this. I have a preference for adding the VF mac addresses
>>>>> because this is the current behavior. Then rename the devlink option
>>>>> "VF reps" or something because that is what it is controlling.
>>>>>
>>>>> The last thing to argue about is if its a port attribute ala ethtool
>>>>> or a device attribute ala devlink. But maybe we can agree on everything
>>>>> up to this point?
>>>>>
>>>>> Thanks,
>>>>> John
>>>>>
>>>> FWIW reviewing devlink and items I want to put there in the future I've
>>>> decided it makes sense to keep it in devlink (sorry took me a day of
>>>> emails to get here). If you can agree to the above and rename it
>>>> something like,
>>>>
>>>> +enum devlink_eswitch_mode {
>>>> + DEVLINK_ESWITCH_MODE_NONE,
>>>> + DEVLINK_ESWITCH_MODE_LEGACY,
>>>> + DEVLINK_ESWITCH_MODE_CREATE_VF_NETDEVS,
>>> That is certainly totally misleading name. The mode is not about
>>> creating "VF netdevs".
>>>
>>> The VF representors are created but just as a side effect. The "offload"
>>> mode or maybe better "switchdev" mode is creating representor netdevs for
>>> VFs because they are needed in order to be able to configure ESwitch in
>>> the same way we configure physical switches - putting netdevs into
>>> bridge/bond/ovs/whatever. You see stats on the representors. Basicaly
>>> they are the same as physical port representors on physical switch ASIC.
>>
>> May be we need 2 new modes
>> - legacy+ mode which only creates VF netdevs and let the user configure and manage the switch via the standard bridge/tc/ip/ethtool interfaces
>> - 'offload' or 'switchdev' mode that does more than just creating VF netdevs if it is not possible to configure the switch into this mode via standard interfaces.
>
> What?
>
> That what you described as "legacy+" as "let the user configure and
> manage the switch via the standard bridge/tc/ip/ethtool interfaces" is
> exactly the "offload/switchdev" mode.
>
> The second mode you described is something that I don't get what you are
> talking about...
>
> Please forget about legacy. It's a mistake. Similar to SDKs :(
> Let's work on getting the proper offload solution in.
>
I think the point here is switchdev is not needed to use bridge, tc,
ip, and ethtool tools. By adding the VF representors we can continue
using 'tc', 'bridge', etc. and it is much more interesting because
we bring the VFs into the netdev world even without switchdev support
this is nice. Adding switchdev of course gets you some extra goodies
like l3 and l2 learning if your nic supports it but its not strictly
required to see goodness from this patch. Without switchdev support
you get stats (big win), basic port configuration with ip link cmds,
tc and bridge fdb to name a few.
Also we can't completely forget about legacy though because we have
infrastructure built around it and its unlikely we can switch entirely
over in one shot. For example the firewall application may switch over
to the new VF rep model while the libvirt VM manager continues to use
the 'ip link set ... vf #' model. No reason to stop this from being
supported its actually more work in the code to block it. We get it for
free.
I've come to the conclusion that we are just arguing over a name and
a bit of perspective calling it "offload" mode is OK with me even
though legacy mode did offloading as well just not as interesting of
offloads. If the VF representors are the cause or effect is not all
that important to me.
If drivers populate the fdb table with known MACs is a side issue
IMO (the thread Or and I got lost in) and doesn't need to hold up this
patch.
.John
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-30 7:57 ` John Fastabend
@ 2016-06-30 10:52 ` Jiri Pirko
2016-06-30 14:24 ` Or Gerlitz
2016-06-30 15:40 ` John Fastabend
0 siblings, 2 replies; 47+ messages in thread
From: Jiri Pirko @ 2016-06-30 10:52 UTC (permalink / raw)
To: John Fastabend
Cc: Samudrala, Sridhar, Or Gerlitz, Or Gerlitz, Saeed Mahameed,
David S. Miller, Linux Netdev List, Hadar Hen-Zion, Jiri Pirko,
Andy Gospodarek, Jesse Brandeburg, John Fastabend, Ido Schimmel,
Tal Anker
Thu, Jun 30, 2016 at 09:57:21AM CEST, john.fastabend@gmail.com wrote:
>On 16-06-30 12:41 AM, Jiri Pirko wrote:
>> Thu, Jun 30, 2016 at 09:13:55AM CEST, sridhar.samudrala@intel.com wrote:
>>>
>>>
>>> On 6/29/2016 11:25 PM, Jiri Pirko wrote:
>>>> Thu, Jun 30, 2016 at 06:04:39AM CEST, john.fastabend@gmail.com wrote:
>>>>> On 16-06-29 08:35 PM, John Fastabend wrote:
>>>>>> On 16-06-29 03:09 PM, John Fastabend wrote:
>>>>>>> On 16-06-29 02:33 PM, Or Gerlitz wrote:
>>>>>>>> On Wed, Jun 29, 2016 at 7:35 PM, John Fastabend
>>>>>>>> <john.fastabend@gmail.com> wrote:
>>>>>>>>> On 16-06-29 07:48 AM, Or Gerlitz wrote:
>>>>>>>>>> On 6/28/2016 10:31 PM, John Fastabend wrote:
>>>>>>>>>>> On 16-06-28 12:12 PM, Jiri Pirko wrote:
>>>>>>>>>>>> Why?! Please, leave legacy be legacy. Use the new mode for
>>>>>>>>>>>> implementing new features. Don't make things any more complicated :(
>>>>>>>> [...]
>>>>>>>>>>> Maybe I'm reading to much into the devlink flag names and if instead
>>>>>>>>>>> you use a switch like the following,
>>>>>>>>>>> VF representer : enable/disable the creation VF netdev's to represent
>>>>>>>>>>> the virtual functions on the PF
>>>>>>>>>>> Much less complicated then magic switching between forwarding logic IMO
>>>>>>>>>>> and you don't whack a default configuration that an entire stack (e.g.
>>>>>>>>>>> libvirt) has been built to use.
>>>>>>>>>> Re letting the user to observe/modify the rules added by the
>>>>>>>>>> driver/firmware while legacy mode. Even if possible with bridge/fdb, it
>>>>>>>>>> will be really pragmatical and doesn't make sense to get that donefor
>>>>>>>>>> the TC subsystem. So this isn't a well defined solution and anyway, as
>>>>>>>>>> you said, legacy mode enhancements is a different exercise. Personally,
>>>>>>>>>> I agree with Jiri, that we should legacy be legacyand focus on adding
>>>>>>>>>> the new model.
>>>>>>>>> The ixgbe driver already supports bridge and tc commands without the VF
>>>>>>>>> representer. Adding the VF representer to these drivers just extends
>>>>>>>>> the existing support so we have an identifier for VFs and now the
>>>>>>>>> redirect action works and the fdb commands can specify the VF netdevs.
>>>>>>>>> I don't see this as a problem because we already do it today with
>>>>>>>>> 'ip' and bridge tools.
>>>>>>>> To be precise, for both ixgbe and mlx5, the existing tc support
>>>>>>>> (u32/ixgbe, flower/mlx5) is not for switching functionality but rather
>>>>>>>> for NIC-ish one, e.g drop, mark, etc. Indeed in ixgbe you added
>>>>>>>> redirect to VF, but this is only for south --> north (wire --> VF)
>>>>>>>> traffic, w.o the VF rep you can't do the other way around.
>>>>>>>>
>>>>>>> Correct which is why we need the VF rep. So we are completely in
>>>>>>> sync there.
>>>>>>>
>>>>>>>> Just to clarify, to what exact bridge command support did you refer for ixgbe?
>>>>>>> 'bridge fdb' commands are supported today on the PF. But its the
>>>>>>> same story as above we need the VF rep to also use it on the
>>>>>>> VF representer
>>>>>>>
>>>>>>> Also 'bridge link' command for veb/vepa modes is supported and the
>>>>>>> other link attributes could be supported with additional driver
>>>>>>> support. No need for core changes here. But again yes only on the
>>>>>>> PF so again we need the VF reps.
>>>>>>>
>>>>>>>> The forwarding done in the legacy mode is not well defined, and
>>>>>>>> different across vendors, adding there the VF reps will not make it
>>>>>>>> any better b/c some steering rules will be set by tc/bridge offloads
>>>>>>>> while other rules will be put by the driver.
>>>>>>>> I don't see how this takes us to better place.
>>>>>>> In legacy mode or any other mode you are defining some default policy
>>>>>>> and rules.
>>>>>>>
>>>>>>> In the legacy mode we use mac/vlan assigned l2 forwarding entries in the
>>>>>>> hardware fdb which are seen when you query 'ip link' and 'bridge fdb'
>>>>>>> today. And similarly can be modified today using 'ip link' and 'bridge
>>>>>>> fdb' at least on the intel devices. Its not undefined in any way with
>>>>>>> a quick query of the tools we can learn exactly what the configuration
>>>>>>> is and even change it. This works fairly well with existing controllers
>>>>>>> and stacks.
>>>>>>>
>>>>>>> The limitations are 'ip' only supports a single MAC address per VF and
>>>>>>> 'tc' doesn't work on VF ports because when the VF is assigned to a VM
>>>>>>> or namespace we lose visibility of it. Providing a VF rep for this
>>>>>>> solves both of those problems.
>>>>>>>
>>>>>>> In this new mode the default policy is to create a default miss rule
>>>>>>> and implement no l2 forwarding rules. Unfortunately not all hardware
>>>>>>> in use supports this default miss rule case but would still benefit
>>>>>> >from having a VF rep. So we shouldn't make this a stipulation for
>>>>>>> enabling VF reps. It also changes a default policy that has been in
>>>>>>> place for years without IMO at least any compelling reason. It will
>>>>>>> be easy enough to change the default l2 policy to a flow based model
>>>>>>> with a few bridge/tc commands.
>>>>>>>
>>>>>>>>> We are also slightly in disagreement about what the default should be
>>>>>>>>> with VF netdevs. I think the default should be the same L2 mac/vlan
>>>>>>>>> switch behavior and see no reason to change it by default just because
>>>>>>>>> we added VF netdevs. The infrastructure libvirt/openstack/etc are built
>>>>>>>>> around this default today. But I guess nothing in this series specifies
>>>>>>>>> what the defaults of any given driver will be. VF netdevs are still
>>>>>>>>> useful even on older hardware that only supports mac/vlan forwarding to
>>>>>>>>> expose statistics and send/receive control frames such as lldp.
>>>>>>>> Again, this is not about default engineering... and using the VF reps
>>>>>>>> (not VF netdevs) in legacy mode only make it more cryptic to my
>>>>>>>> opinion. I agree some changes would be needed in openstack to support
>>>>>>>> the new model, but this is how progress is made... you can't always
>>>>>>>> make all layer above you unchanged. Note that the VF reps behave the
>>>>>>>> same as tap devices (v-switch doing xmit on tap --> recv in VM, VM
>>>>>>>> sends --> recv on tap into the v-switch), so the change in open-stack
>>>>>>>> would not be that big.
>>>>>>>>
>>>>>>> But in this case we have no reason to break the stack above us. The
>>>>>>> currently deployed usage is L2 mac/vlan. As soon as you bind a vSwitch
>>>>>>> or whatever mgmt agent to the device it can go ahead and manage the
>>>>>>> switch putting it in the correct mode using the tooling in 'bridge' and
>>>>>>> 'tc'.
>>>>>>>
>>>>>>>
>>>>>>>> [...]
>>>>>>>>
>>>>>>>>> Why I think the VF representer is a per port ethtool flag and not a
>>>>>>>>> devlink option is my use case might be to assign a PF into a VM or
>>>>>>>>> namespace where I don't want VF netdevs.
>>>>>>>> again, we think the correct place to set how the eswitch is managed is
>>>>>>>> through eswitch manager PCI devices and not net devices and hence
>>>>>>>> ethtool is not the way to go.
>>>>>>>>
>>>>>>>> Also, how do you want your e-switch to be managed in this case?
>>>>>>>>
>>>>>>> In the case where I don't create vf netdevs on one of the PFs I'll
>>>>>>> manage the forwarding tables via the existing mechanisms 'ip' and
>>>>>>> 'bridge'. However its likely not a big deal because 'ip' and 'bridge'
>>>>>>> will continue to work even if VF reps are around. The ethtool/devlink
>>>>>>> comment was more about pointing out that creating VFs does not
>>>>>>> require you to manage your switch any differently. Its useful even on
>>>>>>> devices that can't support flow based forwarding for statistics and
>>>>>>> setting port attributes like mtu, etc.
>>>>>>>
>>>>>>> .John
>>>>>>>
>>>>>> Probably bad form to respond to my own email but just to highlight how
>>>>>> subtle the distinction is (hopefully not to much repeat),
>>>>>>
>>>>>> Today in "legacy" mode each VF mac address is automatically added to
>>>>>> the fdb along with the PF mac address. If there is a miss in the table
>>>>>> (an unknown mac) the packet is sent to the PF but unless the PF is in
>>>>>> promisc mode the packet is dropped by the rx filter. I presume even
>>>>>> with the proposed model you would want to continue to enforce the
>>>>>> rx filter otherwise the instance you flip the mode you are open to
>>>>>> receive unwanted traffic. The promisc mode semantics have been in place
>>>>>> for a long time so certainly don't want to break that. Can we agree on
>>>>>> the promisc point? Also bridges/vswitch/etc already set promisc mode
>>>>>> once they attach to the netdevs.
>>>>>>
>>>>>> (assuming we agree on the promisc point?)
>>>>>> In your proposed model the only difference I can see is when the mode is
>>>>>> changed you don't want to add the VF mac address to the fdb table. How
>>>>>> about rather than make this part of the mode selection pick one way to
>>>>>> do this in all cases. Either add the VF mac addresses to the fdb or
>>>>>> do not do this. I have a preference for adding the VF mac addresses
>>>>>> because this is the current behavior. Then rename the devlink option
>>>>>> "VF reps" or something because that is what it is controlling.
>>>>>>
>>>>>> The last thing to argue about is if its a port attribute ala ethtool
>>>>>> or a device attribute ala devlink. But maybe we can agree on everything
>>>>>> up to this point?
>>>>>>
>>>>>> Thanks,
>>>>>> John
>>>>>>
>>>>> FWIW reviewing devlink and items I want to put there in the future I've
>>>>> decided it makes sense to keep it in devlink (sorry took me a day of
>>>>> emails to get here). If you can agree to the above and rename it
>>>>> something like,
>>>>>
>>>>> +enum devlink_eswitch_mode {
>>>>> + DEVLINK_ESWITCH_MODE_NONE,
>>>>> + DEVLINK_ESWITCH_MODE_LEGACY,
>>>>> + DEVLINK_ESWITCH_MODE_CREATE_VF_NETDEVS,
>>>> That is certainly totally misleading name. The mode is not about
>>>> creating "VF netdevs".
>>>>
>>>> The VF representors are created but just as a side effect. The "offload"
>>>> mode or maybe better "switchdev" mode is creating representor netdevs for
>>>> VFs because they are needed in order to be able to configure ESwitch in
>>>> the same way we configure physical switches - putting netdevs into
>>>> bridge/bond/ovs/whatever. You see stats on the representors. Basicaly
>>>> they are the same as physical port representors on physical switch ASIC.
>>>
>>> May be we need 2 new modes
>>> - legacy+ mode which only creates VF netdevs and let the user configure and manage the switch via the standard bridge/tc/ip/ethtool interfaces
>>> - 'offload' or 'switchdev' mode that does more than just creating VF netdevs if it is not possible to configure the switch into this mode via standard interfaces.
>>
>> What?
>>
>> That what you described as "legacy+" as "let the user configure and
>> manage the switch via the standard bridge/tc/ip/ethtool interfaces" is
>> exactly the "offload/switchdev" mode.
>>
>> The second mode you described is something that I don't get what you are
>> talking about...
>>
>> Please forget about legacy. It's a mistake. Similar to SDKs :(
>> Let's work on getting the proper offload solution in.
>>
>
>I think the point here is switchdev is not needed to use bridge, tc,
>ip, and ethtool tools. By adding the VF representors we can continue
>using 'tc', 'bridge', etc. and it is much more interesting because
>we bring the VFs into the netdev world even without switchdev support
>this is nice. Adding switchdev of course gets you some extra goodies
>like l3 and l2 learning if your nic supports it but its not strictly
>required to see goodness from this patch. Without switchdev support
>you get stats (big win), basic port configuration with ip link cmds,
>tc and bridge fdb to name a few.
Why not to have 2 modes:
1) lagacy - the current solution, blackbox eswitch, undefined behaviour
2) switchdev - with representors, all features possible as on physical
switches, whitebox eswitch configured using standard tools?
I don't see *ANY* reason for a hybrid. That would only make things
already complicated much more complicated.
>
>Also we can't completely forget about legacy though because we have
>infrastructure built around it and its unlikely we can switch entirely
>over in one shot. For example the firewall application may switch over
>to the new VF rep model while the libvirt VM manager continues to use
>the 'ip link set ... vf #' model. No reason to stop this from being
>supported its actually more work in the code to block it. We get it for
>free.
Let legacy be legacy, I have no problem with that. New drivers would be
encouraged to implement only new switchdev mode.
>
>I've come to the conclusion that we are just arguing over a name and
>a bit of perspective calling it "offload" mode is OK with me even
>though legacy mode did offloading as well just not as interesting of
>offloads. If the VF representors are the cause or effect is not all
>that important to me.
Why not call it just MODE_SWITCHDEV? I believe it describes it the best.
Everyone knows what that is about.
>
>If drivers populate the fdb table with known MACs is a side issue
>IMO (the thread Or and I got lost in) and doesn't need to hold up this
>patch.
>
>.John
>
>
>
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-30 10:52 ` Jiri Pirko
@ 2016-06-30 14:24 ` Or Gerlitz
2016-06-30 15:40 ` John Fastabend
1 sibling, 0 replies; 47+ messages in thread
From: Or Gerlitz @ 2016-06-30 14:24 UTC (permalink / raw)
To: Jiri Pirko, John Fastabend
Cc: Samudrala, Sridhar, Or Gerlitz, Saeed Mahameed, David S. Miller,
Linux Netdev List, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Ido Schimmel, Tal Anker
On Thu, Jun 30, 2016 at 1:52 PM, Jiri Pirko <jiri@resnulli.us> wrote:
> Why not to have 2 modes:
> 1) lagacy - the current solution, blackbox eswitch, undefined behaviour
> 2) switchdev - with representors, all features possible as on physical
> switches, whitebox eswitch configured using standard tools?
yep, this makes sense to me. I will rename the offloads mode to be
called switchdev
and we'll respin V2 with few more small fixes and clarification on the
change log of this patch
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-30 10:52 ` Jiri Pirko
2016-06-30 14:24 ` Or Gerlitz
@ 2016-06-30 15:40 ` John Fastabend
2016-06-30 15:53 ` Jiri Pirko
1 sibling, 1 reply; 47+ messages in thread
From: John Fastabend @ 2016-06-30 15:40 UTC (permalink / raw)
To: Jiri Pirko
Cc: Samudrala, Sridhar, Or Gerlitz, Or Gerlitz, Saeed Mahameed,
David S. Miller, Linux Netdev List, Hadar Hen-Zion, Jiri Pirko,
Andy Gospodarek, Jesse Brandeburg, John Fastabend, Ido Schimmel,
Tal Anker
On 16-06-30 03:52 AM, Jiri Pirko wrote:
> Thu, Jun 30, 2016 at 09:57:21AM CEST, john.fastabend@gmail.com wrote:
>> On 16-06-30 12:41 AM, Jiri Pirko wrote:
>>> Thu, Jun 30, 2016 at 09:13:55AM CEST, sridhar.samudrala@intel.com wrote:
>>>>
>>>>
>>>> On 6/29/2016 11:25 PM, Jiri Pirko wrote:
>>>>> Thu, Jun 30, 2016 at 06:04:39AM CEST, john.fastabend@gmail.com wrote:
>>>>>> On 16-06-29 08:35 PM, John Fastabend wrote:
>>>>>>> On 16-06-29 03:09 PM, John Fastabend wrote:
>>>>>>>> On 16-06-29 02:33 PM, Or Gerlitz wrote:
>>>>>>>>> On Wed, Jun 29, 2016 at 7:35 PM, John Fastabend
>>>>>>>>> <john.fastabend@gmail.com> wrote:
>>>>>>>>>> On 16-06-29 07:48 AM, Or Gerlitz wrote:
>>>>>>>>>>> On 6/28/2016 10:31 PM, John Fastabend wrote:
>>>>>>>>>>>> On 16-06-28 12:12 PM, Jiri Pirko wrote:
>>>>>>>>>>>>> Why?! Please, leave legacy be legacy. Use the new mode for
>>>>>>>>>>>>> implementing new features. Don't make things any more complicated :(
>>>>>>>>> [...]
>>>>>>>>>>>> Maybe I'm reading to much into the devlink flag names and if instead
>>>>>>>>>>>> you use a switch like the following,
>>>>>>>>>>>> VF representer : enable/disable the creation VF netdev's to represent
>>>>>>>>>>>> the virtual functions on the PF
>>>>>>>>>>>> Much less complicated then magic switching between forwarding logic IMO
>>>>>>>>>>>> and you don't whack a default configuration that an entire stack (e.g.
>>>>>>>>>>>> libvirt) has been built to use.
>>>>>>>>>>> Re letting the user to observe/modify the rules added by the
>>>>>>>>>>> driver/firmware while legacy mode. Even if possible with bridge/fdb, it
>>>>>>>>>>> will be really pragmatical and doesn't make sense to get that donefor
>>>>>>>>>>> the TC subsystem. So this isn't a well defined solution and anyway, as
>>>>>>>>>>> you said, legacy mode enhancements is a different exercise. Personally,
>>>>>>>>>>> I agree with Jiri, that we should legacy be legacyand focus on adding
>>>>>>>>>>> the new model.
>>>>>>>>>> The ixgbe driver already supports bridge and tc commands without the VF
>>>>>>>>>> representer. Adding the VF representer to these drivers just extends
>>>>>>>>>> the existing support so we have an identifier for VFs and now the
>>>>>>>>>> redirect action works and the fdb commands can specify the VF netdevs.
>>>>>>>>>> I don't see this as a problem because we already do it today with
>>>>>>>>>> 'ip' and bridge tools.
>>>>>>>>> To be precise, for both ixgbe and mlx5, the existing tc support
>>>>>>>>> (u32/ixgbe, flower/mlx5) is not for switching functionality but rather
>>>>>>>>> for NIC-ish one, e.g drop, mark, etc. Indeed in ixgbe you added
>>>>>>>>> redirect to VF, but this is only for south --> north (wire --> VF)
>>>>>>>>> traffic, w.o the VF rep you can't do the other way around.
>>>>>>>>>
>>>>>>>> Correct which is why we need the VF rep. So we are completely in
>>>>>>>> sync there.
>>>>>>>>
>>>>>>>>> Just to clarify, to what exact bridge command support did you refer for ixgbe?
>>>>>>>> 'bridge fdb' commands are supported today on the PF. But its the
>>>>>>>> same story as above we need the VF rep to also use it on the
>>>>>>>> VF representer
>>>>>>>>
>>>>>>>> Also 'bridge link' command for veb/vepa modes is supported and the
>>>>>>>> other link attributes could be supported with additional driver
>>>>>>>> support. No need for core changes here. But again yes only on the
>>>>>>>> PF so again we need the VF reps.
>>>>>>>>
>>>>>>>>> The forwarding done in the legacy mode is not well defined, and
>>>>>>>>> different across vendors, adding there the VF reps will not make it
>>>>>>>>> any better b/c some steering rules will be set by tc/bridge offloads
>>>>>>>>> while other rules will be put by the driver.
>>>>>>>>> I don't see how this takes us to better place.
>>>>>>>> In legacy mode or any other mode you are defining some default policy
>>>>>>>> and rules.
>>>>>>>>
>>>>>>>> In the legacy mode we use mac/vlan assigned l2 forwarding entries in the
>>>>>>>> hardware fdb which are seen when you query 'ip link' and 'bridge fdb'
>>>>>>>> today. And similarly can be modified today using 'ip link' and 'bridge
>>>>>>>> fdb' at least on the intel devices. Its not undefined in any way with
>>>>>>>> a quick query of the tools we can learn exactly what the configuration
>>>>>>>> is and even change it. This works fairly well with existing controllers
>>>>>>>> and stacks.
>>>>>>>>
>>>>>>>> The limitations are 'ip' only supports a single MAC address per VF and
>>>>>>>> 'tc' doesn't work on VF ports because when the VF is assigned to a VM
>>>>>>>> or namespace we lose visibility of it. Providing a VF rep for this
>>>>>>>> solves both of those problems.
>>>>>>>>
>>>>>>>> In this new mode the default policy is to create a default miss rule
>>>>>>>> and implement no l2 forwarding rules. Unfortunately not all hardware
>>>>>>>> in use supports this default miss rule case but would still benefit
>>>>>>> >from having a VF rep. So we shouldn't make this a stipulation for
>>>>>>>> enabling VF reps. It also changes a default policy that has been in
>>>>>>>> place for years without IMO at least any compelling reason. It will
>>>>>>>> be easy enough to change the default l2 policy to a flow based model
>>>>>>>> with a few bridge/tc commands.
>>>>>>>>
>>>>>>>>>> We are also slightly in disagreement about what the default should be
>>>>>>>>>> with VF netdevs. I think the default should be the same L2 mac/vlan
>>>>>>>>>> switch behavior and see no reason to change it by default just because
>>>>>>>>>> we added VF netdevs. The infrastructure libvirt/openstack/etc are built
>>>>>>>>>> around this default today. But I guess nothing in this series specifies
>>>>>>>>>> what the defaults of any given driver will be. VF netdevs are still
>>>>>>>>>> useful even on older hardware that only supports mac/vlan forwarding to
>>>>>>>>>> expose statistics and send/receive control frames such as lldp.
>>>>>>>>> Again, this is not about default engineering... and using the VF reps
>>>>>>>>> (not VF netdevs) in legacy mode only make it more cryptic to my
>>>>>>>>> opinion. I agree some changes would be needed in openstack to support
>>>>>>>>> the new model, but this is how progress is made... you can't always
>>>>>>>>> make all layer above you unchanged. Note that the VF reps behave the
>>>>>>>>> same as tap devices (v-switch doing xmit on tap --> recv in VM, VM
>>>>>>>>> sends --> recv on tap into the v-switch), so the change in open-stack
>>>>>>>>> would not be that big.
>>>>>>>>>
>>>>>>>> But in this case we have no reason to break the stack above us. The
>>>>>>>> currently deployed usage is L2 mac/vlan. As soon as you bind a vSwitch
>>>>>>>> or whatever mgmt agent to the device it can go ahead and manage the
>>>>>>>> switch putting it in the correct mode using the tooling in 'bridge' and
>>>>>>>> 'tc'.
>>>>>>>>
>>>>>>>>
>>>>>>>>> [...]
>>>>>>>>>
>>>>>>>>>> Why I think the VF representer is a per port ethtool flag and not a
>>>>>>>>>> devlink option is my use case might be to assign a PF into a VM or
>>>>>>>>>> namespace where I don't want VF netdevs.
>>>>>>>>> again, we think the correct place to set how the eswitch is managed is
>>>>>>>>> through eswitch manager PCI devices and not net devices and hence
>>>>>>>>> ethtool is not the way to go.
>>>>>>>>>
>>>>>>>>> Also, how do you want your e-switch to be managed in this case?
>>>>>>>>>
>>>>>>>> In the case where I don't create vf netdevs on one of the PFs I'll
>>>>>>>> manage the forwarding tables via the existing mechanisms 'ip' and
>>>>>>>> 'bridge'. However its likely not a big deal because 'ip' and 'bridge'
>>>>>>>> will continue to work even if VF reps are around. The ethtool/devlink
>>>>>>>> comment was more about pointing out that creating VFs does not
>>>>>>>> require you to manage your switch any differently. Its useful even on
>>>>>>>> devices that can't support flow based forwarding for statistics and
>>>>>>>> setting port attributes like mtu, etc.
>>>>>>>>
>>>>>>>> .John
>>>>>>>>
>>>>>>> Probably bad form to respond to my own email but just to highlight how
>>>>>>> subtle the distinction is (hopefully not to much repeat),
>>>>>>>
>>>>>>> Today in "legacy" mode each VF mac address is automatically added to
>>>>>>> the fdb along with the PF mac address. If there is a miss in the table
>>>>>>> (an unknown mac) the packet is sent to the PF but unless the PF is in
>>>>>>> promisc mode the packet is dropped by the rx filter. I presume even
>>>>>>> with the proposed model you would want to continue to enforce the
>>>>>>> rx filter otherwise the instance you flip the mode you are open to
>>>>>>> receive unwanted traffic. The promisc mode semantics have been in place
>>>>>>> for a long time so certainly don't want to break that. Can we agree on
>>>>>>> the promisc point? Also bridges/vswitch/etc already set promisc mode
>>>>>>> once they attach to the netdevs.
>>>>>>>
>>>>>>> (assuming we agree on the promisc point?)
>>>>>>> In your proposed model the only difference I can see is when the mode is
>>>>>>> changed you don't want to add the VF mac address to the fdb table. How
>>>>>>> about rather than make this part of the mode selection pick one way to
>>>>>>> do this in all cases. Either add the VF mac addresses to the fdb or
>>>>>>> do not do this. I have a preference for adding the VF mac addresses
>>>>>>> because this is the current behavior. Then rename the devlink option
>>>>>>> "VF reps" or something because that is what it is controlling.
>>>>>>>
>>>>>>> The last thing to argue about is if its a port attribute ala ethtool
>>>>>>> or a device attribute ala devlink. But maybe we can agree on everything
>>>>>>> up to this point?
>>>>>>>
>>>>>>> Thanks,
>>>>>>> John
>>>>>>>
>>>>>> FWIW reviewing devlink and items I want to put there in the future I've
>>>>>> decided it makes sense to keep it in devlink (sorry took me a day of
>>>>>> emails to get here). If you can agree to the above and rename it
>>>>>> something like,
>>>>>>
>>>>>> +enum devlink_eswitch_mode {
>>>>>> + DEVLINK_ESWITCH_MODE_NONE,
>>>>>> + DEVLINK_ESWITCH_MODE_LEGACY,
>>>>>> + DEVLINK_ESWITCH_MODE_CREATE_VF_NETDEVS,
>>>>> That is certainly totally misleading name. The mode is not about
>>>>> creating "VF netdevs".
>>>>>
>>>>> The VF representors are created but just as a side effect. The "offload"
>>>>> mode or maybe better "switchdev" mode is creating representor netdevs for
>>>>> VFs because they are needed in order to be able to configure ESwitch in
>>>>> the same way we configure physical switches - putting netdevs into
>>>>> bridge/bond/ovs/whatever. You see stats on the representors. Basicaly
>>>>> they are the same as physical port representors on physical switch ASIC.
>>>>
>>>> May be we need 2 new modes
>>>> - legacy+ mode which only creates VF netdevs and let the user configure and manage the switch via the standard bridge/tc/ip/ethtool interfaces
>>>> - 'offload' or 'switchdev' mode that does more than just creating VF netdevs if it is not possible to configure the switch into this mode via standard interfaces.
>>>
>>> What?
>>>
>>> That what you described as "legacy+" as "let the user configure and
>>> manage the switch via the standard bridge/tc/ip/ethtool interfaces" is
>>> exactly the "offload/switchdev" mode.
>>>
>>> The second mode you described is something that I don't get what you are
>>> talking about...
>>>
>>> Please forget about legacy. It's a mistake. Similar to SDKs :(
>>> Let's work on getting the proper offload solution in.
>>>
>>
>> I think the point here is switchdev is not needed to use bridge, tc,
>> ip, and ethtool tools. By adding the VF representors we can continue
>> using 'tc', 'bridge', etc. and it is much more interesting because
>> we bring the VFs into the netdev world even without switchdev support
>> this is nice. Adding switchdev of course gets you some extra goodies
>> like l3 and l2 learning if your nic supports it but its not strictly
>> required to see goodness from this patch. Without switchdev support
>> you get stats (big win), basic port configuration with ip link cmds,
>> tc and bridge fdb to name a few.
>
> Why not to have 2 modes:
>
> 1) lagacy - the current solution, blackbox eswitch, undefined behaviour
> 2) switchdev - with representors, all features possible as on physical
> switches, whitebox eswitch configured using standard tools?
>
> I don't see *ANY* reason for a hybrid. That would only make things
> already complicated much more complicated.
>
>
>>
>> Also we can't completely forget about legacy though because we have
>> infrastructure built around it and its unlikely we can switch entirely
>> over in one shot. For example the firewall application may switch over
>> to the new VF rep model while the libvirt VM manager continues to use
>> the 'ip link set ... vf #' model. No reason to stop this from being
>> supported its actually more work in the code to block it. We get it for
>> free.
>
> Let legacy be legacy, I have no problem with that. New drivers would be
> encouraged to implement only new switchdev mode.
>
Nope I disagree there is no reason to break existing userspace here just
continue to support the handful of ip commands and bridge commands
already supported. The code is already in the driver and supported.
In general the kernel shouldn't break UAPI already in place.
>
>>
>> I've come to the conclusion that we are just arguing over a name and
>> a bit of perspective calling it "offload" mode is OK with me even
>> though legacy mode did offloading as well just not as interesting of
>> offloads. If the VF representors are the cause or effect is not all
>> that important to me.
>
> Why not call it just MODE_SWITCHDEV? I believe it describes it the best.
> Everyone knows what that is about.
>
This is fine but it doesn't require drivers actually register with
switchdev here to get the goodness.
>
>>
>> If drivers populate the fdb table with known MACs is a side issue
>> IMO (the thread Or and I got lost in) and doesn't need to hold up this
>> patch.
>>
>> .John
>>
>>
>>
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-30 15:40 ` John Fastabend
@ 2016-06-30 15:53 ` Jiri Pirko
2016-06-30 16:29 ` John Fastabend
0 siblings, 1 reply; 47+ messages in thread
From: Jiri Pirko @ 2016-06-30 15:53 UTC (permalink / raw)
To: John Fastabend
Cc: Samudrala, Sridhar, Or Gerlitz, Or Gerlitz, Saeed Mahameed,
David S. Miller, Linux Netdev List, Hadar Hen-Zion, Jiri Pirko,
Andy Gospodarek, Jesse Brandeburg, John Fastabend, Ido Schimmel,
Tal Anker
Thu, Jun 30, 2016 at 05:40:57PM CEST, john.fastabend@gmail.com wrote:
>On 16-06-30 03:52 AM, Jiri Pirko wrote:
>> Thu, Jun 30, 2016 at 09:57:21AM CEST, john.fastabend@gmail.com wrote:
>>> On 16-06-30 12:41 AM, Jiri Pirko wrote:
>>>> Thu, Jun 30, 2016 at 09:13:55AM CEST, sridhar.samudrala@intel.com wrote:
>>>>>
>>>>>
>>>>> On 6/29/2016 11:25 PM, Jiri Pirko wrote:
>>>>>> Thu, Jun 30, 2016 at 06:04:39AM CEST, john.fastabend@gmail.com wrote:
>>>>>>> On 16-06-29 08:35 PM, John Fastabend wrote:
>>>>>>>> On 16-06-29 03:09 PM, John Fastabend wrote:
>>>>>>>>> On 16-06-29 02:33 PM, Or Gerlitz wrote:
>>>>>>>>>> On Wed, Jun 29, 2016 at 7:35 PM, John Fastabend
>>>>>>>>>> <john.fastabend@gmail.com> wrote:
>>>>>>>>>>> On 16-06-29 07:48 AM, Or Gerlitz wrote:
>>>>>>>>>>>> On 6/28/2016 10:31 PM, John Fastabend wrote:
>>>>>>>>>>>>> On 16-06-28 12:12 PM, Jiri Pirko wrote:
>>>>>>>>>>>>>> Why?! Please, leave legacy be legacy. Use the new mode for
>>>>>>>>>>>>>> implementing new features. Don't make things any more complicated :(
>>>>>>>>>> [...]
>>>>>>>>>>>>> Maybe I'm reading to much into the devlink flag names and if instead
>>>>>>>>>>>>> you use a switch like the following,
>>>>>>>>>>>>> VF representer : enable/disable the creation VF netdev's to represent
>>>>>>>>>>>>> the virtual functions on the PF
>>>>>>>>>>>>> Much less complicated then magic switching between forwarding logic IMO
>>>>>>>>>>>>> and you don't whack a default configuration that an entire stack (e.g.
>>>>>>>>>>>>> libvirt) has been built to use.
>>>>>>>>>>>> Re letting the user to observe/modify the rules added by the
>>>>>>>>>>>> driver/firmware while legacy mode. Even if possible with bridge/fdb, it
>>>>>>>>>>>> will be really pragmatical and doesn't make sense to get that donefor
>>>>>>>>>>>> the TC subsystem. So this isn't a well defined solution and anyway, as
>>>>>>>>>>>> you said, legacy mode enhancements is a different exercise. Personally,
>>>>>>>>>>>> I agree with Jiri, that we should legacy be legacyand focus on adding
>>>>>>>>>>>> the new model.
>>>>>>>>>>> The ixgbe driver already supports bridge and tc commands without the VF
>>>>>>>>>>> representer. Adding the VF representer to these drivers just extends
>>>>>>>>>>> the existing support so we have an identifier for VFs and now the
>>>>>>>>>>> redirect action works and the fdb commands can specify the VF netdevs.
>>>>>>>>>>> I don't see this as a problem because we already do it today with
>>>>>>>>>>> 'ip' and bridge tools.
>>>>>>>>>> To be precise, for both ixgbe and mlx5, the existing tc support
>>>>>>>>>> (u32/ixgbe, flower/mlx5) is not for switching functionality but rather
>>>>>>>>>> for NIC-ish one, e.g drop, mark, etc. Indeed in ixgbe you added
>>>>>>>>>> redirect to VF, but this is only for south --> north (wire --> VF)
>>>>>>>>>> traffic, w.o the VF rep you can't do the other way around.
>>>>>>>>>>
>>>>>>>>> Correct which is why we need the VF rep. So we are completely in
>>>>>>>>> sync there.
>>>>>>>>>
>>>>>>>>>> Just to clarify, to what exact bridge command support did you refer for ixgbe?
>>>>>>>>> 'bridge fdb' commands are supported today on the PF. But its the
>>>>>>>>> same story as above we need the VF rep to also use it on the
>>>>>>>>> VF representer
>>>>>>>>>
>>>>>>>>> Also 'bridge link' command for veb/vepa modes is supported and the
>>>>>>>>> other link attributes could be supported with additional driver
>>>>>>>>> support. No need for core changes here. But again yes only on the
>>>>>>>>> PF so again we need the VF reps.
>>>>>>>>>
>>>>>>>>>> The forwarding done in the legacy mode is not well defined, and
>>>>>>>>>> different across vendors, adding there the VF reps will not make it
>>>>>>>>>> any better b/c some steering rules will be set by tc/bridge offloads
>>>>>>>>>> while other rules will be put by the driver.
>>>>>>>>>> I don't see how this takes us to better place.
>>>>>>>>> In legacy mode or any other mode you are defining some default policy
>>>>>>>>> and rules.
>>>>>>>>>
>>>>>>>>> In the legacy mode we use mac/vlan assigned l2 forwarding entries in the
>>>>>>>>> hardware fdb which are seen when you query 'ip link' and 'bridge fdb'
>>>>>>>>> today. And similarly can be modified today using 'ip link' and 'bridge
>>>>>>>>> fdb' at least on the intel devices. Its not undefined in any way with
>>>>>>>>> a quick query of the tools we can learn exactly what the configuration
>>>>>>>>> is and even change it. This works fairly well with existing controllers
>>>>>>>>> and stacks.
>>>>>>>>>
>>>>>>>>> The limitations are 'ip' only supports a single MAC address per VF and
>>>>>>>>> 'tc' doesn't work on VF ports because when the VF is assigned to a VM
>>>>>>>>> or namespace we lose visibility of it. Providing a VF rep for this
>>>>>>>>> solves both of those problems.
>>>>>>>>>
>>>>>>>>> In this new mode the default policy is to create a default miss rule
>>>>>>>>> and implement no l2 forwarding rules. Unfortunately not all hardware
>>>>>>>>> in use supports this default miss rule case but would still benefit
>>>>>>>> >from having a VF rep. So we shouldn't make this a stipulation for
>>>>>>>>> enabling VF reps. It also changes a default policy that has been in
>>>>>>>>> place for years without IMO at least any compelling reason. It will
>>>>>>>>> be easy enough to change the default l2 policy to a flow based model
>>>>>>>>> with a few bridge/tc commands.
>>>>>>>>>
>>>>>>>>>>> We are also slightly in disagreement about what the default should be
>>>>>>>>>>> with VF netdevs. I think the default should be the same L2 mac/vlan
>>>>>>>>>>> switch behavior and see no reason to change it by default just because
>>>>>>>>>>> we added VF netdevs. The infrastructure libvirt/openstack/etc are built
>>>>>>>>>>> around this default today. But I guess nothing in this series specifies
>>>>>>>>>>> what the defaults of any given driver will be. VF netdevs are still
>>>>>>>>>>> useful even on older hardware that only supports mac/vlan forwarding to
>>>>>>>>>>> expose statistics and send/receive control frames such as lldp.
>>>>>>>>>> Again, this is not about default engineering... and using the VF reps
>>>>>>>>>> (not VF netdevs) in legacy mode only make it more cryptic to my
>>>>>>>>>> opinion. I agree some changes would be needed in openstack to support
>>>>>>>>>> the new model, but this is how progress is made... you can't always
>>>>>>>>>> make all layer above you unchanged. Note that the VF reps behave the
>>>>>>>>>> same as tap devices (v-switch doing xmit on tap --> recv in VM, VM
>>>>>>>>>> sends --> recv on tap into the v-switch), so the change in open-stack
>>>>>>>>>> would not be that big.
>>>>>>>>>>
>>>>>>>>> But in this case we have no reason to break the stack above us. The
>>>>>>>>> currently deployed usage is L2 mac/vlan. As soon as you bind a vSwitch
>>>>>>>>> or whatever mgmt agent to the device it can go ahead and manage the
>>>>>>>>> switch putting it in the correct mode using the tooling in 'bridge' and
>>>>>>>>> 'tc'.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> [...]
>>>>>>>>>>
>>>>>>>>>>> Why I think the VF representer is a per port ethtool flag and not a
>>>>>>>>>>> devlink option is my use case might be to assign a PF into a VM or
>>>>>>>>>>> namespace where I don't want VF netdevs.
>>>>>>>>>> again, we think the correct place to set how the eswitch is managed is
>>>>>>>>>> through eswitch manager PCI devices and not net devices and hence
>>>>>>>>>> ethtool is not the way to go.
>>>>>>>>>>
>>>>>>>>>> Also, how do you want your e-switch to be managed in this case?
>>>>>>>>>>
>>>>>>>>> In the case where I don't create vf netdevs on one of the PFs I'll
>>>>>>>>> manage the forwarding tables via the existing mechanisms 'ip' and
>>>>>>>>> 'bridge'. However its likely not a big deal because 'ip' and 'bridge'
>>>>>>>>> will continue to work even if VF reps are around. The ethtool/devlink
>>>>>>>>> comment was more about pointing out that creating VFs does not
>>>>>>>>> require you to manage your switch any differently. Its useful even on
>>>>>>>>> devices that can't support flow based forwarding for statistics and
>>>>>>>>> setting port attributes like mtu, etc.
>>>>>>>>>
>>>>>>>>> .John
>>>>>>>>>
>>>>>>>> Probably bad form to respond to my own email but just to highlight how
>>>>>>>> subtle the distinction is (hopefully not to much repeat),
>>>>>>>>
>>>>>>>> Today in "legacy" mode each VF mac address is automatically added to
>>>>>>>> the fdb along with the PF mac address. If there is a miss in the table
>>>>>>>> (an unknown mac) the packet is sent to the PF but unless the PF is in
>>>>>>>> promisc mode the packet is dropped by the rx filter. I presume even
>>>>>>>> with the proposed model you would want to continue to enforce the
>>>>>>>> rx filter otherwise the instance you flip the mode you are open to
>>>>>>>> receive unwanted traffic. The promisc mode semantics have been in place
>>>>>>>> for a long time so certainly don't want to break that. Can we agree on
>>>>>>>> the promisc point? Also bridges/vswitch/etc already set promisc mode
>>>>>>>> once they attach to the netdevs.
>>>>>>>>
>>>>>>>> (assuming we agree on the promisc point?)
>>>>>>>> In your proposed model the only difference I can see is when the mode is
>>>>>>>> changed you don't want to add the VF mac address to the fdb table. How
>>>>>>>> about rather than make this part of the mode selection pick one way to
>>>>>>>> do this in all cases. Either add the VF mac addresses to the fdb or
>>>>>>>> do not do this. I have a preference for adding the VF mac addresses
>>>>>>>> because this is the current behavior. Then rename the devlink option
>>>>>>>> "VF reps" or something because that is what it is controlling.
>>>>>>>>
>>>>>>>> The last thing to argue about is if its a port attribute ala ethtool
>>>>>>>> or a device attribute ala devlink. But maybe we can agree on everything
>>>>>>>> up to this point?
>>>>>>>>
>>>>>>>> Thanks,
>>>>>>>> John
>>>>>>>>
>>>>>>> FWIW reviewing devlink and items I want to put there in the future I've
>>>>>>> decided it makes sense to keep it in devlink (sorry took me a day of
>>>>>>> emails to get here). If you can agree to the above and rename it
>>>>>>> something like,
>>>>>>>
>>>>>>> +enum devlink_eswitch_mode {
>>>>>>> + DEVLINK_ESWITCH_MODE_NONE,
>>>>>>> + DEVLINK_ESWITCH_MODE_LEGACY,
>>>>>>> + DEVLINK_ESWITCH_MODE_CREATE_VF_NETDEVS,
>>>>>> That is certainly totally misleading name. The mode is not about
>>>>>> creating "VF netdevs".
>>>>>>
>>>>>> The VF representors are created but just as a side effect. The "offload"
>>>>>> mode or maybe better "switchdev" mode is creating representor netdevs for
>>>>>> VFs because they are needed in order to be able to configure ESwitch in
>>>>>> the same way we configure physical switches - putting netdevs into
>>>>>> bridge/bond/ovs/whatever. You see stats on the representors. Basicaly
>>>>>> they are the same as physical port representors on physical switch ASIC.
>>>>>
>>>>> May be we need 2 new modes
>>>>> - legacy+ mode which only creates VF netdevs and let the user configure and manage the switch via the standard bridge/tc/ip/ethtool interfaces
>>>>> - 'offload' or 'switchdev' mode that does more than just creating VF netdevs if it is not possible to configure the switch into this mode via standard interfaces.
>>>>
>>>> What?
>>>>
>>>> That what you described as "legacy+" as "let the user configure and
>>>> manage the switch via the standard bridge/tc/ip/ethtool interfaces" is
>>>> exactly the "offload/switchdev" mode.
>>>>
>>>> The second mode you described is something that I don't get what you are
>>>> talking about...
>>>>
>>>> Please forget about legacy. It's a mistake. Similar to SDKs :(
>>>> Let's work on getting the proper offload solution in.
>>>>
>>>
>>> I think the point here is switchdev is not needed to use bridge, tc,
>>> ip, and ethtool tools. By adding the VF representors we can continue
>>> using 'tc', 'bridge', etc. and it is much more interesting because
>>> we bring the VFs into the netdev world even without switchdev support
>>> this is nice. Adding switchdev of course gets you some extra goodies
>>> like l3 and l2 learning if your nic supports it but its not strictly
>>> required to see goodness from this patch. Without switchdev support
>>> you get stats (big win), basic port configuration with ip link cmds,
>>> tc and bridge fdb to name a few.
>>
>> Why not to have 2 modes:
>>
>> 1) lagacy - the current solution, blackbox eswitch, undefined behaviour
>> 2) switchdev - with representors, all features possible as on physical
>> switches, whitebox eswitch configured using standard tools?
>>
>> I don't see *ANY* reason for a hybrid. That would only make things
>> already complicated much more complicated.
>>
>>
>>>
>>> Also we can't completely forget about legacy though because we have
>>> infrastructure built around it and its unlikely we can switch entirely
>>> over in one shot. For example the firewall application may switch over
>>> to the new VF rep model while the libvirt VM manager continues to use
>>> the 'ip link set ... vf #' model. No reason to stop this from being
>>> supported its actually more work in the code to block it. We get it for
>>> free.
>>
>> Let legacy be legacy, I have no problem with that. New drivers would be
>> encouraged to implement only new switchdev mode.
>>
>
>Nope I disagree there is no reason to break existing userspace here just
>continue to support the handful of ip commands and bridge commands
>already supported. The code is already in the driver and supported.
>In general the kernel shouldn't break UAPI already in place.
Who is breaking existing userspace? I don't understand what breakage are
you are referring to :(
>
>>
>>>
>>> I've come to the conclusion that we are just arguing over a name and
>>> a bit of perspective calling it "offload" mode is OK with me even
>>> though legacy mode did offloading as well just not as interesting of
>>> offloads. If the VF representors are the cause or effect is not all
>>> that important to me.
>>
>> Why not call it just MODE_SWITCHDEV? I believe it describes it the best.
>> Everyone knows what that is about.
>>
>
>This is fine but it doesn't require drivers actually register with
>switchdev here to get the goodness.
Switch id for ports, that is the only thing needed.
>
>>
>>>
>>> If drivers populate the fdb table with known MACs is a side issue
>>> IMO (the thread Or and I got lost in) and doesn't need to hold up this
>>> patch.
>>>
>>> .John
>>>
>>>
>>>
>
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-30 15:53 ` Jiri Pirko
@ 2016-06-30 16:29 ` John Fastabend
0 siblings, 0 replies; 47+ messages in thread
From: John Fastabend @ 2016-06-30 16:29 UTC (permalink / raw)
To: Jiri Pirko
Cc: Samudrala, Sridhar, Or Gerlitz, Or Gerlitz, Saeed Mahameed,
David S. Miller, Linux Netdev List, Hadar Hen-Zion, Jiri Pirko,
Andy Gospodarek, Jesse Brandeburg, John Fastabend, Ido Schimmel,
Tal Anker
On 16-06-30 08:53 AM, Jiri Pirko wrote:
> Thu, Jun 30, 2016 at 05:40:57PM CEST, john.fastabend@gmail.com wrote:
>> On 16-06-30 03:52 AM, Jiri Pirko wrote:
>>> Thu, Jun 30, 2016 at 09:57:21AM CEST, john.fastabend@gmail.com wrote:
>>>> On 16-06-30 12:41 AM, Jiri Pirko wrote:
>>>>> Thu, Jun 30, 2016 at 09:13:55AM CEST, sridhar.samudrala@intel.com wrote:
>>>>>>
>>>>>>
>>>>>> On 6/29/2016 11:25 PM, Jiri Pirko wrote:
>>>>>>> Thu, Jun 30, 2016 at 06:04:39AM CEST, john.fastabend@gmail.com wrote:
>>>>>>>> On 16-06-29 08:35 PM, John Fastabend wrote:
>>>>>>>>> On 16-06-29 03:09 PM, John Fastabend wrote:
>>>>>>>>>> On 16-06-29 02:33 PM, Or Gerlitz wrote:
>>>>>>>>>>> On Wed, Jun 29, 2016 at 7:35 PM, John Fastabend
>>>>>>>>>>> <john.fastabend@gmail.com> wrote:
>>>>>>>>>>>> On 16-06-29 07:48 AM, Or Gerlitz wrote:
>>>>>>>>>>>>> On 6/28/2016 10:31 PM, John Fastabend wrote:
>>>>>>>>>>>>>> On 16-06-28 12:12 PM, Jiri Pirko wrote:
>>>>>>>>>>>>>>> Why?! Please, leave legacy be legacy. Use the new mode for
>>>>>>>>>>>>>>> implementing new features. Don't make things any more complicated :(
>>>>>>>>>>> [...]
>>>>>>>>>>>>>> Maybe I'm reading to much into the devlink flag names and if instead
>>>>>>>>>>>>>> you use a switch like the following,
>>>>>>>>>>>>>> VF representer : enable/disable the creation VF netdev's to represent
>>>>>>>>>>>>>> the virtual functions on the PF
>>>>>>>>>>>>>> Much less complicated then magic switching between forwarding logic IMO
>>>>>>>>>>>>>> and you don't whack a default configuration that an entire stack (e.g.
>>>>>>>>>>>>>> libvirt) has been built to use.
>>>>>>>>>>>>> Re letting the user to observe/modify the rules added by the
>>>>>>>>>>>>> driver/firmware while legacy mode. Even if possible with bridge/fdb, it
>>>>>>>>>>>>> will be really pragmatical and doesn't make sense to get that donefor
>>>>>>>>>>>>> the TC subsystem. So this isn't a well defined solution and anyway, as
>>>>>>>>>>>>> you said, legacy mode enhancements is a different exercise. Personally,
>>>>>>>>>>>>> I agree with Jiri, that we should legacy be legacyand focus on adding
>>>>>>>>>>>>> the new model.
>>>>>>>>>>>> The ixgbe driver already supports bridge and tc commands without the VF
>>>>>>>>>>>> representer. Adding the VF representer to these drivers just extends
>>>>>>>>>>>> the existing support so we have an identifier for VFs and now the
>>>>>>>>>>>> redirect action works and the fdb commands can specify the VF netdevs.
>>>>>>>>>>>> I don't see this as a problem because we already do it today with
>>>>>>>>>>>> 'ip' and bridge tools.
>>>>>>>>>>> To be precise, for both ixgbe and mlx5, the existing tc support
>>>>>>>>>>> (u32/ixgbe, flower/mlx5) is not for switching functionality but rather
>>>>>>>>>>> for NIC-ish one, e.g drop, mark, etc. Indeed in ixgbe you added
>>>>>>>>>>> redirect to VF, but this is only for south --> north (wire --> VF)
>>>>>>>>>>> traffic, w.o the VF rep you can't do the other way around.
>>>>>>>>>>>
>>>>>>>>>> Correct which is why we need the VF rep. So we are completely in
>>>>>>>>>> sync there.
>>>>>>>>>>
>>>>>>>>>>> Just to clarify, to what exact bridge command support did you refer for ixgbe?
>>>>>>>>>> 'bridge fdb' commands are supported today on the PF. But its the
>>>>>>>>>> same story as above we need the VF rep to also use it on the
>>>>>>>>>> VF representer
>>>>>>>>>>
>>>>>>>>>> Also 'bridge link' command for veb/vepa modes is supported and the
>>>>>>>>>> other link attributes could be supported with additional driver
>>>>>>>>>> support. No need for core changes here. But again yes only on the
>>>>>>>>>> PF so again we need the VF reps.
>>>>>>>>>>
>>>>>>>>>>> The forwarding done in the legacy mode is not well defined, and
>>>>>>>>>>> different across vendors, adding there the VF reps will not make it
>>>>>>>>>>> any better b/c some steering rules will be set by tc/bridge offloads
>>>>>>>>>>> while other rules will be put by the driver.
>>>>>>>>>>> I don't see how this takes us to better place.
>>>>>>>>>> In legacy mode or any other mode you are defining some default policy
>>>>>>>>>> and rules.
>>>>>>>>>>
>>>>>>>>>> In the legacy mode we use mac/vlan assigned l2 forwarding entries in the
>>>>>>>>>> hardware fdb which are seen when you query 'ip link' and 'bridge fdb'
>>>>>>>>>> today. And similarly can be modified today using 'ip link' and 'bridge
>>>>>>>>>> fdb' at least on the intel devices. Its not undefined in any way with
>>>>>>>>>> a quick query of the tools we can learn exactly what the configuration
>>>>>>>>>> is and even change it. This works fairly well with existing controllers
>>>>>>>>>> and stacks.
>>>>>>>>>>
>>>>>>>>>> The limitations are 'ip' only supports a single MAC address per VF and
>>>>>>>>>> 'tc' doesn't work on VF ports because when the VF is assigned to a VM
>>>>>>>>>> or namespace we lose visibility of it. Providing a VF rep for this
>>>>>>>>>> solves both of those problems.
>>>>>>>>>>
>>>>>>>>>> In this new mode the default policy is to create a default miss rule
>>>>>>>>>> and implement no l2 forwarding rules. Unfortunately not all hardware
>>>>>>>>>> in use supports this default miss rule case but would still benefit
>>>>>>>>> >from having a VF rep. So we shouldn't make this a stipulation for
>>>>>>>>>> enabling VF reps. It also changes a default policy that has been in
>>>>>>>>>> place for years without IMO at least any compelling reason. It will
>>>>>>>>>> be easy enough to change the default l2 policy to a flow based model
>>>>>>>>>> with a few bridge/tc commands.
>>>>>>>>>>
>>>>>>>>>>>> We are also slightly in disagreement about what the default should be
>>>>>>>>>>>> with VF netdevs. I think the default should be the same L2 mac/vlan
>>>>>>>>>>>> switch behavior and see no reason to change it by default just because
>>>>>>>>>>>> we added VF netdevs. The infrastructure libvirt/openstack/etc are built
>>>>>>>>>>>> around this default today. But I guess nothing in this series specifies
>>>>>>>>>>>> what the defaults of any given driver will be. VF netdevs are still
>>>>>>>>>>>> useful even on older hardware that only supports mac/vlan forwarding to
>>>>>>>>>>>> expose statistics and send/receive control frames such as lldp.
>>>>>>>>>>> Again, this is not about default engineering... and using the VF reps
>>>>>>>>>>> (not VF netdevs) in legacy mode only make it more cryptic to my
>>>>>>>>>>> opinion. I agree some changes would be needed in openstack to support
>>>>>>>>>>> the new model, but this is how progress is made... you can't always
>>>>>>>>>>> make all layer above you unchanged. Note that the VF reps behave the
>>>>>>>>>>> same as tap devices (v-switch doing xmit on tap --> recv in VM, VM
>>>>>>>>>>> sends --> recv on tap into the v-switch), so the change in open-stack
>>>>>>>>>>> would not be that big.
>>>>>>>>>>>
>>>>>>>>>> But in this case we have no reason to break the stack above us. The
>>>>>>>>>> currently deployed usage is L2 mac/vlan. As soon as you bind a vSwitch
>>>>>>>>>> or whatever mgmt agent to the device it can go ahead and manage the
>>>>>>>>>> switch putting it in the correct mode using the tooling in 'bridge' and
>>>>>>>>>> 'tc'.
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> [...]
>>>>>>>>>>>
>>>>>>>>>>>> Why I think the VF representer is a per port ethtool flag and not a
>>>>>>>>>>>> devlink option is my use case might be to assign a PF into a VM or
>>>>>>>>>>>> namespace where I don't want VF netdevs.
>>>>>>>>>>> again, we think the correct place to set how the eswitch is managed is
>>>>>>>>>>> through eswitch manager PCI devices and not net devices and hence
>>>>>>>>>>> ethtool is not the way to go.
>>>>>>>>>>>
>>>>>>>>>>> Also, how do you want your e-switch to be managed in this case?
>>>>>>>>>>>
>>>>>>>>>> In the case where I don't create vf netdevs on one of the PFs I'll
>>>>>>>>>> manage the forwarding tables via the existing mechanisms 'ip' and
>>>>>>>>>> 'bridge'. However its likely not a big deal because 'ip' and 'bridge'
>>>>>>>>>> will continue to work even if VF reps are around. The ethtool/devlink
>>>>>>>>>> comment was more about pointing out that creating VFs does not
>>>>>>>>>> require you to manage your switch any differently. Its useful even on
>>>>>>>>>> devices that can't support flow based forwarding for statistics and
>>>>>>>>>> setting port attributes like mtu, etc.
>>>>>>>>>>
>>>>>>>>>> .John
>>>>>>>>>>
>>>>>>>>> Probably bad form to respond to my own email but just to highlight how
>>>>>>>>> subtle the distinction is (hopefully not to much repeat),
>>>>>>>>>
>>>>>>>>> Today in "legacy" mode each VF mac address is automatically added to
>>>>>>>>> the fdb along with the PF mac address. If there is a miss in the table
>>>>>>>>> (an unknown mac) the packet is sent to the PF but unless the PF is in
>>>>>>>>> promisc mode the packet is dropped by the rx filter. I presume even
>>>>>>>>> with the proposed model you would want to continue to enforce the
>>>>>>>>> rx filter otherwise the instance you flip the mode you are open to
>>>>>>>>> receive unwanted traffic. The promisc mode semantics have been in place
>>>>>>>>> for a long time so certainly don't want to break that. Can we agree on
>>>>>>>>> the promisc point? Also bridges/vswitch/etc already set promisc mode
>>>>>>>>> once they attach to the netdevs.
>>>>>>>>>
>>>>>>>>> (assuming we agree on the promisc point?)
>>>>>>>>> In your proposed model the only difference I can see is when the mode is
>>>>>>>>> changed you don't want to add the VF mac address to the fdb table. How
>>>>>>>>> about rather than make this part of the mode selection pick one way to
>>>>>>>>> do this in all cases. Either add the VF mac addresses to the fdb or
>>>>>>>>> do not do this. I have a preference for adding the VF mac addresses
>>>>>>>>> because this is the current behavior. Then rename the devlink option
>>>>>>>>> "VF reps" or something because that is what it is controlling.
>>>>>>>>>
>>>>>>>>> The last thing to argue about is if its a port attribute ala ethtool
>>>>>>>>> or a device attribute ala devlink. But maybe we can agree on everything
>>>>>>>>> up to this point?
>>>>>>>>>
>>>>>>>>> Thanks,
>>>>>>>>> John
>>>>>>>>>
>>>>>>>> FWIW reviewing devlink and items I want to put there in the future I've
>>>>>>>> decided it makes sense to keep it in devlink (sorry took me a day of
>>>>>>>> emails to get here). If you can agree to the above and rename it
>>>>>>>> something like,
>>>>>>>>
>>>>>>>> +enum devlink_eswitch_mode {
>>>>>>>> + DEVLINK_ESWITCH_MODE_NONE,
>>>>>>>> + DEVLINK_ESWITCH_MODE_LEGACY,
>>>>>>>> + DEVLINK_ESWITCH_MODE_CREATE_VF_NETDEVS,
>>>>>>> That is certainly totally misleading name. The mode is not about
>>>>>>> creating "VF netdevs".
>>>>>>>
>>>>>>> The VF representors are created but just as a side effect. The "offload"
>>>>>>> mode or maybe better "switchdev" mode is creating representor netdevs for
>>>>>>> VFs because they are needed in order to be able to configure ESwitch in
>>>>>>> the same way we configure physical switches - putting netdevs into
>>>>>>> bridge/bond/ovs/whatever. You see stats on the representors. Basicaly
>>>>>>> they are the same as physical port representors on physical switch ASIC.
>>>>>>
>>>>>> May be we need 2 new modes
>>>>>> - legacy+ mode which only creates VF netdevs and let the user configure and manage the switch via the standard bridge/tc/ip/ethtool interfaces
>>>>>> - 'offload' or 'switchdev' mode that does more than just creating VF netdevs if it is not possible to configure the switch into this mode via standard interfaces.
>>>>>
>>>>> What?
>>>>>
>>>>> That what you described as "legacy+" as "let the user configure and
>>>>> manage the switch via the standard bridge/tc/ip/ethtool interfaces" is
>>>>> exactly the "offload/switchdev" mode.
>>>>>
>>>>> The second mode you described is something that I don't get what you are
>>>>> talking about...
>>>>>
>>>>> Please forget about legacy. It's a mistake. Similar to SDKs :(
>>>>> Let's work on getting the proper offload solution in.
>>>>>
>>>>
>>>> I think the point here is switchdev is not needed to use bridge, tc,
>>>> ip, and ethtool tools. By adding the VF representors we can continue
>>>> using 'tc', 'bridge', etc. and it is much more interesting because
>>>> we bring the VFs into the netdev world even without switchdev support
>>>> this is nice. Adding switchdev of course gets you some extra goodies
>>>> like l3 and l2 learning if your nic supports it but its not strictly
>>>> required to see goodness from this patch. Without switchdev support
>>>> you get stats (big win), basic port configuration with ip link cmds,
>>>> tc and bridge fdb to name a few.
>>>
>>> Why not to have 2 modes:
>>>
>>> 1) lagacy - the current solution, blackbox eswitch, undefined behaviour
>>> 2) switchdev - with representors, all features possible as on physical
>>> switches, whitebox eswitch configured using standard tools?
>>>
>>> I don't see *ANY* reason for a hybrid. That would only make things
>>> already complicated much more complicated.
>>>
>>>
>>>>
>>>> Also we can't completely forget about legacy though because we have
>>>> infrastructure built around it and its unlikely we can switch entirely
>>>> over in one shot. For example the firewall application may switch over
>>>> to the new VF rep model while the libvirt VM manager continues to use
>>>> the 'ip link set ... vf #' model. No reason to stop this from being
>>>> supported its actually more work in the code to block it. We get it for
>>>> free.
>>>
>>> Let legacy be legacy, I have no problem with that. New drivers would be
>>> encouraged to implement only new switchdev mode.
>>>
>>
>> Nope I disagree there is no reason to break existing userspace here just
>> continue to support the handful of ip commands and bridge commands
>> already supported. The code is already in the driver and supported.
>> In general the kernel shouldn't break UAPI already in place.
>
> Who is breaking existing userspace? I don't understand what breakage are
> you are referring to :(
>
When we switch to 'offload'/'switchdev' mode please continue to support
the 'ip link set ... vf #' command and 'bridge fdb' commands. These use
the following ops,
ndo_set_vf_*
ndo_get_vf_config
ndo_get_stats64
ndo_fdb_add
ndo_bridge_*
Then userspace continues to work. Note your patchset doesn't block these
ops so it should all continue to work.
>
>>
>>>
>>>>
>>>> I've come to the conclusion that we are just arguing over a name and
>>>> a bit of perspective calling it "offload" mode is OK with me even
>>>> though legacy mode did offloading as well just not as interesting of
>>>> offloads. If the VF representors are the cause or effect is not all
>>>> that important to me.
>>>
>>> Why not call it just MODE_SWITCHDEV? I believe it describes it the best.
>>> Everyone knows what that is about.
>>>
>>
>> This is fine but it doesn't require drivers actually register with
>> switchdev here to get the goodness.
>
> Switch id for ports, that is the only thing needed.
>
Fair enough. For many NICs where ports are isolated (e.g. port can
not forward to other ports and rules on one port do not effect other
ports) the switch id is the phys_port_id. But agree implementing the
switch id lets userspace figure out the hierarchy between physical
ports and switch domains vs the old way of assuming a nic model that
binds independent switches to ports.
A generic wrapper to return the phys_port_id as the switchid would be
useful on a handful of nics but maybe will see if its worth the effort.
.John
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-28 16:19 ` John Fastabend
2016-06-28 17:19 ` John Fastabend
@ 2016-06-29 9:44 ` Or Gerlitz
1 sibling, 0 replies; 47+ messages in thread
From: Or Gerlitz @ 2016-06-29 9:44 UTC (permalink / raw)
To: John Fastabend
Cc: Saeed Mahameed, David S. Miller, netdev, Hadar Hen-Zion,
Jiri Pirko, Andy Gospodarek, Jesse Brandeburg, John Fastabend,
Ido Schimmel, Tal Anker
On 6/28/2016 7:19 PM, John Fastabend wrote:
> On 16-06-28 03:25 AM, Or Gerlitz wrote:
>> On 6/28/2016 8:57 AM, John Fastabend wrote:
>>
> hmm so in the hardware I have there is actually a l2 table and various
> other tables so I don't have any issue with doing table setup. I would
> like to see a table_create/table_delete/table_show devlink commands at
> some point though but I'm not there yet. This would allow users to
> optimize the table slices if they cared to. But that is future work
> IMO. Certainly not needed in this series at least.
Agree that we could do that and agree that we need not do that now, as
was agreed (...) in Seville,we are not yet to the geography (== HW
tables and table graph) advertisement and setup class.
>
>> The offloads mode needs to create a black hole miss rule and
>> send-to-vport rules and create the tables so they can contain later
>> rules set by the kernel in a way which is HW/driver dependent.
> Agreed a black hole miss rule needs to be applied but rather than apply
> it automatically with some toggle I would prefer to just add a 'tc' rule
> for this. Or alternatively it can be added by configuring flooding
> ports so that only a single port is in the flooding mode. This could
> all be done via 'bridge fdb ...' and 'bridge link ...' today I believe.
> Then the user defines the state and not the driver writer. It really is
> cleaner in my opinion.
The black hole serves for throwing packets arriving from **anywhere**
and not matched to any other HW rule towards the CPU where the e-switch
manager runs. Hence, it would be correct in my opinion to have it set
by the e-switch manager and it means when some API/knob is applied on
PCI device and not network device, so tc and Co will not really serve
nicely for that.
> And send-to-vport rules I'm not entirely clear on what these actually
> are used for. Is this a rule to match packets sent from a VF representer
> netdev to the actual VF pcie device? If this is the case its seems to
> me that any packet sent on a VF representer should be sent to the VF
> directly and these rules can be created when the VF is created. Or did
> you mean some other rule by this?
YES, send-to-vports rule serve for having the functionality which is
described in the cover letter and on the relevant commit/s: doing xmit
on VF rep netdevice always ends up with the packet to arrive the VF PCI
device. We create these HW rules when in the offloads mode per each
VF/rep indeed.
So when a driver SRIOV logic wakes up in offloads mode (hopefully will
happen a lot soon...), they would (1) create VF reps (2) set these
rules, sure. Currently a transition from legacy to offloads is defined
and these two acts are than on the transition, e.g for mlx5 whose
current default is legacy.
Or.
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 08/16] net/devlink: Add E-Switch mode control
2016-06-27 16:07 ` [PATCH net-next 08/16] net/devlink: Add E-Switch mode control Saeed Mahameed
2016-06-28 5:57 ` John Fastabend
@ 2016-06-28 12:27 ` Jiri Pirko
1 sibling, 0 replies; 47+ messages in thread
From: Jiri Pirko @ 2016-06-28 12:27 UTC (permalink / raw)
To: Saeed Mahameed
Cc: David S. Miller, netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko,
Andy Gospodarek, Jesse Brandeburg, John Fastabend
Mon, Jun 27, 2016 at 06:07:21PM CEST, saeedm@mellanox.com wrote:
>From: Or Gerlitz <ogerlitz@mellanox.com>
>
>Add the commands to set and show the mode of SRIOV E-Switch,
>two modes are supported:
>
>* legacy : operating in the "old" L2 based mode (DMAC --> VF vport)
>* offloads : offloading SW rules/policy (e.g Bridge/FDB or TC/Flows based) set by the host OS
>
>Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
>Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Looks fine to me. Usable for many drivers of devices containing embedded
switch. We need this for clean transition from legacy handling of embedded
switches we have in drivers currently to new switchdev model.
Thanks!
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next 09/16] net/mlx5: Add devlink interface
2016-06-27 16:07 [PATCH net-next 00/16] Mellanox 100G SRIOV E-Switch offload and VF representors Saeed Mahameed
` (7 preceding siblings ...)
2016-06-27 16:07 ` [PATCH net-next 08/16] net/devlink: Add E-Switch mode control Saeed Mahameed
@ 2016-06-27 16:07 ` Saeed Mahameed
2016-06-27 16:07 ` [PATCH net-next 10/16] net/mlx5e: Add devlink based SRIOV mode changes (legacy --> offloads) Saeed Mahameed
` (6 subsequent siblings)
15 siblings, 0 replies; 47+ messages in thread
From: Saeed Mahameed @ 2016-06-27 16:07 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Saeed Mahameed
From: Or Gerlitz <ogerlitz@mellanox.com>
The devlink interface is initially used to set/get the mode of the SRIOV e-switch.
Currently, these are only stubs for get/set, down-stream patch will actually
fill them out.
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/Kconfig | 1 +
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 4 ++++
.../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 10 +++++++++
drivers/net/ethernet/mellanox/mlx5/core/main.c | 26 ++++++++++++++++++----
4 files changed, 37 insertions(+), 4 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
index 1cf722e..aae4688 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -4,6 +4,7 @@
config MLX5_CORE
tristate "Mellanox Technologies ConnectX-4 and Connect-IB core driver"
+ depends on MAY_USE_DEVLINK
depends on PCI
default n
---help---
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index cf959f7..7843f98 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -35,6 +35,7 @@
#include <linux/if_ether.h>
#include <linux/if_link.h>
+#include <net/devlink.h>
#include <linux/mlx5/device.h>
#define MLX5_MAX_UC_PER_VPORT(dev) \
@@ -205,6 +206,9 @@ mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn
struct mlx5_flow_rule *
mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn);
+int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode);
+int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode);
+
#define MLX5_DEBUG_ESWITCH_MASK BIT(3)
#define esw_info(dev, format, ...) \
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 67ff1e8..3b3afbd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -330,3 +330,13 @@ out:
kfree(match_c);
return flow_rule;
}
+
+int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
+{
+ return -EOPNOTSUPP;
+}
+
+int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
+{
+ return -EOPNOTSUPP;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 08cae34..2abd387 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -51,6 +51,7 @@
#ifdef CONFIG_RFS_ACCEL
#include <linux/cpu_rmap.h>
#endif
+#include <net/devlink.h>
#include "mlx5_core.h"
#include "fs_core.h"
#ifdef CONFIG_MLX5_CORE_EN
@@ -1315,19 +1316,28 @@ struct mlx5_core_event_handler {
void *data);
};
+static const struct devlink_ops mlx5_devlink_ops = {
+#ifdef CONFIG_MLX5_CORE_EN
+ .eswitch_mode_set = mlx5_devlink_eswitch_mode_set,
+ .eswitch_mode_get = mlx5_devlink_eswitch_mode_get,
+#endif
+};
static int init_one(struct pci_dev *pdev,
const struct pci_device_id *id)
{
struct mlx5_core_dev *dev;
+ struct devlink *devlink;
struct mlx5_priv *priv;
int err;
- dev = kzalloc(sizeof(*dev), GFP_KERNEL);
- if (!dev) {
+ devlink = devlink_alloc(&mlx5_devlink_ops, sizeof(*dev));
+ if (!devlink) {
dev_err(&pdev->dev, "kzalloc failed\n");
return -ENOMEM;
}
+
+ dev = devlink_priv(devlink);
priv = &dev->priv;
priv->pci_dev_data = id->driver_data;
@@ -1364,15 +1374,21 @@ static int init_one(struct pci_dev *pdev,
goto clean_health;
}
+ err = devlink_register(devlink, &pdev->dev);
+ if (err)
+ goto clean_load;
+
return 0;
+clean_load:
+ mlx5_unload_one(dev, priv);
clean_health:
mlx5_health_cleanup(dev);
close_pci:
mlx5_pci_close(dev, priv);
clean_dev:
pci_set_drvdata(pdev, NULL);
- kfree(dev);
+ devlink_free(devlink);
return err;
}
@@ -1380,8 +1396,10 @@ clean_dev:
static void remove_one(struct pci_dev *pdev)
{
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
+ struct devlink *devlink = priv_to_devlink(dev);
struct mlx5_priv *priv = &dev->priv;
+ devlink_unregister(devlink);
if (mlx5_unload_one(dev, priv)) {
dev_err(&dev->pdev->dev, "mlx5_unload_one failed\n");
mlx5_health_cleanup(dev);
@@ -1390,7 +1408,7 @@ static void remove_one(struct pci_dev *pdev)
mlx5_health_cleanup(dev);
mlx5_pci_close(dev, priv);
pci_set_drvdata(pdev, NULL);
- kfree(dev);
+ devlink_free(devlink);
}
static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
--
2.8.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* [PATCH net-next 10/16] net/mlx5e: Add devlink based SRIOV mode changes (legacy --> offloads)
2016-06-27 16:07 [PATCH net-next 00/16] Mellanox 100G SRIOV E-Switch offload and VF representors Saeed Mahameed
` (8 preceding siblings ...)
2016-06-27 16:07 ` [PATCH net-next 09/16] net/mlx5: Add devlink interface Saeed Mahameed
@ 2016-06-27 16:07 ` Saeed Mahameed
2016-06-28 13:42 ` Andy Gospodarek
2016-06-27 16:07 ` [PATCH net-next 11/16] net/mlx5e: Create NIC global resources only once Saeed Mahameed
` (5 subsequent siblings)
15 siblings, 1 reply; 47+ messages in thread
From: Saeed Mahameed @ 2016-06-27 16:07 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Saeed Mahameed
From: Or Gerlitz <ogerlitz@mellanox.com>
Implement handlers for the devlink commands to get and set the SRIOV
E-Switch mode.
When turning to the offloads mode, we disable the e-switch and enable
it again in the new mode, create the NIC offloads table and create VF reps.
When turning to legacy mode, we remove the VF reps and the offloads
table, and re-initiate the e-switch in it's legacy mode.
The actual creation/removal of the VF reps is done in downstream patches.
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 12 ++-
.../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 102 ++++++++++++++++++++-
2 files changed, 105 insertions(+), 9 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 1fc4cfd..12f509c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -81,8 +81,8 @@ enum {
MC_ADDR_CHANGE | \
PROMISC_CHANGE)
-int esw_create_offloads_fdb_table(struct mlx5_eswitch *esw, int nvports);
-void esw_destroy_offloads_fdb_table(struct mlx5_eswitch *esw);
+int esw_offloads_init(struct mlx5_eswitch *esw, int nvports);
+void esw_offloads_cleanup(struct mlx5_eswitch *esw, int nvports);
static int arm_vport_context_events_cmd(struct mlx5_core_dev *dev, u16 vport,
u32 events_mask)
@@ -1561,7 +1561,7 @@ int mlx5_eswitch_enable_sriov(struct mlx5_eswitch *esw, int nvfs, int mode)
if (mode == SRIOV_LEGACY)
err = esw_create_legacy_fdb_table(esw, nvfs + 1);
else
- err = esw_create_offloads_fdb_table(esw, nvfs + 1);
+ err = esw_offloads_init(esw, nvfs + 1);
if (err)
goto abort;
@@ -1581,6 +1581,7 @@ abort:
void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
{
struct esw_mc_addr *mc_promisc;
+ int nvports;
int i;
if (!esw || !MLX5_CAP_GEN(esw->dev, vport_group_manager) ||
@@ -1591,6 +1592,7 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
esw->enabled_vports, esw->mode);
mc_promisc = esw->mc_promisc;
+ nvports = esw->enabled_vports;
for (i = 0; i < esw->total_vports; i++)
esw_disable_vport(esw, i);
@@ -1600,8 +1602,8 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw)
if (esw->mode == SRIOV_LEGACY)
esw_destroy_legacy_fdb_table(esw);
- else
- esw_destroy_offloads_fdb_table(esw);
+ else if (esw->mode == SRIOV_OFFLOADS)
+ esw_offloads_cleanup(esw, nvports);
esw->mode = SRIOV_NONE;
/* VPORT 0 (PF) must be enabled back with non-sriov configuration */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 3b3afbd..a39af6b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -114,7 +114,7 @@ out:
#define MAX_PF_SQ 256
-int esw_create_offloads_fdb_table(struct mlx5_eswitch *esw, int nvports)
+static int esw_create_offloads_fdb_table(struct mlx5_eswitch *esw, int nvports)
{
int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
struct mlx5_core_dev *dev = esw->dev;
@@ -202,7 +202,7 @@ ns_err:
return err;
}
-void esw_destroy_offloads_fdb_table(struct mlx5_eswitch *esw)
+static void esw_destroy_offloads_fdb_table(struct mlx5_eswitch *esw)
{
if (!esw->fdb_table.fdb)
return;
@@ -331,12 +331,106 @@ out:
return flow_rule;
}
+static int esw_offloads_start(struct mlx5_eswitch *esw)
+{
+ int err, num_vfs = esw->dev->priv.sriov.num_vfs;
+
+ if (esw->mode != SRIOV_LEGACY) {
+ esw_warn(esw->dev, "Can't set offloads mode, SRIOV legacy not enabled\n");
+ return -EINVAL;
+ }
+
+ mlx5_eswitch_disable_sriov(esw);
+ err = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_OFFLOADS);
+ if (err)
+ esw_warn(esw->dev, "Failed set eswitch to offloads, err %d\n", err);
+ return err;
+}
+
+int esw_offloads_init(struct mlx5_eswitch *esw, int nvports)
+{
+ int err;
+
+ err = esw_create_offloads_fdb_table(esw, nvports);
+ if (err)
+ return err;
+
+ err = esw_create_offloads_table(esw);
+ if (err)
+ goto create_ft_err;
+
+ err = esw_create_vport_rx_group(esw);
+ if (err)
+ goto create_fg_err;
+
+ return 0;
+
+create_fg_err:
+ esw_destroy_offloads_table(esw);
+
+create_ft_err:
+ esw_destroy_offloads_fdb_table(esw);
+ return err;
+}
+
+static int esw_offloads_stop(struct mlx5_eswitch *esw)
+{
+ int err, num_vfs = esw->dev->priv.sriov.num_vfs;
+
+ mlx5_eswitch_disable_sriov(esw);
+ err = mlx5_eswitch_enable_sriov(esw, num_vfs, SRIOV_LEGACY);
+ if (err)
+ esw_warn(esw->dev, "Failed set eswitch legacy mode. err %d\n", err);
+
+ return err;
+}
+
+void esw_offloads_cleanup(struct mlx5_eswitch *esw, int nvports)
+{
+ esw_destroy_vport_rx_group(esw);
+ esw_destroy_offloads_table(esw);
+ esw_destroy_offloads_fdb_table(esw);
+}
+
int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
{
- return -EOPNOTSUPP;
+ struct mlx5_core_dev *dev;
+ u16 cur_mode;
+
+ dev = devlink_priv(devlink);
+
+ if (!MLX5_CAP_GEN(dev, vport_group_manager))
+ return -EOPNOTSUPP;
+
+ cur_mode = dev->priv.eswitch->mode;
+
+ if (cur_mode == SRIOV_NONE || mode == SRIOV_NONE)
+ return -EOPNOTSUPP;
+
+ if (cur_mode == mode)
+ return 0;
+
+ if (mode == SRIOV_OFFLOADS) /* current mode is legacy */
+ return esw_offloads_start(dev->priv.eswitch);
+ else if (mode == SRIOV_LEGACY) /* curreny mode is offloads */
+ return esw_offloads_stop(dev->priv.eswitch);
+ else
+ return -EINVAL;
}
int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
{
- return -EOPNOTSUPP;
+ struct mlx5_core_dev *dev;
+
+ dev = devlink_priv(devlink);
+
+ if (!MLX5_CAP_GEN(dev, vport_group_manager))
+ return -EOPNOTSUPP;
+
+ if (dev->priv.eswitch->mode == SRIOV_NONE)
+ return -EOPNOTSUPP;
+
+ *mode = dev->priv.eswitch->mode;
+
+ return 0;
}
--
2.8.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 10/16] net/mlx5e: Add devlink based SRIOV mode changes (legacy --> offloads)
2016-06-27 16:07 ` [PATCH net-next 10/16] net/mlx5e: Add devlink based SRIOV mode changes (legacy --> offloads) Saeed Mahameed
@ 2016-06-28 13:42 ` Andy Gospodarek
2016-06-28 14:25 ` Or Gerlitz
0 siblings, 1 reply; 47+ messages in thread
From: Andy Gospodarek @ 2016-06-28 13:42 UTC (permalink / raw)
To: Saeed Mahameed
Cc: David S. Miller, netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko,
Jesse Brandeburg, John Fastabend
On Mon, Jun 27, 2016 at 07:07:23PM +0300, Saeed Mahameed wrote:
> From: Or Gerlitz <ogerlitz@mellanox.com>
>
> Implement handlers for the devlink commands to get and set the SRIOV
> E-Switch mode.
>
> When turning to the offloads mode, we disable the e-switch and enable
> it again in the new mode, create the NIC offloads table and create VF reps.
>
> When turning to legacy mode, we remove the VF reps and the offloads
> table, and re-initiate the e-switch in it's legacy mode.
>
> The actual creation/removal of the VF reps is done in downstream patches.
>
> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
> ---
> drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 12 ++-
> .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 102 ++++++++++++++++++++-
> 2 files changed, 105 insertions(+), 9 deletions(-)
>
[...]
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> index 3b3afbd..a39af6b 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
[...]
> int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
> {
> - return -EOPNOTSUPP;
> + struct mlx5_core_dev *dev;
> + u16 cur_mode;
> +
> + dev = devlink_priv(devlink);
> +
> + if (!MLX5_CAP_GEN(dev, vport_group_manager))
> + return -EOPNOTSUPP;
> +
> + cur_mode = dev->priv.eswitch->mode;
> +
> + if (cur_mode == SRIOV_NONE || mode == SRIOV_NONE)
> + return -EOPNOTSUPP;
> +
> + if (cur_mode == mode)
> + return 0;
> +
> + if (mode == SRIOV_OFFLOADS) /* current mode is legacy */
> + return esw_offloads_start(dev->priv.eswitch);
> + else if (mode == SRIOV_LEGACY) /* curreny mode is offloads */
> + return esw_offloads_stop(dev->priv.eswitch);
> + else
> + return -EINVAL;
> }
>
> int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
> {
> - return -EOPNOTSUPP;
> + struct mlx5_core_dev *dev;
> +
> + dev = devlink_priv(devlink);
> +
> + if (!MLX5_CAP_GEN(dev, vport_group_manager))
> + return -EOPNOTSUPP;
> +
> + if (dev->priv.eswitch->mode == SRIOV_NONE)
> + return -EOPNOTSUPP;
> +
> + *mode = dev->priv.eswitch->mode;
> +
> + return 0;
> }
This is an _extremely_ minor nit, but I only bring it up since you are
leading the way here and your model may be one that other people
follow...
Internally you have a enum to track the SRIOV modes:
enum {
SRIOV_NONE,
SRIOV_LEGACY,
SRIOV_OFFLOADS
};
But patch 8 adds a new enum for devlink to track this as well.
enum devlink_eswitch_mode {
DEVLINK_ESWITCH_MODE_NONE,
DEVLINK_ESWITCH_MODE_LEGACY,
DEVLINK_ESWITCH_MODE_OFFLOADS,
};
Would it make sense at some point to use the devlink modes in the driver
so it's less to track?
Again, this is an extremely _minor_ concern. The rest of the set looks
great and I like the architectural decisions made here. Awesome work
all around!
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 10/16] net/mlx5e: Add devlink based SRIOV mode changes (legacy --> offloads)
2016-06-28 13:42 ` Andy Gospodarek
@ 2016-06-28 14:25 ` Or Gerlitz
2016-06-28 14:49 ` Andy Gospodarek
0 siblings, 1 reply; 47+ messages in thread
From: Or Gerlitz @ 2016-06-28 14:25 UTC (permalink / raw)
To: Andy Gospodarek, Saeed Mahameed
Cc: David S. Miller, netdev, Hadar Hen-Zion, Jiri Pirko,
Jesse Brandeburg, John Fastabend
On 6/28/2016 4:42 PM, Andy Gospodarek wrote:
> On Mon, Jun 27, 2016 at 07:07:23PM +0300, Saeed Mahameed wrote:
>> From: Or Gerlitz <ogerlitz@mellanox.com>
>>
>> Implement handlers for the devlink commands to get and set the SRIOV
>> E-Switch mode.
>>
>> When turning to the offloads mode, we disable the e-switch and enable
>> it again in the new mode, create the NIC offloads table and create VF reps.
>>
>> When turning to legacy mode, we remove the VF reps and the offloads
>> table, and re-initiate the e-switch in it's legacy mode.
>>
>> The actual creation/removal of the VF reps is done in downstream patches.
>>
>> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
>> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
>> ---
>> drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 12 ++-
>> .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 102 ++++++++++++++++++++-
>> 2 files changed, 105 insertions(+), 9 deletions(-)
>>
> [...]
>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
>> index 3b3afbd..a39af6b 100644
>> --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> [...]
>> int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
>> {
>> - return -EOPNOTSUPP;
>> + struct mlx5_core_dev *dev;
>> + u16 cur_mode;
>> +
>> + dev = devlink_priv(devlink);
>> +
>> + if (!MLX5_CAP_GEN(dev, vport_group_manager))
>> + return -EOPNOTSUPP;
>> +
>> + cur_mode = dev->priv.eswitch->mode;
>> +
>> + if (cur_mode == SRIOV_NONE || mode == SRIOV_NONE)
>> + return -EOPNOTSUPP;
>> +
>> + if (cur_mode == mode)
>> + return 0;
>> +
>> + if (mode == SRIOV_OFFLOADS) /* current mode is legacy */
>> + return esw_offloads_start(dev->priv.eswitch);
>> + else if (mode == SRIOV_LEGACY) /* curreny mode is offloads */
>> + return esw_offloads_stop(dev->priv.eswitch);
>> + else
>> + return -EINVAL;
>> }
>>
>>
>> This is an _extremely_ minor nit, but I only bring it up since you are
>> leading the way here and your model may be one that other people
>> follow...
>>
>> Internally you have a enum to track the SRIOV modes:
>>
>> enum {
>> SRIOV_NONE,
>> SRIOV_LEGACY,
>> SRIOV_OFFLOADS
>> };
>>
>> But patch 8 adds a new enum for devlink to track this as well.
>>
>> enum devlink_eswitch_mode {
>> DEVLINK_ESWITCH_MODE_NONE,
>> DEVLINK_ESWITCH_MODE_LEGACY,
>> DEVLINK_ESWITCH_MODE_OFFLOADS,
>> };
>>
Andy,
In mlx5 we're having an eswitch driver instance also when not in sriov
mode where on that case the mlx5 eswitch mode is called sriov_none,
which is maybe not a very successful name, I'll look on that.
On the devlink/system level, the eswitch modes are relevant only for
SRIOV, you can see in the mlx5 set function that we return error when in
the none mode or asked to go there.
So... with your comment, I realize now that I forgot to remove
DEVLINK_ESWITCH_MODE_NONE value from the submission.
> Would it make sense at some point to use the devlink modes in the driver
> so it's less to track?
This makes it a bit problematic for mlx5 to use the
DEVLINK_ESWITCH_MODE_YYY values internally.
> Again, this is an extremely _minor_ concern. The rest of the set looks
> great and I like the architectural decisions made here. Awesome work
> all around!
^ permalink raw reply [flat|nested] 47+ messages in thread
* Re: [PATCH net-next 10/16] net/mlx5e: Add devlink based SRIOV mode changes (legacy --> offloads)
2016-06-28 14:25 ` Or Gerlitz
@ 2016-06-28 14:49 ` Andy Gospodarek
0 siblings, 0 replies; 47+ messages in thread
From: Andy Gospodarek @ 2016-06-28 14:49 UTC (permalink / raw)
To: Or Gerlitz
Cc: Saeed Mahameed, David S. Miller, netdev, Hadar Hen-Zion,
Jiri Pirko, Jesse Brandeburg, John Fastabend
On Tue, Jun 28, 2016 at 05:25:11PM +0300, Or Gerlitz wrote:
> On 6/28/2016 4:42 PM, Andy Gospodarek wrote:
> >On Mon, Jun 27, 2016 at 07:07:23PM +0300, Saeed Mahameed wrote:
> >>From: Or Gerlitz <ogerlitz@mellanox.com>
> >>
> >>Implement handlers for the devlink commands to get and set the SRIOV
> >>E-Switch mode.
> >>
> >>When turning to the offloads mode, we disable the e-switch and enable
> >>it again in the new mode, create the NIC offloads table and create VF reps.
> >>
> >>When turning to legacy mode, we remove the VF reps and the offloads
> >>table, and re-initiate the e-switch in it's legacy mode.
> >>
> >>The actual creation/removal of the VF reps is done in downstream patches.
> >>
> >>Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
> >>Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
> >>---
> >> drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 12 ++-
> >> .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 102 ++++++++++++++++++++-
> >> 2 files changed, 105 insertions(+), 9 deletions(-)
> >>
> >[...]
> >>diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> >>index 3b3afbd..a39af6b 100644
> >>--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> >>+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> >[...]
> >> int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode)
> >> {
> >>- return -EOPNOTSUPP;
> >>+ struct mlx5_core_dev *dev;
> >>+ u16 cur_mode;
> >>+
> >>+ dev = devlink_priv(devlink);
> >>+
> >>+ if (!MLX5_CAP_GEN(dev, vport_group_manager))
> >>+ return -EOPNOTSUPP;
> >>+
> >>+ cur_mode = dev->priv.eswitch->mode;
> >>+
> >>+ if (cur_mode == SRIOV_NONE || mode == SRIOV_NONE)
> >>+ return -EOPNOTSUPP;
> >>+
> >>+ if (cur_mode == mode)
> >>+ return 0;
> >>+
> >>+ if (mode == SRIOV_OFFLOADS) /* current mode is legacy */
> >>+ return esw_offloads_start(dev->priv.eswitch);
> >>+ else if (mode == SRIOV_LEGACY) /* curreny mode is offloads */
> >>+ return esw_offloads_stop(dev->priv.eswitch);
> >>+ else
> >>+ return -EINVAL;
> >> }
> >>
> >>This is an _extremely_ minor nit, but I only bring it up since you are
> >>leading the way here and your model may be one that other people
> >>follow...
> >>
> >>Internally you have a enum to track the SRIOV modes:
> >>
> >>enum {
> >> SRIOV_NONE,
> >> SRIOV_LEGACY,
> >> SRIOV_OFFLOADS
> >>};
> >>
> >>But patch 8 adds a new enum for devlink to track this as well.
> >>
> >>enum devlink_eswitch_mode {
> >> DEVLINK_ESWITCH_MODE_NONE,
> >> DEVLINK_ESWITCH_MODE_LEGACY,
> >> DEVLINK_ESWITCH_MODE_OFFLOADS,
> >>};
> >>
>
>
> Andy,
>
> In mlx5 we're having an eswitch driver instance also when not in sriov mode
> where on that case the mlx5 eswitch mode is called sriov_none, which is
> maybe not a very successful name, I'll look on that.
>
> On the devlink/system level, the eswitch modes are relevant only for SRIOV,
> you can see in the mlx5 set function that we return error when in the none
> mode or asked to go there.
>
> So... with your comment, I realize now that I forgot to remove
> DEVLINK_ESWITCH_MODE_NONE value from the submission.
>
> >Would it make sense at some point to use the devlink modes in the driver
> >so it's less to track?
>
> This makes it a bit problematic for mlx5 to use the DEVLINK_ESWITCH_MODE_YYY
> values internally.
If you planned to remove DEVLINK_ESWITCH_MODE_NONE then I could see how
using these in mlx5 would be problematic. Thinking about it for just a
minute, I can see the value dropping DEVLINK_ESWITCH_MODE_NONE. If the
driver supports the ability to set the eswitch mode, then it should
report an actual mode other than none.
If you remove DEVLINK_ESWITCH_MODE_NONE, then obviously
mlx5_devlink_eswitch_mode_set/get will need to change a bit as well as
there will need to be a mapping between the two values since the enums
would no longer be the same. Easy fix, though. :-)
> >Again, this is an extremely _minor_ concern. The rest of the set looks
> >great and I like the architectural decisions made here. Awesome work
> >all around!
^ permalink raw reply [flat|nested] 47+ messages in thread
* [PATCH net-next 11/16] net/mlx5e: Create NIC global resources only once
2016-06-27 16:07 [PATCH net-next 00/16] Mellanox 100G SRIOV E-Switch offload and VF representors Saeed Mahameed
` (9 preceding siblings ...)
2016-06-27 16:07 ` [PATCH net-next 10/16] net/mlx5e: Add devlink based SRIOV mode changes (legacy --> offloads) Saeed Mahameed
@ 2016-06-27 16:07 ` Saeed Mahameed
2016-06-27 16:07 ` [PATCH net-next 12/16] net/mlx5e: TIRs management refactoring Saeed Mahameed
` (4 subsequent siblings)
15 siblings, 0 replies; 47+ messages in thread
From: Saeed Mahameed @ 2016-06-27 16:07 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Saeed Mahameed
From: Hadar Hen Zion <hadarh@mellanox.com>
To allow creating more than one netdev over the same PCI function, we
change the driver such that global NIC resources are created once and
later be shared amongst all the mlx5e netdevs running over that port.
Move the CQ UAR, PD (pdn), Transport Domain (tdn), MKey resources from
being kept in the mlx5e priv part to a new resources structure
(mlx5e_resources) placed under the mlx5_core device.
This patch doesn't add any new functionality.
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/Makefile | 6 +-
drivers/net/ethernet/mellanox/mlx5/core/en.h | 6 +-
.../net/ethernet/mellanox/mlx5/core/en_common.c | 112 +++++++++++++++++++
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 124 +++++++--------------
include/linux/mlx5/driver.h | 13 +++
5 files changed, 171 insertions(+), 90 deletions(-)
create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_common.c
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 96f1826..9b14dad 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -6,8 +6,8 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
fs_counters.o rl.o
mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o eswitch.o eswitch_offloads.o \
- en_main.o en_fs.o en_ethtool.o en_tx.o en_rx.o \
- en_rx_am.o en_txrx.o en_clock.o vxlan.o en_tc.o \
- en_arfs.o
+ en_main.o en_common.o en_fs.o en_ethtool.o en_tx.o \
+ en_rx.o en_rx_am.o en_txrx.o en_clock.o vxlan.o \
+ en_tc.o en_arfs.o
mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index da885c0..da93bf55 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -570,10 +570,6 @@ struct mlx5e_priv {
unsigned long state;
struct mutex state_lock; /* Protects Interface state */
- struct mlx5_uar cq_uar;
- u32 pdn;
- u32 tdn;
- struct mlx5_core_mkey mkey;
struct mlx5_core_mkey umr_mkey;
struct mlx5e_rq drop_rq;
@@ -788,5 +784,7 @@ int mlx5e_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
#endif
u16 mlx5e_get_max_inline_cap(struct mlx5_core_dev *mdev);
+int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev);
+void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev);
#endif /* __MLX5_EN_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
new file mode 100644
index 0000000..33b3732
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "en.h"
+
+/* mlx5e global resources should be placed in this file.
+ * Global resources are common to all the netdevices crated on the same nic.
+ */
+
+static int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
+ struct mlx5_core_mkey *mkey)
+{
+ struct mlx5_create_mkey_mbox_in *in;
+ int err;
+
+ in = mlx5_vzalloc(sizeof(*in));
+ if (!in)
+ return -ENOMEM;
+
+ in->seg.flags = MLX5_PERM_LOCAL_WRITE |
+ MLX5_PERM_LOCAL_READ |
+ MLX5_ACCESS_MODE_PA;
+ in->seg.flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64);
+ in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+
+ err = mlx5_core_create_mkey(mdev, mkey, in, sizeof(*in), NULL, NULL,
+ NULL);
+
+ kvfree(in);
+
+ return err;
+}
+
+int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev)
+{
+ struct mlx5e_resources *res = &mdev->mlx5e_res;
+ int err;
+
+ err = mlx5_alloc_map_uar(mdev, &res->cq_uar, false);
+ if (err) {
+ mlx5_core_err(mdev, "alloc_map uar failed, %d\n", err);
+ return err;
+ }
+
+ err = mlx5_core_alloc_pd(mdev, &res->pdn);
+ if (err) {
+ mlx5_core_err(mdev, "alloc pd failed, %d\n", err);
+ goto err_unmap_free_uar;
+ }
+
+ err = mlx5_core_alloc_transport_domain(mdev, &res->td.tdn);
+ if (err) {
+ mlx5_core_err(mdev, "alloc td failed, %d\n", err);
+ goto err_dealloc_pd;
+ }
+
+ err = mlx5e_create_mkey(mdev, res->pdn, &res->mkey);
+ if (err) {
+ mlx5_core_err(mdev, "create mkey failed, %d\n", err);
+ goto err_dealloc_transport_domain;
+ }
+
+ return 0;
+
+err_dealloc_transport_domain:
+ mlx5_core_dealloc_transport_domain(mdev, res->td.tdn);
+err_dealloc_pd:
+ mlx5_core_dealloc_pd(mdev, res->pdn);
+err_unmap_free_uar:
+ mlx5_unmap_free_uar(mdev, &res->cq_uar);
+
+ return err;
+}
+
+void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev)
+{
+ struct mlx5e_resources *res = &mdev->mlx5e_res;
+
+ mlx5_core_destroy_mkey(mdev, &res->mkey);
+ mlx5_core_dealloc_transport_domain(mdev, res->td.tdn);
+ mlx5_core_dealloc_pd(mdev, res->pdn);
+ mlx5_unmap_free_uar(mdev, &res->cq_uar);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 02a0f17..bd3bd61 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -859,7 +859,7 @@ static int mlx5e_create_cq(struct mlx5e_channel *c,
mcq->comp = mlx5e_completion_event;
mcq->event = mlx5e_cq_error_event;
mcq->irqn = irqn;
- mcq->uar = &priv->cq_uar;
+ mcq->uar = &mdev->mlx5e_res.cq_uar;
for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) {
struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i);
@@ -1137,7 +1137,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
c->cpu = cpu;
c->pdev = &priv->mdev->pdev->dev;
c->netdev = priv->netdev;
- c->mkey_be = cpu_to_be32(priv->mkey.key);
+ c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
c->num_tc = priv->params.num_tc;
if (priv->params.rx_am_enabled)
@@ -1253,7 +1253,7 @@ static void mlx5e_build_rq_param(struct mlx5e_priv *priv,
MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe)));
MLX5_SET(wq, wq, log_wq_sz, priv->params.log_rq_size);
- MLX5_SET(wq, wq, pd, priv->pdn);
+ MLX5_SET(wq, wq, pd, priv->mdev->mlx5e_res.pdn);
MLX5_SET(rqc, rqc, counter_set_id, priv->q_counter);
param->wq.buf_numa_node = dev_to_node(&priv->mdev->pdev->dev);
@@ -1278,7 +1278,7 @@ static void mlx5e_build_sq_param_common(struct mlx5e_priv *priv,
void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
- MLX5_SET(wq, wq, pd, priv->pdn);
+ MLX5_SET(wq, wq, pd, priv->mdev->mlx5e_res.pdn);
param->wq.buf_numa_node = dev_to_node(&priv->mdev->pdev->dev);
}
@@ -1300,7 +1300,7 @@ static void mlx5e_build_common_cq_param(struct mlx5e_priv *priv,
{
void *cqc = param->cqc;
- MLX5_SET(cqc, cqc, uar_page, priv->cq_uar.index);
+ MLX5_SET(cqc, cqc, uar_page, priv->mdev->mlx5e_res.cq_uar.index);
}
static void mlx5e_build_rx_cq_param(struct mlx5e_priv *priv,
@@ -1921,7 +1921,7 @@ static int mlx5e_create_drop_cq(struct mlx5e_priv *priv,
mcq->comp = mlx5e_completion_event;
mcq->event = mlx5e_cq_error_event;
mcq->irqn = irqn;
- mcq->uar = &priv->cq_uar;
+ mcq->uar = &mdev->mlx5e_res.cq_uar;
cq->priv = priv;
@@ -1987,7 +1987,7 @@ static int mlx5e_create_tis(struct mlx5e_priv *priv, int tc)
memset(in, 0, sizeof(in));
MLX5_SET(tisc, tisc, prio, tc << 1);
- MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
+ MLX5_SET(tisc, tisc, transport_domain, mdev->mlx5e_res.td.tdn);
return mlx5_core_create_tis(mdev, in, sizeof(in), &priv->tisn[tc]);
}
@@ -2030,7 +2030,7 @@ static void mlx5e_build_indir_tir_ctx(struct mlx5e_priv *priv, u32 *tirc,
{
void *hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
- MLX5_SET(tirc, tirc, transport_domain, priv->tdn);
+ MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.td.tdn);
#define MLX5_HASH_IP (MLX5_HASH_FIELD_SEL_SRC_IP |\
MLX5_HASH_FIELD_SEL_DST_IP)
@@ -2137,7 +2137,7 @@ static void mlx5e_build_indir_tir_ctx(struct mlx5e_priv *priv, u32 *tirc,
static void mlx5e_build_direct_tir_ctx(struct mlx5e_priv *priv, u32 *tirc,
u32 rqtn)
{
- MLX5_SET(tirc, tirc, transport_domain, priv->tdn);
+ MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.td.tdn);
mlx5e_build_tir_ctx_lro(tirc, priv);
@@ -3083,31 +3083,6 @@ static void mlx5e_build_netdev(struct net_device *netdev)
mlx5e_set_netdev_dev_addr(netdev);
}
-static int mlx5e_create_mkey(struct mlx5e_priv *priv, u32 pdn,
- struct mlx5_core_mkey *mkey)
-{
- struct mlx5_core_dev *mdev = priv->mdev;
- struct mlx5_create_mkey_mbox_in *in;
- int err;
-
- in = mlx5_vzalloc(sizeof(*in));
- if (!in)
- return -ENOMEM;
-
- in->seg.flags = MLX5_PERM_LOCAL_WRITE |
- MLX5_PERM_LOCAL_READ |
- MLX5_ACCESS_MODE_PA;
- in->seg.flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64);
- in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
-
- err = mlx5_core_create_mkey(mdev, mkey, in, sizeof(*in), NULL, NULL,
- NULL);
-
- kvfree(in);
-
- return err;
-}
-
static void mlx5e_create_q_counter(struct mlx5e_priv *priv)
{
struct mlx5_core_dev *mdev = priv->mdev;
@@ -3150,7 +3125,7 @@ static int mlx5e_create_umr_mkey(struct mlx5e_priv *priv)
MLX5_ACCESS_MODE_MTT;
mkc->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
- mkc->flags_pd = cpu_to_be32(priv->pdn);
+ mkc->flags_pd = cpu_to_be32(mdev->mlx5e_res.pdn);
mkc->len = cpu_to_be64(npages << PAGE_SHIFT);
mkc->xlt_oct_size = cpu_to_be32(mlx5e_get_mtt_octw(npages));
mkc->log2_page_size = PAGE_SHIFT;
@@ -3170,9 +3145,6 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev)
int nch = mlx5e_get_max_num_channels(mdev);
int err;
- if (mlx5e_check_required_hca_cap(mdev))
- return NULL;
-
netdev = alloc_etherdev_mqs(sizeof(struct mlx5e_priv),
nch * MLX5E_MAX_NUM_TC,
nch);
@@ -3192,34 +3164,10 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev)
if (!priv->wq)
goto err_free_netdev;
- err = mlx5_alloc_map_uar(mdev, &priv->cq_uar, false);
- if (err) {
- mlx5_core_err(mdev, "alloc_map uar failed, %d\n", err);
- goto err_destroy_wq;
- }
-
- err = mlx5_core_alloc_pd(mdev, &priv->pdn);
- if (err) {
- mlx5_core_err(mdev, "alloc pd failed, %d\n", err);
- goto err_unmap_free_uar;
- }
-
- err = mlx5_core_alloc_transport_domain(mdev, &priv->tdn);
- if (err) {
- mlx5_core_err(mdev, "alloc td failed, %d\n", err);
- goto err_dealloc_pd;
- }
-
- err = mlx5e_create_mkey(priv, priv->pdn, &priv->mkey);
- if (err) {
- mlx5_core_err(mdev, "create mkey failed, %d\n", err);
- goto err_dealloc_transport_domain;
- }
-
err = mlx5e_create_umr_mkey(priv);
if (err) {
mlx5_core_err(mdev, "create umr mkey failed, %d\n", err);
- goto err_destroy_mkey;
+ goto err_destroy_wq;
}
err = mlx5e_create_tises(priv);
@@ -3305,18 +3253,6 @@ err_destroy_tises:
err_destroy_umr_mkey:
mlx5_core_destroy_mkey(mdev, &priv->umr_mkey);
-err_destroy_mkey:
- mlx5_core_destroy_mkey(mdev, &priv->mkey);
-
-err_dealloc_transport_domain:
- mlx5_core_dealloc_transport_domain(mdev, priv->tdn);
-
-err_dealloc_pd:
- mlx5_core_dealloc_pd(mdev, priv->pdn);
-
-err_unmap_free_uar:
- mlx5_unmap_free_uar(mdev, &priv->cq_uar);
-
err_destroy_wq:
destroy_workqueue(priv->wq);
@@ -3326,9 +3262,27 @@ err_free_netdev:
return NULL;
}
-static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv)
+static void *mlx5e_add(struct mlx5_core_dev *mdev)
+{
+ void *ret;
+
+ if (mlx5e_check_required_hca_cap(mdev))
+ return NULL;
+
+ if (mlx5e_create_mdev_resources(mdev))
+ return NULL;
+
+ ret = mlx5e_create_netdev(mdev);
+ if (!ret) {
+ mlx5e_destroy_mdev_resources(mdev);
+ return NULL;
+ }
+ return ret;
+}
+
+static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev,
+ struct mlx5e_priv *priv)
{
- struct mlx5e_priv *priv = vpriv;
struct net_device *netdev = priv->netdev;
set_bit(MLX5E_STATE_DESTROYING, &priv->state);
@@ -3352,10 +3306,6 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv)
mlx5e_close_drop_rq(priv);
mlx5e_destroy_tises(priv);
mlx5_core_destroy_mkey(priv->mdev, &priv->umr_mkey);
- mlx5_core_destroy_mkey(priv->mdev, &priv->mkey);
- mlx5_core_dealloc_transport_domain(priv->mdev, priv->tdn);
- mlx5_core_dealloc_pd(priv->mdev, priv->pdn);
- mlx5_unmap_free_uar(priv->mdev, &priv->cq_uar);
cancel_delayed_work_sync(&priv->update_stats_work);
destroy_workqueue(priv->wq);
@@ -3363,6 +3313,14 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv)
free_netdev(netdev);
}
+static void mlx5e_remove(struct mlx5_core_dev *mdev, void *vpriv)
+{
+ struct mlx5e_priv *priv = vpriv;
+
+ mlx5e_destroy_netdev(mdev, priv);
+ mlx5e_destroy_mdev_resources(mdev);
+}
+
static void *mlx5e_get_netdev(void *vpriv)
{
struct mlx5e_priv *priv = vpriv;
@@ -3371,8 +3329,8 @@ static void *mlx5e_get_netdev(void *vpriv)
}
static struct mlx5_interface mlx5e_interface = {
- .add = mlx5e_create_netdev,
- .remove = mlx5e_destroy_netdev,
+ .add = mlx5e_add,
+ .remove = mlx5e_remove,
.event = mlx5e_async_event,
.protocol = MLX5_INTERFACE_PROTOCOL_ETH,
.get_dev = mlx5e_get_netdev,
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 46260fd..e22b345 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -578,6 +578,18 @@ enum mlx5_pci_status {
MLX5_PCI_STATUS_ENABLED,
};
+struct mlx5_td {
+ struct list_head tirs_list;
+ u32 tdn;
+};
+
+struct mlx5e_resources {
+ struct mlx5_uar cq_uar;
+ u32 pdn;
+ struct mlx5_td td;
+ struct mlx5_core_mkey mkey;
+};
+
struct mlx5_core_dev {
struct pci_dev *pdev;
/* sync pci state */
@@ -602,6 +614,7 @@ struct mlx5_core_dev {
struct mlx5_profile *profile;
atomic_t num_qps;
u32 issi;
+ struct mlx5e_resources mlx5e_res;
#ifdef CONFIG_RFS_ACCEL
struct cpu_rmap *rmap;
#endif
--
2.8.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* [PATCH net-next 12/16] net/mlx5e: TIRs management refactoring
2016-06-27 16:07 [PATCH net-next 00/16] Mellanox 100G SRIOV E-Switch offload and VF representors Saeed Mahameed
` (10 preceding siblings ...)
2016-06-27 16:07 ` [PATCH net-next 11/16] net/mlx5e: Create NIC global resources only once Saeed Mahameed
@ 2016-06-27 16:07 ` Saeed Mahameed
2016-06-27 16:07 ` [PATCH net-next 13/16] net/mlx5e: Mark enabled RQTs instances explicitly Saeed Mahameed
` (3 subsequent siblings)
15 siblings, 0 replies; 47+ messages in thread
From: Saeed Mahameed @ 2016-06-27 16:07 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Saeed Mahameed
From: Hadar Hen Zion <hadarh@mellanox.com>
The current refresh tirs self loopback mechanism, refreshes all the tirs
belonging to the same mlx5e instance to prevent self loopback by packets
sent over any ring of that instance. This mechanism relies on all the
tirs/tises of an instance to be created with the same transport domain
number (tdn).
Change the driver to refresh all the tirs created under the same tdn
regardless of which mlx5e netdev instance they belong to.
This behaviour is needed for introducing new mlx5e instances which serve
to represent SRIOV VFs. The representors and the PF share vport used for
E-Switch management, and we want to avoid NIC level HW loopback between
them, e.g when sending broadcast packets. To achieve that, both the
representors and the PF NIC will share the tdn.
This patch doesn't add any new functionality.
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en.h | 12 +++--
drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c | 14 +++---
.../net/ethernet/mellanox/mlx5/core/en_common.c | 48 +++++++++++++++++++
.../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/en_fs.c | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 56 +++++-----------------
6 files changed, 77 insertions(+), 57 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index da93bf55..ded3f96 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -552,9 +552,10 @@ struct mlx5e_flow_steering {
struct mlx5e_arfs_tables arfs;
};
-struct mlx5e_direct_tir {
+struct mlx5e_tir {
u32 tirn;
u32 rqtn;
+ struct list_head list;
};
enum {
@@ -576,8 +577,8 @@ struct mlx5e_priv {
struct mlx5e_channel **channel;
u32 tisn[MLX5E_MAX_NUM_TC];
u32 indir_rqtn;
- u32 indir_tirn[MLX5E_NUM_INDIR_TIRS];
- struct mlx5e_direct_tir direct_tir[MLX5E_MAX_NUM_CHANNELS];
+ struct mlx5e_tir indir_tir[MLX5E_NUM_INDIR_TIRS];
+ struct mlx5e_tir direct_tir[MLX5E_MAX_NUM_CHANNELS];
u32 tx_rates[MLX5E_MAX_NUM_SQS];
struct mlx5e_flow_steering fs;
@@ -784,7 +785,12 @@ int mlx5e_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
#endif
u16 mlx5e_get_max_inline_cap(struct mlx5_core_dev *mdev);
+int mlx5e_create_tir(struct mlx5_core_dev *mdev,
+ struct mlx5e_tir *tir, u32 *in, int inlen);
+void mlx5e_destroy_tir(struct mlx5_core_dev *mdev,
+ struct mlx5e_tir *tir);
int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev);
void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev);
+int mlx5e_refresh_tirs_self_loopback_enable(struct mlx5_core_dev *mdev);
#endif /* __MLX5_EN_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
index 3515e78..10f18d4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
@@ -93,14 +93,14 @@ static enum mlx5e_traffic_types arfs_get_tt(enum arfs_type type)
static int arfs_disable(struct mlx5e_priv *priv)
{
struct mlx5_flow_destination dest;
- u32 *tirn = priv->indir_tirn;
+ struct mlx5e_tir *tir = priv->indir_tir;
int err = 0;
int tt;
int i;
dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
for (i = 0; i < ARFS_NUM_TYPES; i++) {
- dest.tir_num = tirn[i];
+ dest.tir_num = tir[i].tirn;
tt = arfs_get_tt(i);
/* Modify ttc rules destination to bypass the aRFS tables*/
err = mlx5_modify_rule_destination(priv->fs.ttc.rules[tt],
@@ -176,7 +176,7 @@ static int arfs_add_default_rule(struct mlx5e_priv *priv,
struct arfs_table *arfs_t = &priv->fs.arfs.arfs_tables[type];
struct mlx5_flow_destination dest;
u8 match_criteria_enable = 0;
- u32 *tirn = priv->indir_tirn;
+ struct mlx5e_tir *tir = priv->indir_tir;
u32 *match_criteria;
u32 *match_value;
int err = 0;
@@ -192,16 +192,16 @@ static int arfs_add_default_rule(struct mlx5e_priv *priv,
dest.type = MLX5_FLOW_DESTINATION_TYPE_TIR;
switch (type) {
case ARFS_IPV4_TCP:
- dest.tir_num = tirn[MLX5E_TT_IPV4_TCP];
+ dest.tir_num = tir[MLX5E_TT_IPV4_TCP].tirn;
break;
case ARFS_IPV4_UDP:
- dest.tir_num = tirn[MLX5E_TT_IPV4_UDP];
+ dest.tir_num = tir[MLX5E_TT_IPV4_UDP].tirn;
break;
case ARFS_IPV6_TCP:
- dest.tir_num = tirn[MLX5E_TT_IPV6_TCP];
+ dest.tir_num = tir[MLX5E_TT_IPV6_TCP].tirn;
break;
case ARFS_IPV6_UDP:
- dest.tir_num = tirn[MLX5E_TT_IPV6_UDP];
+ dest.tir_num = tir[MLX5E_TT_IPV6_UDP].tirn;
break;
default:
err = -EINVAL;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
index 33b3732..673043c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_common.c
@@ -36,6 +36,27 @@
* Global resources are common to all the netdevices crated on the same nic.
*/
+int mlx5e_create_tir(struct mlx5_core_dev *mdev,
+ struct mlx5e_tir *tir, u32 *in, int inlen)
+{
+ int err;
+
+ err = mlx5_core_create_tir(mdev, in, inlen, &tir->tirn);
+ if (err)
+ return err;
+
+ list_add(&tir->list, &mdev->mlx5e_res.td.tirs_list);
+
+ return 0;
+}
+
+void mlx5e_destroy_tir(struct mlx5_core_dev *mdev,
+ struct mlx5e_tir *tir)
+{
+ mlx5_core_destroy_tir(mdev, tir->tirn);
+ list_del(&tir->list);
+}
+
static int mlx5e_create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
struct mlx5_core_mkey *mkey)
{
@@ -89,6 +110,8 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev)
goto err_dealloc_transport_domain;
}
+ INIT_LIST_HEAD(&mdev->mlx5e_res.td.tirs_list);
+
return 0;
err_dealloc_transport_domain:
@@ -110,3 +133,28 @@ void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev)
mlx5_core_dealloc_pd(mdev, res->pdn);
mlx5_unmap_free_uar(mdev, &res->cq_uar);
}
+
+int mlx5e_refresh_tirs_self_loopback_enable(struct mlx5_core_dev *mdev)
+{
+ struct mlx5e_tir *tir;
+ void *in;
+ int inlen;
+ int err;
+
+ inlen = MLX5_ST_SZ_BYTES(modify_tir_in);
+ in = mlx5_vzalloc(inlen);
+ if (!in)
+ return -ENOMEM;
+
+ MLX5_SET(modify_tir_in, in, bitmask.self_lb_en, 1);
+
+ list_for_each_entry(tir, &mdev->mlx5e_res.td.tirs_list, list) {
+ err = mlx5_core_modify_tir(mdev, tir->tirn, in, inlen);
+ if (err)
+ return err;
+ }
+
+ kvfree(in);
+
+ return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 39a4d96..877cf68 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -876,7 +876,7 @@ static void mlx5e_modify_tirs_hash(struct mlx5e_priv *priv, void *in, int inlen)
mlx5e_build_tir_ctx_hash(tirc, priv);
for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++)
- mlx5_core_modify_tir(mdev, priv->indir_tirn[i], in, inlen);
+ mlx5_core_modify_tir(mdev, priv->indir_tir[i].tirn, in, inlen);
}
static int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index b327400..606e69b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -655,7 +655,7 @@ static int mlx5e_generate_ttc_table_rules(struct mlx5e_priv *priv)
if (tt == MLX5E_TT_ANY)
dest.tir_num = priv->direct_tir[0].tirn;
else
- dest.tir_num = priv->indir_tirn[tt];
+ dest.tir_num = priv->indir_tir[tt].tirn;
rules[tt] = mlx5e_generate_ttc_rule(priv, ft, &dest,
ttc_rules[tt].etype,
ttc_rules[tt].proto);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index bd3bd61..808dff4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1662,7 +1662,7 @@ static int mlx5e_modify_tirs_lro(struct mlx5e_priv *priv)
mlx5e_build_tir_ctx_lro(tirc, priv);
for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) {
- err = mlx5_core_modify_tir(mdev, priv->indir_tirn[tt], in,
+ err = mlx5_core_modify_tir(mdev, priv->indir_tir[tt].tirn, in,
inlen);
if (err)
goto free_in;
@@ -1681,40 +1681,6 @@ free_in:
return err;
}
-static int mlx5e_refresh_tirs_self_loopback_enable(struct mlx5e_priv *priv)
-{
- void *in;
- int inlen;
- int err;
- int i;
-
- inlen = MLX5_ST_SZ_BYTES(modify_tir_in);
- in = mlx5_vzalloc(inlen);
- if (!in)
- return -ENOMEM;
-
- MLX5_SET(modify_tir_in, in, bitmask.self_lb_en, 1);
-
- for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++) {
- err = mlx5_core_modify_tir(priv->mdev, priv->indir_tirn[i], in,
- inlen);
- if (err)
- return err;
- }
-
- for (i = 0; i < priv->params.num_channels; i++) {
- err = mlx5_core_modify_tir(priv->mdev,
- priv->direct_tir[i].tirn, in,
- inlen);
- if (err)
- return err;
- }
-
- kvfree(in);
-
- return 0;
-}
-
static int mlx5e_set_mtu(struct mlx5e_priv *priv, u16 mtu)
{
struct mlx5_core_dev *mdev = priv->mdev;
@@ -1805,7 +1771,7 @@ int mlx5e_open_locked(struct net_device *netdev)
goto err_clear_state_opened_flag;
}
- err = mlx5e_refresh_tirs_self_loopback_enable(priv);
+ err = mlx5e_refresh_tirs_self_loopback_enable(priv->mdev);
if (err) {
netdev_err(netdev, "%s: mlx5e_refresh_tirs_self_loopback_enable failed, %d\n",
__func__, err);
@@ -2149,9 +2115,9 @@ static void mlx5e_build_direct_tir_ctx(struct mlx5e_priv *priv, u32 *tirc,
static int mlx5e_create_tirs(struct mlx5e_priv *priv)
{
int nch = mlx5e_get_max_num_channels(priv->mdev);
+ struct mlx5e_tir *tir;
void *tirc;
int inlen;
- u32 *tirn;
int err;
u32 *in;
int ix;
@@ -2165,10 +2131,10 @@ static int mlx5e_create_tirs(struct mlx5e_priv *priv)
/* indirect tirs */
for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) {
memset(in, 0, inlen);
- tirn = &priv->indir_tirn[tt];
+ tir = &priv->indir_tir[tt];
tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
mlx5e_build_indir_tir_ctx(priv, tirc, tt);
- err = mlx5_core_create_tir(priv->mdev, in, inlen, tirn);
+ err = mlx5e_create_tir(priv->mdev, tir, in, inlen);
if (err)
goto err_destroy_tirs;
}
@@ -2176,11 +2142,11 @@ static int mlx5e_create_tirs(struct mlx5e_priv *priv)
/* direct tirs */
for (ix = 0; ix < nch; ix++) {
memset(in, 0, inlen);
- tirn = &priv->direct_tir[ix].tirn;
+ tir = &priv->direct_tir[ix];
tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
mlx5e_build_direct_tir_ctx(priv, tirc,
priv->direct_tir[ix].rqtn);
- err = mlx5_core_create_tir(priv->mdev, in, inlen, tirn);
+ err = mlx5e_create_tir(priv->mdev, tir, in, inlen);
if (err)
goto err_destroy_ch_tirs;
}
@@ -2191,11 +2157,11 @@ static int mlx5e_create_tirs(struct mlx5e_priv *priv)
err_destroy_ch_tirs:
for (ix--; ix >= 0; ix--)
- mlx5_core_destroy_tir(priv->mdev, priv->direct_tir[ix].tirn);
+ mlx5e_destroy_tir(priv->mdev, &priv->direct_tir[ix]);
err_destroy_tirs:
for (tt--; tt >= 0; tt--)
- mlx5_core_destroy_tir(priv->mdev, priv->indir_tirn[tt]);
+ mlx5e_destroy_tir(priv->mdev, &priv->indir_tir[tt]);
kvfree(in);
@@ -2208,10 +2174,10 @@ static void mlx5e_destroy_tirs(struct mlx5e_priv *priv)
int i;
for (i = 0; i < nch; i++)
- mlx5_core_destroy_tir(priv->mdev, priv->direct_tir[i].tirn);
+ mlx5e_destroy_tir(priv->mdev, &priv->direct_tir[i]);
for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++)
- mlx5_core_destroy_tir(priv->mdev, priv->indir_tirn[i]);
+ mlx5e_destroy_tir(priv->mdev, &priv->indir_tir[i]);
}
int mlx5e_modify_rqs_vsd(struct mlx5e_priv *priv, bool vsd)
--
2.8.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* [PATCH net-next 13/16] net/mlx5e: Mark enabled RQTs instances explicitly
2016-06-27 16:07 [PATCH net-next 00/16] Mellanox 100G SRIOV E-Switch offload and VF representors Saeed Mahameed
` (11 preceding siblings ...)
2016-06-27 16:07 ` [PATCH net-next 12/16] net/mlx5e: TIRs management refactoring Saeed Mahameed
@ 2016-06-27 16:07 ` Saeed Mahameed
2016-06-27 16:07 ` [PATCH net-next 14/16] net/mlx5e: Add support for multiple profiles Saeed Mahameed
` (2 subsequent siblings)
15 siblings, 0 replies; 47+ messages in thread
From: Saeed Mahameed @ 2016-06-27 16:07 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Saeed Mahameed
From: Hadar Hen Zion <hadarh@mellanox.com>
In the current driver implementation two types of receive queue
tables (RQTs) are in use - direct and indirect.
Change the driver to mark each new created RQT (direct or indirect)
as "enabled". This behaviour is needed for introducing new mlx5e
instances which serve to represent SRIOV VFs.
The VF representors will have only one type of RQTs (direct).
An "enabled" flag is added to each RQT to allow better handling
and code sharing between the representors and the nic netdevices.
This patch doesn't add any new functionality.
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en.h | 13 +++++--
.../net/ethernet/mellanox/mlx5/core/en_ethtool.c | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 45 +++++++++++++---------
3 files changed, 37 insertions(+), 23 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index ded3f96..1843a4c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -552,10 +552,15 @@ struct mlx5e_flow_steering {
struct mlx5e_arfs_tables arfs;
};
-struct mlx5e_tir {
- u32 tirn;
+struct mlx5e_rqt {
u32 rqtn;
- struct list_head list;
+ bool enabled;
+};
+
+struct mlx5e_tir {
+ u32 tirn;
+ struct mlx5e_rqt rqt;
+ struct list_head list;
};
enum {
@@ -576,7 +581,7 @@ struct mlx5e_priv {
struct mlx5e_channel **channel;
u32 tisn[MLX5E_MAX_NUM_TC];
- u32 indir_rqtn;
+ struct mlx5e_rqt indir_rqt;
struct mlx5e_tir indir_tir[MLX5E_NUM_INDIR_TIRS];
struct mlx5e_tir direct_tir[MLX5E_MAX_NUM_CHANNELS];
u32 tx_rates[MLX5E_MAX_NUM_SQS];
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 877cf68..7c5c477 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -898,7 +898,7 @@ static int mlx5e_set_rxfh(struct net_device *dev, const u32 *indir,
mutex_lock(&priv->state_lock);
if (indir) {
- u32 rqtn = priv->indir_rqtn;
+ u32 rqtn = priv->indir_rqt.rqtn;
memcpy(priv->params.indirection_rqt, indir,
sizeof(priv->params.indirection_rqt));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 808dff4..db890b2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1487,7 +1487,8 @@ static void mlx5e_fill_direct_rqt_rqn(struct mlx5e_priv *priv, void *rqtc,
MLX5_SET(rqtc, rqtc, rq_num[0], rqn);
}
-static int mlx5e_create_rqt(struct mlx5e_priv *priv, int sz, int ix, u32 *rqtn)
+static int mlx5e_create_rqt(struct mlx5e_priv *priv, int sz,
+ int ix, struct mlx5e_rqt *rqt)
{
struct mlx5_core_dev *mdev = priv->mdev;
void *rqtc;
@@ -1510,34 +1511,37 @@ static int mlx5e_create_rqt(struct mlx5e_priv *priv, int sz, int ix, u32 *rqtn)
else
mlx5e_fill_direct_rqt_rqn(priv, rqtc, ix);
- err = mlx5_core_create_rqt(mdev, in, inlen, rqtn);
+ err = mlx5_core_create_rqt(mdev, in, inlen, &rqt->rqtn);
+ if (!err)
+ rqt->enabled = true;
kvfree(in);
return err;
}
-static void mlx5e_destroy_rqt(struct mlx5e_priv *priv, u32 rqtn)
+static void mlx5e_destroy_rqt(struct mlx5e_priv *priv, struct mlx5e_rqt *rqt)
{
- mlx5_core_destroy_rqt(priv->mdev, rqtn);
+ rqt->enabled = false;
+ mlx5_core_destroy_rqt(priv->mdev, rqt->rqtn);
}
static int mlx5e_create_rqts(struct mlx5e_priv *priv)
{
int nch = mlx5e_get_max_num_channels(priv->mdev);
- u32 *rqtn;
+ struct mlx5e_rqt *rqt;
int err;
int ix;
/* Indirect RQT */
- rqtn = &priv->indir_rqtn;
- err = mlx5e_create_rqt(priv, MLX5E_INDIR_RQT_SIZE, 0, rqtn);
+ rqt = &priv->indir_rqt;
+ err = mlx5e_create_rqt(priv, MLX5E_INDIR_RQT_SIZE, 0, rqt);
if (err)
return err;
/* Direct RQTs */
for (ix = 0; ix < nch; ix++) {
- rqtn = &priv->direct_tir[ix].rqtn;
- err = mlx5e_create_rqt(priv, 1 /*size */, ix, rqtn);
+ rqt = &priv->direct_tir[ix].rqt;
+ err = mlx5e_create_rqt(priv, 1 /*size */, ix, rqt);
if (err)
goto err_destroy_rqts;
}
@@ -1546,9 +1550,9 @@ static int mlx5e_create_rqts(struct mlx5e_priv *priv)
err_destroy_rqts:
for (ix--; ix >= 0; ix--)
- mlx5e_destroy_rqt(priv, priv->direct_tir[ix].rqtn);
+ mlx5e_destroy_rqt(priv, &priv->direct_tir[ix].rqt);
- mlx5e_destroy_rqt(priv, priv->indir_rqtn);
+ mlx5e_destroy_rqt(priv, &priv->indir_rqt);
return err;
}
@@ -1559,9 +1563,9 @@ static void mlx5e_destroy_rqts(struct mlx5e_priv *priv)
int i;
for (i = 0; i < nch; i++)
- mlx5e_destroy_rqt(priv, priv->direct_tir[i].rqtn);
+ mlx5e_destroy_rqt(priv, &priv->direct_tir[i].rqt);
- mlx5e_destroy_rqt(priv, priv->indir_rqtn);
+ mlx5e_destroy_rqt(priv, &priv->indir_rqt);
}
int mlx5e_redirect_rqt(struct mlx5e_priv *priv, u32 rqtn, int sz, int ix)
@@ -1599,10 +1603,15 @@ static void mlx5e_redirect_rqts(struct mlx5e_priv *priv)
u32 rqtn;
int ix;
- rqtn = priv->indir_rqtn;
- mlx5e_redirect_rqt(priv, rqtn, MLX5E_INDIR_RQT_SIZE, 0);
+ if (priv->indir_rqt.enabled) {
+ rqtn = priv->indir_rqt.rqtn;
+ mlx5e_redirect_rqt(priv, rqtn, MLX5E_INDIR_RQT_SIZE, 0);
+ }
+
for (ix = 0; ix < priv->params.num_channels; ix++) {
- rqtn = priv->direct_tir[ix].rqtn;
+ if (!priv->direct_tir[ix].rqt.enabled)
+ continue;
+ rqtn = priv->direct_tir[ix].rqt.rqtn;
mlx5e_redirect_rqt(priv, rqtn, 1, ix);
}
}
@@ -2013,7 +2022,7 @@ static void mlx5e_build_indir_tir_ctx(struct mlx5e_priv *priv, u32 *tirc,
mlx5e_build_tir_ctx_lro(tirc, priv);
MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
- MLX5_SET(tirc, tirc, indirect_table, priv->indir_rqtn);
+ MLX5_SET(tirc, tirc, indirect_table, priv->indir_rqt.rqtn);
mlx5e_build_tir_ctx_hash(tirc, priv);
switch (tt) {
@@ -2145,7 +2154,7 @@ static int mlx5e_create_tirs(struct mlx5e_priv *priv)
tir = &priv->direct_tir[ix];
tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
mlx5e_build_direct_tir_ctx(priv, tirc,
- priv->direct_tir[ix].rqtn);
+ priv->direct_tir[ix].rqt.rqtn);
err = mlx5e_create_tir(priv->mdev, tir, in, inlen);
if (err)
goto err_destroy_ch_tirs;
--
2.8.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* [PATCH net-next 14/16] net/mlx5e: Add support for multiple profiles
2016-06-27 16:07 [PATCH net-next 00/16] Mellanox 100G SRIOV E-Switch offload and VF representors Saeed Mahameed
` (12 preceding siblings ...)
2016-06-27 16:07 ` [PATCH net-next 13/16] net/mlx5e: Mark enabled RQTs instances explicitly Saeed Mahameed
@ 2016-06-27 16:07 ` Saeed Mahameed
2016-06-27 16:07 ` [PATCH net-next 15/16] net/mlx5: Add Representors registration API Saeed Mahameed
2016-06-27 16:07 ` [PATCH net-next 16/16] net/mlx5e: Introduce SRIOV VF representors Saeed Mahameed
15 siblings, 0 replies; 47+ messages in thread
From: Saeed Mahameed @ 2016-06-27 16:07 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Saeed Mahameed
From: Hadar Hen Zion <hadarh@mellanox.com>
To allow support in representor netdevices where we create more than one
netdevice per NIC, add profiles to the mlx5e driver. The profiling
allows for creation of mlx5e instances with different characteristics.
Each profile implements its own behavior using set of function pointers
defined in struct mlx5e_profile. This is done to allow for avoiding complex
per profix branching in the code.
Currently only the profile for the conventional NIC is implemented,
which is of use when a netdev is created upon pci probe.
This patch doesn't add any new functionality.
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en.h | 17 ++
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 341 ++++++++++++++--------
2 files changed, 240 insertions(+), 118 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 1843a4c..8d4d2b2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -568,6 +568,22 @@ enum {
MLX5E_NIC_PRIO
};
+struct mlx5e_profile {
+ void (*init)(struct mlx5_core_dev *mdev,
+ struct net_device *netdev,
+ const struct mlx5e_profile *profile);
+ void (*cleanup)(struct mlx5e_priv *priv);
+ int (*init_rx)(struct mlx5e_priv *priv);
+ void (*cleanup_rx)(struct mlx5e_priv *priv);
+ int (*init_tx)(struct mlx5e_priv *priv);
+ void (*cleanup_tx)(struct mlx5e_priv *priv);
+ void (*enable)(struct mlx5e_priv *priv);
+ void (*disable)(struct mlx5e_priv *priv);
+ void (*update_stats)(struct mlx5e_priv *priv);
+ int (*max_nch)(struct mlx5_core_dev *mdev);
+ int max_tc;
+};
+
struct mlx5e_priv {
/* priv data path fields - start */
struct mlx5e_sq **txq_to_sq_map;
@@ -601,6 +617,7 @@ struct mlx5e_priv {
struct mlx5e_stats stats;
struct mlx5e_tstamp tstamp;
u16 q_counter;
+ const struct mlx5e_profile *profile;
};
enum mlx5e_link_mode {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index db890b2..8ffe68b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -234,7 +234,7 @@ static void mlx5e_update_stats_work(struct work_struct *work)
update_stats_work);
mutex_lock(&priv->state_lock);
if (test_bit(MLX5E_STATE_OPENED, &priv->state)) {
- mlx5e_update_stats(priv);
+ priv->profile->update_stats(priv);
queue_delayed_work(priv->wq, dwork,
msecs_to_jiffies(MLX5E_UPDATE_STATS_INTERVAL));
}
@@ -1037,7 +1037,7 @@ static void mlx5e_build_channeltc_to_txq_map(struct mlx5e_priv *priv, int ix)
{
int i;
- for (i = 0; i < MLX5E_MAX_NUM_TC; i++)
+ for (i = 0; i < priv->profile->max_tc; i++)
priv->channeltc_to_txq_map[ix][i] =
ix + i * priv->params.num_channels;
}
@@ -1525,21 +1525,20 @@ static void mlx5e_destroy_rqt(struct mlx5e_priv *priv, struct mlx5e_rqt *rqt)
mlx5_core_destroy_rqt(priv->mdev, rqt->rqtn);
}
-static int mlx5e_create_rqts(struct mlx5e_priv *priv)
+static int mlx5e_create_indirect_rqts(struct mlx5e_priv *priv)
+{
+ struct mlx5e_rqt *rqt = &priv->indir_rqt;
+
+ return mlx5e_create_rqt(priv, MLX5E_INDIR_RQT_SIZE, 0, rqt);
+}
+
+static int mlx5e_create_direct_rqts(struct mlx5e_priv *priv)
{
- int nch = mlx5e_get_max_num_channels(priv->mdev);
struct mlx5e_rqt *rqt;
int err;
int ix;
- /* Indirect RQT */
- rqt = &priv->indir_rqt;
- err = mlx5e_create_rqt(priv, MLX5E_INDIR_RQT_SIZE, 0, rqt);
- if (err)
- return err;
-
- /* Direct RQTs */
- for (ix = 0; ix < nch; ix++) {
+ for (ix = 0; ix < priv->profile->max_nch(priv->mdev); ix++) {
rqt = &priv->direct_tir[ix].rqt;
err = mlx5e_create_rqt(priv, 1 /*size */, ix, rqt);
if (err)
@@ -1552,22 +1551,9 @@ err_destroy_rqts:
for (ix--; ix >= 0; ix--)
mlx5e_destroy_rqt(priv, &priv->direct_tir[ix].rqt);
- mlx5e_destroy_rqt(priv, &priv->indir_rqt);
-
return err;
}
-static void mlx5e_destroy_rqts(struct mlx5e_priv *priv)
-{
- int nch = mlx5e_get_max_num_channels(priv->mdev);
- int i;
-
- for (i = 0; i < nch; i++)
- mlx5e_destroy_rqt(priv, &priv->direct_tir[i].rqt);
-
- mlx5e_destroy_rqt(priv, &priv->indir_rqt);
-}
-
int mlx5e_redirect_rqt(struct mlx5e_priv *priv, u32 rqtn, int sz, int ix)
{
struct mlx5_core_dev *mdev = priv->mdev;
@@ -1677,7 +1663,7 @@ static int mlx5e_modify_tirs_lro(struct mlx5e_priv *priv)
goto free_in;
}
- for (ix = 0; ix < mlx5e_get_max_num_channels(mdev); ix++) {
+ for (ix = 0; ix < priv->profile->max_nch(priv->mdev); ix++) {
err = mlx5_core_modify_tir(mdev, priv->direct_tir[ix].tirn,
in, inlen);
if (err)
@@ -1977,7 +1963,7 @@ static int mlx5e_create_tises(struct mlx5e_priv *priv)
int err;
int tc;
- for (tc = 0; tc < MLX5E_MAX_NUM_TC; tc++) {
+ for (tc = 0; tc < priv->profile->max_tc; tc++) {
err = mlx5e_create_tis(priv, tc);
if (err)
goto err_close_tises;
@@ -1992,11 +1978,11 @@ err_close_tises:
return err;
}
-static void mlx5e_destroy_tises(struct mlx5e_priv *priv)
+static void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv)
{
int tc;
- for (tc = 0; tc < MLX5E_MAX_NUM_TC; tc++)
+ for (tc = 0; tc < priv->profile->max_tc; tc++)
mlx5e_destroy_tis(priv, tc);
}
@@ -2121,15 +2107,13 @@ static void mlx5e_build_direct_tir_ctx(struct mlx5e_priv *priv, u32 *tirc,
MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_INVERTED_XOR8);
}
-static int mlx5e_create_tirs(struct mlx5e_priv *priv)
+static int mlx5e_create_indirect_tirs(struct mlx5e_priv *priv)
{
- int nch = mlx5e_get_max_num_channels(priv->mdev);
struct mlx5e_tir *tir;
void *tirc;
int inlen;
int err;
u32 *in;
- int ix;
int tt;
inlen = MLX5_ST_SZ_BYTES(create_tir_in);
@@ -2137,7 +2121,6 @@ static int mlx5e_create_tirs(struct mlx5e_priv *priv)
if (!in)
return -ENOMEM;
- /* indirect tirs */
for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) {
memset(in, 0, inlen);
tir = &priv->indir_tir[tt];
@@ -2148,7 +2131,34 @@ static int mlx5e_create_tirs(struct mlx5e_priv *priv)
goto err_destroy_tirs;
}
- /* direct tirs */
+ kvfree(in);
+
+ return 0;
+
+err_destroy_tirs:
+ for (tt--; tt >= 0; tt--)
+ mlx5e_destroy_tir(priv->mdev, &priv->indir_tir[tt]);
+
+ kvfree(in);
+
+ return err;
+}
+
+static int mlx5e_create_direct_tirs(struct mlx5e_priv *priv)
+{
+ int nch = priv->profile->max_nch(priv->mdev);
+ struct mlx5e_tir *tir;
+ void *tirc;
+ int inlen;
+ int err;
+ u32 *in;
+ int ix;
+
+ inlen = MLX5_ST_SZ_BYTES(create_tir_in);
+ in = mlx5_vzalloc(inlen);
+ if (!in)
+ return -ENOMEM;
+
for (ix = 0; ix < nch; ix++) {
memset(in, 0, inlen);
tir = &priv->direct_tir[ix];
@@ -2168,27 +2178,28 @@ err_destroy_ch_tirs:
for (ix--; ix >= 0; ix--)
mlx5e_destroy_tir(priv->mdev, &priv->direct_tir[ix]);
-err_destroy_tirs:
- for (tt--; tt >= 0; tt--)
- mlx5e_destroy_tir(priv->mdev, &priv->indir_tir[tt]);
-
kvfree(in);
return err;
}
-static void mlx5e_destroy_tirs(struct mlx5e_priv *priv)
+static void mlx5e_destroy_indirect_tirs(struct mlx5e_priv *priv)
{
- int nch = mlx5e_get_max_num_channels(priv->mdev);
int i;
- for (i = 0; i < nch; i++)
- mlx5e_destroy_tir(priv->mdev, &priv->direct_tir[i]);
-
for (i = 0; i < MLX5E_NUM_INDIR_TIRS; i++)
mlx5e_destroy_tir(priv->mdev, &priv->indir_tir[i]);
}
+static void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv)
+{
+ int nch = priv->profile->max_nch(priv->mdev);
+ int i;
+
+ for (i = 0; i < nch; i++)
+ mlx5e_destroy_tir(priv->mdev, &priv->direct_tir[i]);
+}
+
int mlx5e_modify_rqs_vsd(struct mlx5e_priv *priv, bool vsd)
{
int err = 0;
@@ -2868,9 +2879,9 @@ void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE;
}
-static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
- struct net_device *netdev,
- int num_channels)
+static void mlx5e_build_nic_netdev_priv(struct mlx5_core_dev *mdev,
+ struct net_device *netdev,
+ const struct mlx5e_profile *profile)
{
struct mlx5e_priv *priv = netdev_priv(netdev);
u32 link_speed = 0;
@@ -2939,7 +2950,7 @@ static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
sizeof(priv->params.toeplitz_hash_key));
mlx5e_build_default_indir_rqt(mdev, priv->params.indirection_rqt,
- MLX5E_INDIR_RQT_SIZE, num_channels);
+ MLX5E_INDIR_RQT_SIZE, profile->max_nch(mdev));
priv->params.lro_wqe_sz =
MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ;
@@ -2950,7 +2961,8 @@ static void mlx5e_build_netdev_priv(struct mlx5_core_dev *mdev,
priv->mdev = mdev;
priv->netdev = netdev;
- priv->params.num_channels = num_channels;
+ priv->params.num_channels = profile->max_nch(mdev);
+ priv->profile = profile;
#ifdef CONFIG_MLX5_CORE_EN_DCB
mlx5e_ets_init(priv);
@@ -2975,7 +2987,7 @@ static void mlx5e_set_netdev_dev_addr(struct net_device *netdev)
}
}
-static void mlx5e_build_netdev(struct net_device *netdev)
+static void mlx5e_build_nic_netdev(struct net_device *netdev)
{
struct mlx5e_priv *priv = netdev_priv(netdev);
struct mlx5_core_dev *mdev = priv->mdev;
@@ -3085,7 +3097,7 @@ static int mlx5e_create_umr_mkey(struct mlx5e_priv *priv)
struct mlx5_mkey_seg *mkc;
int inlen = sizeof(*in);
u64 npages =
- mlx5e_get_max_num_channels(mdev) * MLX5_CHANNEL_MAX_NUM_MTTS;
+ priv->profile->max_nch(mdev) * MLX5_CHANNEL_MAX_NUM_MTTS;
int err;
in = mlx5_vzalloc(inlen);
@@ -3113,23 +3125,159 @@ static int mlx5e_create_umr_mkey(struct mlx5e_priv *priv)
return err;
}
-static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev)
+static void mlx5e_nic_init(struct mlx5_core_dev *mdev,
+ struct net_device *netdev,
+ const struct mlx5e_profile *profile)
+{
+ struct mlx5e_priv *priv = netdev_priv(netdev);
+
+ mlx5e_build_nic_netdev_priv(mdev, netdev, profile);
+ mlx5e_build_nic_netdev(netdev);
+ mlx5e_vxlan_init(priv);
+}
+
+static void mlx5e_nic_cleanup(struct mlx5e_priv *priv)
+{
+ mlx5e_vxlan_cleanup(priv);
+}
+
+static int mlx5e_init_nic_rx(struct mlx5e_priv *priv)
+{
+ struct mlx5_core_dev *mdev = priv->mdev;
+ int err;
+ int i;
+
+ err = mlx5e_create_indirect_rqts(priv);
+ if (err) {
+ mlx5_core_warn(mdev, "create indirect rqts failed, %d\n", err);
+ return err;
+ }
+
+ err = mlx5e_create_direct_rqts(priv);
+ if (err) {
+ mlx5_core_warn(mdev, "create direct rqts failed, %d\n", err);
+ goto err_destroy_indirect_rqts;
+ }
+
+ err = mlx5e_create_indirect_tirs(priv);
+ if (err) {
+ mlx5_core_warn(mdev, "create indirect tirs failed, %d\n", err);
+ goto err_destroy_direct_rqts;
+ }
+
+ err = mlx5e_create_direct_tirs(priv);
+ if (err) {
+ mlx5_core_warn(mdev, "create direct tirs failed, %d\n", err);
+ goto err_destroy_indirect_tirs;
+ }
+
+ err = mlx5e_create_flow_steering(priv);
+ if (err) {
+ mlx5_core_warn(mdev, "create flow steering failed, %d\n", err);
+ goto err_destroy_direct_tirs;
+ }
+
+ err = mlx5e_tc_init(priv);
+ if (err)
+ goto err_destroy_flow_steering;
+
+ return 0;
+
+err_destroy_flow_steering:
+ mlx5e_destroy_flow_steering(priv);
+err_destroy_direct_tirs:
+ mlx5e_destroy_direct_tirs(priv);
+err_destroy_indirect_tirs:
+ mlx5e_destroy_indirect_tirs(priv);
+err_destroy_direct_rqts:
+ for (i = 0; i < priv->profile->max_nch(mdev); i++)
+ mlx5e_destroy_rqt(priv, &priv->direct_tir[i].rqt);
+err_destroy_indirect_rqts:
+ mlx5e_destroy_rqt(priv, &priv->indir_rqt);
+ return err;
+}
+
+static void mlx5e_cleanup_nic_rx(struct mlx5e_priv *priv)
+{
+ int i;
+
+ mlx5e_tc_cleanup(priv);
+ mlx5e_destroy_flow_steering(priv);
+ mlx5e_destroy_direct_tirs(priv);
+ mlx5e_destroy_indirect_tirs(priv);
+ for (i = 0; i < priv->profile->max_nch(priv->mdev); i++)
+ mlx5e_destroy_rqt(priv, &priv->direct_tir[i].rqt);
+ mlx5e_destroy_rqt(priv, &priv->indir_rqt);
+}
+
+static int mlx5e_init_nic_tx(struct mlx5e_priv *priv)
+{
+ int err;
+
+ err = mlx5e_create_tises(priv);
+ if (err) {
+ mlx5_core_warn(priv->mdev, "create tises failed, %d\n", err);
+ return err;
+ }
+
+#ifdef CONFIG_MLX5_CORE_EN_DCB
+ mlx5e_dcbnl_ieee_setets_core(priv, &priv->params.ets);
+#endif
+ return 0;
+}
+
+static void mlx5e_nic_enable(struct mlx5e_priv *priv)
+{
+ struct net_device *netdev = priv->netdev;
+ struct mlx5_core_dev *mdev = priv->mdev;
+
+ if (mlx5e_vxlan_allowed(mdev)) {
+ rtnl_lock();
+ udp_tunnel_get_rx_info(netdev);
+ rtnl_unlock();
+ }
+
+ mlx5e_enable_async_events(priv);
+ queue_work(priv->wq, &priv->set_rx_mode_work);
+}
+
+static void mlx5e_nic_disable(struct mlx5e_priv *priv)
+{
+ queue_work(priv->wq, &priv->set_rx_mode_work);
+ mlx5e_disable_async_events(priv);
+}
+
+static const struct mlx5e_profile mlx5e_nic_profile = {
+ .init = mlx5e_nic_init,
+ .cleanup = mlx5e_nic_cleanup,
+ .init_rx = mlx5e_init_nic_rx,
+ .cleanup_rx = mlx5e_cleanup_nic_rx,
+ .init_tx = mlx5e_init_nic_tx,
+ .cleanup_tx = mlx5e_cleanup_nic_tx,
+ .enable = mlx5e_nic_enable,
+ .disable = mlx5e_nic_disable,
+ .update_stats = mlx5e_update_stats,
+ .max_nch = mlx5e_get_max_num_channels,
+ .max_tc = MLX5E_MAX_NUM_TC,
+};
+
+static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev,
+ const struct mlx5e_profile *profile)
{
struct net_device *netdev;
struct mlx5e_priv *priv;
- int nch = mlx5e_get_max_num_channels(mdev);
+ int nch = profile->max_nch(mdev);
int err;
netdev = alloc_etherdev_mqs(sizeof(struct mlx5e_priv),
- nch * MLX5E_MAX_NUM_TC,
+ nch * profile->max_tc,
nch);
if (!netdev) {
mlx5_core_err(mdev, "alloc_etherdev_mqs() failed\n");
return NULL;
}
- mlx5e_build_netdev_priv(mdev, netdev, nch);
- mlx5e_build_netdev(netdev);
+ profile->init(mdev, netdev, profile);
netif_carrier_off(netdev);
@@ -3145,85 +3293,44 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev)
goto err_destroy_wq;
}
- err = mlx5e_create_tises(priv);
- if (err) {
- mlx5_core_warn(mdev, "create tises failed, %d\n", err);
+ err = profile->init_tx(priv);
+ if (err)
goto err_destroy_umr_mkey;
- }
err = mlx5e_open_drop_rq(priv);
if (err) {
mlx5_core_err(mdev, "open drop rq failed, %d\n", err);
- goto err_destroy_tises;
+ goto err_cleanup_tx;
}
- err = mlx5e_create_rqts(priv);
- if (err) {
- mlx5_core_warn(mdev, "create rqts failed, %d\n", err);
+ err = profile->init_rx(priv);
+ if (err)
goto err_close_drop_rq;
- }
-
- err = mlx5e_create_tirs(priv);
- if (err) {
- mlx5_core_warn(mdev, "create tirs failed, %d\n", err);
- goto err_destroy_rqts;
- }
-
- err = mlx5e_create_flow_steering(priv);
- if (err) {
- mlx5_core_warn(mdev, "create flow steering failed, %d\n", err);
- goto err_destroy_tirs;
- }
mlx5e_create_q_counter(priv);
mlx5e_init_l2_addr(priv);
- mlx5e_vxlan_init(priv);
-
- err = mlx5e_tc_init(priv);
- if (err)
- goto err_dealloc_q_counters;
-
-#ifdef CONFIG_MLX5_CORE_EN_DCB
- mlx5e_dcbnl_ieee_setets_core(priv, &priv->params.ets);
-#endif
-
err = register_netdev(netdev);
if (err) {
mlx5_core_err(mdev, "register_netdev failed, %d\n", err);
- goto err_tc_cleanup;
- }
-
- if (mlx5e_vxlan_allowed(mdev)) {
- rtnl_lock();
- udp_tunnel_get_rx_info(netdev);
- rtnl_unlock();
+ goto err_dealloc_q_counters;
}
- mlx5e_enable_async_events(priv);
- queue_work(priv->wq, &priv->set_rx_mode_work);
+ if (profile->enable)
+ profile->enable(priv);
return priv;
-err_tc_cleanup:
- mlx5e_tc_cleanup(priv);
-
err_dealloc_q_counters:
mlx5e_destroy_q_counter(priv);
- mlx5e_destroy_flow_steering(priv);
-
-err_destroy_tirs:
- mlx5e_destroy_tirs(priv);
-
-err_destroy_rqts:
- mlx5e_destroy_rqts(priv);
+ profile->cleanup_rx(priv);
err_close_drop_rq:
mlx5e_close_drop_rq(priv);
-err_destroy_tises:
- mlx5e_destroy_tises(priv);
+err_cleanup_tx:
+ profile->cleanup_tx(priv);
err_destroy_umr_mkey:
mlx5_core_destroy_mkey(mdev, &priv->umr_mkey);
@@ -3247,7 +3354,7 @@ static void *mlx5e_add(struct mlx5_core_dev *mdev)
if (mlx5e_create_mdev_resources(mdev))
return NULL;
- ret = mlx5e_create_netdev(mdev);
+ ret = mlx5e_create_netdev(mdev, &mlx5e_nic_profile);
if (!ret) {
mlx5e_destroy_mdev_resources(mdev);
return NULL;
@@ -3255,15 +3362,15 @@ static void *mlx5e_add(struct mlx5_core_dev *mdev)
return ret;
}
-static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev,
- struct mlx5e_priv *priv)
+static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, struct mlx5e_priv *priv)
{
+ const struct mlx5e_profile *profile = priv->profile;
struct net_device *netdev = priv->netdev;
set_bit(MLX5E_STATE_DESTROYING, &priv->state);
+ if (profile->disable)
+ profile->disable(priv);
- queue_work(priv->wq, &priv->set_rx_mode_work);
- mlx5e_disable_async_events(priv);
flush_workqueue(priv->wq);
if (test_bit(MLX5_INTERFACE_STATE_SHUTDOWN, &mdev->intf_state)) {
netif_device_detach(netdev);
@@ -3272,17 +3379,15 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev,
unregister_netdev(netdev);
}
- mlx5e_tc_cleanup(priv);
- mlx5e_vxlan_cleanup(priv);
mlx5e_destroy_q_counter(priv);
- mlx5e_destroy_flow_steering(priv);
- mlx5e_destroy_tirs(priv);
- mlx5e_destroy_rqts(priv);
+ profile->cleanup_rx(priv);
mlx5e_close_drop_rq(priv);
- mlx5e_destroy_tises(priv);
+ profile->cleanup_tx(priv);
mlx5_core_destroy_mkey(priv->mdev, &priv->umr_mkey);
cancel_delayed_work_sync(&priv->update_stats_work);
destroy_workqueue(priv->wq);
+ if (profile->cleanup)
+ profile->cleanup(priv);
if (!test_bit(MLX5_INTERFACE_STATE_SHUTDOWN, &mdev->intf_state))
free_netdev(netdev);
--
2.8.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* [PATCH net-next 15/16] net/mlx5: Add Representors registration API
2016-06-27 16:07 [PATCH net-next 00/16] Mellanox 100G SRIOV E-Switch offload and VF representors Saeed Mahameed
` (13 preceding siblings ...)
2016-06-27 16:07 ` [PATCH net-next 14/16] net/mlx5e: Add support for multiple profiles Saeed Mahameed
@ 2016-06-27 16:07 ` Saeed Mahameed
2016-06-27 16:07 ` [PATCH net-next 16/16] net/mlx5e: Introduce SRIOV VF representors Saeed Mahameed
15 siblings, 0 replies; 47+ messages in thread
From: Saeed Mahameed @ 2016-06-27 16:07 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Saeed Mahameed
From: Hadar Hen Zion <hadarh@mellanox.com>
Introduce E-Switch registration/unregister representors functions.
Those functions are called by the mlx5e driver when the PF NIC is
created upon pci probe action regardless of the E-Switch mode (NONE,
LEGACY or OFFLOADS).
Adding basic E-Switch database that will hold the vport represntors
upon creation.
This patch doesn't add any new functionality.
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/en.h | 3 +-
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 60 +++++++++++++++++++---
drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 10 ++++
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 12 +++++
.../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 19 +++++++
5 files changed, 97 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 8d4d2b2..f61255c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -571,7 +571,7 @@ enum {
struct mlx5e_profile {
void (*init)(struct mlx5_core_dev *mdev,
struct net_device *netdev,
- const struct mlx5e_profile *profile);
+ const struct mlx5e_profile *profile, void *ppriv);
void (*cleanup)(struct mlx5e_priv *priv);
int (*init_rx)(struct mlx5e_priv *priv);
void (*cleanup_rx)(struct mlx5e_priv *priv);
@@ -618,6 +618,7 @@ struct mlx5e_priv {
struct mlx5e_tstamp tstamp;
u16 q_counter;
const struct mlx5e_profile *profile;
+ void *ppriv;
};
enum mlx5e_link_mode {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 8ffe68b..bfe3a4c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2881,7 +2881,8 @@ void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
static void mlx5e_build_nic_netdev_priv(struct mlx5_core_dev *mdev,
struct net_device *netdev,
- const struct mlx5e_profile *profile)
+ const struct mlx5e_profile *profile,
+ void *ppriv)
{
struct mlx5e_priv *priv = netdev_priv(netdev);
u32 link_speed = 0;
@@ -2963,6 +2964,7 @@ static void mlx5e_build_nic_netdev_priv(struct mlx5_core_dev *mdev,
priv->netdev = netdev;
priv->params.num_channels = profile->max_nch(mdev);
priv->profile = profile;
+ priv->ppriv = ppriv;
#ifdef CONFIG_MLX5_CORE_EN_DCB
mlx5e_ets_init(priv);
@@ -3127,18 +3129,25 @@ static int mlx5e_create_umr_mkey(struct mlx5e_priv *priv)
static void mlx5e_nic_init(struct mlx5_core_dev *mdev,
struct net_device *netdev,
- const struct mlx5e_profile *profile)
+ const struct mlx5e_profile *profile,
+ void *ppriv)
{
struct mlx5e_priv *priv = netdev_priv(netdev);
- mlx5e_build_nic_netdev_priv(mdev, netdev, profile);
+ mlx5e_build_nic_netdev_priv(mdev, netdev, profile, ppriv);
mlx5e_build_nic_netdev(netdev);
mlx5e_vxlan_init(priv);
}
static void mlx5e_nic_cleanup(struct mlx5e_priv *priv)
{
+ struct mlx5_core_dev *mdev = priv->mdev;
+ struct mlx5_eswitch *esw = mdev->priv.eswitch;
+
mlx5e_vxlan_cleanup(priv);
+
+ if (MLX5_CAP_GEN(mdev, vport_group_manager))
+ mlx5_eswitch_unregister_vport_rep(esw, 0);
}
static int mlx5e_init_nic_rx(struct mlx5e_priv *priv)
@@ -3230,6 +3239,8 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv)
{
struct net_device *netdev = priv->netdev;
struct mlx5_core_dev *mdev = priv->mdev;
+ struct mlx5_eswitch *esw = mdev->priv.eswitch;
+ struct mlx5_eswitch_rep rep;
if (mlx5e_vxlan_allowed(mdev)) {
rtnl_lock();
@@ -3239,6 +3250,12 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv)
mlx5e_enable_async_events(priv);
queue_work(priv->wq, &priv->set_rx_mode_work);
+
+ if (MLX5_CAP_GEN(mdev, vport_group_manager)) {
+ rep.vport = 0;
+ rep.priv_data = priv;
+ mlx5_eswitch_register_vport_rep(esw, &rep);
+ }
}
static void mlx5e_nic_disable(struct mlx5e_priv *priv)
@@ -3262,7 +3279,7 @@ static const struct mlx5e_profile mlx5e_nic_profile = {
};
static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev,
- const struct mlx5e_profile *profile)
+ const struct mlx5e_profile *profile, void *ppriv)
{
struct net_device *netdev;
struct mlx5e_priv *priv;
@@ -3277,7 +3294,7 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev,
return NULL;
}
- profile->init(mdev, netdev, profile);
+ profile->init(mdev, netdev, profile, ppriv);
netif_carrier_off(netdev);
@@ -3344,8 +3361,27 @@ err_free_netdev:
return NULL;
}
+static void mlx5e_register_vport_rep(struct mlx5_core_dev *mdev)
+{
+ struct mlx5_eswitch *esw = mdev->priv.eswitch;
+ int total_vfs = MLX5_TOTAL_VPORTS(mdev);
+ int vport;
+
+ if (!MLX5_CAP_GEN(mdev, vport_group_manager))
+ return;
+
+ for (vport = 1; vport < total_vfs; vport++) {
+ struct mlx5_eswitch_rep rep;
+
+ rep.vport = vport;
+ mlx5_eswitch_register_vport_rep(esw, &rep);
+ }
+}
+
static void *mlx5e_add(struct mlx5_core_dev *mdev)
{
+ struct mlx5_eswitch *esw = mdev->priv.eswitch;
+ void *ppriv = NULL;
void *ret;
if (mlx5e_check_required_hca_cap(mdev))
@@ -3354,7 +3390,12 @@ static void *mlx5e_add(struct mlx5_core_dev *mdev)
if (mlx5e_create_mdev_resources(mdev))
return NULL;
- ret = mlx5e_create_netdev(mdev, &mlx5e_nic_profile);
+ mlx5e_register_vport_rep(mdev);
+
+ if (MLX5_CAP_GEN(mdev, vport_group_manager))
+ ppriv = &esw->offloads.vport_reps[0];
+
+ ret = mlx5e_create_netdev(mdev, &mlx5e_nic_profile, ppriv);
if (!ret) {
mlx5e_destroy_mdev_resources(mdev);
return NULL;
@@ -3395,9 +3436,16 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, struct mlx5e_priv *
static void mlx5e_remove(struct mlx5_core_dev *mdev, void *vpriv)
{
+ struct mlx5_eswitch *esw = mdev->priv.eswitch;
+ int total_vfs = MLX5_TOTAL_VPORTS(mdev);
struct mlx5e_priv *priv = vpriv;
+ int vport;
mlx5e_destroy_netdev(mdev, priv);
+
+ for (vport = 1; vport < total_vfs; vport++)
+ mlx5_eswitch_unregister_vport_rep(esw, vport);
+
mlx5e_destroy_mdev_resources(mdev);
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 12f509c..f0a9735 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1663,6 +1663,14 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
goto abort;
}
+ esw->offloads.vport_reps =
+ kzalloc(total_vports * sizeof(struct mlx5_eswitch_rep),
+ GFP_KERNEL);
+ if (!esw->offloads.vport_reps) {
+ err = -ENOMEM;
+ goto abort;
+ }
+
mutex_init(&esw->state_lock);
for (vport_num = 0; vport_num < total_vports; vport_num++) {
@@ -1687,6 +1695,7 @@ abort:
destroy_workqueue(esw->work_queue);
kfree(esw->l2_table.bitmap);
kfree(esw->vports);
+ kfree(esw->offloads.vport_reps);
kfree(esw);
return err;
}
@@ -1704,6 +1713,7 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw)
destroy_workqueue(esw->work_queue);
kfree(esw->l2_table.bitmap);
kfree(esw->mc_promisc);
+ kfree(esw->offloads.vport_reps);
kfree(esw->vports);
kfree(esw);
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 7843f98..ffe5eab 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -156,9 +156,17 @@ enum {
SRIOV_OFFLOADS
};
+
+struct mlx5_eswitch_rep {
+ u16 vport;
+ void *priv_data;
+ bool valid;
+};
+
struct mlx5_esw_offload {
struct mlx5_flow_table *ft_offloads;
struct mlx5_flow_group *vport_rx_group;
+ struct mlx5_eswitch_rep *vport_reps;
};
struct mlx5_eswitch {
@@ -208,6 +216,10 @@ mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn)
int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode);
int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode);
+void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
+ struct mlx5_eswitch_rep *rep);
+void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw,
+ int vport);
#define MLX5_DEBUG_ESWITCH_MASK BIT(3)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index a39af6b..a7d5568c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -434,3 +434,22 @@ int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode)
return 0;
}
+
+void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
+ struct mlx5_eswitch_rep *rep)
+{
+ struct mlx5_esw_offload *offloads = &esw->offloads;
+
+ memcpy(&offloads->vport_reps[rep->vport], rep,
+ sizeof(struct mlx5_eswitch_rep));
+
+ offloads->vport_reps[rep->vport].valid = true;
+}
+
+void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw,
+ int vport)
+{
+ struct mlx5_esw_offload *offloads = &esw->offloads;
+
+ offloads->vport_reps[vport].valid = false;
+}
--
2.8.0
^ permalink raw reply related [flat|nested] 47+ messages in thread
* [PATCH net-next 16/16] net/mlx5e: Introduce SRIOV VF representors
2016-06-27 16:07 [PATCH net-next 00/16] Mellanox 100G SRIOV E-Switch offload and VF representors Saeed Mahameed
` (14 preceding siblings ...)
2016-06-27 16:07 ` [PATCH net-next 15/16] net/mlx5: Add Representors registration API Saeed Mahameed
@ 2016-06-27 16:07 ` Saeed Mahameed
15 siblings, 0 replies; 47+ messages in thread
From: Saeed Mahameed @ 2016-06-27 16:07 UTC (permalink / raw)
To: David S. Miller
Cc: netdev, Or Gerlitz, Hadar Hen-Zion, Jiri Pirko, Andy Gospodarek,
Jesse Brandeburg, John Fastabend, Saeed Mahameed
From: Hadar Hen Zion <hadarh@mellanox.com>
Implement the relevant profile functions to create mlx5e driver instance
serving as VF representor. When SRIOV offloads mode is enabled, each VF
will have a representor netdevice instance on the host.
To do that, we also export set of shared service functions from en_main.c,
such that they can be used by both NIC and repsresentors netdevs.
The newly created representor netdevice has a basic set of net_device_ops
which are the same ndo functions as the NIC netdevice and an ndo of it's
own for phys port name.
The profiling infrastructure allow sharing code between the NIC and the
vport representor even though the representor has only a subset of the
NIC functionality.
The VF reps and the PF which is used in that mode to represent the uplink,
expose switchdev ops. Currently the only op supposed is attr get for the
port parent ID which here serves to identify net-devices belonging to the
same HW E-Switch. Other than that, no offloading is implemented and hence
switching functionality is achieved if one sets SW switching rules, e.g
using tc, bridge or ovs.
Port phys name (ndo_get_phys_port_name) is implemented to allow exporting
to user-space the VF vport number and along with the switchdev port parent
id (phys_switch_id) enable a udev base consistent naming scheme:
SUBSYSTEM=="net", ACTION=="add", ATTR{phys_switch_id}=="<phys_switch_id>", \
ATTR{phys_port_name}!="", NAME="$PF_NIC$attr{phys_port_name}"
where phys_switch_id is exposed by the PF (and VF reps) and $PF_NIC is
the name of the PF netdevice.
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 +-
drivers/net/ethernet/mellanox/mlx5/core/en.h | 28 ++
drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 53 ++-
drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 387 +++++++++++++++++++++
drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 20 +-
.../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 96 ++++-
6 files changed, 567 insertions(+), 19 deletions(-)
create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
index 9b14dad..a574dea 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -8,6 +8,6 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
mlx5_core-$(CONFIG_MLX5_CORE_EN) += wq.o eswitch.o eswitch_offloads.o \
en_main.o en_common.o en_fs.o en_ethtool.o en_tx.o \
en_rx.o en_rx_am.o en_txrx.o en_clock.o vxlan.o \
- en_tc.o en_arfs.o
+ en_tc.o en_arfs.o en_rep.o
mlx5_core-$(CONFIG_MLX5_CORE_EN_DCB) += en_dcbnl.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index f61255c..5912a02 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -44,6 +44,7 @@
#include <linux/mlx5/vport.h>
#include <linux/mlx5/transobj.h>
#include <linux/rhashtable.h>
+#include <net/switchdev.h>
#include "wq.h"
#include "mlx5_core.h"
#include "en_stats.h"
@@ -816,4 +817,31 @@ int mlx5e_create_mdev_resources(struct mlx5_core_dev *mdev);
void mlx5e_destroy_mdev_resources(struct mlx5_core_dev *mdev);
int mlx5e_refresh_tirs_self_loopback_enable(struct mlx5_core_dev *mdev);
+struct mlx5_eswitch_rep;
+int mlx5e_vport_rep_load(struct mlx5_eswitch *esw,
+ struct mlx5_eswitch_rep *rep);
+void mlx5e_vport_rep_unload(struct mlx5_eswitch *esw,
+ struct mlx5_eswitch_rep *rep);
+int mlx5e_nic_rep_load(struct mlx5_eswitch *esw, struct mlx5_eswitch_rep *rep);
+void mlx5e_nic_rep_unload(struct mlx5_eswitch *esw,
+ struct mlx5_eswitch_rep *rep);
+int mlx5e_add_sqs_fwd_rules(struct mlx5e_priv *priv);
+void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv);
+int mlx5e_attr_get(struct net_device *dev, struct switchdev_attr *attr);
+
+int mlx5e_create_direct_rqts(struct mlx5e_priv *priv);
+void mlx5e_destroy_rqt(struct mlx5e_priv *priv, struct mlx5e_rqt *rqt);
+int mlx5e_create_direct_tirs(struct mlx5e_priv *priv);
+void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv);
+int mlx5e_create_tises(struct mlx5e_priv *priv);
+void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv);
+int mlx5e_close(struct net_device *netdev);
+int mlx5e_open(struct net_device *netdev);
+void mlx5e_update_stats_work(struct work_struct *work);
+void *mlx5e_create_netdev(struct mlx5_core_dev *mdev,
+ const struct mlx5e_profile *profile, void *ppriv);
+void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, struct mlx5e_priv *priv);
+struct rtnl_link_stats64 *
+mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats);
+
#endif /* __MLX5_EN_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index bfe3a4c..11ce66c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -227,7 +227,7 @@ void mlx5e_update_stats(struct mlx5e_priv *priv)
mlx5e_update_sw_counters(priv);
}
-static void mlx5e_update_stats_work(struct work_struct *work)
+void mlx5e_update_stats_work(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
struct mlx5e_priv *priv = container_of(dwork, struct mlx5e_priv,
@@ -1519,7 +1519,7 @@ static int mlx5e_create_rqt(struct mlx5e_priv *priv, int sz,
return err;
}
-static void mlx5e_destroy_rqt(struct mlx5e_priv *priv, struct mlx5e_rqt *rqt)
+void mlx5e_destroy_rqt(struct mlx5e_priv *priv, struct mlx5e_rqt *rqt)
{
rqt->enabled = false;
mlx5_core_destroy_rqt(priv->mdev, rqt->rqtn);
@@ -1532,7 +1532,7 @@ static int mlx5e_create_indirect_rqts(struct mlx5e_priv *priv)
return mlx5e_create_rqt(priv, MLX5E_INDIR_RQT_SIZE, 0, rqt);
}
-static int mlx5e_create_direct_rqts(struct mlx5e_priv *priv)
+int mlx5e_create_direct_rqts(struct mlx5e_priv *priv)
{
struct mlx5e_rqt *rqt;
int err;
@@ -1744,6 +1744,7 @@ static void mlx5e_netdev_set_tcs(struct net_device *netdev)
int mlx5e_open_locked(struct net_device *netdev)
{
struct mlx5e_priv *priv = netdev_priv(netdev);
+ struct mlx5_core_dev *mdev = priv->mdev;
int num_txqs;
int err;
@@ -1779,9 +1780,14 @@ int mlx5e_open_locked(struct net_device *netdev)
#ifdef CONFIG_RFS_ACCEL
priv->netdev->rx_cpu_rmap = priv->mdev->rmap;
#endif
+ if (priv->profile->update_stats)
+ queue_delayed_work(priv->wq, &priv->update_stats_work, 0);
- queue_delayed_work(priv->wq, &priv->update_stats_work, 0);
-
+ if (MLX5_CAP_GEN(mdev, vport_group_manager)) {
+ err = mlx5e_add_sqs_fwd_rules(priv);
+ if (err)
+ goto err_close_channels;
+ }
return 0;
err_close_channels:
@@ -1791,7 +1797,7 @@ err_clear_state_opened_flag:
return err;
}
-static int mlx5e_open(struct net_device *netdev)
+int mlx5e_open(struct net_device *netdev)
{
struct mlx5e_priv *priv = netdev_priv(netdev);
int err;
@@ -1806,6 +1812,7 @@ static int mlx5e_open(struct net_device *netdev)
int mlx5e_close_locked(struct net_device *netdev)
{
struct mlx5e_priv *priv = netdev_priv(netdev);
+ struct mlx5_core_dev *mdev = priv->mdev;
/* May already be CLOSED in case a previous configuration operation
* (e.g RX/TX queue size change) that involves close&open failed.
@@ -1815,6 +1822,9 @@ int mlx5e_close_locked(struct net_device *netdev)
clear_bit(MLX5E_STATE_OPENED, &priv->state);
+ if (MLX5_CAP_GEN(mdev, vport_group_manager))
+ mlx5e_remove_sqs_fwd_rules(priv);
+
mlx5e_timestamp_cleanup(priv);
netif_carrier_off(priv->netdev);
mlx5e_redirect_rqts(priv);
@@ -1823,7 +1833,7 @@ int mlx5e_close_locked(struct net_device *netdev)
return 0;
}
-static int mlx5e_close(struct net_device *netdev)
+int mlx5e_close(struct net_device *netdev)
{
struct mlx5e_priv *priv = netdev_priv(netdev);
int err;
@@ -1958,7 +1968,7 @@ static void mlx5e_destroy_tis(struct mlx5e_priv *priv, int tc)
mlx5_core_destroy_tis(priv->mdev, priv->tisn[tc]);
}
-static int mlx5e_create_tises(struct mlx5e_priv *priv)
+int mlx5e_create_tises(struct mlx5e_priv *priv)
{
int err;
int tc;
@@ -1978,7 +1988,7 @@ err_close_tises:
return err;
}
-static void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv)
+void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv)
{
int tc;
@@ -2144,7 +2154,7 @@ err_destroy_tirs:
return err;
}
-static int mlx5e_create_direct_tirs(struct mlx5e_priv *priv)
+int mlx5e_create_direct_tirs(struct mlx5e_priv *priv)
{
int nch = priv->profile->max_nch(priv->mdev);
struct mlx5e_tir *tir;
@@ -2191,7 +2201,7 @@ static void mlx5e_destroy_indirect_tirs(struct mlx5e_priv *priv)
mlx5e_destroy_tir(priv->mdev, &priv->indir_tir[i]);
}
-static void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv)
+void mlx5e_destroy_direct_tirs(struct mlx5e_priv *priv)
{
int nch = priv->profile->max_nch(priv->mdev);
int i;
@@ -2271,7 +2281,7 @@ mqprio:
return mlx5e_setup_tc(dev, tc->tc);
}
-static struct rtnl_link_stats64 *
+struct rtnl_link_stats64 *
mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
struct mlx5e_priv *priv = netdev_priv(dev);
@@ -2989,6 +2999,10 @@ static void mlx5e_set_netdev_dev_addr(struct net_device *netdev)
}
}
+static const struct switchdev_ops mlx5e_switchdev_ops = {
+ .switchdev_port_attr_get = mlx5e_attr_get,
+};
+
static void mlx5e_build_nic_netdev(struct net_device *netdev)
{
struct mlx5e_priv *priv = netdev_priv(netdev);
@@ -3070,6 +3084,11 @@ static void mlx5e_build_nic_netdev(struct net_device *netdev)
netdev->priv_flags |= IFF_UNICAST_FLT;
mlx5e_set_netdev_dev_addr(netdev);
+
+#ifdef CONFIG_NET_SWITCHDEV
+ if (MLX5_CAP_GEN(mdev, vport_group_manager))
+ netdev->switchdev_ops = &mlx5e_switchdev_ops;
+#endif
}
static void mlx5e_create_q_counter(struct mlx5e_priv *priv)
@@ -3252,6 +3271,8 @@ static void mlx5e_nic_enable(struct mlx5e_priv *priv)
queue_work(priv->wq, &priv->set_rx_mode_work);
if (MLX5_CAP_GEN(mdev, vport_group_manager)) {
+ rep.load = mlx5e_nic_rep_load;
+ rep.unload = mlx5e_nic_rep_unload;
rep.vport = 0;
rep.priv_data = priv;
mlx5_eswitch_register_vport_rep(esw, &rep);
@@ -3278,8 +3299,8 @@ static const struct mlx5e_profile mlx5e_nic_profile = {
.max_tc = MLX5E_MAX_NUM_TC,
};
-static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev,
- const struct mlx5e_profile *profile, void *ppriv)
+void *mlx5e_create_netdev(struct mlx5_core_dev *mdev,
+ const struct mlx5e_profile *profile, void *ppriv)
{
struct net_device *netdev;
struct mlx5e_priv *priv;
@@ -3373,6 +3394,8 @@ static void mlx5e_register_vport_rep(struct mlx5_core_dev *mdev)
for (vport = 1; vport < total_vfs; vport++) {
struct mlx5_eswitch_rep rep;
+ rep.load = mlx5e_vport_rep_load;
+ rep.unload = mlx5e_vport_rep_unload;
rep.vport = vport;
mlx5_eswitch_register_vport_rep(esw, &rep);
}
@@ -3403,7 +3426,7 @@ static void *mlx5e_add(struct mlx5_core_dev *mdev)
return ret;
}
-static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, struct mlx5e_priv *priv)
+void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, struct mlx5e_priv *priv)
{
const struct mlx5e_profile *profile = priv->profile;
struct net_device *netdev = priv->netdev;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
new file mode 100644
index 0000000..f806cfb
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <generated/utsrelease.h>
+#include <linux/mlx5/fs.h>
+#include <net/switchdev.h>
+
+#include "eswitch.h"
+#include "en.h"
+
+static const char mlx5e_rep_driver_name[] = "mlx5e_rep";
+
+static void mlx5e_rep_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *drvinfo)
+{
+ strlcpy(drvinfo->driver, mlx5e_rep_driver_name,
+ sizeof(drvinfo->driver));
+ strlcpy(drvinfo->version, UTS_RELEASE, sizeof(drvinfo->version));
+}
+
+#define NUM_VPORT_REP_COUNTERS 4
+
+static void mlx5e_rep_get_strings(struct net_device *dev,
+ u32 stringset, uint8_t *data)
+{
+ int i;
+
+ switch (stringset) {
+ case ETH_SS_STATS:
+ for (i = 0; i < NUM_VPORT_REP_COUNTERS; i++)
+ strcpy(data + (i * ETH_GSTRING_LEN),
+ sw_stats_desc[i].name);
+ break;
+ }
+}
+
+static void mlx5e_update_sw_rep_counters(struct mlx5e_priv *priv)
+{
+ struct mlx5e_sw_stats *s = &priv->stats.sw;
+ struct mlx5e_rq_stats *rq_stats;
+ struct mlx5e_sq_stats *sq_stats;
+ int i, j;
+
+ memset(s, 0, sizeof(*s));
+ for (i = 0; i < priv->params.num_channels; i++) {
+ rq_stats = &priv->channel[i]->rq.stats;
+
+ s->rx_packets += rq_stats->packets;
+ s->rx_bytes += rq_stats->bytes;
+
+ for (j = 0; j < priv->params.num_tc; j++) {
+ sq_stats = &priv->channel[i]->sq[j].stats;
+
+ s->tx_packets += sq_stats->packets;
+ s->tx_bytes += sq_stats->bytes;
+ }
+ }
+}
+
+static void mlx5e_rep_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *stats, u64 *data)
+{
+ struct mlx5e_priv *priv = netdev_priv(dev);
+ int i;
+
+ if (!data)
+ return;
+
+ mutex_lock(&priv->state_lock);
+ if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+ mlx5e_update_sw_rep_counters(priv);
+ mutex_unlock(&priv->state_lock);
+
+ for (i = 0; i < NUM_VPORT_REP_COUNTERS; i++)
+ data[i] = MLX5E_READ_CTR64_CPU(&priv->stats.sw,
+ sw_stats_desc, i);
+}
+
+static int mlx5e_rep_get_sset_count(struct net_device *dev, int sset)
+{
+ switch (sset) {
+ case ETH_SS_STATS:
+ return NUM_VPORT_REP_COUNTERS;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static const struct ethtool_ops mlx5e_rep_ethtool_ops = {
+ .get_drvinfo = mlx5e_rep_get_drvinfo,
+ .get_link = ethtool_op_get_link,
+ .get_strings = mlx5e_rep_get_strings,
+ .get_sset_count = mlx5e_rep_get_sset_count,
+ .get_ethtool_stats = mlx5e_rep_get_ethtool_stats,
+};
+
+int mlx5e_attr_get(struct net_device *dev, struct switchdev_attr *attr)
+{
+ struct mlx5e_priv *priv = netdev_priv(dev);
+ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+ u8 mac[ETH_ALEN];
+
+ if (esw->mode == SRIOV_NONE)
+ return -EOPNOTSUPP;
+
+ switch (attr->id) {
+ case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
+ mlx5_query_nic_vport_mac_address(priv->mdev, 0, mac);
+ attr->u.ppid.id_len = ETH_ALEN;
+ memcpy(&attr->u.ppid.id, &mac, ETH_ALEN);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+int mlx5e_add_sqs_fwd_rules(struct mlx5e_priv *priv)
+
+{
+ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+ struct mlx5_eswitch_rep *rep = priv->ppriv;
+ struct mlx5e_channel *c;
+ int n, tc, err, num_sqs = 0;
+ u16 *sqs;
+
+ sqs = kcalloc(priv->params.num_channels * priv->params.num_tc, sizeof(u16), GFP_KERNEL);
+ if (!sqs)
+ return -ENOMEM;
+
+ for (n = 0; n < priv->params.num_channels; n++) {
+ c = priv->channel[n];
+ for (tc = 0; tc < c->num_tc; tc++)
+ sqs[num_sqs++] = c->sq[tc].sqn;
+ }
+
+ err = mlx5_eswitch_sqs2vport_start(esw, rep, sqs, num_sqs);
+
+ kfree(sqs);
+ return err;
+}
+
+int mlx5e_nic_rep_load(struct mlx5_eswitch *esw, struct mlx5_eswitch_rep *rep)
+{
+ struct mlx5e_priv *priv = rep->priv_data;
+
+ if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+ return mlx5e_add_sqs_fwd_rules(priv);
+ return 0;
+}
+
+void mlx5e_remove_sqs_fwd_rules(struct mlx5e_priv *priv)
+{
+ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+ struct mlx5_eswitch_rep *rep = priv->ppriv;
+
+ mlx5_eswitch_sqs2vport_stop(esw, rep);
+}
+
+void mlx5e_nic_rep_unload(struct mlx5_eswitch *esw,
+ struct mlx5_eswitch_rep *rep)
+{
+ struct mlx5e_priv *priv = rep->priv_data;
+
+ if (test_bit(MLX5E_STATE_OPENED, &priv->state))
+ mlx5e_remove_sqs_fwd_rules(priv);
+}
+
+static int mlx5e_rep_get_phys_port_name(struct net_device *dev,
+ char *buf, size_t len)
+{
+ struct mlx5e_priv *priv = netdev_priv(dev);
+ struct mlx5_eswitch_rep *rep = priv->ppriv;
+ int ret;
+
+ ret = snprintf(buf, len, "%d", rep->vport - 1);
+ if (ret >= len)
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+static const struct switchdev_ops mlx5e_rep_switchdev_ops = {
+ .switchdev_port_attr_get = mlx5e_attr_get,
+};
+
+static const struct net_device_ops mlx5e_netdev_ops_rep = {
+ .ndo_open = mlx5e_open,
+ .ndo_stop = mlx5e_close,
+ .ndo_start_xmit = mlx5e_xmit,
+ .ndo_get_phys_port_name = mlx5e_rep_get_phys_port_name,
+ .ndo_get_stats64 = mlx5e_get_stats,
+};
+
+static void mlx5e_build_rep_netdev_priv(struct mlx5_core_dev *mdev,
+ struct net_device *netdev,
+ const struct mlx5e_profile *profile,
+ void *ppriv)
+{
+ struct mlx5e_priv *priv = netdev_priv(netdev);
+ u8 cq_period_mode = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ?
+ MLX5_CQ_PERIOD_MODE_START_FROM_CQE :
+ MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
+
+ priv->params.log_sq_size =
+ MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
+ priv->params.rq_wq_type = MLX5_WQ_TYPE_LINKED_LIST;
+ priv->params.log_rq_size = MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE;
+
+ priv->params.min_rx_wqes = mlx5_min_rx_wqes(priv->params.rq_wq_type,
+ BIT(priv->params.log_rq_size));
+
+ priv->params.rx_am_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
+ mlx5e_set_rx_cq_mode_params(&priv->params, cq_period_mode);
+
+ priv->params.tx_max_inline = mlx5e_get_max_inline_cap(mdev);
+ priv->params.num_tc = 1;
+
+ priv->params.lro_wqe_sz =
+ MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ;
+
+ priv->mdev = mdev;
+ priv->netdev = netdev;
+ priv->params.num_channels = profile->max_nch(mdev);
+ priv->profile = profile;
+ priv->ppriv = ppriv;
+
+ mutex_init(&priv->state_lock);
+
+ INIT_DELAYED_WORK(&priv->update_stats_work, mlx5e_update_stats_work);
+}
+
+static void mlx5e_build_rep_netdev(struct net_device *netdev)
+{
+ netdev->netdev_ops = &mlx5e_netdev_ops_rep;
+
+ netdev->watchdog_timeo = 15 * HZ;
+
+ netdev->ethtool_ops = &mlx5e_rep_ethtool_ops;
+
+#ifdef CONFIG_NET_SWITCHDEV
+ netdev->switchdev_ops = &mlx5e_rep_switchdev_ops;
+#endif
+
+ netdev->features |= NETIF_F_VLAN_CHALLENGED;
+
+ eth_hw_addr_random(netdev);
+}
+
+static void mlx5e_init_rep(struct mlx5_core_dev *mdev,
+ struct net_device *netdev,
+ const struct mlx5e_profile *profile,
+ void *ppriv)
+{
+ mlx5e_build_rep_netdev_priv(mdev, netdev, profile, ppriv);
+ mlx5e_build_rep_netdev(netdev);
+}
+
+static int mlx5e_init_rep_rx(struct mlx5e_priv *priv)
+{
+ struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+ struct mlx5_eswitch_rep *rep = priv->ppriv;
+ struct mlx5_core_dev *mdev = priv->mdev;
+ struct mlx5_flow_rule *flow_rule;
+ int err;
+ int i;
+
+ err = mlx5e_create_direct_rqts(priv);
+ if (err) {
+ mlx5_core_warn(mdev, "create direct rqts failed, %d\n", err);
+ return err;
+ }
+
+ err = mlx5e_create_direct_tirs(priv);
+ if (err) {
+ mlx5_core_warn(mdev, "create direct tirs failed, %d\n", err);
+ goto err_destroy_direct_rqts;
+ }
+
+ flow_rule = mlx5_eswitch_create_vport_rx_rule(esw,
+ rep->vport,
+ priv->direct_tir[0].tirn);
+ if (IS_ERR(flow_rule)) {
+ err = PTR_ERR(flow_rule);
+ goto err_destroy_direct_tirs;
+ }
+ rep->vport_rx_rule = flow_rule;
+
+ return 0;
+
+err_destroy_direct_tirs:
+ mlx5e_destroy_direct_tirs(priv);
+err_destroy_direct_rqts:
+ for (i = 0; i < priv->params.num_channels; i++)
+ mlx5e_destroy_rqt(priv, &priv->direct_tir[i].rqt);
+ return err;
+}
+
+static void mlx5e_cleanup_rep_rx(struct mlx5e_priv *priv)
+{
+ struct mlx5_eswitch_rep *rep = priv->ppriv;
+ int i;
+
+ mlx5_del_flow_rule(rep->vport_rx_rule);
+ mlx5e_destroy_direct_tirs(priv);
+ for (i = 0; i < priv->params.num_channels; i++)
+ mlx5e_destroy_rqt(priv, &priv->direct_tir[i].rqt);
+}
+
+static int mlx5e_init_rep_tx(struct mlx5e_priv *priv)
+{
+ int err;
+
+ err = mlx5e_create_tises(priv);
+ if (err) {
+ mlx5_core_warn(priv->mdev, "create tises failed, %d\n", err);
+ return err;
+ }
+ return 0;
+}
+
+static int mlx5e_get_rep_max_num_channels(struct mlx5_core_dev *mdev)
+{
+#define MLX5E_PORT_REPRESENTOR_NCH 1
+ return MLX5E_PORT_REPRESENTOR_NCH;
+}
+
+static struct mlx5e_profile mlx5e_rep_profile = {
+ .init = mlx5e_init_rep,
+ .init_rx = mlx5e_init_rep_rx,
+ .cleanup_rx = mlx5e_cleanup_rep_rx,
+ .init_tx = mlx5e_init_rep_tx,
+ .cleanup_tx = mlx5e_cleanup_nic_tx,
+ .update_stats = mlx5e_update_sw_rep_counters,
+ .max_nch = mlx5e_get_rep_max_num_channels,
+ .max_tc = 1,
+};
+
+int mlx5e_vport_rep_load(struct mlx5_eswitch *esw,
+ struct mlx5_eswitch_rep *rep)
+{
+ rep->priv_data = mlx5e_create_netdev(esw->dev, &mlx5e_rep_profile, rep);
+ if (!rep->priv_data) {
+ pr_warn("Failed to create representor for vport %d\n",
+ rep->vport);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+void mlx5e_vport_rep_unload(struct mlx5_eswitch *esw,
+ struct mlx5_eswitch_rep *rep)
+{
+ struct mlx5e_priv *priv = rep->priv_data;
+
+ mlx5e_destroy_netdev(esw->dev, priv);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index ffe5eab..7b45e6a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -47,6 +47,8 @@
#define MLX5_L2_ADDR_HASH_SIZE (BIT(BITS_PER_BYTE))
#define MLX5_L2_ADDR_HASH(addr) (addr[5])
+#define FDB_UPLINK_VPORT 0xffff
+
/* L2 -mac address based- hash helpers */
struct l2addr_node {
struct hlist_node hlist;
@@ -156,10 +158,20 @@ enum {
SRIOV_OFFLOADS
};
+struct mlx5_esw_sq {
+ struct mlx5_flow_rule *send_to_vport_rule;
+ struct list_head list;
+};
struct mlx5_eswitch_rep {
+ int (*load)(struct mlx5_eswitch *esw,
+ struct mlx5_eswitch_rep *rep);
+ void (*unload)(struct mlx5_eswitch *esw,
+ struct mlx5_eswitch_rep *rep);
u16 vport;
+ struct mlx5_flow_rule *vport_rx_rule;
void *priv_data;
+ struct list_head vport_sqs_list;
bool valid;
};
@@ -208,12 +220,16 @@ int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw,
int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
int vport,
struct ifla_vf_stats *vf_stats);
-struct mlx5_flow_rule *
-mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn);
struct mlx5_flow_rule *
mlx5_eswitch_create_vport_rx_rule(struct mlx5_eswitch *esw, int vport, u32 tirn);
+int mlx5_eswitch_sqs2vport_start(struct mlx5_eswitch *esw,
+ struct mlx5_eswitch_rep *rep,
+ u16 *sqns_array, int sqns_num);
+void mlx5_eswitch_sqs2vport_stop(struct mlx5_eswitch *esw,
+ struct mlx5_eswitch_rep *rep);
+
int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode);
int mlx5_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode);
void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index a7d5568c..04544be 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -38,7 +38,7 @@
#include "mlx5_core.h"
#include "eswitch.h"
-struct mlx5_flow_rule *
+static struct mlx5_flow_rule *
mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, int vport, u32 sqn)
{
struct mlx5_flow_destination dest;
@@ -77,6 +77,63 @@ out:
return flow_rule;
}
+void mlx5_eswitch_sqs2vport_stop(struct mlx5_eswitch *esw,
+ struct mlx5_eswitch_rep *rep)
+{
+ struct mlx5_esw_sq *esw_sq, *tmp;
+
+ if (esw->mode != SRIOV_OFFLOADS)
+ return;
+
+ list_for_each_entry_safe(esw_sq, tmp, &rep->vport_sqs_list, list) {
+ mlx5_del_flow_rule(esw_sq->send_to_vport_rule);
+ list_del(&esw_sq->list);
+ kfree(esw_sq);
+ }
+}
+
+int mlx5_eswitch_sqs2vport_start(struct mlx5_eswitch *esw,
+ struct mlx5_eswitch_rep *rep,
+ u16 *sqns_array, int sqns_num)
+{
+ struct mlx5_flow_rule *flow_rule;
+ struct mlx5_esw_sq *esw_sq;
+ int vport;
+ int err;
+ int i;
+
+ if (esw->mode != SRIOV_OFFLOADS)
+ return 0;
+
+ vport = rep->vport == 0 ?
+ FDB_UPLINK_VPORT : rep->vport;
+
+ for (i = 0; i < sqns_num; i++) {
+ esw_sq = kzalloc(sizeof(*esw_sq), GFP_KERNEL);
+ if (!esw_sq) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ /* Add re-inject rule to the PF/representor sqs */
+ flow_rule = mlx5_eswitch_add_send_to_vport_rule(esw,
+ vport,
+ sqns_array[i]);
+ if (IS_ERR(flow_rule)) {
+ err = PTR_ERR(flow_rule);
+ kfree(esw_sq);
+ goto out_err;
+ }
+ esw_sq->send_to_vport_rule = flow_rule;
+ list_add(&esw_sq->list, &rep->vport_sqs_list);
+ }
+ return 0;
+
+out_err:
+ mlx5_eswitch_sqs2vport_stop(esw, rep);
+ return err;
+}
+
static int esw_add_fdb_miss_rule(struct mlx5_eswitch *esw)
{
struct mlx5_flow_destination dest;
@@ -349,6 +406,8 @@ static int esw_offloads_start(struct mlx5_eswitch *esw)
int esw_offloads_init(struct mlx5_eswitch *esw, int nvports)
{
+ struct mlx5_eswitch_rep *rep;
+ int vport;
int err;
err = esw_create_offloads_fdb_table(esw, nvports);
@@ -363,8 +422,26 @@ int esw_offloads_init(struct mlx5_eswitch *esw, int nvports)
if (err)
goto create_fg_err;
+ for (vport = 0; vport < nvports; vport++) {
+ rep = &esw->offloads.vport_reps[vport];
+ if (!rep->valid)
+ continue;
+
+ err = rep->load(esw, rep);
+ if (err)
+ goto err_reps;
+ }
return 0;
+err_reps:
+ for (vport--; vport >= 0; vport--) {
+ rep = &esw->offloads.vport_reps[vport];
+ if (!rep->valid)
+ continue;
+ rep->unload(esw, rep);
+ }
+ esw_destroy_vport_rx_group(esw);
+
create_fg_err:
esw_destroy_offloads_table(esw);
@@ -387,6 +464,16 @@ static int esw_offloads_stop(struct mlx5_eswitch *esw)
void esw_offloads_cleanup(struct mlx5_eswitch *esw, int nvports)
{
+ struct mlx5_eswitch_rep *rep;
+ int vport;
+
+ for (vport = 0; vport < nvports; vport++) {
+ rep = &esw->offloads.vport_reps[vport];
+ if (!rep->valid)
+ continue;
+ rep->unload(esw, rep);
+ }
+
esw_destroy_vport_rx_group(esw);
esw_destroy_offloads_table(esw);
esw_destroy_offloads_fdb_table(esw);
@@ -443,6 +530,7 @@ void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
memcpy(&offloads->vport_reps[rep->vport], rep,
sizeof(struct mlx5_eswitch_rep));
+ INIT_LIST_HEAD(&offloads->vport_reps[rep->vport].vport_sqs_list);
offloads->vport_reps[rep->vport].valid = true;
}
@@ -450,6 +538,12 @@ void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw,
int vport)
{
struct mlx5_esw_offload *offloads = &esw->offloads;
+ struct mlx5_eswitch_rep *rep;
+
+ rep = &offloads->vport_reps[vport];
+
+ if (esw->mode == SRIOV_OFFLOADS && esw->vports[vport].enabled)
+ rep->unload(esw, rep);
offloads->vport_reps[vport].valid = false;
}
--
2.8.0
^ permalink raw reply related [flat|nested] 47+ messages in thread