The following patchset opens support for offloading tuples in parallel. Patches for netfilter replace the flow table block lock with rw sem, and use a work entry per offload command, so they can be run in parallel under rw sem read lock. MLX5 patch removes the unguarded ct entries list, and instead uses rhashtable's iterator to support the above. --- Changelog: v1->v2: Rebased on dave's net-next, HEAD: 5bb7357 ("Merge branch 'cls_flower-Use-extack-in-fl_set_key'") Paul Blakey (3): netfilter: flowtable: Use rw sem as flow block lock netfilter: flowtable: Use work entry per offload command net/mlx5: CT: Use rhashtable's ct entries instead of a seperate list drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 19 +++----- include/net/netfilter/nf_flow_table.h | 2 +- net/netfilter/nf_flow_table_core.c | 11 +++-- net/netfilter/nf_flow_table_offload.c | 50 ++++++++-------------- 4 files changed, 30 insertions(+), 52 deletions(-) -- 1.8.3.1
Currently flow offload threads are synchronized by the flow block mutex. Use rw lock instead to increase flow insertion (read) concurrency. Signed-off-by: Paul Blakey <paulb@mellanox.com> Reviewed-by: Oz Shlomo <ozsh@mellanox.com> --- include/net/netfilter/nf_flow_table.h | 2 +- net/netfilter/nf_flow_table_core.c | 11 +++++------ net/netfilter/nf_flow_table_offload.c | 4 ++-- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index f523ea8..956193b 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -73,7 +73,7 @@ struct nf_flowtable { struct delayed_work gc_work; unsigned int flags; struct flow_block flow_block; - struct mutex flow_block_lock; /* Guards flow_block */ + struct rw_semaphore flow_block_lock; /* Guards flow_block */ possible_net_t net; }; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index d523c2a..c0cb7949 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -392,7 +392,7 @@ int nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, struct flow_block_cb *block_cb; int err = 0; - mutex_lock(&flow_table->flow_block_lock); + down_write(&flow_table->flow_block_lock); block_cb = flow_block_cb_lookup(block, cb, cb_priv); if (block_cb) { err = -EEXIST; @@ -408,7 +408,7 @@ int nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, list_add_tail(&block_cb->list, &block->cb_list); unlock: - mutex_unlock(&flow_table->flow_block_lock); + up_write(&flow_table->flow_block_lock); return err; } EXPORT_SYMBOL_GPL(nf_flow_table_offload_add_cb); @@ -419,13 +419,13 @@ void nf_flow_table_offload_del_cb(struct nf_flowtable *flow_table, struct flow_block *block = &flow_table->flow_block; struct flow_block_cb *block_cb; - mutex_lock(&flow_table->flow_block_lock); + down_write(&flow_table->flow_block_lock); block_cb = flow_block_cb_lookup(block, cb, cb_priv); if (block_cb) list_del(&block_cb->list); else WARN_ON(true); - mutex_unlock(&flow_table->flow_block_lock); + up_write(&flow_table->flow_block_lock); } EXPORT_SYMBOL_GPL(nf_flow_table_offload_del_cb); @@ -551,7 +551,7 @@ int nf_flow_table_init(struct nf_flowtable *flowtable) INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); flow_block_init(&flowtable->flow_block); - mutex_init(&flowtable->flow_block_lock); + init_rwsem(&flowtable->flow_block_lock); err = rhashtable_init(&flowtable->rhashtable, &nf_flow_offload_rhash_params); @@ -617,7 +617,6 @@ void nf_flow_table_free(struct nf_flowtable *flow_table) nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); rhashtable_destroy(&flow_table->rhashtable); - mutex_destroy(&flow_table->flow_block_lock); } EXPORT_SYMBOL_GPL(nf_flow_table_free); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 024a916..dbd3685 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -692,7 +692,7 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, if (cmd == FLOW_CLS_REPLACE) cls_flow.rule = flow_rule->rule; - mutex_lock(&flowtable->flow_block_lock); + down_read(&flowtable->flow_block_lock); list_for_each_entry(block_cb, block_cb_list, list) { err = block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, block_cb->cb_priv); @@ -701,7 +701,7 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, i++; } - mutex_unlock(&flowtable->flow_block_lock); + up_read(&flowtable->flow_block_lock); if (cmd == FLOW_CLS_STATS) memcpy(stats, &cls_flow.stats, sizeof(*stats)); -- 1.8.3.1
To allow offload commands to execute in parallel, create workqueue for flow table offload, and use a work entry per offload command. Signed-off-by: Paul Blakey <paulb@mellanox.com> Reviewed-by: Oz Shlomo <ozsh@mellanox.com> --- net/netfilter/nf_flow_table_offload.c | 46 ++++++++++++----------------------- 1 file changed, 15 insertions(+), 31 deletions(-) diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index dbd3685..527114a 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -12,9 +12,7 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_tuple.h> -static struct work_struct nf_flow_offload_work; -static DEFINE_SPINLOCK(flow_offload_pending_list_lock); -static LIST_HEAD(flow_offload_pending_list); +static struct workqueue_struct *nf_flow_offload_wq; struct flow_offload_work { struct list_head list; @@ -22,6 +20,7 @@ struct flow_offload_work { int priority; struct nf_flowtable *flowtable; struct flow_offload *flow; + struct work_struct work; }; #define NF_FLOW_DISSECTOR(__match, __type, __field) \ @@ -789,15 +788,10 @@ static void flow_offload_work_stats(struct flow_offload_work *offload) static void flow_offload_work_handler(struct work_struct *work) { - struct flow_offload_work *offload, *next; - LIST_HEAD(offload_pending_list); - - spin_lock_bh(&flow_offload_pending_list_lock); - list_replace_init(&flow_offload_pending_list, &offload_pending_list); - spin_unlock_bh(&flow_offload_pending_list_lock); + struct flow_offload_work *offload; - list_for_each_entry_safe(offload, next, &offload_pending_list, list) { - switch (offload->cmd) { + offload = container_of(work, struct flow_offload_work, work); + switch (offload->cmd) { case FLOW_CLS_REPLACE: flow_offload_work_add(offload); break; @@ -809,19 +803,14 @@ static void flow_offload_work_handler(struct work_struct *work) break; default: WARN_ON_ONCE(1); - } - list_del(&offload->list); - kfree(offload); } + + kfree(offload); } static void flow_offload_queue_work(struct flow_offload_work *offload) { - spin_lock_bh(&flow_offload_pending_list_lock); - list_add_tail(&offload->list, &flow_offload_pending_list); - spin_unlock_bh(&flow_offload_pending_list_lock); - - schedule_work(&nf_flow_offload_work); + queue_work(nf_flow_offload_wq, &offload->work); } static struct flow_offload_work * @@ -838,6 +827,7 @@ static void flow_offload_queue_work(struct flow_offload_work *offload) offload->flow = flow; offload->priority = flowtable->priority; offload->flowtable = flowtable; + INIT_WORK(&offload->work, flow_offload_work_handler); return offload; } @@ -888,7 +878,7 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable, void nf_flow_table_offload_flush(struct nf_flowtable *flowtable) { if (nf_flowtable_hw_offload(flowtable)) - flush_work(&nf_flow_offload_work); + flush_workqueue(nf_flow_offload_wq); } static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, @@ -1053,7 +1043,10 @@ static void nf_flow_table_indr_block_cb(struct net_device *dev, int nf_flow_table_offload_init(void) { - INIT_WORK(&nf_flow_offload_work, flow_offload_work_handler); + nf_flow_offload_wq = alloc_workqueue("nf_flow_table_offload", + WQ_UNBOUND | WQ_MEM_RECLAIM, 0); + if (!nf_flow_offload_wq) + return -ENOMEM; flow_indr_add_block_cb(&block_ing_entry); @@ -1062,15 +1055,6 @@ int nf_flow_table_offload_init(void) void nf_flow_table_offload_exit(void) { - struct flow_offload_work *offload, *next; - LIST_HEAD(offload_pending_list); - flow_indr_del_block_cb(&block_ing_entry); - - cancel_work_sync(&nf_flow_offload_work); - - list_for_each_entry_safe(offload, next, &offload_pending_list, list) { - list_del(&offload->list); - kfree(offload); - } + destroy_workqueue(nf_flow_offload_wq); } -- 1.8.3.1
CT entries list is only used while freeing a ct zone flow table to go over all the ct entries offloaded on that zone/table, and flush the table. Rhashtable already provides an api to go over all the inserted entries. Use it instead, and remove the list. Signed-off-by: Paul Blakey <paulb@mellanox.com> Reviewed-by: Oz Shlomo <ozsh@mellanox.com> --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index a22ad6b..afc8ac3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -67,11 +67,9 @@ struct mlx5_ct_ft { struct nf_flowtable *nf_ft; struct mlx5_tc_ct_priv *ct_priv; struct rhashtable ct_entries_ht; - struct list_head ct_entries_list; }; struct mlx5_ct_entry { - struct list_head list; u16 zone; struct rhash_head node; struct flow_rule *flow_rule; @@ -617,8 +615,6 @@ struct mlx5_ct_entry { if (err) goto err_insert; - list_add(&entry->list, &ft->ct_entries_list); - return 0; err_insert: @@ -646,7 +642,6 @@ struct mlx5_ct_entry { WARN_ON(rhashtable_remove_fast(&ft->ct_entries_ht, &entry->node, cts_ht_params)); - list_del(&entry->list); kfree(entry); return 0; @@ -817,7 +812,6 @@ struct mlx5_ct_entry { ft->zone = zone; ft->nf_ft = nf_ft; ft->ct_priv = ct_priv; - INIT_LIST_HEAD(&ft->ct_entries_list); refcount_set(&ft->refcount, 1); err = rhashtable_init(&ft->ct_entries_ht, &cts_ht_params); @@ -846,12 +840,12 @@ struct mlx5_ct_entry { } static void -mlx5_tc_ct_flush_ft(struct mlx5_tc_ct_priv *ct_priv, struct mlx5_ct_ft *ft) +mlx5_tc_ct_flush_ft_entry(void *ptr, void *arg) { - struct mlx5_ct_entry *entry; + struct mlx5_tc_ct_priv *ct_priv = arg; + struct mlx5_ct_entry *entry = ptr; - list_for_each_entry(entry, &ft->ct_entries_list, list) - mlx5_tc_ct_entry_del_rules(ft->ct_priv, entry); + mlx5_tc_ct_entry_del_rules(ct_priv, entry); } static void @@ -862,9 +856,10 @@ struct mlx5_ct_entry { nf_flow_table_offload_del_cb(ft->nf_ft, mlx5_tc_ct_block_flow_offload, ft); - mlx5_tc_ct_flush_ft(ct_priv, ft); rhashtable_remove_fast(&ct_priv->zone_ht, &ft->node, zone_params); - rhashtable_destroy(&ft->ct_entries_ht); + rhashtable_free_and_destroy(&ft->ct_entries_ht, + mlx5_tc_ct_flush_ft_entry, + ct_priv); kfree(ft); } -- 1.8.3.1
On Fri, Mar 27, 2020 at 12:12:28PM +0300, Paul Blakey wrote:
> The following patchset opens support for offloading tuples in parallel.
>
> Patches for netfilter replace the flow table block lock with rw sem,
> and use a work entry per offload command, so they can be run in
> parallel under rw sem read lock.
I'll apply 1/3 and 2/3 to nf-next.
@Saeed: please handle patch 3/3 through the MLX5 driver tree.
Thanks.
On Fri, Mar 27, 2020 at 12:12:29PM +0300, Paul Blakey wrote:
> Currently flow offload threads are synchronized by the flow block mutex.
> Use rw lock instead to increase flow insertion (read) concurrency.
Applied to nf-next, thanks.
On Fri, Mar 27, 2020 at 12:12:30PM +0300, Paul Blakey wrote:
> To allow offload commands to execute in parallel, create workqueue
> for flow table offload, and use a work entry per offload command.
Also applied to nf-next, thanks.