linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH rdma-next 0/2] Add gratuitous ARP support to RDMA-CM
@ 2022-04-04 12:27 Leon Romanovsky
  2022-04-04 12:27 ` [PATCH rdma-next 1/2] RDMA/core: Add an rb_tree that stores cm_ids sorted by ifindex and remote IP Leon Romanovsky
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Leon Romanovsky @ 2022-04-04 12:27 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Leon Romanovsky, linux-kernel, linux-rdma, Mark Zhang, Patrisious Haddad

From: Leon Romanovsky <leonro@nvidia.com>

In this series, Patrisious adds gratuitous ARP support to RDMA-CM, in
order to speed up migration failover from one node to another.

Thanks

Patrisious Haddad (2):
  RDMA/core: Add an rb_tree that stores cm_ids sorted by ifindex and
    remote IP
  RDMA/core: Add a netevent notifier to cma

 drivers/infiniband/core/cma.c      | 260 +++++++++++++++++++++++++++--
 drivers/infiniband/core/cma_priv.h |   1 +
 2 files changed, 249 insertions(+), 12 deletions(-)

-- 
2.35.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH rdma-next 1/2] RDMA/core: Add an rb_tree that stores cm_ids sorted by ifindex and remote IP
  2022-04-04 12:27 [PATCH rdma-next 0/2] Add gratuitous ARP support to RDMA-CM Leon Romanovsky
@ 2022-04-04 12:27 ` Leon Romanovsky
  2022-05-10 23:56   ` Jason Gunthorpe
  2022-04-04 12:27 ` [PATCH rdma-next 2/2] RDMA/core: Add a netevent notifier to cma Leon Romanovsky
  2022-04-29 16:07 ` [PATCH rdma-next 0/2] Add gratuitous ARP support to RDMA-CM Olga Kornievskaia
  2 siblings, 1 reply; 7+ messages in thread
From: Leon Romanovsky @ 2022-04-04 12:27 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Patrisious Haddad, linux-kernel, linux-rdma, Mark Zhang

From: Patrisious Haddad <phaddad@nvidia.com>

Add to the cma, a tree that keeps track of all rdma_id_private channels
that were created while in RoCE mode.

The IDs are sorted first according to their netdevice ifindex then their
destination IP. And for IDs with matching IP they would be at the same node
in the tree, since the tree data is a list of all ids with matching destination IP.

The tree allows fast and efficient lookup of ids using an ifindex and
IP address which is useful for identifying relevant net_events promptly.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/core/cma.c      | 156 ++++++++++++++++++++++++++---
 drivers/infiniband/core/cma_priv.h |   1 +
 2 files changed, 145 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index fabca5e51e3d..bfe2b70daf39 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -11,6 +11,7 @@
 #include <linux/in6.h>
 #include <linux/mutex.h>
 #include <linux/random.h>
+#include <linux/rbtree.h>
 #include <linux/igmp.h>
 #include <linux/xarray.h>
 #include <linux/inetdevice.h>
@@ -168,6 +169,9 @@ static struct ib_sa_client sa_client;
 static LIST_HEAD(dev_list);
 static LIST_HEAD(listen_any_list);
 static DEFINE_MUTEX(lock);
+static struct rb_root id_table = RB_ROOT;
+/* Serialize operations of id_table tree */
+static DEFINE_SPINLOCK(id_table_lock);
 static struct workqueue_struct *cma_wq;
 static unsigned int cma_pernet_id;
 
@@ -202,6 +206,11 @@ struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps)
 	}
 }
 
+struct id_table_entry {
+	struct list_head id_list;
+	struct rb_node rb_node;
+};
+
 struct cma_device {
 	struct list_head	list;
 	struct ib_device	*device;
@@ -420,11 +429,21 @@ static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr)
 	return hdr->ip_version >> 4;
 }
 
-static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
+static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
 {
 	hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
 }
 
+static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv)
+{
+	return (struct sockaddr *)&id_priv->id.route.addr.src_addr;
+}
+
+static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv)
+{
+	return (struct sockaddr *)&id_priv->id.route.addr.dst_addr;
+}
+
 static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join)
 {
 	struct in_device *in_dev = NULL;
@@ -445,6 +464,124 @@ static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join)
 	return (in_dev) ? 0 : -ENODEV;
 }
 
+static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa,
+				 int ifindex_b, struct sockaddr *sb)
+{
+	if (ifindex_a != ifindex_b)
+		return ifindex_a - ifindex_b;
+
+	if (sa->sa_family != sb->sa_family)
+		return sa->sa_family - sb->sa_family;
+
+	if (sa->sa_family == AF_INET)
+		return (int)__be32_to_cpu(
+			       ((struct sockaddr_in *)sa)->sin_addr.s_addr) -
+		       (int)__be32_to_cpu(
+			       ((struct sockaddr_in *)sb)->sin_addr.s_addr);
+
+	return memcmp((char *)&((struct sockaddr_in6 *)sa)->sin6_addr,
+		      (char *)&((struct sockaddr_in6 *)sb)->sin6_addr,
+		      sizeof(((struct sockaddr_in6 *)sa)->sin6_addr));
+}
+
+static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv)
+{
+	struct rb_node **new = &id_table.rb_node, *parent = NULL;
+	struct id_table_entry *this, *node;
+	struct rdma_id_private *id_priv;
+	unsigned long flags;
+	int result;
+
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return -ENOMEM;
+
+	spin_lock_irqsave(&id_table_lock, flags);
+	while (*new) {
+		this = container_of(*new, struct id_table_entry, rb_node);
+		id_priv = list_first_entry(
+			&this->id_list, struct rdma_id_private, id_list_entry);
+		result = compare_netdev_and_ip(
+			node_id_priv->id.route.addr.dev_addr.bound_dev_if,
+			cma_dst_addr(node_id_priv),
+			id_priv->id.route.addr.dev_addr.bound_dev_if,
+			cma_dst_addr(id_priv));
+
+		parent = *new;
+		if (result < 0)
+			new = &((*new)->rb_left);
+		else if (result > 0)
+			new = &((*new)->rb_right);
+		else {
+			list_add_tail(&node_id_priv->id_list_entry,
+				      &this->id_list);
+			kfree(node);
+			goto unlock;
+		}
+	}
+
+	INIT_LIST_HEAD(&node->id_list);
+	list_add_tail(&node_id_priv->id_list_entry, &node->id_list);
+
+	rb_link_node(&node->rb_node, parent, new);
+	rb_insert_color(&node->rb_node, &id_table);
+
+unlock:
+	spin_unlock_irqrestore(&id_table_lock, flags);
+	return 0;
+}
+
+static struct id_table_entry *
+node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa)
+{
+	struct rb_node *node = root->rb_node;
+	struct rdma_id_private *node_id_priv;
+	struct id_table_entry *data;
+	int result;
+
+	while (node) {
+		data = container_of(node, struct id_table_entry, rb_node);
+		node_id_priv = list_first_entry(
+			&data->id_list, struct rdma_id_private, id_list_entry);
+		result = compare_netdev_and_ip(
+			ifindex, sa,
+			node_id_priv->id.route.addr.dev_addr.bound_dev_if,
+			cma_dst_addr(node_id_priv));
+		if (result < 0)
+			node = node->rb_left;
+		else if (result > 0)
+			node = node->rb_right;
+		else
+			return data;
+	}
+
+	return NULL;
+}
+
+static void cma_remove_id_from_tree(struct rdma_id_private *id_priv)
+{
+	struct id_table_entry *data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&id_table_lock, flags);
+	if (list_empty(&id_priv->id_list_entry))
+		goto out;
+
+	data = node_from_ndev_ip(&id_table,
+				 id_priv->id.route.addr.dev_addr.bound_dev_if,
+				 cma_dst_addr(id_priv));
+	if (!data)
+		goto out;
+
+	list_del_init(&id_priv->id_list_entry);
+	if (list_empty(&data->id_list)) {
+		rb_erase(&data->rb_node, &id_table);
+		kfree(data);
+	}
+out:
+	spin_unlock_irqrestore(&id_table_lock, flags);
+}
+
 static void _cma_attach_to_dev(struct rdma_id_private *id_priv,
 			       struct cma_device *cma_dev)
 {
@@ -481,16 +618,6 @@ static void cma_release_dev(struct rdma_id_private *id_priv)
 	mutex_unlock(&lock);
 }
 
-static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv)
-{
-	return (struct sockaddr *) &id_priv->id.route.addr.src_addr;
-}
-
-static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv)
-{
-	return (struct sockaddr *) &id_priv->id.route.addr.dst_addr;
-}
-
 static inline unsigned short cma_family(struct rdma_id_private *id_priv)
 {
 	return id_priv->id.route.addr.src_addr.ss_family;
@@ -861,6 +988,7 @@ __rdma_create_id(struct net *net, rdma_cm_event_handler event_handler,
 	refcount_set(&id_priv->refcount, 1);
 	mutex_init(&id_priv->handler_mutex);
 	INIT_LIST_HEAD(&id_priv->device_item);
+	INIT_LIST_HEAD(&id_priv->id_list_entry);
 	INIT_LIST_HEAD(&id_priv->listen_list);
 	INIT_LIST_HEAD(&id_priv->mc_list);
 	get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
@@ -1883,6 +2011,7 @@ static void _destroy_id(struct rdma_id_private *id_priv,
 	cma_cancel_operation(id_priv, state);
 
 	rdma_restrack_del(&id_priv->res);
+	cma_remove_id_from_tree(id_priv);
 	if (id_priv->cma_dev) {
 		if (rdma_cap_ib_cm(id_priv->id.device, 1)) {
 			if (id_priv->cm_id.ib)
@@ -3172,8 +3301,11 @@ int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms)
 	cma_id_get(id_priv);
 	if (rdma_cap_ib_sa(id->device, id->port_num))
 		ret = cma_resolve_ib_route(id_priv, timeout_ms);
-	else if (rdma_protocol_roce(id->device, id->port_num))
+	else if (rdma_protocol_roce(id->device, id->port_num)) {
 		ret = cma_resolve_iboe_route(id_priv);
+		if (!ret)
+			cma_add_id_to_tree(id_priv);
+	}
 	else if (rdma_protocol_iwarp(id->device, id->port_num))
 		ret = cma_resolve_iw_route(id_priv);
 	else
diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h
index 757a0ef79872..b7354c94cf1b 100644
--- a/drivers/infiniband/core/cma_priv.h
+++ b/drivers/infiniband/core/cma_priv.h
@@ -64,6 +64,7 @@ struct rdma_id_private {
 		struct list_head listen_item;
 		struct list_head listen_list;
 	};
+	struct list_head        id_list_entry;
 	struct cma_device	*cma_dev;
 	struct list_head	mc_list;
 
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH rdma-next 2/2] RDMA/core: Add a netevent notifier to cma
  2022-04-04 12:27 [PATCH rdma-next 0/2] Add gratuitous ARP support to RDMA-CM Leon Romanovsky
  2022-04-04 12:27 ` [PATCH rdma-next 1/2] RDMA/core: Add an rb_tree that stores cm_ids sorted by ifindex and remote IP Leon Romanovsky
@ 2022-04-04 12:27 ` Leon Romanovsky
  2022-05-11  0:04   ` Jason Gunthorpe
  2022-04-29 16:07 ` [PATCH rdma-next 0/2] Add gratuitous ARP support to RDMA-CM Olga Kornievskaia
  2 siblings, 1 reply; 7+ messages in thread
From: Leon Romanovsky @ 2022-04-04 12:27 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Patrisious Haddad, linux-kernel, linux-rdma, Mark Zhang

From: Patrisious Haddad <phaddad@nvidia.com>

Add a netevent callback for cma, mainly to catch NETEVENT_NEIGH_UPDATE.

Previously, when a system with failover MAC mechanism change its MAC address
during a CM connection attempt, the RDMA-CM would take a lot of time till
it disconnects and timesout due to the incorrect MAC address.

Now when we get a NETEVENT_NEIGH_UPDATE we check if it is due to a failover
MAC change and if so, we instantly destroy the CM and notify the user in order
to spare the unnecessary waiting for the timeout.

Signed-off-by: Patrisious Haddad <phaddad@nvidia.com>
Reviewed-by: Mark Zhang <markzhang@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/core/cma.c | 104 ++++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index bfe2b70daf39..c26fec94d032 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -21,6 +21,7 @@
 
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/netevent.h>
 #include <net/tcp.h>
 #include <net/ipv6.h>
 #include <net/ip_fib.h>
@@ -173,6 +174,7 @@ static struct rb_root id_table = RB_ROOT;
 /* Serialize operations of id_table tree */
 static DEFINE_SPINLOCK(id_table_lock);
 static struct workqueue_struct *cma_wq;
+static struct workqueue_struct *cma_netevent_wq;
 static unsigned int cma_pernet_id;
 
 struct cma_pernet {
@@ -373,6 +375,11 @@ struct cma_work {
 	struct rdma_cm_event	event;
 };
 
+struct cma_netevent_work {
+	struct work_struct work;
+	struct rdma_id_private *id_priv;
+};
+
 union cma_ip_addr {
 	struct in6_addr ip6;
 	struct {
@@ -5054,10 +5061,95 @@ static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
 	return ret;
 }
 
+static void cma_netevent_work_handler(struct work_struct *_work)
+{
+	struct cma_netevent_work *network =
+		container_of(_work, struct cma_netevent_work, work);
+	struct rdma_cm_event event = {};
+
+	mutex_lock(&network->id_priv->handler_mutex);
+
+	if (READ_ONCE(network->id_priv->state) == RDMA_CM_DESTROYING ||
+	    READ_ONCE(network->id_priv->state) == RDMA_CM_DEVICE_REMOVAL)
+		goto out_unlock;
+
+	event.event = RDMA_CM_EVENT_UNREACHABLE;
+	event.status = -ETIMEDOUT;
+
+	if (cma_cm_event_handler(network->id_priv, &event)) {
+		__acquire(&network->id_priv->handler_mutex);
+		network->id_priv->cm_id.ib = NULL;
+		cma_id_put(network->id_priv);
+		destroy_id_handler_unlock(network->id_priv);
+		kfree(network);
+		return;
+	}
+
+out_unlock:
+	mutex_unlock(&network->id_priv->handler_mutex);
+	cma_id_put(network->id_priv);
+	kfree(network);
+}
+
+static int cma_netevent_callback(struct notifier_block *self,
+				 unsigned long event, void *ctx)
+{
+	struct id_table_entry *ips_node = NULL;
+	struct rdma_id_private *current_id;
+	struct cma_netevent_work *network;
+	struct neighbour *neigh = ctx;
+	unsigned long flags;
+
+	if (event != NETEVENT_NEIGH_UPDATE)
+		return NOTIFY_DONE;
+
+	spin_lock_irqsave(&id_table_lock, flags);
+	if (neigh->tbl->family == AF_INET6) {
+		struct sockaddr_in6 neigh_sock_6;
+
+		neigh_sock_6.sin6_family = AF_INET6;
+		neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key;
+		ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex,
+					     (struct sockaddr *)&neigh_sock_6);
+	} else if (neigh->tbl->family == AF_INET) {
+		struct sockaddr_in neigh_sock_4;
+
+		neigh_sock_4.sin_family = AF_INET;
+		neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key);
+		ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex,
+					     (struct sockaddr *)&neigh_sock_4);
+	} else
+		goto out;
+
+	if (!ips_node)
+		goto out;
+
+	list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) {
+		if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr,
+			   neigh->ha, ETH_ALEN))
+			continue;
+		network = kzalloc(sizeof(*network), GFP_ATOMIC);
+		if (!network)
+			goto out;
+
+		INIT_WORK(&network->work, cma_netevent_work_handler);
+		network->id_priv = current_id;
+		cma_id_get(current_id);
+		queue_work(cma_netevent_wq, &network->work);
+	}
+out:
+	spin_unlock_irqrestore(&id_table_lock, flags);
+	return NOTIFY_DONE;
+}
+
 static struct notifier_block cma_nb = {
 	.notifier_call = cma_netdev_callback
 };
 
+static struct notifier_block cma_netevent_cb = {
+	.notifier_call = cma_netevent_callback
+};
+
 static void cma_send_device_removal_put(struct rdma_id_private *id_priv)
 {
 	struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL };
@@ -5274,12 +5366,19 @@ static int __init cma_init(void)
 	if (!cma_wq)
 		return -ENOMEM;
 
+	cma_netevent_wq = alloc_ordered_workqueue("rdma_cm_netevent", 0);
+	if (!cma_netevent_wq) {
+		ret = -ENOMEM;
+		goto err_netevent_wq;
+	}
+
 	ret = register_pernet_subsys(&cma_pernet_operations);
 	if (ret)
 		goto err_wq;
 
 	ib_sa_register_client(&sa_client);
 	register_netdevice_notifier(&cma_nb);
+	register_netevent_notifier(&cma_netevent_cb);
 
 	ret = ib_register_client(&cma_client);
 	if (ret)
@@ -5294,10 +5393,13 @@ static int __init cma_init(void)
 err_ib:
 	ib_unregister_client(&cma_client);
 err:
+	unregister_netevent_notifier(&cma_netevent_cb);
 	unregister_netdevice_notifier(&cma_nb);
 	ib_sa_unregister_client(&sa_client);
 	unregister_pernet_subsys(&cma_pernet_operations);
 err_wq:
+	destroy_workqueue(cma_netevent_wq);
+err_netevent_wq:
 	destroy_workqueue(cma_wq);
 	return ret;
 }
@@ -5306,9 +5408,11 @@ static void __exit cma_cleanup(void)
 {
 	cma_configfs_exit();
 	ib_unregister_client(&cma_client);
+	unregister_netevent_notifier(&cma_netevent_cb);
 	unregister_netdevice_notifier(&cma_nb);
 	ib_sa_unregister_client(&sa_client);
 	unregister_pernet_subsys(&cma_pernet_operations);
+	destroy_workqueue(cma_netevent_wq);
 	destroy_workqueue(cma_wq);
 }
 
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH rdma-next 0/2] Add gratuitous ARP support to RDMA-CM
  2022-04-04 12:27 [PATCH rdma-next 0/2] Add gratuitous ARP support to RDMA-CM Leon Romanovsky
  2022-04-04 12:27 ` [PATCH rdma-next 1/2] RDMA/core: Add an rb_tree that stores cm_ids sorted by ifindex and remote IP Leon Romanovsky
  2022-04-04 12:27 ` [PATCH rdma-next 2/2] RDMA/core: Add a netevent notifier to cma Leon Romanovsky
@ 2022-04-29 16:07 ` Olga Kornievskaia
  2022-04-29 16:23   ` Leon Romanovsky
  2 siblings, 1 reply; 7+ messages in thread
From: Olga Kornievskaia @ 2022-04-29 16:07 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Jason Gunthorpe, Leon Romanovsky, linux-kernel, linux-rdma,
	Mark Zhang, Patrisious Haddad

Hi Leon,

Are these the 2 patches that are supposed to fix: [Bug 214523] New:
RDMA Mellanox RoCE drivers are unresponsive to ARP updates during a
reconnect?

I could be wrong but I don't think they made it into the 5.18 pull, correct?

Thank you.

On Mon, Apr 4, 2022 at 8:36 AM Leon Romanovsky <leon@kernel.org> wrote:
>
> From: Leon Romanovsky <leonro@nvidia.com>
>
> In this series, Patrisious adds gratuitous ARP support to RDMA-CM, in
> order to speed up migration failover from one node to another.
>
> Thanks
>
> Patrisious Haddad (2):
>   RDMA/core: Add an rb_tree that stores cm_ids sorted by ifindex and
>     remote IP
>   RDMA/core: Add a netevent notifier to cma
>
>  drivers/infiniband/core/cma.c      | 260 +++++++++++++++++++++++++++--
>  drivers/infiniband/core/cma_priv.h |   1 +
>  2 files changed, 249 insertions(+), 12 deletions(-)
>
> --
> 2.35.1
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH rdma-next 0/2] Add gratuitous ARP support to RDMA-CM
  2022-04-29 16:07 ` [PATCH rdma-next 0/2] Add gratuitous ARP support to RDMA-CM Olga Kornievskaia
@ 2022-04-29 16:23   ` Leon Romanovsky
  0 siblings, 0 replies; 7+ messages in thread
From: Leon Romanovsky @ 2022-04-29 16:23 UTC (permalink / raw)
  To: Olga Kornievskaia
  Cc: Jason Gunthorpe, linux-kernel, linux-rdma, Mark Zhang, Patrisious Haddad

On Fri, Apr 29, 2022 at 12:07:09PM -0400, Olga Kornievskaia wrote:
> Hi Leon,
> 
> Are these the 2 patches that are supposed to fix: [Bug 214523] New:
> RDMA Mellanox RoCE drivers are unresponsive to ARP updates during a
> reconnect?

Yes, you are right.

> 
> I could be wrong but I don't think they made it into the 5.18 pull, correct?

This is correct too.

Thanks

> 
> Thank you.
> 
> On Mon, Apr 4, 2022 at 8:36 AM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > From: Leon Romanovsky <leonro@nvidia.com>
> >
> > In this series, Patrisious adds gratuitous ARP support to RDMA-CM, in
> > order to speed up migration failover from one node to another.
> >
> > Thanks
> >
> > Patrisious Haddad (2):
> >   RDMA/core: Add an rb_tree that stores cm_ids sorted by ifindex and
> >     remote IP
> >   RDMA/core: Add a netevent notifier to cma
> >
> >  drivers/infiniband/core/cma.c      | 260 +++++++++++++++++++++++++++--
> >  drivers/infiniband/core/cma_priv.h |   1 +
> >  2 files changed, 249 insertions(+), 12 deletions(-)
> >
> > --
> > 2.35.1
> >

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH rdma-next 1/2] RDMA/core: Add an rb_tree that stores cm_ids sorted by ifindex and remote IP
  2022-04-04 12:27 ` [PATCH rdma-next 1/2] RDMA/core: Add an rb_tree that stores cm_ids sorted by ifindex and remote IP Leon Romanovsky
@ 2022-05-10 23:56   ` Jason Gunthorpe
  0 siblings, 0 replies; 7+ messages in thread
From: Jason Gunthorpe @ 2022-05-10 23:56 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: Patrisious Haddad, linux-kernel, linux-rdma, Mark Zhang

On Mon, Apr 04, 2022 at 03:27:26PM +0300, Leon Romanovsky wrote:
> +static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa,
> +				 int ifindex_b, struct sockaddr *sb)
> +{
> +	if (ifindex_a != ifindex_b)
> +		return ifindex_a - ifindex_b;

These subtraction tricks don't work if the value can overflow
INT_MAX - INT_MIN == undefined


> +static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv)
> +{
> +	struct rb_node **new = &id_table.rb_node, *parent = NULL;

This read of rb_node has to be under the spinlock

> +	struct id_table_entry *this, *node;
> +	struct rdma_id_private *id_priv;
> +	unsigned long flags;
> +	int result;
> +
> +	node = kzalloc(sizeof(*node), GFP_KERNEL);
> +	if (!node)
> +		return -ENOMEM;
> +
> +	spin_lock_irqsave(&id_table_lock, flags);
> +	while (*new) {
> +		this = container_of(*new, struct id_table_entry, rb_node);

Because rebalacing can alter the head

> +		id_priv = list_first_entry(
> +			&this->id_list, struct rdma_id_private, id_list_entry);
> +		result = compare_netdev_and_ip(
> +			node_id_priv->id.route.addr.dev_addr.bound_dev_if,
> +			cma_dst_addr(node_id_priv),
> +			id_priv->id.route.addr.dev_addr.bound_dev_if,
> +			cma_dst_addr(id_priv));

This pattern keeps repeating, one of the arguments to compare should
just be the id_table_entry * and do the list_first/etc inside the
compare.

Jason

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH rdma-next 2/2] RDMA/core: Add a netevent notifier to cma
  2022-04-04 12:27 ` [PATCH rdma-next 2/2] RDMA/core: Add a netevent notifier to cma Leon Romanovsky
@ 2022-05-11  0:04   ` Jason Gunthorpe
  0 siblings, 0 replies; 7+ messages in thread
From: Jason Gunthorpe @ 2022-05-11  0:04 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: Patrisious Haddad, linux-kernel, linux-rdma, Mark Zhang

On Mon, Apr 04, 2022 at 03:27:27PM +0300, Leon Romanovsky wrote:

> @@ -5054,10 +5061,95 @@ static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
>  	return ret;
>  }
>  
> +static void cma_netevent_work_handler(struct work_struct *_work)
> +{
> +	struct cma_netevent_work *network =
> +		container_of(_work, struct cma_netevent_work, work);
> +	struct rdma_cm_event event = {};
> +
> +	mutex_lock(&network->id_priv->handler_mutex);
> +
> +	if (READ_ONCE(network->id_priv->state) == RDMA_CM_DESTROYING ||
> +	    READ_ONCE(network->id_priv->state) == RDMA_CM_DEVICE_REMOVAL)
> +		goto out_unlock;
> +
> +	event.event = RDMA_CM_EVENT_UNREACHABLE;
> +	event.status = -ETIMEDOUT;
> +
> +	if (cma_cm_event_handler(network->id_priv, &event)) {
> +		__acquire(&network->id_priv->handler_mutex);

??

> +		network->id_priv->cm_id.ib = NULL;
> +		cma_id_put(network->id_priv);
> +		destroy_id_handler_unlock(network->id_priv);
> +		kfree(network);
> +		return;
> +	}
> +
> +out_unlock:
> +	mutex_unlock(&network->id_priv->handler_mutex);
> +	cma_id_put(network->id_priv);
> +	kfree(network);
> +}
> +
> +static int cma_netevent_callback(struct notifier_block *self,
> +				 unsigned long event, void *ctx)
> +{
> +	struct id_table_entry *ips_node = NULL;
> +	struct rdma_id_private *current_id;
> +	struct cma_netevent_work *network;
> +	struct neighbour *neigh = ctx;
> +	unsigned long flags;
> +
> +	if (event != NETEVENT_NEIGH_UPDATE)
> +		return NOTIFY_DONE;
> +
> +	spin_lock_irqsave(&id_table_lock, flags);
> +	if (neigh->tbl->family == AF_INET6) {
> +		struct sockaddr_in6 neigh_sock_6;
> +
> +		neigh_sock_6.sin6_family = AF_INET6;
> +		neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key;
> +		ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex,
> +					     (struct sockaddr *)&neigh_sock_6);
> +	} else if (neigh->tbl->family == AF_INET) {
> +		struct sockaddr_in neigh_sock_4;
> +
> +		neigh_sock_4.sin_family = AF_INET;
> +		neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key);
> +		ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex,
> +					     (struct sockaddr *)&neigh_sock_4);
> +	} else
> +		goto out;
> +
> +	if (!ips_node)
> +		goto out;
> +
> +	list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) {
> +		if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr,
> +			   neigh->ha, ETH_ALEN))
> +			continue;
> +		network = kzalloc(sizeof(*network), GFP_ATOMIC);
> +		if (!network)
> +			goto out;
> +
> +		INIT_WORK(&network->work, cma_netevent_work_handler);
> +		network->id_priv = current_id;
> +		cma_id_get(current_id);
> +		queue_work(cma_netevent_wq, &network->work);

This is pretty ugly that we need to do atomic allocations for every
matching id.

It would be better to add the work directly to the rdma_cm_id and just
waste that memory.

> +	cma_netevent_wq = alloc_ordered_workqueue("rdma_cm_netevent", 0);
> +	if (!cma_netevent_wq) {
> +		ret = -ENOMEM;
> +		goto err_netevent_wq;
> +	}

Why do we need another WQ? Why does it need to be ordered?

Jason

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-05-11  0:04 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-04 12:27 [PATCH rdma-next 0/2] Add gratuitous ARP support to RDMA-CM Leon Romanovsky
2022-04-04 12:27 ` [PATCH rdma-next 1/2] RDMA/core: Add an rb_tree that stores cm_ids sorted by ifindex and remote IP Leon Romanovsky
2022-05-10 23:56   ` Jason Gunthorpe
2022-04-04 12:27 ` [PATCH rdma-next 2/2] RDMA/core: Add a netevent notifier to cma Leon Romanovsky
2022-05-11  0:04   ` Jason Gunthorpe
2022-04-29 16:07 ` [PATCH rdma-next 0/2] Add gratuitous ARP support to RDMA-CM Olga Kornievskaia
2022-04-29 16:23   ` Leon Romanovsky

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).