All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
@ 2015-03-25 21:19   ` Somnath Kotur
       [not found]     ` <44ab0dce-c7c9-400b-af24-10b8981358a7-3RiH6ntJJkOPfaB/Gd0HpljyZtpTMMwT@public.gmane.org>
  2015-03-25 21:19   ` [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices Somnath Kotur
                     ` (31 subsequent siblings)
  32 siblings, 1 reply; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:19 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Somnath Kotur

From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

In order to manage multiple types, vlans and MACs per GID, we
need to store them along the GID itself. We store the net device
as well, as sometimes GIDs should be handled according to the
net device they came from. Since populating the GID table should
be identical for every RoCE provider, the GIDs table should be
handled in ib_core.

Adding a GID cache table that supports a lockless find, add and
delete gids. The lockless nature comes from using a unique
sequence number per table entry and detecting that while reading/
writing this sequence wasn't changed.

By using this RoCE GID cache table, providers must implement a
modify_gid callback. The table is managed exclusively by
this roce_gid_cache and the provider just need to write
the data to the hardware.

Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/core/Makefile         |   3 +-
 drivers/infiniband/core/core_priv.h      |  24 ++
 drivers/infiniband/core/roce_gid_cache.c | 518 +++++++++++++++++++++++++++++++
 drivers/infiniband/hw/mlx4/main.c        |   2 -
 include/rdma/ib_verbs.h                  |  55 +++-
 5 files changed, 598 insertions(+), 4 deletions(-)
 create mode 100644 drivers/infiniband/core/roce_gid_cache.c

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index acf7367..9b63bdf 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -9,7 +9,8 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
 					$(user_access-y)
 
 ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
-				device.o fmr_pool.o cache.o netlink.o
+				device.o fmr_pool.o cache.o netlink.o \
+				roce_gid_cache.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
 
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 87d1936..a502daa 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -35,6 +35,7 @@
 
 #include <linux/list.h>
 #include <linux/spinlock.h>
+#include <net/net_namespace.h>
 
 #include <rdma/ib_verbs.h>
 
@@ -51,4 +52,27 @@ void ib_cache_cleanup(void);
 
 int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
 			    struct ib_qp_attr *qp_attr, int *qp_attr_mask);
+
+int roce_gid_cache_get_gid(struct ib_device *ib_dev, u8 port, int index,
+			   union ib_gid *gid, struct ib_gid_attr *attr);
+
+int roce_gid_cache_find_gid(struct ib_device *ib_dev, union ib_gid *gid,
+			    enum ib_gid_type gid_type, struct net *net,
+			    int if_index, u8 *port, u16 *index);
+
+int roce_gid_cache_find_gid_by_port(struct ib_device *ib_dev, union ib_gid *gid,
+				    enum ib_gid_type gid_type, u8 port,
+				    struct net *net, int if_index, u16 *index);
+
+int roce_gid_cache_is_active(struct ib_device *ib_dev, u8 port);
+
+int roce_add_gid(struct ib_device *ib_dev, u8 port,
+		 union ib_gid *gid, struct ib_gid_attr *attr);
+
+int roce_del_gid(struct ib_device *ib_dev, u8 port,
+		 union ib_gid *gid, struct ib_gid_attr *attr);
+
+int roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+			     struct net_device *ndev);
+
 #endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/roce_gid_cache.c b/drivers/infiniband/core/roce_gid_cache.c
new file mode 100644
index 0000000..80f364a
--- /dev/null
+++ b/drivers/infiniband/core/roce_gid_cache.c
@@ -0,0 +1,518 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <rdma/ib_cache.h>
+
+#include "core_priv.h"
+
+union ib_gid zgid;
+EXPORT_SYMBOL_GPL(zgid);
+
+static const struct ib_gid_attr zattr;
+
+enum gid_attr_find_mask {
+	GID_ATTR_FIND_MASK_GID_TYPE	= 1UL << 0,
+	GID_ATTR_FIND_MASK_NETDEV	= 1UL << 1,
+};
+
+static inline int start_port(struct ib_device *ib_dev)
+{
+	return (ib_dev->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+struct dev_put_rcu {
+	struct rcu_head		rcu;
+	struct net_device	*ndev;
+};
+
+static void put_ndev(struct rcu_head *rcu)
+{
+	struct dev_put_rcu *put_rcu =
+		container_of(rcu, struct dev_put_rcu, rcu);
+
+	dev_put(put_rcu->ndev);
+	kfree(put_rcu);
+}
+
+static int write_gid(struct ib_device *ib_dev, u8 port,
+		     struct ib_roce_gid_cache *cache, int ix,
+		     const union ib_gid *gid,
+		     const struct ib_gid_attr *attr)
+{
+	unsigned int orig_seq;
+	int ret;
+	struct dev_put_rcu	*put_rcu;
+	struct net_device *old_net_dev;
+
+	orig_seq = cache->data_vec[ix].seq;
+	cache->data_vec[ix].seq = -1;
+	/* Ensure that all readers will see invalid sequence
+	 * identifier before starting the actual GID update.
+	 */
+	smp_wmb();
+
+	ret = ib_dev->modify_gid(ib_dev, port, ix, gid, attr,
+				 &cache->data_vec[ix].context);
+
+	old_net_dev = cache->data_vec[ix].attr.ndev;
+	if (old_net_dev && old_net_dev != attr->ndev) {
+		put_rcu = kmalloc(sizeof(*put_rcu), GFP_KERNEL);
+		if (put_rcu) {
+			put_rcu->ndev = old_net_dev;
+			call_rcu(&put_rcu->rcu, put_ndev);
+		} else {
+			pr_warn("roce_gid_cache: can't allocate rcu context, using synchronize\n");
+			synchronize_rcu();
+			dev_put(old_net_dev);
+		}
+	}
+	/* if modify_gid failed, just delete the old gid */
+	if (ret || !memcmp(gid, &zgid, sizeof(*gid))) {
+		gid = &zgid;
+		attr = &zattr;
+		cache->data_vec[ix].context = NULL;
+	}
+	memcpy(&cache->data_vec[ix].gid, gid, sizeof(*gid));
+	memcpy(&cache->data_vec[ix].attr, attr, sizeof(*attr));
+	if (cache->data_vec[ix].attr.ndev &&
+	    cache->data_vec[ix].attr.ndev != old_net_dev)
+		dev_hold(cache->data_vec[ix].attr.ndev);
+
+	/* Ensure that all cached gid data updating is finished before
+	 * marking the entry as available.
+	 */
+	smp_wmb();
+
+	if (++orig_seq == (unsigned int)-1)
+		orig_seq = 0;
+	ACCESS_ONCE(cache->data_vec[ix].seq) = orig_seq;
+
+	if (!ret) {
+		struct ib_event event;
+
+		event.device		= ib_dev;
+		event.element.port_num	= port;
+		event.event		= IB_EVENT_GID_CHANGE;
+
+		ib_dispatch_event(&event);
+	}
+	return ret;
+}
+
+static int find_gid(struct ib_roce_gid_cache *cache, union ib_gid *gid,
+		    const struct ib_gid_attr *val, unsigned long mask)
+{
+	int i;
+	unsigned int orig_seq;
+
+	for (i = 0; i < cache->sz; i++) {
+		struct ib_gid_attr *attr = &cache->data_vec[i].attr;
+
+		orig_seq = cache->data_vec[i].seq;
+		if (orig_seq == -1)
+			continue;
+		/* Make sure the sequence number we remeber was read
+		 * before the gid cache entry content is read.
+		 */
+		smp_rmb();
+
+		if (mask & GID_ATTR_FIND_MASK_GID_TYPE &&
+		    attr->gid_type != val->gid_type)
+			continue;
+
+		if (memcmp(gid, &cache->data_vec[i].gid, sizeof(*gid)))
+			continue;
+
+		if (mask & GID_ATTR_FIND_MASK_NETDEV &&
+		    attr->ndev != val->ndev)
+			continue;
+
+		/* We have a match, verify that the data we
+		 * compared is valid. Make sure that the
+		 * sequence number we read is the last to be
+		 * read.
+		 */
+		smp_rmb();
+		if (orig_seq == ACCESS_ONCE(cache->data_vec[i].seq))
+			return i;
+		/* The sequence number changed under our feet,
+		 * the GID entry is invalid. Continue to the
+		 * next entry.
+		 */
+	}
+
+	return -1;
+}
+
+int roce_add_gid(struct ib_device *ib_dev, u8 port,
+		 union ib_gid *gid, struct ib_gid_attr *attr)
+{
+	struct ib_roce_gid_cache *cache;
+	int ix;
+	int ret = 0;
+
+	if (!ib_dev->cache.roce_gid_cache)
+		return -ENOSYS;
+
+	cache = ib_dev->cache.roce_gid_cache[port - start_port(ib_dev)];
+
+	if (!cache || !cache->active)
+		return -ENOSYS;
+
+	if (!memcmp(gid, &zgid, sizeof(*gid)))
+		return -EINVAL;
+
+	mutex_lock(&cache->lock);
+
+	ix = find_gid(cache, gid, attr, GID_ATTR_FIND_MASK_GID_TYPE |
+		      GID_ATTR_FIND_MASK_NETDEV);
+	if (ix >= 0)
+		goto out_unlock;
+
+	ix = find_gid(cache, &zgid, NULL, 0);
+	if (ix < 0) {
+		ret = -ENOSPC;
+		goto out_unlock;
+	}
+
+	write_gid(ib_dev, port, cache, ix, gid, attr);
+
+out_unlock:
+	mutex_unlock(&cache->lock);
+	return ret;
+}
+
+int roce_del_gid(struct ib_device *ib_dev, u8 port,
+		 union ib_gid *gid, struct ib_gid_attr *attr)
+{
+	struct ib_roce_gid_cache *cache;
+	int ix;
+
+	if (!ib_dev->cache.roce_gid_cache)
+		return 0;
+
+	cache  = ib_dev->cache.roce_gid_cache[port - start_port(ib_dev)];
+
+	if (!cache || !cache->active)
+		return -ENOSYS;
+
+	mutex_lock(&cache->lock);
+
+	ix = find_gid(cache, gid, attr,
+		      GID_ATTR_FIND_MASK_GID_TYPE |
+		      GID_ATTR_FIND_MASK_NETDEV);
+	if (ix < 0)
+		goto out_unlock;
+
+	write_gid(ib_dev, port, cache, ix, &zgid, &zattr);
+
+out_unlock:
+	mutex_unlock(&cache->lock);
+	return 0;
+}
+
+int roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+			     struct net_device *ndev)
+{
+	struct ib_roce_gid_cache *cache;
+	int ix;
+
+	if (!ib_dev->cache.roce_gid_cache)
+		return 0;
+
+	cache  = ib_dev->cache.roce_gid_cache[port - start_port(ib_dev)];
+
+	if (!cache || !cache->active)
+		return -ENOSYS;
+
+	mutex_lock(&cache->lock);
+
+	for (ix = 0; ix < cache->sz; ix++)
+		if (cache->data_vec[ix].attr.ndev == ndev)
+			write_gid(ib_dev, port, cache, ix, &zgid, &zattr);
+
+	mutex_unlock(&cache->lock);
+	return 0;
+}
+
+int roce_gid_cache_get_gid(struct ib_device *ib_dev, u8 port, int index,
+			   union ib_gid *gid, struct ib_gid_attr *attr)
+{
+	struct ib_roce_gid_cache *cache;
+	union ib_gid local_gid;
+	struct ib_gid_attr local_attr;
+	unsigned int orig_seq;
+
+	if (!ib_dev->cache.roce_gid_cache)
+		return -EINVAL;
+
+	cache = ib_dev->cache.roce_gid_cache[port - start_port(ib_dev)];
+
+	if (!cache || !cache->active)
+		return -ENOSYS;
+
+	if (index < 0 || index >= cache->sz)
+		return -EINVAL;
+
+	orig_seq = ACCESS_ONCE(cache->data_vec[index].seq);
+	/* Make sure we read the sequence number before copying the
+	 * gid to local storage. */
+	smp_rmb();
+
+	memcpy(&local_gid, &cache->data_vec[index].gid, sizeof(local_gid));
+	memcpy(&local_attr, &cache->data_vec[index].attr, sizeof(local_attr));
+	/* Ensure the local copy completed reading before verifying
+	 * the new sequence number. */
+	smp_rmb();
+
+	if (orig_seq == -1 ||
+	    orig_seq != ACCESS_ONCE(cache->data_vec[index].seq))
+		return -EAGAIN;
+
+	memcpy(gid, &local_gid, sizeof(*gid));
+	if (attr)
+		memcpy(attr, &local_attr, sizeof(*attr));
+	return 0;
+}
+
+static int _roce_gid_cache_find_gid(struct ib_device *ib_dev, union ib_gid *gid,
+				    const struct ib_gid_attr *val,
+				    unsigned long mask,
+				    u8 *port, u16 *index)
+{
+	struct ib_roce_gid_cache *cache;
+	u8 p;
+	int local_index;
+
+	if (!ib_dev->cache.roce_gid_cache)
+		return -ENOENT;
+
+	for (p = 0; p < ib_dev->phys_port_cnt; p++) {
+		if (rdma_port_get_link_layer(ib_dev, p + start_port(ib_dev)) !=
+		    IB_LINK_LAYER_ETHERNET)
+			continue;
+		cache = ib_dev->cache.roce_gid_cache[p];
+		if (!cache || !cache->active)
+			continue;
+		local_index = find_gid(cache, gid, val, mask);
+		if (local_index >= 0) {
+			if (index)
+				*index = local_index;
+			if (port)
+				*port = p + start_port(ib_dev);
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static int get_netdev_from_ifindex(struct net *net, int if_index,
+				   struct ib_gid_attr *gid_attr_val)
+{
+	if (if_index && net) {
+		rcu_read_lock();
+		gid_attr_val->ndev = dev_get_by_index_rcu(net, if_index);
+		rcu_read_unlock();
+		if (gid_attr_val->ndev)
+			return GID_ATTR_FIND_MASK_NETDEV;
+	}
+	return 0;
+}
+
+int roce_gid_cache_find_gid(struct ib_device *ib_dev, union ib_gid *gid,
+			    enum ib_gid_type gid_type, struct net *net,
+			    int if_index, u8 *port, u16 *index)
+{
+	unsigned long mask = GID_ATTR_FIND_MASK_GID |
+			     GID_ATTR_FIND_MASK_GID_TYPE;
+	struct ib_gid_attr gid_attr_val = {.gid_type = gid_type};
+
+	mask |= get_netdev_from_ifindex(net, if_index, &gid_attr_val);
+
+	return _roce_gid_cache_find_gid(ib_dev, gid, &gid_attr_val,
+					mask, port, index);
+}
+
+int roce_gid_cache_find_gid_by_port(struct ib_device *ib_dev, union ib_gid *gid,
+				    enum ib_gid_type gid_type, u8 port,
+				    struct net *net, int if_index, u16 *index)
+{
+	int local_index;
+	struct ib_roce_gid_cache *cache;
+	unsigned long mask = GID_ATTR_FIND_MASK_GID_TYPE;
+	struct ib_gid_attr val = {.gid_type = gid_type};
+
+	if (!ib_dev->cache.roce_gid_cache || port < start_port(ib_dev) ||
+	    port >= (start_port(ib_dev) + ib_dev->phys_port_cnt))
+		return -ENOENT;
+
+	cache = ib_dev->cache.roce_gid_cache[port - start_port(ib_dev)];
+	if (!cache || !cache->active)
+		return -ENOENT;
+
+	mask |= get_netdev_from_ifindex(net, if_index, &val);
+
+	local_index = find_gid(cache, gid, &val, mask);
+	if (local_index >= 0) {
+		if (index)
+			*index = local_index;
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static struct ib_roce_gid_cache *alloc_roce_gid_cache(int sz)
+{
+	struct ib_roce_gid_cache *cache =
+		kzalloc(sizeof(struct ib_roce_gid_cache), GFP_KERNEL);
+	if (!cache)
+		return NULL;
+
+	cache->data_vec = kcalloc(sz, sizeof(*cache->data_vec), GFP_KERNEL);
+	if (!cache->data_vec)
+		goto err_free_cache;
+
+	mutex_init(&cache->lock);
+
+	cache->sz = sz;
+
+	return cache;
+
+err_free_cache:
+	kfree(cache);
+	return NULL;
+}
+
+static void free_roce_gid_cache(struct ib_device *ib_dev, u8 port)
+{
+	int i;
+	struct ib_roce_gid_cache *cache =
+		ib_dev->cache.roce_gid_cache[port - 1];
+
+	if (!cache)
+		return;
+
+	for (i = 0; i < cache->sz; ++i) {
+		if (memcmp(&cache->data_vec[i].gid, &zgid,
+			   sizeof(cache->data_vec[i].gid)))
+		    write_gid(ib_dev, port, cache, i, &zgid, &zattr);
+	}
+	kfree(cache->data_vec);
+	kfree(cache);
+}
+
+static void set_roce_gid_cache_active(struct ib_roce_gid_cache *cache,
+				      int active)
+{
+	if (!cache)
+		return;
+
+	cache->active = active;
+}
+
+static int roce_gid_cache_setup_one(struct ib_device *ib_dev)
+{
+	u8 port;
+	int err = 0;
+
+	if (!ib_dev->modify_gid)
+		return -ENOSYS;
+
+	ib_dev->cache.roce_gid_cache =
+		kcalloc(ib_dev->phys_port_cnt,
+			sizeof(*ib_dev->cache.roce_gid_cache), GFP_KERNEL);
+
+	if (!ib_dev->cache.roce_gid_cache) {
+		pr_warn("failed to allocate roce addr cache for %s\n",
+			ib_dev->name);
+		return -ENOMEM;
+	}
+
+	for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+		if (rdma_port_get_link_layer(ib_dev, port + start_port(ib_dev))
+		    != IB_LINK_LAYER_ETHERNET)
+			continue;
+		ib_dev->cache.roce_gid_cache[port] =
+			alloc_roce_gid_cache(ib_dev->gid_tbl_len[port]);
+		if (!ib_dev->cache.roce_gid_cache[port]) {
+			err = -ENOMEM;
+			goto rollback_cache_setup;
+		}
+	}
+	return 0;
+
+rollback_cache_setup:
+	for (port = 1; port <= ib_dev->phys_port_cnt; port++)
+		free_roce_gid_cache(ib_dev, port);
+
+	kfree(ib_dev->cache.roce_gid_cache);
+	ib_dev->cache.roce_gid_cache = NULL;
+	return err;
+}
+
+static void roce_gid_cache_cleanup_one(struct ib_device *ib_dev)
+{
+	u8 port;
+
+	if (!ib_dev->cache.roce_gid_cache)
+		return;
+
+	for (port = 1; port <= ib_dev->phys_port_cnt; port++)
+		free_roce_gid_cache(ib_dev, port);
+
+	kfree(ib_dev->cache.roce_gid_cache);
+	ib_dev->cache.roce_gid_cache = NULL;
+}
+
+static void roce_gid_cache_set_active_state(struct ib_device *ib_dev,
+					    int active)
+{
+	u8 port;
+
+	if (!ib_dev->cache.roce_gid_cache)
+		return;
+
+	for (port = 0; port < ib_dev->phys_port_cnt; port++)
+		set_roce_gid_cache_active(ib_dev->cache.roce_gid_cache[port],
+					  active);
+}
+
+int roce_gid_cache_is_active(struct ib_device *ib_dev, u8 port)
+{
+	return ib_dev->cache.roce_gid_cache &&
+		ib_dev->cache.roce_gid_cache[port - start_port(ib_dev)]->active;
+}
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 5261665..6fa5e49 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -93,8 +93,6 @@ static void init_query_mad(struct ib_smp *mad)
 	mad->method	   = IB_MGMT_METHOD_GET;
 }
 
-static union ib_gid zgid;
-
 static int check_flow_steering_support(struct mlx4_dev *dev)
 {
 	int eth_num_ports = 0;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 65994a1..1866595 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -64,6 +64,36 @@ union ib_gid {
 	} global;
 };
 
+extern union ib_gid zgid;
+
+enum ib_gid_type {
+	/* If link layer is Ethernet, this is RoCE V1 */
+	IB_GID_TYPE_IB        = 0,
+	IB_GID_TYPE_ROCE_V2   = 1,
+	IB_GID_TYPE_SIZE
+};
+
+struct ib_gid_attr {
+	enum ib_gid_type	gid_type;
+	struct net_device	*ndev;
+};
+
+struct ib_roce_gid_cache_entry {
+	/* seq number of 0 indicates entry being changed. */
+	unsigned int        seq;
+	union ib_gid        gid;
+	struct ib_gid_attr  attr;
+	void		   *context;
+};
+
+struct ib_roce_gid_cache {
+	int		     active;
+	int                  sz;
+	/* locking against multiple writes in data_vec */
+	struct mutex         lock;
+	struct ib_roce_gid_cache_entry *data_vec;
+};
+
 enum rdma_node_type {
 	/* IB values map to NodeInfo:NodeType. */
 	RDMA_NODE_IB_CA 	= 1,
@@ -265,7 +295,9 @@ enum ib_port_cap_flags {
 	IB_PORT_BOOT_MGMT_SUP			= 1 << 23,
 	IB_PORT_LINK_LATENCY_SUP		= 1 << 24,
 	IB_PORT_CLIENT_REG_SUP			= 1 << 25,
-	IB_PORT_IP_BASED_GIDS			= 1 << 26
+	IB_PORT_IP_BASED_GIDS			= 1 << 26,
+	IB_PORT_ROCE				= 1 << 27,
+	IB_PORT_ROCE_V2				= 1 << 28,
 };
 
 enum ib_port_width {
@@ -1431,6 +1463,7 @@ struct ib_cache {
 	struct ib_pkey_cache  **pkey_cache;
 	struct ib_gid_cache   **gid_cache;
 	u8                     *lmc_cache;
+	struct ib_roce_gid_cache **roce_gid_cache;
 };
 
 struct ib_dma_mapping_ops {
@@ -1506,6 +1539,26 @@ struct ib_device {
 	int		           (*query_gid)(struct ib_device *device,
 						u8 port_num, int index,
 						union ib_gid *gid);
+	/* When calling modify_gid, the HW vendor's driver should
+	 * modify the gid of device @device at gid index @index of
+	 * port @port to be @gid. Meta-info of that gid (for example,
+	 * the network device related to this gid is available
+	 * at @attr. @context allows the HW vendor driver to store extra
+	 * information together with a GID entry. The HW vendor may allocate
+	 * memory to contain this information and store it in @context when a
+	 * new GID entry is written to. Upon the deletion of a GID entry,
+	 * the HW vendor must free any allocated memory. The caller will clear
+	 * @context afterwards.GID deletion is done by passing the zero gid.
+	 * Params are consistent until the next call of modify_gid.
+	 * The function should return 0 on success or error otherwise.
+	 * The function could be called concurrently for different ports.
+	 */
+	int		           (*modify_gid)(struct ib_device *device,
+						 u8 port_num,
+						 unsigned int index,
+						 const union ib_gid *gid,
+						 const struct ib_gid_attr *attr,
+						 void **context);
 	int		           (*query_pkey)(struct ib_device *device,
 						 u8 port_num, u16 index, u16 *pkey);
 	int		           (*modify_device)(struct ib_device *device,
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
  2015-03-25 21:19   ` [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache Somnath Kotur
@ 2015-03-25 21:19   ` Somnath Kotur
       [not found]     ` <9f65de5e-ed5f-48d2-bff2-03ffbe4f4876-3RiH6ntJJkOPfaB/Gd0HpljyZtpTMMwT@public.gmane.org>
  2015-03-25 21:19   ` [PATCH v3 for-next 03/33] IB/core: Add RoCE GID population Somnath Kotur
                     ` (30 subsequent siblings)
  32 siblings, 1 reply; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:19 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Somnath Kotur

From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Previously. we used device_mutex lock in order to protect
the device's list. That means that in order to guarantee a
device isn't freed while we use it, we had to lock all
devices.

Adding a kref per IB device. Before an IB device
is unregistered, we wait before its not held anymore.

Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/core/device.c | 41 ++++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_verbs.h          |  6 ++++++
 2 files changed, 47 insertions(+)

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 18c1ece..8616a95 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -261,6 +261,39 @@ out:
 	return ret;
 }
 
+static void ib_device_complete_cb(struct kref *kref)
+{
+	struct ib_device *device = container_of(kref, struct ib_device,
+						refcount);
+
+	if (device->reg_state >= IB_DEV_UNREGISTERING)
+		complete(&device->free);
+}
+
+/**
+ * ib_device_hold - increase the reference count of device
+ * @device: ib device to prevent from being free'd
+ *
+ * Prevent the device from being free'd.
+ */
+void ib_device_hold(struct ib_device *device)
+{
+	kref_get(&device->refcount);
+}
+EXPORT_SYMBOL(ib_device_hold);
+
+/**
+ * ib_device_put - decrease the reference count of device
+ * @device: allows this device to be free'd
+ *
+ * Puts the ib_device and allows it to be free'd.
+ */
+int ib_device_put(struct ib_device *device)
+{
+	return kref_put(&device->refcount, ib_device_complete_cb);
+}
+EXPORT_SYMBOL(ib_device_put);
+
 /**
  * ib_register_device - Register an IB device with IB core
  * @device:Device to register
@@ -312,6 +345,9 @@ int ib_register_device(struct ib_device *device,
 
 	list_add_tail(&device->core_list, &device_list);
 
+	kref_init(&device->refcount);
+	init_completion(&device->free);
+
 	device->reg_state = IB_DEV_REGISTERED;
 
 	{
@@ -342,6 +378,8 @@ void ib_unregister_device(struct ib_device *device)
 
 	mutex_lock(&device_mutex);
 
+	device->reg_state = IB_DEV_UNREGISTERING;
+
 	list_for_each_entry_reverse(client, &client_list, list)
 		if (client->remove)
 			client->remove(device);
@@ -355,6 +393,9 @@ void ib_unregister_device(struct ib_device *device)
 
 	ib_device_unregister_sysfs(device);
 
+	ib_device_put(device);
+	wait_for_completion(&device->free);
+
 	spin_lock_irqsave(&device->client_data_lock, flags);
 	list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
 		kfree(context);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 1866595..a7593b0 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1716,6 +1716,7 @@ struct ib_device {
 	enum {
 		IB_DEV_UNINITIALIZED,
 		IB_DEV_REGISTERED,
+		IB_DEV_UNREGISTERING,
 		IB_DEV_UNREGISTERED
 	}                            reg_state;
 
@@ -1728,6 +1729,8 @@ struct ib_device {
 	u32			     local_dma_lkey;
 	u8                           node_type;
 	u8                           phys_port_cnt;
+	struct kref		     refcount;
+	struct completion	     free;
 };
 
 struct ib_client {
@@ -1741,6 +1744,9 @@ struct ib_client {
 struct ib_device *ib_alloc_device(size_t size);
 void ib_dealloc_device(struct ib_device *device);
 
+void ib_device_hold(struct ib_device *device);
+int ib_device_put(struct ib_device *device);
+
 int ib_register_device(struct ib_device *device,
 		       int (*port_callback)(struct ib_device *,
 					    u8, struct kobject *));
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 03/33] IB/core: Add RoCE GID population
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
  2015-03-25 21:19   ` [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache Somnath Kotur
  2015-03-25 21:19   ` [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices Somnath Kotur
@ 2015-03-25 21:19   ` Somnath Kotur
  2015-03-25 21:19   ` [PATCH v3 for-next 04/33] IB/core: Add default GID for RoCE GID Cache Somnath Kotur
                     ` (29 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:19 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Somnath Kotur

From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

In order to populate the GID table, we need to listen for
events:
(a) IB device has been added or removed - used in order
    to allocate/deallocate the cache and populate
    the GID table internally.
(b) inet events - add new GIDs (according to the IP addresses)
    to the table.
(c) netdev up/down/change_addr - if a netdev is built onto our
    RoCE device, we need to add/delete its IPs.

When an event is received, multiple entries (each with
different GID type) are added.

Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/core/Makefile         |   2 +-
 drivers/infiniband/core/core_priv.h      |  26 ++
 drivers/infiniband/core/device.c         |  80 +++++
 drivers/infiniband/core/roce_gid_cache.c |  68 ++++
 drivers/infiniband/core/roce_gid_mgmt.c  | 516 +++++++++++++++++++++++++++++++
 include/rdma/ib_addr.h                   |   2 +-
 include/rdma/ib_verbs.h                  |   9 +
 7 files changed, 701 insertions(+), 2 deletions(-)
 create mode 100644 drivers/infiniband/core/roce_gid_mgmt.c

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 9b63bdf..2c94963 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -10,7 +10,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
 
 ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
 				device.o fmr_pool.o cache.o netlink.o \
-				roce_gid_cache.o
+				roce_gid_cache.o roce_gid_mgmt.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
 
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index a502daa..12797d9 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -39,6 +39,8 @@
 
 #include <rdma/ib_verbs.h>
 
+extern struct workqueue_struct *roce_gid_mgmt_wq;
+
 int  ib_device_register_sysfs(struct ib_device *device,
 			      int (*port_callback)(struct ib_device *,
 						   u8, struct kobject *));
@@ -53,6 +55,22 @@ void ib_cache_cleanup(void);
 int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
 			    struct ib_qp_attr *qp_attr, int *qp_attr_mask);
 
+typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
+	      struct net_device *idev, void *cookie);
+
+typedef int (*roce_netdev_filter)(struct ib_device *device, u8 port,
+	     struct net_device *idev, void *cookie);
+
+void ib_dev_roce_ports_of_netdev(struct ib_device *ib_dev,
+				 roce_netdev_filter filter,
+				 void *filter_cookie,
+				 roce_netdev_callback cb,
+				 void *cookie);
+void ib_enum_roce_ports_of_netdev(roce_netdev_filter filter,
+				  void *filter_cookie,
+				  roce_netdev_callback cb,
+				  void *cookie);
+
 int roce_gid_cache_get_gid(struct ib_device *ib_dev, u8 port, int index,
 			   union ib_gid *gid, struct ib_gid_attr *attr);
 
@@ -66,6 +84,9 @@ int roce_gid_cache_find_gid_by_port(struct ib_device *ib_dev, union ib_gid *gid,
 
 int roce_gid_cache_is_active(struct ib_device *ib_dev, u8 port);
 
+int roce_gid_cache_setup(void);
+void roce_gid_cache_cleanup(void);
+
 int roce_add_gid(struct ib_device *ib_dev, u8 port,
 		 union ib_gid *gid, struct ib_gid_attr *attr);
 
@@ -75,4 +96,9 @@ int roce_del_gid(struct ib_device *ib_dev, u8 port,
 int roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
 			     struct net_device *ndev);
 
+int roce_gid_mgmt_init(void);
+void roce_gid_mgmt_cleanup(void);
+
+int roce_rescan_device(struct ib_device *ib_dev);
+
 #endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 8616a95..5ce57bf 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -39,6 +39,7 @@
 #include <linux/init.h>
 #include <linux/mutex.h>
 #include <rdma/rdma_netlink.h>
+#include <rdma/ib_addr.h>
 
 #include "core_priv.h"
 
@@ -640,6 +641,82 @@ int ib_query_gid(struct ib_device *device,
 EXPORT_SYMBOL(ib_query_gid);
 
 /**
+ * ib_dev_roce_ports_of_netdev - enumerate RoCE ports of ibdev in
+ *				 respect of netdev
+ * @ib_dev : IB device we want to query
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all of the physical RoCE ports of ib_dev RoCE ports
+ * which are relaying Ethernet packets to a specific
+ * (possibly virtual) netdevice according to filter.
+ */
+void ib_dev_roce_ports_of_netdev(struct ib_device *ib_dev,
+				 roce_netdev_filter filter,
+				 void *filter_cookie,
+				 roce_netdev_callback cb,
+				 void *cookie)
+{
+	u8 port;
+
+	if (ib_dev->modify_gid)
+		for (port = start_port(ib_dev); port <= end_port(ib_dev);
+		     port++)
+			if (ib_dev->get_link_layer(ib_dev, port) ==
+			    IB_LINK_LAYER_ETHERNET) {
+				struct net_device *idev = NULL;
+
+				rcu_read_lock();
+				if (ib_dev->get_netdev)
+					idev = ib_dev->get_netdev(ib_dev, port);
+
+				if (idev &&
+				    idev->reg_state >= NETREG_UNREGISTERED)
+					idev = NULL;
+
+				if (idev)
+					dev_hold(idev);
+
+				rcu_read_unlock();
+
+				if (filter(ib_dev, port, idev, filter_cookie))
+					cb(ib_dev, port, idev, cookie);
+
+				if (idev)
+					dev_put(idev);
+			}
+}
+
+/**
+ * ib_enum_roce_ports_of_netdev - enumerate RoCE ports of a netdev
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all of the physical RoCE ports which are relaying
+ * Ethernet packets to a specific (possibly virtual) netdevice
+ * according to filter.
+ */
+void ib_enum_roce_ports_of_netdev(roce_netdev_filter filter,
+				  void *filter_cookie,
+				  roce_netdev_callback cb,
+				  void *cookie)
+{
+	struct ib_device *dev;
+
+	mutex_lock(&device_mutex);
+
+	list_for_each_entry(dev, &device_list, core_list)
+		ib_dev_roce_ports_of_netdev(dev, filter, filter_cookie, cb,
+					    cookie);
+
+	mutex_unlock(&device_mutex);
+}
+
+/**
  * ib_query_pkey - Get P_Key table entry
  * @device:Device to query
  * @port_num:Port number to query
@@ -794,6 +871,8 @@ static int __init ib_core_init(void)
 		goto err_sysfs;
 	}
 
+	roce_gid_cache_setup();
+
 	ret = ib_cache_setup();
 	if (ret) {
 		printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
@@ -815,6 +894,7 @@ err:
 
 static void __exit ib_core_cleanup(void)
 {
+	roce_gid_cache_cleanup();
 	ib_cache_cleanup();
 	ibnl_cleanup();
 	ib_sysfs_cleanup();
diff --git a/drivers/infiniband/core/roce_gid_cache.c b/drivers/infiniband/core/roce_gid_cache.c
index 80f364a..1d0f841 100644
--- a/drivers/infiniband/core/roce_gid_cache.c
+++ b/drivers/infiniband/core/roce_gid_cache.c
@@ -516,3 +516,71 @@ int roce_gid_cache_is_active(struct ib_device *ib_dev, u8 port)
 	return ib_dev->cache.roce_gid_cache &&
 		ib_dev->cache.roce_gid_cache[port - start_port(ib_dev)]->active;
 }
+
+static void roce_gid_cache_client_setup_one(struct ib_device *ib_dev)
+{
+	if (!roce_gid_cache_setup_one(ib_dev)) {
+		roce_gid_cache_set_active_state(ib_dev, 1);
+		if (roce_rescan_device(ib_dev)) {
+			roce_gid_cache_set_active_state(ib_dev, 0);
+			roce_gid_cache_cleanup_one(ib_dev);
+		}
+	}
+}
+
+static void roce_gid_cache_client_cleanup_work_handler(struct work_struct *work)
+{
+	struct ib_cache *ib_cache = container_of(work, struct ib_cache,
+						 roce_gid_cache_cleanup_work);
+	struct ib_device *ib_dev = container_of(ib_cache, struct ib_device,
+						cache);
+
+	/* Make sure no gid update task is still referencing this device */
+	flush_workqueue(roce_gid_mgmt_wq);
+
+	/* No need to flush the system wq, even though we use it in
+	 * roce_rescan_device because we are guarenteed to run this
+	 * on the system_wq after roce_rescan_device.
+	 */
+
+	roce_gid_cache_cleanup_one(ib_dev);
+	ib_device_put(ib_dev);
+}
+
+static void roce_gid_cache_client_cleanup_one_work(struct ib_device *ib_dev)
+{
+	ib_device_hold(ib_dev);
+	INIT_WORK(&ib_dev->cache.roce_gid_cache_cleanup_work,
+		  roce_gid_cache_client_cleanup_work_handler);
+	schedule_work(&ib_dev->cache.roce_gid_cache_cleanup_work);
+}
+
+static void roce_gid_cache_client_cleanup_one(struct ib_device *ib_dev)
+{
+	roce_gid_cache_set_active_state(ib_dev, 0);
+	roce_gid_cache_client_cleanup_one_work(ib_dev);
+}
+
+static struct ib_client cache_client = {
+	.name   = "roce_gid_cache",
+	.add    = roce_gid_cache_client_setup_one,
+	.remove = roce_gid_cache_client_cleanup_one
+};
+
+int __init roce_gid_cache_setup(void)
+{
+	roce_gid_mgmt_init();
+
+	return ib_register_client(&cache_client);
+}
+
+void __exit roce_gid_cache_cleanup(void)
+{
+	ib_unregister_client(&cache_client);
+
+	roce_gid_mgmt_cleanup();
+
+	flush_workqueue(system_wq);
+
+	rcu_barrier();
+}
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
new file mode 100644
index 0000000..d51138c
--- /dev/null
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -0,0 +1,516 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/in.h>
+#include <linux/in6.h>
+
+/* For in6_dev_get/in6_dev_put */
+#include <net/addrconf.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+
+struct workqueue_struct *roce_gid_mgmt_wq;
+
+enum gid_op_type {
+	GID_DEL = 0,
+	GID_ADD
+};
+
+struct  update_gid_event_work {
+	struct work_struct work;
+	union ib_gid       gid;
+	struct ib_gid_attr gid_attr;
+	enum gid_op_type gid_op;
+};
+
+#define ROCE_NETDEV_CALLBACK_SZ		2
+struct netdev_event_work_cmd {
+	roce_netdev_callback	cb;
+	roce_netdev_filter	filter;
+};
+
+struct netdev_event_work {
+	struct work_struct		work;
+	struct netdev_event_work_cmd	cmds[ROCE_NETDEV_CALLBACK_SZ];
+	struct net_device		*ndev;
+};
+
+struct roce_rescan_work {
+	struct work_struct	work;
+	struct ib_device	*ib_dev;
+};
+
+static const struct {
+	int flag_mask;
+	enum ib_gid_type gid_type;
+} PORT_CAP_TO_GID_TYPE[] = {
+	{IB_PORT_ROCE_V2,   IB_GID_TYPE_ROCE_V2},
+	{IB_PORT_ROCE,      IB_GID_TYPE_IB},
+};
+
+#define CAP_TO_GID_TABLE_SIZE	ARRAY_SIZE(PORT_CAP_TO_GID_TYPE)
+
+static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
+		       u8 port, union ib_gid *gid,
+		       struct ib_gid_attr *gid_attr)
+{
+	struct ib_port_attr pattr;
+	int i;
+	int err;
+
+	err = ib_query_port(ib_dev, port, &pattr);
+	if (err) {
+		pr_warn("update_gid: ib_query_port() failed for %s, %d\n",
+			ib_dev->name, err);
+	}
+
+	for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) {
+		if (pattr.port_cap_flags & PORT_CAP_TO_GID_TYPE[i].flag_mask) {
+			gid_attr->gid_type =
+				PORT_CAP_TO_GID_TYPE[i].gid_type;
+			switch (gid_op) {
+			case GID_ADD:
+				roce_add_gid(ib_dev, port,
+					     gid, gid_attr);
+				break;
+			case GID_DEL:
+				roce_del_gid(ib_dev, port,
+					     gid, gid_attr);
+				break;
+			}
+		}
+	}
+}
+
+static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port,
+				 struct net_device *idev, void *cookie)
+{
+	struct net_device *rdev;
+	struct net_device *mdev;
+	struct net_device *ndev = (struct net_device *)cookie;
+
+	if (!idev)
+		return 0;
+
+	rcu_read_lock();
+	mdev = netdev_master_upper_dev_get_rcu(idev);
+	rdev = rdma_vlan_dev_real_dev(ndev);
+	rcu_read_unlock();
+
+	return (rdev ? rdev : ndev) == (mdev ? mdev : idev);
+}
+
+static int pass_all_filter(struct ib_device *ib_dev, u8 port,
+			   struct net_device *idev, void *cookie)
+{
+	return 1;
+}
+
+static void netdevice_event_work_handler(struct work_struct *_work)
+{
+	struct netdev_event_work *work =
+		container_of(_work, struct netdev_event_work, work);
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++)
+		ib_enum_roce_ports_of_netdev(work->cmds[i].filter, work->ndev,
+					     work->cmds[i].cb, work->ndev);
+
+	dev_put(work->ndev);
+	kfree(work);
+}
+
+static void update_gid_ip(enum gid_op_type gid_op,
+			  struct ib_device *ib_dev,
+			  u8 port, struct net_device *ndev,
+			  const struct sockaddr *addr)
+{
+	union ib_gid gid;
+	struct ib_gid_attr gid_attr;
+
+	rdma_ip2gid(addr, &gid);
+	memset(&gid_attr, 0, sizeof(gid_attr));
+	gid_attr.ndev = ndev;
+
+	update_gid(gid_op, ib_dev, port, &gid, &gid_attr);
+}
+
+static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
+				 u8 port, struct net_device *ndev)
+{
+	struct in_device *in_dev;
+
+	if (ndev->reg_state >= NETREG_UNREGISTERING)
+		return;
+
+	in_dev = in_dev_get(ndev);
+	if (!in_dev)
+		return;
+
+	for_ifa(in_dev) {
+		struct sockaddr_in ip;
+
+		ip.sin_family = AF_INET;
+		ip.sin_addr.s_addr = ifa->ifa_address;
+		update_gid_ip(GID_ADD, ib_dev, port, ndev,
+			      (struct sockaddr *)&ip);
+	}
+	endfor_ifa(in_dev);
+
+	in_dev_put(in_dev);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void enum_netdev_ipv6_ips(struct ib_device *ib_dev,
+				 u8 port, struct net_device *ndev)
+{
+	struct inet6_ifaddr *ifp;
+	struct inet6_dev *in6_dev;
+	struct sin6_list {
+		struct list_head	list;
+		struct sockaddr_in6	sin6;
+	};
+	struct sin6_list *sin6_iter;
+	struct sin6_list *sin6_temp;
+	struct ib_gid_attr gid_attr = {.ndev = ndev};
+	LIST_HEAD(sin6_list);
+
+	if (ndev->reg_state >= NETREG_UNREGISTERING)
+		return;
+
+	in6_dev = in6_dev_get(ndev);
+	if (!in6_dev)
+		return;
+
+	read_lock_bh(&in6_dev->lock);
+	list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+		struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+
+		if (!entry) {
+			pr_warn("roce_gid_mgmt: couldn't allocate entry for IPv6 update\n");
+			continue;
+		}
+
+		entry->sin6.sin6_family = AF_INET6;
+		entry->sin6.sin6_addr = ifp->addr;
+		list_add_tail(&entry->list, &sin6_list);
+	}
+	read_unlock_bh(&in6_dev->lock);
+
+	in6_dev_put(in6_dev);
+
+	list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) {
+		union ib_gid	gid;
+
+		rdma_ip2gid((const struct sockaddr *)&sin6_iter->sin6, &gid);
+		update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr);
+		list_del(&sin6_iter->list);
+		kfree(sin6_iter);
+	}
+}
+#endif
+
+static void add_netdev_ips(struct ib_device *ib_dev, u8 port,
+			   struct net_device *idev, void *cookie)
+{
+	struct net_device *ndev = (struct net_device *)cookie;
+
+	enum_netdev_ipv4_ips(ib_dev, port, ndev);
+#if IS_ENABLED(CONFIG_IPV6)
+	enum_netdev_ipv6_ips(ib_dev, port, ndev);
+#endif
+}
+
+static void del_netdev_ips(struct ib_device *ib_dev, u8 port,
+			   struct net_device *idev, void *cookie)
+{
+	struct net_device *ndev = (struct net_device *)cookie;
+
+	roce_del_all_netdev_gids(ib_dev, port, ndev);
+}
+
+static int netdevice_event(struct notifier_block *this, unsigned long event,
+			   void *ptr)
+{
+	static const struct netdev_event_work_cmd add_cmd = {
+		.cb = add_netdev_ips, .filter = is_eth_port_of_netdev};
+	static const struct netdev_event_work_cmd del_cmd = {
+		.cb = del_netdev_ips, .filter = pass_all_filter};
+	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+	struct netdev_event_work *ndev_work;
+	struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} };
+
+	if (ndev->type != ARPHRD_ETHER)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_REGISTER:
+	case NETDEV_UP:
+		cmds[0] = add_cmd;
+		break;
+
+	case NETDEV_UNREGISTER:
+		if (ndev->reg_state < NETREG_UNREGISTERED)
+			cmds[0] = del_cmd;
+		else
+			return NOTIFY_DONE;
+		break;
+
+	case NETDEV_CHANGEADDR:
+		cmds[0] = del_cmd;
+		cmds[1] = add_cmd;
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	ndev_work = kmalloc(sizeof(*ndev_work), GFP_KERNEL);
+	if (!ndev_work) {
+		pr_warn("roce_gid_mgmt: can't allocate work for netdevice_event\n");
+		return NOTIFY_DONE;
+	}
+
+	memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds));
+	ndev_work->ndev = ndev;
+	dev_hold(ndev);
+	INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
+
+	queue_work(roce_gid_mgmt_wq, &ndev_work->work);
+
+	return NOTIFY_DONE;
+}
+
+static void callback_for_addr_gid_device_scan(struct ib_device *device,
+					      u8 port,
+					      struct net_device *idev,
+					      void *cookie)
+{
+	struct update_gid_event_work *parsed = cookie;
+
+	return update_gid(parsed->gid_op, device,
+			  port, &parsed->gid,
+			  &parsed->gid_attr);
+}
+
+static void update_gid_event_work_handler(struct work_struct *_work)
+{
+	struct update_gid_event_work *work =
+		container_of(_work, struct update_gid_event_work, work);
+
+	ib_enum_roce_ports_of_netdev(is_eth_port_of_netdev, work->gid_attr.ndev,
+				     callback_for_addr_gid_device_scan, work);
+
+	dev_put(work->gid_attr.ndev);
+	kfree(work);
+}
+
+static int addr_event(struct notifier_block *this, unsigned long event,
+		      struct sockaddr *sa, struct net_device *ndev)
+{
+	struct update_gid_event_work *work;
+	enum gid_op_type gid_op;
+
+	if (ndev->type != ARPHRD_ETHER)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_UP:
+		gid_op = GID_ADD;
+		break;
+
+	case NETDEV_DOWN:
+		gid_op = GID_DEL;
+		break;
+
+	default:
+		return NOTIFY_DONE;
+	}
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work) {
+		pr_warn("roce_gid_mgmt: Couldn't allocate work for addr_event\n");
+		return NOTIFY_DONE;
+	}
+
+	INIT_WORK(&work->work, update_gid_event_work_handler);
+
+	rdma_ip2gid(sa, &work->gid);
+	work->gid_op = gid_op;
+
+	memset(&work->gid_attr, 0, sizeof(work->gid_attr));
+	dev_hold(ndev);
+	work->gid_attr.ndev   = ndev;
+
+	queue_work(roce_gid_mgmt_wq, &work->work);
+
+	return NOTIFY_DONE;
+}
+
+static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev,
+				    u8 port,
+				    struct net_device *idev,
+				    void *cookie)
+{
+	struct net *net;
+	struct net_device *ndev;
+
+	/* Lock the rtnl to make sure the netdevs does not move under
+	 * our feet
+	 */
+	rtnl_lock();
+	for_each_net(net)
+		for_each_netdev(net, ndev)
+			if (is_eth_port_of_netdev(ib_dev, port, idev, ndev))
+				add_netdev_ips(ib_dev, port, idev, ndev);
+	rtnl_unlock();
+}
+
+/* This function will rescan all of the network devices in the system
+ * and add their gids, as needed, to the relevant RoCE devices. Will
+ * take rtnl and the IB device list mutexes. Must not be called from
+ * ib_wq or deadlock will happen. */
+static void enum_all_gids_of_dev(struct ib_device *ib_dev)
+{
+	ib_dev_roce_ports_of_netdev(ib_dev, pass_all_filter, NULL,
+				    enum_all_gids_of_dev_cb, NULL);
+}
+
+static int inetaddr_event(struct notifier_block *this, unsigned long event,
+			  void *ptr)
+{
+	struct sockaddr_in	in;
+	struct net_device	*ndev;
+	struct in_ifaddr	*ifa = ptr;
+
+	in.sin_family = AF_INET;
+	in.sin_addr.s_addr = ifa->ifa_address;
+	ndev = ifa->ifa_dev->dev;
+
+	return addr_event(this, event, (struct sockaddr *)&in, ndev);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int inet6addr_event(struct notifier_block *this, unsigned long event,
+			   void *ptr)
+{
+	struct sockaddr_in6	in6;
+	struct net_device	*ndev;
+	struct inet6_ifaddr	*ifa6 = ptr;
+
+	in6.sin6_family = AF_INET6;
+	in6.sin6_addr = ifa6->addr;
+	ndev = ifa6->idev->dev;
+
+	return addr_event(this, event, (struct sockaddr *)&in6, ndev);
+}
+#endif
+
+static struct notifier_block nb_netdevice = {
+	.notifier_call = netdevice_event
+};
+
+static struct notifier_block nb_inetaddr = {
+	.notifier_call = inetaddr_event
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct notifier_block nb_inet6addr = {
+	.notifier_call = inet6addr_event
+};
+#endif
+
+static void roce_rescan_device_work_handler(struct work_struct *_work)
+{
+	struct roce_rescan_work *work =
+		container_of(_work, struct roce_rescan_work, work);
+
+	enum_all_gids_of_dev(work->ib_dev);
+	kfree(work);
+}
+
+/* Caller must flush system workqueue before removing the ib_device */
+int roce_rescan_device(struct ib_device *ib_dev)
+{
+	struct roce_rescan_work *work = kmalloc(sizeof(*work), GFP_KERNEL);
+
+	if (!work)
+		return -ENOMEM;
+
+	work->ib_dev = ib_dev;
+	INIT_WORK(&work->work, roce_rescan_device_work_handler);
+	schedule_work(&work->work);
+
+	return 0;
+}
+
+int __init roce_gid_mgmt_init(void)
+{
+	roce_gid_mgmt_wq = alloc_ordered_workqueue("roce_gid_mgmt_wq", 0);
+
+	if (!roce_gid_mgmt_wq) {
+		pr_warn("roce_gid_mgmt: can't allocate work queue\n");
+		return -ENOMEM;
+	}
+
+	register_inetaddr_notifier(&nb_inetaddr);
+#if IS_ENABLED(CONFIG_IPV6)
+	register_inet6addr_notifier(&nb_inet6addr);
+#endif
+	/* We relay on the netdevice notifier to enumerate all
+	 * existing devices in the system. Register to this notifier
+	 * last to make sure we will not miss any IP add/del
+	 * callbacks.
+	 */
+	register_netdevice_notifier(&nb_netdevice);
+
+	return 0;
+}
+
+void __exit roce_gid_mgmt_cleanup(void)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	unregister_inet6addr_notifier(&nb_inet6addr);
+#endif
+	unregister_inetaddr_notifier(&nb_inetaddr);
+	unregister_netdevice_notifier(&nb_netdevice);
+	/* Ensure all gid deletion tasks complete before we go down,
+	 * to avoid any reference to free'd memory. By the time
+	 * ib-core is removed, all physical devices have been removed,
+	 * so no issue with remaining hardware contexts.
+	 */
+	synchronize_rcu();
+	drain_workqueue(roce_gid_mgmt_wq);
+	destroy_workqueue(roce_gid_mgmt_wq);
+}
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index ce55906..3cf32d1 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -142,7 +142,7 @@ static inline u16 rdma_vlan_dev_vlan_id(const struct net_device *dev)
 		vlan_dev_vlan_id(dev) : 0xffff;
 }
 
-static inline int rdma_ip2gid(struct sockaddr *addr, union ib_gid *gid)
+static inline int rdma_ip2gid(const struct sockaddr *addr, union ib_gid *gid)
 {
 	switch (addr->sa_family) {
 	case AF_INET:
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index a7593b0..1bc13b1 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1464,6 +1464,7 @@ struct ib_cache {
 	struct ib_gid_cache   **gid_cache;
 	u8                     *lmc_cache;
 	struct ib_roce_gid_cache **roce_gid_cache;
+	struct work_struct	roce_gid_cache_cleanup_work;
 };
 
 struct ib_dma_mapping_ops {
@@ -1536,6 +1537,14 @@ struct ib_device {
 						 struct ib_port_attr *port_attr);
 	enum rdma_link_layer	   (*get_link_layer)(struct ib_device *device,
 						     u8 port_num);
+	/* When calling get_netdev, the HW vendor's driver should return the
+	 * net device of device @device at port @port_num. The function
+	 * is called in rtnl_lock. The HW vendor's device driver must guarantee
+	 * to return NULL before the net device has reached
+	 * NETDEV_UNREGISTER_FINAL state.
+	 */
+	struct net_device	  *(*get_netdev)(struct ib_device *device,
+						 u8 port_num);
 	int		           (*query_gid)(struct ib_device *device,
 						u8 port_num, int index,
 						union ib_gid *gid);
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 04/33] IB/core: Add default GID for RoCE GID Cache
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (2 preceding siblings ...)
  2015-03-25 21:19   ` [PATCH v3 for-next 03/33] IB/core: Add RoCE GID population Somnath Kotur
@ 2015-03-25 21:19   ` Somnath Kotur
  2015-03-25 21:19   ` [PATCH v3 for-next 05/33] net/bonding: make DRV macros private Somnath Kotur
                     ` (28 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:19 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Somnath Kotur

From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

When RoCE is used, a default GID address should be generated
for every supported RoCE type. These default GID addresses are
generated based on the IPv6 link-local address, but in contrast
to the GID based on the regular IPv6 link-local (as we generate
GID per IP address), these GIDs are also available if the net
device is down (in order to support loopback).
Moreover, these default GID addresses can't be deleted.

Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/core/core_priv.h      |  12 +++
 drivers/infiniband/core/roce_gid_cache.c | 179 ++++++++++++++++++++++++++++---
 drivers/infiniband/core/roce_gid_mgmt.c  |  43 ++++++--
 include/net/addrconf.h                   |  31 ++++++
 include/rdma/ib_verbs.h                  |   1 +
 net/ipv6/addrconf.c                      |  31 ------
 6 files changed, 243 insertions(+), 54 deletions(-)

diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 12797d9..128d2b3 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -84,6 +84,16 @@ int roce_gid_cache_find_gid_by_port(struct ib_device *ib_dev, union ib_gid *gid,
 
 int roce_gid_cache_is_active(struct ib_device *ib_dev, u8 port);
 
+enum roce_gid_cache_default_mode {
+	ROCE_GID_CACHE_DEFAULT_MODE_SET,
+	ROCE_GID_CACHE_DEFAULT_MODE_DELETE
+};
+
+void roce_gid_cache_set_default_gid(struct ib_device *ib_dev, u8 port,
+				    struct net_device *ndev,
+				    unsigned long gid_type_mask,
+				    enum roce_gid_cache_default_mode mode);
+
 int roce_gid_cache_setup(void);
 void roce_gid_cache_cleanup(void);
 
@@ -100,5 +110,7 @@ int roce_gid_mgmt_init(void);
 void roce_gid_mgmt_cleanup(void);
 
 int roce_rescan_device(struct ib_device *ib_dev);
+unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port);
+
 
 #endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/roce_gid_cache.c b/drivers/infiniband/core/roce_gid_cache.c
index 1d0f841..1f30dad 100644
--- a/drivers/infiniband/core/roce_gid_cache.c
+++ b/drivers/infiniband/core/roce_gid_cache.c
@@ -34,6 +34,7 @@
 #include <linux/netdevice.h>
 #include <linux/rtnetlink.h>
 #include <rdma/ib_cache.h>
+#include <net/addrconf.h>
 
 #include "core_priv.h"
 
@@ -43,8 +44,10 @@ EXPORT_SYMBOL_GPL(zgid);
 static const struct ib_gid_attr zattr;
 
 enum gid_attr_find_mask {
-	GID_ATTR_FIND_MASK_GID_TYPE	= 1UL << 0,
-	GID_ATTR_FIND_MASK_NETDEV	= 1UL << 1,
+	GID_ATTR_FIND_MASK_GID		= 1UL << 0,
+	GID_ATTR_FIND_MASK_GID_TYPE	= 1UL << 1,
+	GID_ATTR_FIND_MASK_NETDEV	= 1UL << 2,
+	GID_ATTR_FIND_MASK_DEFAULT	= 1UL << 3,
 };
 
 static inline int start_port(struct ib_device *ib_dev)
@@ -69,7 +72,8 @@ static void put_ndev(struct rcu_head *rcu)
 static int write_gid(struct ib_device *ib_dev, u8 port,
 		     struct ib_roce_gid_cache *cache, int ix,
 		     const union ib_gid *gid,
-		     const struct ib_gid_attr *attr)
+		     const struct ib_gid_attr *attr,
+		     bool  default_gid)
 {
 	unsigned int orig_seq;
 	int ret;
@@ -83,6 +87,7 @@ static int write_gid(struct ib_device *ib_dev, u8 port,
 	 */
 	smp_wmb();
 
+	cache->data_vec[ix].default_gid = default_gid;
 	ret = ib_dev->modify_gid(ib_dev, port, ix, gid, attr,
 				 &cache->data_vec[ix].context);
 
@@ -132,7 +137,8 @@ static int write_gid(struct ib_device *ib_dev, u8 port,
 }
 
 static int find_gid(struct ib_roce_gid_cache *cache, union ib_gid *gid,
-		    const struct ib_gid_attr *val, unsigned long mask)
+		    const struct ib_gid_attr *val, bool default_gid,
+		    unsigned long mask)
 {
 	int i;
 	unsigned int orig_seq;
@@ -152,13 +158,18 @@ static int find_gid(struct ib_roce_gid_cache *cache, union ib_gid *gid,
 		    attr->gid_type != val->gid_type)
 			continue;
 
-		if (memcmp(gid, &cache->data_vec[i].gid, sizeof(*gid)))
+		if (mask & GID_ATTR_FIND_MASK_GID &&
+		    memcmp(gid, &cache->data_vec[i].gid, sizeof(*gid)))
 			continue;
 
 		if (mask & GID_ATTR_FIND_MASK_NETDEV &&
 		    attr->ndev != val->ndev)
 			continue;
 
+		if (mask & GID_ATTR_FIND_MASK_DEFAULT &&
+		    cache->data_vec[i].default_gid != default_gid)
+			continue;
+
 		/* We have a match, verify that the data we
 		 * compared is valid. Make sure that the
 		 * sequence number we read is the last to be
@@ -176,12 +187,19 @@ static int find_gid(struct ib_roce_gid_cache *cache, union ib_gid *gid,
 	return -1;
 }
 
+static void make_default_gid(struct  net_device *dev, union ib_gid *gid)
+{
+	gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+	addrconf_ifid_eui48(&gid->raw[8], dev);
+}
+
 int roce_add_gid(struct ib_device *ib_dev, u8 port,
 		 union ib_gid *gid, struct ib_gid_attr *attr)
 {
 	struct ib_roce_gid_cache *cache;
 	int ix;
 	int ret = 0;
+	struct net_device *idev;
 
 	if (!ib_dev->cache.roce_gid_cache)
 		return -ENOSYS;
@@ -194,20 +212,38 @@ int roce_add_gid(struct ib_device *ib_dev, u8 port,
 	if (!memcmp(gid, &zgid, sizeof(*gid)))
 		return -EINVAL;
 
+	if (ib_dev->get_netdev) {
+		rcu_read_lock();
+		idev = ib_dev->get_netdev(ib_dev, port);
+		if (idev && attr->ndev != idev) {
+			union ib_gid default_gid;
+
+			/* Adding default GIDs in not permitted */
+			make_default_gid(idev, &default_gid);
+			if (!memcmp(gid, &default_gid, sizeof(*gid))) {
+				rcu_read_unlock();
+				return -EPERM;
+			}
+		}
+		rcu_read_unlock();
+	}
+
 	mutex_lock(&cache->lock);
 
-	ix = find_gid(cache, gid, attr, GID_ATTR_FIND_MASK_GID_TYPE |
+	ix = find_gid(cache, gid, attr, false, GID_ATTR_FIND_MASK_GID |
+		      GID_ATTR_FIND_MASK_GID_TYPE |
 		      GID_ATTR_FIND_MASK_NETDEV);
 	if (ix >= 0)
 		goto out_unlock;
 
-	ix = find_gid(cache, &zgid, NULL, 0);
+	ix = find_gid(cache, &zgid, NULL, false, GID_ATTR_FIND_MASK_GID |
+		      GID_ATTR_FIND_MASK_DEFAULT);
 	if (ix < 0) {
 		ret = -ENOSPC;
 		goto out_unlock;
 	}
 
-	write_gid(ib_dev, port, cache, ix, gid, attr);
+	write_gid(ib_dev, port, cache, ix, gid, attr, false);
 
 out_unlock:
 	mutex_unlock(&cache->lock);
@@ -218,6 +254,7 @@ int roce_del_gid(struct ib_device *ib_dev, u8 port,
 		 union ib_gid *gid, struct ib_gid_attr *attr)
 {
 	struct ib_roce_gid_cache *cache;
+	union ib_gid default_gid;
 	int ix;
 
 	if (!ib_dev->cache.roce_gid_cache)
@@ -228,15 +265,24 @@ int roce_del_gid(struct ib_device *ib_dev, u8 port,
 	if (!cache || !cache->active)
 		return -ENOSYS;
 
+	if (attr->ndev) {
+		/* Deleting default GIDs in not permitted */
+		make_default_gid(attr->ndev, &default_gid);
+		if (!memcmp(gid, &default_gid, sizeof(*gid)))
+			return -EPERM;
+	}
+
 	mutex_lock(&cache->lock);
 
-	ix = find_gid(cache, gid, attr,
+	ix = find_gid(cache, gid, attr, false,
+		      GID_ATTR_FIND_MASK_GID	  |
 		      GID_ATTR_FIND_MASK_GID_TYPE |
-		      GID_ATTR_FIND_MASK_NETDEV);
+		      GID_ATTR_FIND_MASK_NETDEV	  |
+		      GID_ATTR_FIND_MASK_DEFAULT);
 	if (ix < 0)
 		goto out_unlock;
 
-	write_gid(ib_dev, port, cache, ix, &zgid, &zattr);
+	write_gid(ib_dev, port, cache, ix, &zgid, &zattr, false);
 
 out_unlock:
 	mutex_unlock(&cache->lock);
@@ -261,7 +307,7 @@ int roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
 
 	for (ix = 0; ix < cache->sz; ix++)
 		if (cache->data_vec[ix].attr.ndev == ndev)
-			write_gid(ib_dev, port, cache, ix, &zgid, &zattr);
+			write_gid(ib_dev, port, cache, ix, &zgid, &zattr, false);
 
 	mutex_unlock(&cache->lock);
 	return 0;
@@ -326,7 +372,7 @@ static int _roce_gid_cache_find_gid(struct ib_device *ib_dev, union ib_gid *gid,
 		cache = ib_dev->cache.roce_gid_cache[p];
 		if (!cache || !cache->active)
 			continue;
-		local_index = find_gid(cache, gid, val, mask);
+		local_index = find_gid(cache, gid, val, false, mask);
 		if (local_index >= 0) {
 			if (index)
 				*index = local_index;
@@ -372,7 +418,8 @@ int roce_gid_cache_find_gid_by_port(struct ib_device *ib_dev, union ib_gid *gid,
 {
 	int local_index;
 	struct ib_roce_gid_cache *cache;
-	unsigned long mask = GID_ATTR_FIND_MASK_GID_TYPE;
+	unsigned long mask = GID_ATTR_FIND_MASK_GID |
+			     GID_ATTR_FIND_MASK_GID_TYPE;
 	struct ib_gid_attr val = {.gid_type = gid_type};
 
 	if (!ib_dev->cache.roce_gid_cache || port < start_port(ib_dev) ||
@@ -385,7 +432,7 @@ int roce_gid_cache_find_gid_by_port(struct ib_device *ib_dev, union ib_gid *gid,
 
 	mask |= get_netdev_from_ifindex(net, if_index, &val);
 
-	local_index = find_gid(cache, gid, &val, mask);
+	local_index = find_gid(cache, gid, &val, false, mask);
 	if (local_index >= 0) {
 		if (index)
 			*index = local_index;
@@ -429,7 +476,8 @@ static void free_roce_gid_cache(struct ib_device *ib_dev, u8 port)
 	for (i = 0; i < cache->sz; ++i) {
 		if (memcmp(&cache->data_vec[i].gid, &zgid,
 			   sizeof(cache->data_vec[i].gid)))
-		    write_gid(ib_dev, port, cache, i, &zgid, &zattr);
+		    write_gid(ib_dev, port, cache, i, &zgid, &zattr,
+			      cache->data_vec[i].default_gid);
 	}
 	kfree(cache->data_vec);
 	kfree(cache);
@@ -444,6 +492,101 @@ static void set_roce_gid_cache_active(struct ib_roce_gid_cache *cache,
 	cache->active = active;
 }
 
+void roce_gid_cache_set_default_gid(struct ib_device *ib_dev, u8 port,
+				    struct net_device *ndev,
+				    unsigned long gid_type_mask,
+				    enum roce_gid_cache_default_mode mode)
+{
+	union ib_gid gid;
+	struct ib_gid_attr gid_attr;
+	struct ib_gid_attr zattr_type = zattr;
+	struct ib_roce_gid_cache *cache;
+	unsigned int gid_type;
+
+	cache  = ib_dev->cache.roce_gid_cache[port - 1];
+
+	if (!cache)
+		return;
+
+	make_default_gid(ndev, &gid);
+	memset(&gid_attr, 0, sizeof(gid_attr));
+	gid_attr.ndev = ndev;
+	for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) {
+		int ix;
+		union ib_gid current_gid;
+		struct ib_gid_attr current_gid_attr;
+
+		if (1UL << gid_type & ~gid_type_mask)
+			continue;
+
+		gid_attr.gid_type = gid_type;
+
+		ix = find_gid(cache, &gid, &gid_attr, true,
+			      GID_ATTR_FIND_MASK_GID_TYPE |
+			      GID_ATTR_FIND_MASK_DEFAULT);
+
+		if (ix < 0) {
+			pr_warn("roce_gid_cache: couldn't find index for default gid type %u\n",
+				gid_type);
+			continue;
+		}
+
+		zattr_type.gid_type = gid_type;
+
+		mutex_lock(&cache->lock);
+		if (!roce_gid_cache_get_gid(ib_dev, port, ix,
+					    &current_gid, &current_gid_attr) &&
+		    mode == ROCE_GID_CACHE_DEFAULT_MODE_SET &&
+		    !memcmp(&gid, &current_gid, sizeof(gid)) &&
+		    !memcmp(&gid_attr, &current_gid_attr, sizeof(gid_attr))) {
+			mutex_unlock(&cache->lock);
+			continue;
+		}
+
+		if ((memcmp(&current_gid, &zgid, sizeof(current_gid)) ||
+		     memcmp(&current_gid_attr, &zattr_type,
+			    sizeof(current_gid_attr))) &&
+		    write_gid(ib_dev, port, cache, ix, &zgid, &zattr, true)) {
+			pr_warn("roce_gid_cache: can't delete index %d for default gid %pI6\n",
+				ix, gid.raw);
+			mutex_unlock(&cache->lock);
+			continue;
+		}
+
+		if (mode == ROCE_GID_CACHE_DEFAULT_MODE_SET)
+			if (write_gid(ib_dev, port, cache, ix, &gid, &gid_attr,
+				      true))
+				pr_warn("roce_gid_cache: unable to add default gid %pI6\n",
+					gid.raw);
+
+		mutex_unlock(&cache->lock);
+	}
+}
+
+static int roce_gid_cache_reserve_default(struct ib_device *ib_dev, u8 port)
+{
+	unsigned int i;
+	unsigned long roce_gid_type_mask;
+	unsigned int num_default_gids;
+	struct ib_roce_gid_cache *cache;
+
+	cache  = ib_dev->cache.roce_gid_cache[port - 1];
+
+	roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+	num_default_gids = hweight_long(roce_gid_type_mask);
+	for (i = 0; i < num_default_gids && i < cache->sz; i++) {
+		struct ib_roce_gid_cache_entry *entry =
+			&cache->data_vec[i];
+
+		entry->default_gid = true;
+		entry->attr.gid_type = find_next_bit(&roce_gid_type_mask,
+						     BITS_PER_LONG,
+						     i);
+	}
+
+	return 0;
+}
+
 static int roce_gid_cache_setup_one(struct ib_device *ib_dev)
 {
 	u8 port;
@@ -472,6 +615,10 @@ static int roce_gid_cache_setup_one(struct ib_device *ib_dev)
 			err = -ENOMEM;
 			goto rollback_cache_setup;
 		}
+
+		err = roce_gid_cache_reserve_default(ib_dev, port + 1);
+		if (err)
+			goto rollback_cache_setup;
 	}
 	return 0;
 
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
index d51138c..c0cbb23 100644
--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -82,24 +82,37 @@ static const struct {
 
 #define CAP_TO_GID_TABLE_SIZE	ARRAY_SIZE(PORT_CAP_TO_GID_TYPE)
 
-static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
-		       u8 port, union ib_gid *gid,
-		       struct ib_gid_attr *gid_attr)
+unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port)
 {
 	struct ib_port_attr pattr;
 	int i;
 	int err;
+	unsigned int ret_flags = 0;
 
 	err = ib_query_port(ib_dev, port, &pattr);
 	if (err) {
 		pr_warn("update_gid: ib_query_port() failed for %s, %d\n",
 			ib_dev->name, err);
+		return 0;
 	}
 
-	for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) {
-		if (pattr.port_cap_flags & PORT_CAP_TO_GID_TYPE[i].flag_mask) {
-			gid_attr->gid_type =
-				PORT_CAP_TO_GID_TYPE[i].gid_type;
+	for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++)
+		if (pattr.port_cap_flags & PORT_CAP_TO_GID_TYPE[i].flag_mask)
+			ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type;
+
+	return ret_flags;
+}
+
+static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
+		       u8 port, union ib_gid *gid,
+		       struct ib_gid_attr *gid_attr)
+{
+	int i;
+	unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
+	for (i = 0; i < IB_GID_TYPE_SIZE; i++) {
+		if ((1UL << i) & gid_type_mask) {
+			gid_attr->gid_type = i;
 			switch (gid_op) {
 			case GID_ADD:
 				roce_add_gid(ib_dev, port,
@@ -167,6 +180,21 @@ static void update_gid_ip(enum gid_op_type gid_op,
 	update_gid(gid_op, ib_dev, port, &gid, &gid_attr);
 }
 
+static void enum_netdev_default_gids(struct ib_device *ib_dev,
+				     u8 port, struct net_device *ndev,
+				     struct net_device *idev)
+{
+	unsigned long gid_type_mask;
+
+	if (idev != ndev)
+		return;
+
+	gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
+	roce_gid_cache_set_default_gid(ib_dev, port, idev, gid_type_mask,
+				       ROCE_GID_CACHE_DEFAULT_MODE_SET);
+}
+
 static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
 				 u8 port, struct net_device *ndev)
 {
@@ -247,6 +275,7 @@ static void add_netdev_ips(struct ib_device *ib_dev, u8 port,
 {
 	struct net_device *ndev = (struct net_device *)cookie;
 
+	enum_netdev_default_gids(ib_dev, port, ndev, idev);
 	enum_netdev_ipv4_ips(ib_dev, port, ndev);
 #if IS_ENABLED(CONFIG_IPV6)
 	enum_netdev_ipv6_ips(ib_dev, port, ndev);
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index d13573b..378bf82 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -88,6 +88,37 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2);
 void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
 void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);
 
+static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
+{
+	if (dev->addr_len != ETH_ALEN)
+		return -1;
+	memcpy(eui, dev->dev_addr, 3);
+	memcpy(eui + 5, dev->dev_addr + 3, 3);
+
+	/*
+	 * The zSeries OSA network cards can be shared among various
+	 * OS instances, but the OSA cards have only one MAC address.
+	 * This leads to duplicate address conflicts in conjunction
+	 * with IPv6 if more than one instance uses the same card.
+	 *
+	 * The driver for these cards can deliver a unique 16-bit
+	 * identifier for each instance sharing the same card.  It is
+	 * placed instead of 0xFFFE in the interface identifier.  The
+	 * "u" bit of the interface identifier is not inverted in this
+	 * case.  Hence the resulting interface identifier has local
+	 * scope according to RFC2373.
+	 */
+	if (dev->dev_id) {
+		eui[3] = (dev->dev_id >> 8) & 0xFF;
+		eui[4] = dev->dev_id & 0xFF;
+	} else {
+		eui[3] = 0xFF;
+		eui[4] = 0xFE;
+		eui[0] ^= 2;
+	}
+	return 0;
+}
+
 static inline unsigned long addrconf_timeout_fixup(u32 timeout,
 						   unsigned int unit)
 {
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 1bc13b1..3956863 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -84,6 +84,7 @@ struct ib_roce_gid_cache_entry {
 	union ib_gid        gid;
 	struct ib_gid_attr  attr;
 	void		   *context;
+	bool		    default_gid;
 };
 
 struct ib_roce_gid_cache {
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f7c8bbe..1ad323d 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1765,37 +1765,6 @@ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
 	__ipv6_dev_ac_dec(ifp->idev, &addr);
 }
 
-static int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
-{
-	if (dev->addr_len != ETH_ALEN)
-		return -1;
-	memcpy(eui, dev->dev_addr, 3);
-	memcpy(eui + 5, dev->dev_addr + 3, 3);
-
-	/*
-	 * The zSeries OSA network cards can be shared among various
-	 * OS instances, but the OSA cards have only one MAC address.
-	 * This leads to duplicate address conflicts in conjunction
-	 * with IPv6 if more than one instance uses the same card.
-	 *
-	 * The driver for these cards can deliver a unique 16-bit
-	 * identifier for each instance sharing the same card.  It is
-	 * placed instead of 0xFFFE in the interface identifier.  The
-	 * "u" bit of the interface identifier is not inverted in this
-	 * case.  Hence the resulting interface identifier has local
-	 * scope according to RFC2373.
-	 */
-	if (dev->dev_id) {
-		eui[3] = (dev->dev_id >> 8) & 0xFF;
-		eui[4] = dev->dev_id & 0xFF;
-	} else {
-		eui[3] = 0xFF;
-		eui[4] = 0xFE;
-		eui[0] ^= 2;
-	}
-	return 0;
-}
-
 static int addrconf_ifid_eui64(u8 *eui, struct net_device *dev)
 {
 	if (dev->addr_len != IEEE802154_ADDR_LEN)
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 05/33] net/bonding: make DRV macros private
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (3 preceding siblings ...)
  2015-03-25 21:19   ` [PATCH v3 for-next 04/33] IB/core: Add default GID for RoCE GID Cache Somnath Kotur
@ 2015-03-25 21:19   ` Somnath Kotur
  2015-03-25 21:19   ` [PATCH v3 for-next 06/33] net: Add info for NETDEV_CHANGEUPPER event Somnath Kotur
                     ` (27 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:19 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Somnath Kotur

From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

The bonding modules currently defines 4 macros with
general names that pollute the global namespace:
DRV_VERSION
DRV_RELDATE
DRV_NAME
DRV_DESCRIPTION

Fixing that by defining a private bonding_priv.h
header files which includes those defines.

Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/net/bonding/bond_main.c    |  2 ++
 drivers/net/bonding/bond_procfs.c  |  1 +
 drivers/net/bonding/bonding_priv.h | 26 ++++++++++++++++++++++++++
 include/net/bonding.h              |  7 -------
 4 files changed, 29 insertions(+), 7 deletions(-)
 create mode 100644 drivers/net/bonding/bonding_priv.h

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 468c70e..55f2d3e 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -81,6 +81,8 @@
 #include <net/bond_3ad.h>
 #include <net/bond_alb.h>
 
+#include "bonding_priv.h"
+
 /*---------------------------- Module parameters ----------------------------*/
 
 /* monitor all links that often (in milliseconds). <=0 disables monitoring */
diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c
index 976f5ad..b50a002 100644
--- a/drivers/net/bonding/bond_procfs.c
+++ b/drivers/net/bonding/bond_procfs.c
@@ -4,6 +4,7 @@
 #include <net/netns/generic.h>
 #include <net/bonding.h>
 
+#include "bonding_priv.h"
 
 static void *bond_info_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(RCU)
diff --git a/drivers/net/bonding/bonding_priv.h b/drivers/net/bonding/bonding_priv.h
new file mode 100644
index 0000000..c093e91
--- /dev/null
+++ b/drivers/net/bonding/bonding_priv.h
@@ -0,0 +1,26 @@
+/*
+ * Bond several ethernet interfaces into a Cisco, running 'Etherchannel'.
+ *
+ * Portions are (c) Copyright 1995 Simon "Guru Aleph-Null" Janes
+ * NCM: Network and Communications Management, Inc.
+ *
+ * BUT, I'm the one who modified it for ethernet, so:
+ * (c) Copyright 1999, Thomas Davis, tadavis-/3juihCSby0@public.gmane.org
+ *
+ *	This software may be used and distributed according to the terms
+ *	of the GNU Public License, incorporated herein by reference.
+ *
+ */
+
+#ifndef _BONDING_PRIV_H
+#define _BONDING_PRIV_H
+
+#define DRV_VERSION	"3.7.1"
+#define DRV_RELDATE	"April 27, 2011"
+#define DRV_NAME	"bonding"
+#define DRV_DESCRIPTION	"Ethernet Channel Bonding Driver"
+
+#define bond_version DRV_DESCRIPTION ": v" DRV_VERSION " (" DRV_RELDATE ")\n"
+
+#endif
+
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 4c2b0f4..a124173 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -30,13 +30,6 @@
 #include <net/bond_alb.h>
 #include <net/bond_options.h>
 
-#define DRV_VERSION	"3.7.1"
-#define DRV_RELDATE	"April 27, 2011"
-#define DRV_NAME	"bonding"
-#define DRV_DESCRIPTION	"Ethernet Channel Bonding Driver"
-
-#define bond_version DRV_DESCRIPTION ": v" DRV_VERSION " (" DRV_RELDATE ")\n"
-
 #define BOND_MAX_ARP_TARGETS	16
 
 #define BOND_DEFAULT_MIIMON	100
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 06/33] net: Add info for NETDEV_CHANGEUPPER event
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (4 preceding siblings ...)
  2015-03-25 21:19   ` [PATCH v3 for-next 05/33] net/bonding: make DRV macros private Somnath Kotur
@ 2015-03-25 21:19   ` Somnath Kotur
  2015-03-25 21:19   ` [PATCH v3 for-next 07/33] IB/core: Add RoCE cache bonding support Somnath Kotur
                     ` (26 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:19 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Somnath Kotur

From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Consumers of NETDEV_CHANGEUPPER event sometimes want
to know which upper device was linked/unlinked and which
operation was carried. Adding extra information in the
notifier info block.

Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 include/linux/netdevice.h | 14 ++++++++++++++
 net/core/dev.c            | 12 ++++++++++--
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f36f7d3..599d7c8 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3466,6 +3466,20 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
 				    netdev_features_t features);
 
+enum netdev_changeupper_event {
+	NETDEV_CHANGEUPPER_LINK,
+	NETDEV_CHANGEUPPER_UNLINK,
+};
+
+struct netdev_changeupper_info {
+	struct netdev_notifier_info	info; /* must be first */
+	enum netdev_changeupper_event	event;
+	struct net_device		*upper;
+};
+
+void netdev_changeupper_info_change(struct net_device *dev,
+				    struct netdev_changeupper_info *info);
+
 struct netdev_bonding_info {
 	ifslave	slave;
 	ifbond	master;
diff --git a/net/core/dev.c b/net/core/dev.c
index ea714fc..1ef1bd5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5118,6 +5118,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 				   void *private)
 {
 	struct netdev_adjacent *i, *j, *to_i, *to_j;
+	struct netdev_changeupper_info changeupper_info;
 	int ret = 0;
 
 	ASSERT_RTNL();
@@ -5173,7 +5174,10 @@ static int __netdev_upper_dev_link(struct net_device *dev,
 			goto rollback_lower_mesh;
 	}
 
-	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+	changeupper_info.event = NETDEV_CHANGEUPPER_LINK;
+	changeupper_info.upper = upper_dev;
+	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+				      &changeupper_info.info);
 	return 0;
 
 rollback_lower_mesh:
@@ -5269,6 +5273,7 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 			     struct net_device *upper_dev)
 {
 	struct netdev_adjacent *i, *j;
+	struct netdev_changeupper_info changeupper_info;
 	ASSERT_RTNL();
 
 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
@@ -5290,7 +5295,10 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
 		__netdev_adjacent_dev_unlink(dev, i->dev);
 
-	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
+	changeupper_info.event = NETDEV_CHANGEUPPER_UNLINK;
+	changeupper_info.upper = upper_dev;
+	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+				      &changeupper_info.info);
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
 
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 07/33] IB/core: Add RoCE cache bonding support
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (5 preceding siblings ...)
  2015-03-25 21:19   ` [PATCH v3 for-next 06/33] net: Add info for NETDEV_CHANGEUPPER event Somnath Kotur
@ 2015-03-25 21:19   ` Somnath Kotur
  2015-03-25 21:19   ` [PATCH v3 for-next 08/33] IB/core: GID attribute should be returned from verbs API and cache API Somnath Kotur
                     ` (25 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:19 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Somnath Kotur

From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Bonding is a unique behavior since when working in
active-backup mode, only the current selected slave
should occupy the default GIDs and the master's GID.
Listening to bonding events and only adding the
required GIDs to the active slave in the RoCE cache
GID table.

Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/core/roce_gid_mgmt.c | 291 ++++++++++++++++++++++++++++++--
 drivers/net/bonding/bond_options.c      |  13 --
 include/net/bonding.h                   |   7 +
 3 files changed, 282 insertions(+), 29 deletions(-)

diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
index c0cbb23..362327f 100644
--- a/drivers/infiniband/core/roce_gid_mgmt.c
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -37,6 +37,7 @@
 
 /* For in6_dev_get/in6_dev_put */
 #include <net/addrconf.h>
+#include <net/bonding.h>
 
 #include <rdma/ib_cache.h>
 #include <rdma/ib_addr.h>
@@ -55,16 +56,17 @@ struct  update_gid_event_work {
 	enum gid_op_type gid_op;
 };
 
-#define ROCE_NETDEV_CALLBACK_SZ		2
+#define ROCE_NETDEV_CALLBACK_SZ		3
 struct netdev_event_work_cmd {
 	roce_netdev_callback	cb;
 	roce_netdev_filter	filter;
+	struct net_device	*ndev;
+	struct net_device	*f_ndev;
 };
 
 struct netdev_event_work {
 	struct work_struct		work;
 	struct netdev_event_work_cmd	cmds[ROCE_NETDEV_CALLBACK_SZ];
-	struct net_device		*ndev;
 };
 
 struct roce_rescan_work {
@@ -127,22 +129,96 @@ static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
 	}
 }
 
+#define IS_NETDEV_BONDING_MASTER(ndev)	\
+	(((ndev)->priv_flags &		\
+	  (IFF_BONDING | IFF_MASTER)) == (IFF_BONDING | IFF_MASTER))
+
+enum bonding_slave_state {
+	BONDING_SLAVE_STATE_ACTIVE	= 1UL << 0,
+	BONDING_SLAVE_STATE_INACTIVE	= 1UL << 1,
+	BONDING_SLAVE_STATE_NA		= 1UL << 2,
+};
+
+static enum bonding_slave_state is_eth_active_slave_of_bonding(struct net_device *idev,
+							       struct net_device *upper)
+{
+	if (upper && IS_NETDEV_BONDING_MASTER(upper)) {
+		struct net_device *pdev;
+
+		rcu_read_lock();
+		pdev = bond_option_active_slave_get_rcu(netdev_priv(upper));
+		rcu_read_unlock();
+		if (pdev)
+			return idev == pdev ? BONDING_SLAVE_STATE_ACTIVE :
+				BONDING_SLAVE_STATE_INACTIVE;
+	}
+
+	return BONDING_SLAVE_STATE_NA;
+}
+
+static bool is_upper_dev_rcu(struct net_device *dev, struct net_device *upper)
+{
+	struct net_device *_upper = NULL;
+	struct list_head *iter;
+
+	rcu_read_lock();
+	netdev_for_each_all_upper_dev_rcu(dev, _upper, iter) {
+		if (_upper == upper)
+			break;
+	}
+
+	rcu_read_unlock();
+	return _upper == upper;
+}
+
+static int _is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port,
+				  struct net_device *idev, void *cookie,
+				  unsigned long bond_state)
+{
+	struct net_device *ndev = (struct net_device *)cookie;
+	struct net_device *rdev;
+	int res;
+
+	if (!idev)
+		return 0;
+
+	rcu_read_lock();
+	rdev = rdma_vlan_dev_real_dev(ndev);
+	if (!rdev)
+		rdev = ndev;
+
+	res = ((is_upper_dev_rcu(idev, ndev) &&
+	       (is_eth_active_slave_of_bonding(idev, rdev) &
+		bond_state)) ||
+	       rdev == idev);
+
+	rcu_read_unlock();
+	return res;
+}
+
 static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port,
 				 struct net_device *idev, void *cookie)
 {
-	struct net_device *rdev;
-	struct net_device *mdev;
-	struct net_device *ndev = (struct net_device *)cookie;
+	return _is_eth_port_of_netdev(ib_dev, port, idev, cookie,
+				      BONDING_SLAVE_STATE_ACTIVE |
+				      BONDING_SLAVE_STATE_NA);
+}
 
+static int is_eth_port_inactive_slave(struct ib_device *ib_dev, u8 port,
+				      struct net_device *idev, void *cookie)
+{
+	struct net_device *mdev;
+	int res;
 	if (!idev)
 		return 0;
 
 	rcu_read_lock();
 	mdev = netdev_master_upper_dev_get_rcu(idev);
-	rdev = rdma_vlan_dev_real_dev(ndev);
+	res = is_eth_active_slave_of_bonding(idev, mdev) ==
+		BONDING_SLAVE_STATE_INACTIVE;
 	rcu_read_unlock();
 
-	return (rdev ? rdev : ndev) == (mdev ? mdev : idev);
+	return res;
 }
 
 static int pass_all_filter(struct ib_device *ib_dev, u8 port,
@@ -151,17 +227,49 @@ static int pass_all_filter(struct ib_device *ib_dev, u8 port,
 	return 1;
 }
 
+static int upper_device_filter(struct ib_device *ib_dev, u8 port,
+			       struct net_device *idev, void *cookie)
+{
+	struct net_device *ndev = (struct net_device *)cookie;
+
+	return idev == ndev || is_upper_dev_rcu(idev, ndev);
+}
+
+static int bonding_slaves_filter(struct ib_device *ib_dev, u8 port,
+				 struct net_device *idev, void *cookie)
+{
+	struct net_device *rdev;
+	struct net_device *ndev = (struct net_device *)cookie;
+	int res;
+
+	rdev = rdma_vlan_dev_real_dev(ndev);
+
+	ndev = rdev ? rdev : ndev;
+	if (!idev || !IS_NETDEV_BONDING_MASTER(ndev))
+		return 0;
+
+	rcu_read_lock();
+	res = is_upper_dev_rcu(idev, ndev);
+	rcu_read_unlock();
+
+	return res;
+}
+
 static void netdevice_event_work_handler(struct work_struct *_work)
 {
 	struct netdev_event_work *work =
 		container_of(_work, struct netdev_event_work, work);
 	unsigned int i;
 
-	for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++)
-		ib_enum_roce_ports_of_netdev(work->cmds[i].filter, work->ndev,
-					     work->cmds[i].cb, work->ndev);
+	for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
+		ib_enum_roce_ports_of_netdev(work->cmds[i].filter,
+					     work->cmds[i].f_ndev,
+					     work->cmds[i].cb,
+					     work->cmds[i].ndev);
+		dev_put(work->cmds[i].ndev);
+		dev_put(work->cmds[i].f_ndev);
+	}
 
-	dev_put(work->ndev);
 	kfree(work);
 }
 
@@ -186,8 +294,16 @@ static void enum_netdev_default_gids(struct ib_device *ib_dev,
 {
 	unsigned long gid_type_mask;
 
-	if (idev != ndev)
+	rcu_read_lock();
+	if (!idev ||
+	    ((idev != ndev && !is_upper_dev_rcu(idev, ndev)) ||
+	     is_eth_active_slave_of_bonding(idev,
+					    netdev_master_upper_dev_get_rcu(idev)) ==
+	     BONDING_SLAVE_STATE_INACTIVE)) {
+		rcu_read_unlock();
 		return;
+	}
+	rcu_read_unlock();
 
 	gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
 
@@ -195,6 +311,37 @@ static void enum_netdev_default_gids(struct ib_device *ib_dev,
 				       ROCE_GID_CACHE_DEFAULT_MODE_SET);
 }
 
+static void bond_delete_netdev_default_gids(struct ib_device *ib_dev,
+					    u8 port, struct net_device *ndev,
+					    struct net_device *idev)
+{
+	struct net_device *rdev = rdma_vlan_dev_real_dev(ndev);
+
+	if (!idev)
+		return;
+
+	if (!rdev)
+		rdev = ndev;
+
+	rcu_read_lock();
+
+	if (is_upper_dev_rcu(idev, ndev) &&
+	    is_eth_active_slave_of_bonding(idev, rdev) ==
+	    BONDING_SLAVE_STATE_INACTIVE) {
+		unsigned long gid_type_mask;
+
+		rcu_read_unlock();
+
+		gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
+
+		roce_gid_cache_set_default_gid(ib_dev, port, idev,
+					       gid_type_mask,
+					       ROCE_GID_CACHE_DEFAULT_MODE_DELETE);
+	} else {
+		rcu_read_unlock();
+	}
+}
+
 static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
 				 u8 port, struct net_device *ndev)
 {
@@ -290,6 +437,72 @@ static void del_netdev_ips(struct ib_device *ib_dev, u8 port,
 	roce_del_all_netdev_gids(ib_dev, port, ndev);
 }
 
+static void del_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
+				 struct net_device *idev, void *cookie)
+{
+	struct net_device *ndev = (struct net_device *)cookie;
+	struct upper_list {
+		struct list_head list;
+		struct net_device *upper;
+	};
+	struct net_device *upper;
+	struct list_head *iter;
+	struct upper_list *upper_iter;
+	struct upper_list *upper_temp;
+	LIST_HEAD(upper_list);
+
+	rcu_read_lock();
+	netdev_for_each_all_upper_dev_rcu(ndev, upper, iter) {
+		struct upper_list *entry = kmalloc(sizeof(*entry),
+						   GFP_ATOMIC);
+
+		if (!entry) {
+			pr_info("roce_gid_mgmt: couldn't allocate entry to delete ndev\n");
+			continue;
+		}
+
+		list_add_tail(&entry->list, &upper_list);
+		dev_hold(upper);
+		entry->upper = upper;
+	}
+	rcu_read_unlock();
+
+	roce_del_all_netdev_gids(ib_dev, port, ndev);
+	list_for_each_entry_safe(upper_iter, upper_temp, &upper_list,
+				 list) {
+		roce_del_all_netdev_gids(ib_dev, port,
+					 upper_iter->upper);
+		dev_put(upper_iter->upper);
+		list_del(&upper_iter->list);
+		kfree(upper_iter);
+	}
+}
+
+static void del_netdev_default_ips_join(struct ib_device *ib_dev, u8 port,
+					struct net_device *idev, void *cookie)
+{
+	struct net_device *mdev;
+
+	rcu_read_lock();
+	mdev = netdev_master_upper_dev_get_rcu(idev);
+	if (mdev)
+		dev_hold(mdev);
+	rcu_read_unlock();
+
+	if (mdev) {
+		bond_delete_netdev_default_gids(ib_dev, port, mdev, idev);
+		dev_put(mdev);
+	}
+}
+
+static void del_netdev_default_ips(struct ib_device *ib_dev, u8 port,
+				   struct net_device *idev, void *cookie)
+{
+	struct net_device *ndev = (struct net_device *)cookie;
+
+	bond_delete_netdev_default_gids(ib_dev, port, ndev, idev);
+}
+
 static int netdevice_event(struct notifier_block *this, unsigned long event,
 			   void *ptr)
 {
@@ -297,9 +510,20 @@ static int netdevice_event(struct notifier_block *this, unsigned long event,
 		.cb = add_netdev_ips, .filter = is_eth_port_of_netdev};
 	static const struct netdev_event_work_cmd del_cmd = {
 		.cb = del_netdev_ips, .filter = pass_all_filter};
+	static const struct netdev_event_work_cmd bonding_default_del_cmd_join = {
+		.cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave};
+	static const struct netdev_event_work_cmd bonding_default_del_cmd = {
+		.cb = del_netdev_default_ips, .filter = is_eth_port_inactive_slave};
+	static const struct netdev_event_work_cmd default_del_cmd = {
+		.cb = del_netdev_default_ips, .filter = pass_all_filter};
+	static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = {
+		.cb = del_netdev_ips, .filter = bonding_slaves_filter};
+	static const struct netdev_event_work_cmd upper_ips_del_cmd = {
+		.cb = del_netdev_upper_ips, .filter = upper_device_filter};
 	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
 	struct netdev_event_work *ndev_work;
 	struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} };
+	unsigned int i;
 
 	if (ndev->type != ARPHRD_ETHER)
 		return NOTIFY_DONE;
@@ -307,7 +531,8 @@ static int netdevice_event(struct notifier_block *this, unsigned long event,
 	switch (event) {
 	case NETDEV_REGISTER:
 	case NETDEV_UP:
-		cmds[0] = add_cmd;
+		cmds[0] = bonding_default_del_cmd_join;
+		cmds[1] = add_cmd;
 		break;
 
 	case NETDEV_UNREGISTER:
@@ -318,9 +543,37 @@ static int netdevice_event(struct notifier_block *this, unsigned long event,
 		break;
 
 	case NETDEV_CHANGEADDR:
-		cmds[0] = del_cmd;
+		cmds[0] = default_del_cmd;
 		cmds[1] = add_cmd;
 		break;
+
+	case NETDEV_CHANGEUPPER:
+		{
+			struct netdev_changeupper_info *changeupper_info =
+				container_of(ptr, struct netdev_changeupper_info, info);
+
+			if (changeupper_info->event ==
+			    NETDEV_CHANGEUPPER_UNLINK) {
+				cmds[0] = upper_ips_del_cmd;
+				cmds[0].ndev = changeupper_info->upper;
+				cmds[1] = add_cmd;
+			} else if (changeupper_info->event ==
+				   NETDEV_CHANGEUPPER_LINK) {
+				cmds[0] = bonding_default_del_cmd;
+				cmds[0].ndev = changeupper_info->upper;
+				cmds[1] = add_cmd;
+				cmds[1].ndev = changeupper_info->upper;
+				cmds[1].f_ndev = changeupper_info->upper;
+			}
+		}
+	break;
+
+	case NETDEV_BONDING_FAILOVER:
+		cmds[0] = bonding_event_ips_del_cmd;
+		cmds[1] = bonding_default_del_cmd_join;
+		cmds[2] = add_cmd;
+		break;
+
 	default:
 		return NOTIFY_DONE;
 	}
@@ -332,8 +585,14 @@ static int netdevice_event(struct notifier_block *this, unsigned long event,
 	}
 
 	memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds));
-	ndev_work->ndev = ndev;
-	dev_hold(ndev);
+	for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) {
+		if (!ndev_work->cmds[i].ndev)
+			ndev_work->cmds[i].ndev = ndev;
+		if (!ndev_work->cmds[i].f_ndev)
+			ndev_work->cmds[i].f_ndev = ndev;
+		dev_hold(ndev_work->cmds[i].ndev);
+		dev_hold(ndev_work->cmds[i].f_ndev);
+	}
 	INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
 
 	queue_work(roce_gid_mgmt_wq, &ndev_work->work);
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 1a61cc9..42e30e5 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -690,19 +690,6 @@ static int bond_option_mode_set(struct bonding *bond,
 	return 0;
 }
 
-static struct net_device *__bond_option_active_slave_get(struct bonding *bond,
-							 struct slave *slave)
-{
-	return bond_uses_primary(bond) && slave ? slave->dev : NULL;
-}
-
-struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond)
-{
-	struct slave *slave = rcu_dereference(bond->curr_active_slave);
-
-	return __bond_option_active_slave_get(bond, slave);
-}
-
 static int bond_option_active_slave_set(struct bonding *bond,
 					const struct bond_opt_value *newval)
 {
diff --git a/include/net/bonding.h b/include/net/bonding.h
index a124173..23952a6 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -307,6 +307,13 @@ static inline bool bond_uses_primary(struct bonding *bond)
 	return bond_mode_uses_primary(BOND_MODE(bond));
 }
 
+static inline struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond)
+{
+	struct slave *slave = rcu_dereference(bond->curr_active_slave);
+
+	return bond_uses_primary(bond) && slave ? slave->dev : NULL;
+}
+
 static inline bool bond_slave_is_up(struct slave *slave)
 {
 	return netif_running(slave->dev) && netif_carrier_ok(slave->dev);
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 08/33] IB/core: GID attribute should be returned from verbs API and cache API
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (6 preceding siblings ...)
  2015-03-25 21:19   ` [PATCH v3 for-next 07/33] IB/core: Add RoCE cache bonding support Somnath Kotur
@ 2015-03-25 21:19   ` Somnath Kotur
  2015-03-25 21:19   ` [PATCH v3 for-next 09/33] IB/core: Report gid_type and gid_ndev through sysfs Somnath Kotur
                     ` (24 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:19 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Somnath Kotur

From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Along with the GID itself, we now store GIDs attribute.
This GID attribute contains important meta information regarding
the GID itself, for example the netdevice. Thus, this information
needs to be returned in APIs. This patch changes the following APIs:
(a) ib_get_cached_gid
(b) ib_find_cached_gid
(c) ib_find_cached_gid_by_port
(d) ib_query_gid

It changes the usage of those APIs and use the RoCE GID cache
when needed.

Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/core/cache.c                | 225 +++++++++++++++++++++----
 drivers/infiniband/core/cm.c                   |   6 +-
 drivers/infiniband/core/cma.c                  |  84 ++++++---
 drivers/infiniband/core/device.c               |  29 +++-
 drivers/infiniband/core/mad.c                  |   2 +-
 drivers/infiniband/core/multicast.c            |   3 +-
 drivers/infiniband/core/sa_query.c             |   7 +-
 drivers/infiniband/core/sysfs.c                |   2 +-
 drivers/infiniband/core/uverbs_marshall.c      |   4 +-
 drivers/infiniband/core/verbs.c                |   7 +-
 drivers/infiniband/hw/mlx4/qp.c                |   5 +-
 drivers/infiniband/hw/mthca/mthca_av.c         |   2 +-
 drivers/infiniband/ulp/ipoib/ipoib_main.c      |   2 +-
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   2 +-
 drivers/infiniband/ulp/srp/ib_srp.c            |   2 +-
 drivers/infiniband/ulp/srpt/ib_srpt.c          |   3 +-
 include/rdma/ib_cache.h                        |  44 ++++-
 include/rdma/ib_sa.h                           |   4 +-
 include/rdma/ib_verbs.h                        |   7 +-
 19 files changed, 352 insertions(+), 88 deletions(-)

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 80f6cf2..882d491 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -42,6 +42,8 @@
 
 #include "core_priv.h"
 
+#define __IB_ONLY
+
 struct ib_pkey_cache {
 	int             table_len;
 	u16             table[0];
@@ -69,16 +71,16 @@ static inline int end_port(struct ib_device *device)
 		0 : device->phys_port_cnt;
 }
 
-int ib_get_cached_gid(struct ib_device *device,
-		      u8                port_num,
-		      int               index,
-		      union ib_gid     *gid)
+static int __IB_ONLY __ib_get_cached_gid(struct ib_device *device,
+					 u8                port_num,
+					 int               index,
+					 union ib_gid     *gid)
 {
 	struct ib_gid_cache *cache;
 	unsigned long flags;
 	int ret = 0;
 
-	if (port_num < start_port(device) || port_num > end_port(device))
+	if (!device->cache.gid_cache)
 		return -EINVAL;
 
 	read_lock_irqsave(&device->cache.lock, flags);
@@ -94,43 +96,183 @@ int ib_get_cached_gid(struct ib_device *device,
 
 	return ret;
 }
+
+int ib_cache_use_roce_gid_cache(struct ib_device *device, u8 port_num)
+{
+	if (rdma_port_get_link_layer(device, port_num) ==
+	    IB_LINK_LAYER_ETHERNET) {
+		if (device->cache.roce_gid_cache)
+			return 0;
+		else
+			return -EAGAIN;
+	}
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL(ib_cache_use_roce_gid_cache);
+
+int ib_get_cached_gid(struct ib_device *device,
+		      u8                port_num,
+		      int               index,
+		      union ib_gid     *gid,
+		      struct ib_gid_attr *attr)
+{
+	int ret;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	ret = ib_cache_use_roce_gid_cache(device, port_num);
+	if (!ret)
+		return roce_gid_cache_get_gid(device, port_num, index, gid,
+					      attr);
+
+	if (ret == -EAGAIN)
+		return ret;
+
+	ret = __ib_get_cached_gid(device, port_num, index, gid);
+
+	if (!ret && attr) {
+		memset(attr, 0, sizeof(*attr));
+		attr->gid_type = IB_GID_TYPE_IB;
+	}
+
+	return ret;
+}
 EXPORT_SYMBOL(ib_get_cached_gid);
 
-int ib_find_cached_gid(struct ib_device *device,
-		       union ib_gid	*gid,
-		       u8               *port_num,
-		       u16              *index)
+static int __IB_ONLY ___ib_find_cached_gid_by_port(struct ib_device *device,
+						   u8               port_num,
+						   const union ib_gid *gid,
+						   u16              *index)
 {
 	struct ib_gid_cache *cache;
+	u8 p = port_num - start_port(device);
+	int i;
+
+	if (!ib_cache_use_roce_gid_cache(device, port_num))
+		return -ENOSYS;
+
+	cache = device->cache.gid_cache[p];
+	for (i = 0; i < cache->table_len; ++i) {
+		if (!memcmp(gid, &cache->table[i], sizeof(*gid))) {
+			if (index)
+				*index = i;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static int __IB_ONLY __ib_find_cached_gid_by_port(struct ib_device *device,
+						  u8		    port_num,
+						  union ib_gid     *gid,
+						  u16              *index)
+{
+	unsigned long flags;
+	u16 found_index;
+	int ret;
+
+	if (index)
+		*index = -1;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	ret = ___ib_find_cached_gid_by_port(device, port_num, gid,
+					    &found_index);
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	if (!ret && index)
+		*index = found_index;
+
+	return ret;
+}
+
+static int __IB_ONLY __ib_find_cached_gid(struct ib_device *device,
+					  union ib_gid     *gid,
+					  u8               *port_num,
+					  u16              *index)
+{
 	unsigned long flags;
-	int p, i;
+	u16 found_index;
+	int p;
 	int ret = -ENOENT;
 
-	*port_num = -1;
+	if (port_num)
+		*port_num = -1;
 	if (index)
 		*index = -1;
 
 	read_lock_irqsave(&device->cache.lock, flags);
 
-	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
-		cache = device->cache.gid_cache[p];
-		for (i = 0; i < cache->table_len; ++i) {
-			if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
-				*port_num = p + start_port(device);
-				if (index)
-					*index = i;
-				ret = 0;
-				goto found;
-			}
+	for (p = start_port(device); p <= end_port(device); ++p) {
+		if (!___ib_find_cached_gid_by_port(device, p, gid,
+						   &found_index)) {
+			if (port_num)
+				*port_num = p;
+			ret = 0;
+			break;
 		}
 	}
-found:
+
 	read_unlock_irqrestore(&device->cache.lock, flags);
 
+	if (!ret && index)
+		*index = found_index;
+
+	return ret;
+}
+
+int ib_find_cached_gid(struct ib_device *device,
+		       union ib_gid	*gid,
+		       enum ib_gid_type gid_type,
+		       struct net	*net,
+		       int		if_index,
+		       u8               *port_num,
+		       u16              *index)
+{
+	int ret = -ENOENT;
+
+	/* Look for a RoCE device with the specified GID. */
+	if (device->cache.roce_gid_cache)
+		ret = roce_gid_cache_find_gid(device, gid, gid_type, net,
+					      if_index, port_num, index);
+
+	/* If no RoCE devices with the specified GID, look for IB device. */
+	if (ret && gid_type == IB_GID_TYPE_IB)
+		ret =  __ib_find_cached_gid(device, gid, port_num, index);
+
 	return ret;
 }
 EXPORT_SYMBOL(ib_find_cached_gid);
 
+int ib_find_cached_gid_by_port(struct ib_device *device,
+			       union ib_gid	*gid,
+			       enum ib_gid_type gid_type,
+			       u8               port_num,
+			       struct net	*net,
+			       int		if_index,
+			       u16              *index)
+{
+	int ret = -ENOENT;
+
+	/* Look for a RoCE device with the specified GID. */
+	if (!ib_cache_use_roce_gid_cache(device, port_num))
+		return roce_gid_cache_find_gid_by_port(device, gid, gid_type,
+						       port_num, net, if_index,
+						       index);
+
+	/* If no RoCE devices with the specified GID, look for IB device. */
+	if (gid_type == IB_GID_TYPE_IB)
+		ret = __ib_find_cached_gid_by_port(device, port_num,
+						   gid, index);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_gid_by_port);
+
 int ib_get_cached_pkey(struct ib_device *device,
 		       u8                port_num,
 		       int               index,
@@ -254,9 +396,12 @@ static void ib_cache_update(struct ib_device *device,
 {
 	struct ib_port_attr       *tprops = NULL;
 	struct ib_pkey_cache      *pkey_cache = NULL, *old_pkey_cache;
-	struct ib_gid_cache       *gid_cache = NULL, *old_gid_cache;
+	struct ib_gid_cache       *gid_cache = NULL, *old_gid_cache = NULL;
 	int                        i;
 	int                        ret;
+	bool			   use_roce_gid_cache =
+					!ib_cache_use_roce_gid_cache(device,
+								     port);
 
 	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
 	if (!tprops)
@@ -276,12 +421,14 @@ static void ib_cache_update(struct ib_device *device,
 
 	pkey_cache->table_len = tprops->pkey_tbl_len;
 
-	gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len *
-			    sizeof *gid_cache->table, GFP_KERNEL);
-	if (!gid_cache)
-		goto err;
+	if (!use_roce_gid_cache) {
+		gid_cache = kmalloc(sizeof(*gid_cache) + tprops->gid_tbl_len *
+			    sizeof(*gid_cache->table), GFP_KERNEL);
+		if (!gid_cache)
+			goto err;
 
-	gid_cache->table_len = tprops->gid_tbl_len;
+		gid_cache->table_len = tprops->gid_tbl_len;
+	}
 
 	for (i = 0; i < pkey_cache->table_len; ++i) {
 		ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
@@ -292,22 +439,28 @@ static void ib_cache_update(struct ib_device *device,
 		}
 	}
 
-	for (i = 0; i < gid_cache->table_len; ++i) {
-		ret = ib_query_gid(device, port, i, gid_cache->table + i);
-		if (ret) {
-			printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
-			       ret, device->name, i);
-			goto err;
+	if (!use_roce_gid_cache) {
+		for (i = 0;  i < gid_cache->table_len; ++i) {
+			ret = ib_query_gid(device, port, i,
+					   gid_cache->table + i, NULL);
+			if (ret) {
+				printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
+				       ret, device->name, i);
+				goto err;
+			}
 		}
 	}
 
 	write_lock_irq(&device->cache.lock);
 
 	old_pkey_cache = device->cache.pkey_cache[port - start_port(device)];
-	old_gid_cache  = device->cache.gid_cache [port - start_port(device)];
+	if (!use_roce_gid_cache)
+		old_gid_cache  =
+			device->cache.gid_cache[port - start_port(device)];
 
 	device->cache.pkey_cache[port - start_port(device)] = pkey_cache;
-	device->cache.gid_cache [port - start_port(device)] = gid_cache;
+	if (!use_roce_gid_cache)
+		device->cache.gid_cache[port - start_port(device)] = gid_cache;
 
 	device->cache.lmc_cache[port - start_port(device)] = tprops->lmc;
 
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index e28a494..d88f2ae 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -360,6 +360,8 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
 	read_lock_irqsave(&cm.device_lock, flags);
 	list_for_each_entry(cm_dev, &cm.device_list, list) {
 		if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid,
+					IB_GID_TYPE_IB, path->net,
+					path->ifindex,
 					&p, NULL)) {
 			port = cm_dev->port[p-1];
 			break;
@@ -379,7 +381,6 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
 	ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path,
 			     &av->ah_attr);
 	av->timeout = path->packet_life_time + 1;
-	memcpy(av->smac, path->smac, sizeof(av->smac));
 
 	av->valid = 1;
 	return 0;
@@ -1566,7 +1567,8 @@ static int cm_req_handler(struct cm_work *work)
 	ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
 	if (ret) {
 		ib_get_cached_gid(work->port->cm_dev->ib_device,
-				  work->port->port_num, 0, &work->path[0].sgid);
+				  work->port->port_num, 0, &work->path[0].sgid,
+				  NULL);
 		ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
 			       &work->path[0].sgid, sizeof work->path[0].sgid,
 			       NULL, 0);
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index d570030..335def9 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -356,7 +356,7 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv,
 	struct cma_device *cma_dev;
 	union ib_gid gid, iboe_gid;
 	int ret = -ENODEV;
-	u8 port, found_port;
+	u8 port;
 	enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ?
 		IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
 
@@ -375,16 +375,28 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv,
 				     listen_id_priv->id.port_num) == dev_ll) {
 		cma_dev = listen_id_priv->cma_dev;
 		port = listen_id_priv->id.port_num;
-		if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB &&
-		    rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET)
-			ret = ib_find_cached_gid(cma_dev->device, &iboe_gid,
-						 &found_port, NULL);
-		else
-			ret = ib_find_cached_gid(cma_dev->device, &gid,
-						 &found_port, NULL);
+		if (rdma_node_get_transport(cma_dev->device->node_type) ==
+		    RDMA_TRANSPORT_IB &&
+		    rdma_port_get_link_layer(cma_dev->device, port) ==
+		    IB_LINK_LAYER_ETHERNET) {
+			int if_index =
+				id_priv->id.route.addr.dev_addr.bound_dev_if;
+
+			ret = ib_find_cached_gid_by_port(cma_dev->device,
+							 &iboe_gid,
+							 IB_GID_TYPE_IB,
+							 port,
+							 &init_net,
+							 if_index,
+							 NULL);
+		} else {
+			ret = ib_find_cached_gid_by_port(cma_dev->device, &gid,
+							 IB_GID_TYPE_IB, port,
+							 NULL, 0, NULL);
+		}
 
-		if (!ret && (port  == found_port)) {
-			id_priv->id.port_num = found_port;
+		if (!ret) {
+			id_priv->id.port_num = port;
 			goto out;
 		}
 	}
@@ -394,15 +406,34 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv,
 			    listen_id_priv->cma_dev == cma_dev &&
 			    listen_id_priv->id.port_num == port)
 				continue;
-			if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) {
-				if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB &&
-				    rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET)
-					ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, &found_port, NULL);
-				else
-					ret = ib_find_cached_gid(cma_dev->device, &gid, &found_port, NULL);
-
-				if (!ret && (port == found_port)) {
-					id_priv->id.port_num = found_port;
+			if (rdma_port_get_link_layer(cma_dev->device, port) ==
+			    dev_ll) {
+				if (rdma_node_get_transport(cma_dev->device->node_type) ==
+				    RDMA_TRANSPORT_IB &&
+				    rdma_port_get_link_layer(cma_dev->device, port) ==
+				    IB_LINK_LAYER_ETHERNET) {
+					int if_index =
+						id_priv->id.route.addr.dev_addr.bound_dev_if;
+
+					ret = ib_find_cached_gid_by_port(cma_dev->device,
+									 &iboe_gid,
+									 IB_GID_TYPE_IB,
+									 port,
+									 &init_net,
+									 if_index,
+									 NULL);
+				} else {
+					ret = ib_find_cached_gid_by_port(cma_dev->device,
+									 &gid,
+									 IB_GID_TYPE_IB,
+									 port,
+									 NULL,
+									 0,
+									 NULL);
+				}
+
+				if (!ret) {
+					id_priv->id.port_num = port;
 					goto out;
 				}
 			}
@@ -442,7 +473,9 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
 			if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index))
 				continue;
 
-			for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i, &gid); i++) {
+			for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i,
+						       &gid, NULL);
+			     i++) {
 				if (!memcmp(&gid, dgid, sizeof(gid))) {
 					cma_dev = cur_dev;
 					sgid = gid;
@@ -629,7 +662,7 @@ static int cma_modify_qp_rtr(struct rdma_id_private *id_priv,
 		goto out;
 
 	ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num,
-			   qp_attr.ah_attr.grh.sgid_index, &sgid);
+			   qp_attr.ah_attr.grh.sgid_index, &sgid, NULL);
 	if (ret)
 		goto out;
 
@@ -1908,16 +1941,17 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
 
 	route->num_paths = 1;
 
-	if (addr->dev_addr.bound_dev_if)
+	if (addr->dev_addr.bound_dev_if) {
 		ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if);
+		route->path_rec->net = &init_net;
+		route->path_rec->ifindex = addr->dev_addr.bound_dev_if;
+	}
 	if (!ndev) {
 		ret = -ENODEV;
 		goto err2;
 	}
 
-	route->path_rec->vlan_id = rdma_vlan_dev_vlan_id(ndev);
 	memcpy(route->path_rec->dmac, addr->dev_addr.dst_dev_addr, ETH_ALEN);
-	memcpy(route->path_rec->smac, ndev->dev_addr, ndev->addr_len);
 
 	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
 		    &route->path_rec->sgid);
@@ -2051,7 +2085,7 @@ static int cma_bind_loopback(struct rdma_id_private *id_priv)
 	p = 1;
 
 port_found:
-	ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid);
+	ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid, NULL);
 	if (ret)
 		goto out;
 
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 5ce57bf..d42bbda 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -40,6 +40,7 @@
 #include <linux/mutex.h>
 #include <rdma/rdma_netlink.h>
 #include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
 
 #include "core_priv.h"
 
@@ -630,12 +631,21 @@ EXPORT_SYMBOL(ib_query_port);
  * @port_num:Port number to query
  * @index:GID table index to query
  * @gid:Returned GID
+ * @attr: Returned GID's attribute (only in RoCE)
  *
  * ib_query_gid() fetches the specified GID table entry.
  */
 int ib_query_gid(struct ib_device *device,
-		 u8 port_num, int index, union ib_gid *gid)
+		 u8 port_num, int index, union ib_gid *gid,
+		 struct ib_gid_attr *attr)
 {
+	if (!ib_cache_use_roce_gid_cache(device, port_num))
+		return roce_gid_cache_get_gid(device, port_num, index, gid,
+					      attr);
+
+	if (attr)
+		return -EINVAL;
+
 	return device->query_gid(device, port_num, index, gid);
 }
 EXPORT_SYMBOL(ib_query_gid);
@@ -784,19 +794,32 @@ EXPORT_SYMBOL(ib_modify_port);
  *   a specified GID value occurs.
  * @device: The device to query.
  * @gid: The GID value to search for.
+ * @gid_type: Type of GID.
+ * @net: The namespace to search this GID in (RoCE only).
+ *	 Valid only if if_index != 0.
+ * @if_index: The if_index assigned with this GID (RoCE only).
  * @port_num: The port number of the device where the GID value was found.
  * @index: The index into the GID table where the GID was found.  This
  *   parameter may be NULL.
  */
 int ib_find_gid(struct ib_device *device, union ib_gid *gid,
-		u8 *port_num, u16 *index)
+		enum ib_gid_type gid_type, struct net *net,
+		int if_index, u8 *port_num, u16 *index)
 {
 	union ib_gid tmp_gid;
 	int ret, port, i;
 
+	if (device->cache.roce_gid_cache &&
+	    !roce_gid_cache_find_gid(device, gid, gid_type, net, if_index,
+				     port_num, index))
+		return 0;
+
 	for (port = start_port(device); port <= end_port(device); ++port) {
+		if (!ib_cache_use_roce_gid_cache(device, port))
+			continue;
+
 		for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) {
-			ret = ib_query_gid(device, port, i, &tmp_gid);
+			ret = ib_query_gid(device, port, i, &tmp_gid, NULL);
 			if (ret)
 				return ret;
 			if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 74c30f4..5d59cce 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -1791,7 +1791,7 @@ static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv,
 					  ((1 << lmc) - 1)));
 		} else {
 			if (ib_get_cached_gid(device, port_num,
-					      attr.grh.sgid_index, &sgid))
+					      attr.grh.sgid_index, &sgid, NULL))
 				return 0;
 			return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw,
 				       16);
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
index fa17b55..f1927f1 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -729,7 +729,8 @@ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
 	u16 gid_index;
 	u8 p;
 
-	ret = ib_find_cached_gid(device, &rec->port_gid, &p, &gid_index);
+	ret = ib_find_cached_gid(device, &rec->port_gid, IB_GID_TYPE_IB,
+				 NULL, 0, &p, &gid_index);
 	if (ret)
 		return ret;
 
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index c38f030..5b20237 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -546,7 +546,8 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
 		ah_attr->ah_flags = IB_AH_GRH;
 		ah_attr->grh.dgid = rec->dgid;
 
-		ret = ib_find_cached_gid(device, &rec->sgid, &port_num,
+		ret = ib_find_cached_gid(device, &rec->sgid, IB_GID_TYPE_IB,
+					 rec->net, rec->ifindex, &port_num,
 					 &gid_index);
 		if (ret)
 			return ret;
@@ -677,9 +678,9 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
 
 		ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
 			  mad->data, &rec);
-		rec.vlan_id = 0xffff;
+		rec.net = NULL;
+		rec.ifindex = 0;
 		memset(rec.dmac, 0, ETH_ALEN);
-		memset(rec.smac, 0, ETH_ALEN);
 		query->callback(status, &rec, query->context);
 	} else
 		query->callback(status, NULL, query->context);
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index cbd0383..5cee246 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -289,7 +289,7 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
 	union ib_gid gid;
 	ssize_t ret;
 
-	ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid);
+	ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid, NULL);
 	if (ret)
 		return ret;
 
diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c
index abd9724..7d2f14c 100644
--- a/drivers/infiniband/core/uverbs_marshall.c
+++ b/drivers/infiniband/core/uverbs_marshall.c
@@ -141,8 +141,8 @@ void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst,
 	dst->preference		= src->preference;
 	dst->packet_life_time_selector = src->packet_life_time_selector;
 
-	memset(dst->smac, 0, sizeof(dst->smac));
 	memset(dst->dmac, 0, sizeof(dst->dmac));
-	dst->vlan_id = 0xffff;
+	dst->net = NULL;
+	dst->ifindex = 0;
 }
 EXPORT_SYMBOL(ib_copy_path_rec_from_user);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index f93eb8d..1fe3e71 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -229,8 +229,8 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
 		ah_attr->ah_flags = IB_AH_GRH;
 		ah_attr->grh.dgid = grh->sgid;
 
-		ret = ib_find_cached_gid(device, &grh->dgid, &port_num,
-					 &gid_index);
+		ret = ib_find_cached_gid(device, &grh->dgid, IB_GID_TYPE_IB,
+					 NULL, 0, &port_num, &gid_index);
 		if (ret)
 			return ret;
 
@@ -873,7 +873,8 @@ int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
 	if ((*qp_attr_mask & IB_QP_AV)  &&
 	    (rdma_port_get_link_layer(qp->device, qp_attr->ah_attr.port_num) == IB_LINK_LAYER_ETHERNET)) {
 		ret = ib_query_gid(qp->device, qp_attr->ah_attr.port_num,
-				   qp_attr->ah_attr.grh.sgid_index, &sgid);
+				   qp_attr->ah_attr.grh.sgid_index, &sgid,
+				   NULL);
 		if (ret)
 			goto out;
 		if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) {
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 0a11f8e..b9ed4f1 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -2156,7 +2156,8 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 		} else  {
 			err = ib_get_cached_gid(ib_dev,
 						be32_to_cpu(ah->av.ib.port_pd) >> 24,
-						ah->av.ib.gid_index, &sgid);
+						ah->av.ib.gid_index, &sgid,
+						NULL);
 			if (err)
 				return err;
 		}
@@ -2198,7 +2199,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 			ib_get_cached_gid(ib_dev,
 					  be32_to_cpu(ah->av.ib.port_pd) >> 24,
 					  ah->av.ib.gid_index,
-					  &sqp->ud_header.grh.source_gid);
+					  &sqp->ud_header.grh.source_gid, NULL);
 		}
 		memcpy(sqp->ud_header.grh.destination_gid.raw,
 		       ah->av.ib.dgid, 16);
diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c
index 32f6c63..bcac294 100644
--- a/drivers/infiniband/hw/mthca/mthca_av.c
+++ b/drivers/infiniband/hw/mthca/mthca_av.c
@@ -281,7 +281,7 @@ int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
 		ib_get_cached_gid(&dev->ib_dev,
 				  be32_to_cpu(ah->av->port_pd) >> 24,
 				  ah->av->gid_index % dev->limits.gid_table_len,
-				  &header->grh.source_gid);
+				  &header->grh.source_gid, NULL);
 		memcpy(header->grh.destination_gid.raw,
 		       ah->av->dgid, 16);
 	}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 58b5aa3..fa2899a 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1580,7 +1580,7 @@ static struct net_device *ipoib_add_port(const char *format,
 	priv->dev->broadcast[8] = priv->pkey >> 8;
 	priv->dev->broadcast[9] = priv->pkey & 0xff;
 
-	result = ib_query_gid(hca, port, 0, &priv->local_gid);
+	result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL);
 	if (result) {
 		printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
 		       hca->name, port, result);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index ffb83b5..74ceed6 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -531,7 +531,7 @@ void ipoib_mcast_join_task(struct work_struct *work)
 	}
 	priv->local_lid = port_attr.lid;
 
-	if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid))
+	if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid, NULL))
 		ipoib_warn(priv, "ib_query_gid() failed\n");
 	else
 		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 0747c05..e72d92f 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -3205,7 +3205,7 @@ static ssize_t srp_create_target(struct device *dev,
 	INIT_WORK(&target->tl_err_work, srp_tl_err_work);
 	INIT_WORK(&target->remove_work, srp_remove_work);
 	spin_lock_init(&target->lock);
-	ret = ib_query_gid(ibdev, host->port, 0, &target->sgid);
+	ret = ib_query_gid(ibdev, host->port, 0, &target->sgid, NULL);
 	if (ret)
 		goto err;
 
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
index eb694dd..e174bd2 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -547,7 +547,8 @@ static int srpt_refresh_port(struct srpt_port *sport)
 	sport->sm_lid = port_attr.sm_lid;
 	sport->lid = port_attr.lid;
 
-	ret = ib_query_gid(sport->sdev->device, sport->port, 0, &sport->gid);
+	ret = ib_query_gid(sport->sdev->device, sport->port, 0, &sport->gid,
+			   NULL);
 	if (ret)
 		goto err_query_port;
 
diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h
index ad9a3c2..36b72bf 100644
--- a/include/rdma/ib_cache.h
+++ b/include/rdma/ib_cache.h
@@ -36,6 +36,17 @@
 #define _IB_CACHE_H
 
 #include <rdma/ib_verbs.h>
+#include <net/net_namespace.h>
+
+/**
+ * ib_cache_use_roce_gid_cache - Returns whether the device uses roce gid cache
+ * @device: The device to query
+ * @port_num: The port number of the device to query.
+ *
+ * ib_cache_use_roce_gid_cache() returns 0 if this port uses the roce_gid_cache
+ * to store GIDs and error otherwise.
+ */
+int ib_cache_use_roce_gid_cache(struct ib_device *device, u8 port_num);
 
 /**
  * ib_get_cached_gid - Returns a cached GID table entry
@@ -43,6 +54,7 @@
  * @port_num: The port number of the device to query.
  * @index: The index into the cached GID table to query.
  * @gid: The GID value found at the specified index.
+ * @attr: The GID attribute found at the specified index (only in RoCE).
  *
  * ib_get_cached_gid() fetches the specified GID table entry stored in
  * the local software cache.
@@ -50,13 +62,17 @@
 int ib_get_cached_gid(struct ib_device    *device,
 		      u8                   port_num,
 		      int                  index,
-		      union ib_gid        *gid);
+		      union ib_gid        *gid,
+		      struct ib_gid_attr  *attr);
 
 /**
  * ib_find_cached_gid - Returns the port number and GID table index where
  *   a specified GID value occurs.
  * @device: The device to query.
  * @gid: The GID value to search for.
+ * @gid_type: The GID type to search for.
+ * @net: In RoCE, the namespace of the device.
+ * @if_index: In RoCE, the if_index of the device. Zero means ignore.
  * @port_num: The port number of the device where the GID value was found.
  * @index: The index into the cached GID table where the GID was found.  This
  *   parameter may be NULL.
@@ -66,10 +82,36 @@ int ib_get_cached_gid(struct ib_device    *device,
  */
 int ib_find_cached_gid(struct ib_device *device,
 		       union ib_gid	*gid,
+		       enum ib_gid_type gid_type,
+		       struct net	  *net,
+		       int		   if_index,
 		       u8               *port_num,
 		       u16              *index);
 
 /**
+ * ib_find_cached_gid_by_port - Returns the GID table index where a specified
+ * GID value occurs
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @gid_type: The GID type to search for.
+ * @port_num: The port number of the device where the GID value sould be
+ *   searched.
+ * @net: In RoCE, the namespace of the device.
+ * @if_index: In RoCE, the if_index of the device. Zero means ignore.
+ * @index: The index into the cached GID table where the GID was found.  This
+ *   parameter may be NULL.
+ *
+ * ib_find_cached_gid() searches for the specified GID value in
+ * the local software cache.
+ */
+int ib_find_cached_gid_by_port(struct ib_device *device,
+			       union ib_gid	*gid,
+			       enum ib_gid_type gid_type,
+			       u8               port_num,
+			       struct net	*net,
+			       int		if_index,
+			       u16              *index);
+/**
  * ib_get_cached_pkey - Returns a cached PKey table entry
  * @device: The device to query.
  * @port_num: The port number of the device to query.
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index 7e071a6..6a1b994 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -156,7 +156,9 @@ struct ib_sa_path_rec {
 	u8           preference;
 	u8           smac[ETH_ALEN];
 	u8           dmac[ETH_ALEN];
-	u16	     vlan_id;
+	u16          vlan_id;
+	int	     ifindex;
+	struct net  *net;
 };
 
 #define IB_SA_MCMEMBER_REC_MGID				IB_SA_COMP_MASK( 0)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 3956863..2d662e0 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -48,6 +48,7 @@
 #include <linux/rwsem.h>
 #include <linux/scatterlist.h>
 #include <linux/workqueue.h>
+#include <net/net_namespace.h>
 #include <uapi/linux/if_ether.h>
 
 #include <linux/atomic.h>
@@ -1813,7 +1814,8 @@ enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device,
 					       u8 port_num);
 
 int ib_query_gid(struct ib_device *device,
-		 u8 port_num, int index, union ib_gid *gid);
+		 u8 port_num, int index, union ib_gid *gid,
+		 struct ib_gid_attr *attr);
 
 int ib_query_pkey(struct ib_device *device,
 		  u8 port_num, u16 index, u16 *pkey);
@@ -1827,7 +1829,8 @@ int ib_modify_port(struct ib_device *device,
 		   struct ib_port_modify *port_modify);
 
 int ib_find_gid(struct ib_device *device, union ib_gid *gid,
-		u8 *port_num, u16 *index);
+		enum ib_gid_type gid_type, struct net *net,
+		int if_index, u8 *port_num, u16 *index);
 
 int ib_find_pkey(struct ib_device *device,
 		 u8 port_num, u16 pkey, u16 *index);
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 09/33] IB/core: Report gid_type and gid_ndev through sysfs
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (7 preceding siblings ...)
  2015-03-25 21:19   ` [PATCH v3 for-next 08/33] IB/core: GID attribute should be returned from verbs API and cache API Somnath Kotur
@ 2015-03-25 21:19   ` Somnath Kotur
  2015-03-25 21:19   ` [PATCH v3 for-next 10/33] IB/core: Support find sgid index using a filter function Somnath Kotur
                     ` (23 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:19 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Somnath Kotur

From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Since we've added GID attributes to the RoCE GID table,
the users need a convenient way to query them.
Adding the GID type and relate net device to IB's sysfs.

The new attributes are available in:
/sys/class/infiniband/<device>/ports/<port>/gid_attrs/ndevs/<index>
/sys/class/infiniband/<device>/ports/<port>/gid_attrs/types/<index>

The <index> corresponds to the index of the respective GID in:
/sys/class/infiniband/<device>/ports/<port>/gids/<index>

Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/core/core_priv.h      |   2 +
 drivers/infiniband/core/roce_gid_cache.c |  13 +++
 drivers/infiniband/core/sysfs.c          | 184 ++++++++++++++++++++++++++++++-
 3 files changed, 197 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 128d2b3..b5bbbdf 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -71,6 +71,8 @@ void ib_enum_roce_ports_of_netdev(roce_netdev_filter filter,
 				  roce_netdev_callback cb,
 				  void *cookie);
 
+const char *roce_gid_cache_type_str(enum ib_gid_type gid_type);
+
 int roce_gid_cache_get_gid(struct ib_device *ib_dev, u8 port, int index,
 			   union ib_gid *gid, struct ib_gid_attr *attr);
 
diff --git a/drivers/infiniband/core/roce_gid_cache.c b/drivers/infiniband/core/roce_gid_cache.c
index 1f30dad..b6180eb 100644
--- a/drivers/infiniband/core/roce_gid_cache.c
+++ b/drivers/infiniband/core/roce_gid_cache.c
@@ -50,6 +50,11 @@ enum gid_attr_find_mask {
 	GID_ATTR_FIND_MASK_DEFAULT	= 1UL << 3,
 };
 
+static const char * const gid_type_str[] = {
+	[IB_GID_TYPE_IB]	= "IB/RoCE V1\n",
+	[IB_GID_TYPE_ROCE_V2]	= "RoCE V2\n",
+};
+
 static inline int start_port(struct ib_device *ib_dev)
 {
 	return (ib_dev->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
@@ -60,6 +65,14 @@ struct dev_put_rcu {
 	struct net_device	*ndev;
 };
 
+const char *roce_gid_cache_type_str(enum ib_gid_type gid_type)
+{
+	if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type])
+		return gid_type_str[gid_type];
+
+	return "Invalid GID type";
+}
+
 static void put_ndev(struct rcu_head *rcu)
 {
 	struct dev_put_rcu *put_rcu =
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 5cee246..887c2f8 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -37,12 +37,22 @@
 #include <linux/slab.h>
 #include <linux/stat.h>
 #include <linux/string.h>
+#include <linux/netdevice.h>
 
 #include <rdma/ib_mad.h>
 
+struct ib_port;
+
+struct gid_attr_group {
+	struct ib_port		*port;
+	struct kobject		kobj;
+	struct attribute_group	ndev;
+	struct attribute_group	type;
+};
 struct ib_port {
 	struct kobject         kobj;
 	struct ib_device      *ibdev;
+	struct gid_attr_group *gid_attr_group;
 	struct attribute_group gid_group;
 	struct attribute_group pkey_group;
 	u8                     port_num;
@@ -84,6 +94,24 @@ static const struct sysfs_ops port_sysfs_ops = {
 	.show = port_attr_show
 };
 
+static ssize_t gid_attr_show(struct kobject *kobj,
+			     struct attribute *attr, char *buf)
+{
+	struct port_attribute *port_attr =
+		container_of(attr, struct port_attribute, attr);
+	struct ib_port *p = container_of(kobj, struct gid_attr_group,
+					 kobj)->port;
+
+	if (!port_attr->show)
+		return -EIO;
+
+	return port_attr->show(p, port_attr, buf);
+}
+
+static const struct sysfs_ops gid_attr_sysfs_ops = {
+	.show = gid_attr_show
+};
+
 static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
 			  char *buf)
 {
@@ -281,6 +309,46 @@ static struct attribute *port_default_attrs[] = {
 	NULL
 };
 
+static size_t print_ndev(struct ib_gid_attr *gid_attr, char *buf)
+{
+	if (!gid_attr->ndev)
+		return -EINVAL;
+
+	return sprintf(buf, "%s\n", gid_attr->ndev->name);
+}
+
+static size_t print_gid_type(struct ib_gid_attr *gid_attr, char *buf)
+{
+	return sprintf(buf, "%s", roce_gid_cache_type_str(gid_attr->gid_type));
+}
+
+static ssize_t _show_port_gid_attr(struct ib_port *p,
+				   struct port_attribute *attr,
+				   char *buf,
+				   size_t (*print)(struct ib_gid_attr *gid_attr,
+						   char *buf))
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	union ib_gid gid;
+	struct ib_gid_attr gid_attr;
+	ssize_t ret;
+	va_list args;
+
+	rcu_read_lock();
+	ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid,
+			   &gid_attr);
+	if (ret)
+		goto err;
+
+	ret = print(&gid_attr, buf);
+
+err:
+	va_end(args);
+	rcu_read_unlock();
+	return ret;
+}
+
 static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
 			     char *buf)
 {
@@ -296,6 +364,19 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
 	return sprintf(buf, "%pI6\n", gid.raw);
 }
 
+static ssize_t show_port_gid_attr_ndev(struct ib_port *p,
+				       struct port_attribute *attr, char *buf)
+{
+	return _show_port_gid_attr(p, attr, buf, print_ndev);
+}
+
+static ssize_t show_port_gid_attr_gid_type(struct ib_port *p,
+					   struct port_attribute *attr,
+					   char *buf)
+{
+	return _show_port_gid_attr(p, attr, buf, print_gid_type);
+}
+
 static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
 			      char *buf)
 {
@@ -446,12 +527,41 @@ static void ib_port_release(struct kobject *kobj)
 	kfree(p);
 }
 
+static void ib_port_gid_attr_release(struct kobject *kobj)
+{
+	struct gid_attr_group *g = container_of(kobj, struct gid_attr_group,
+						kobj);
+	struct attribute *a;
+	int i;
+
+	if (g->ndev.attrs) {
+		for (i = 0; (a = g->ndev.attrs[i]); ++i)
+			kfree(a);
+
+		kfree(g->ndev.attrs);
+	}
+
+	if (g->type.attrs) {
+		for (i = 0; (a = g->type.attrs[i]); ++i)
+			kfree(a);
+
+		kfree(g->type.attrs);
+	}
+
+	kfree(g);
+}
+
 static struct kobj_type port_type = {
 	.release       = ib_port_release,
 	.sysfs_ops     = &port_sysfs_ops,
 	.default_attrs = port_default_attrs
 };
 
+static struct kobj_type gid_attr_type = {
+	.sysfs_ops	= &gid_attr_sysfs_ops,
+	.release	= ib_port_gid_attr_release
+};
+
 static void ib_device_release(struct device *device)
 {
 	struct ib_device *dev = container_of(device, struct ib_device, dev);
@@ -545,9 +655,23 @@ static int add_port(struct ib_device *device, int port_num,
 		return ret;
 	}
 
+	p->gid_attr_group = kzalloc(sizeof(*p->gid_attr_group), GFP_KERNEL);
+	if (!p->gid_attr_group) {
+		ret = -ENOMEM;
+		goto err_put;
+	}
+
+	p->gid_attr_group->port = p;
+	ret = kobject_init_and_add(&p->gid_attr_group->kobj, &gid_attr_type,
+				   &p->kobj, "gid_attrs");
+	if (ret) {
+		kfree(p->gid_attr_group);
+		goto err_put;
+	}
+
 	ret = sysfs_create_group(&p->kobj, &pma_group);
 	if (ret)
-		goto err_put;
+		goto err_put_gid_attrs;
 
 	p->gid_group.name  = "gids";
 	p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len);
@@ -560,12 +684,38 @@ static int add_port(struct ib_device *device, int port_num,
 	if (ret)
 		goto err_free_gid;
 
+	p->gid_attr_group->ndev.name = "ndevs";
+	p->gid_attr_group->ndev.attrs = alloc_group_attrs(show_port_gid_attr_ndev,
+							  attr.gid_tbl_len);
+	if (!p->gid_attr_group->ndev.attrs) {
+		ret = -ENOMEM;
+		goto err_remove_gid;
+	}
+
+	ret = sysfs_create_group(&p->gid_attr_group->kobj,
+				 &p->gid_attr_group->ndev);
+	if (ret)
+		goto err_free_gid_ndev;
+
+	p->gid_attr_group->type.name = "types";
+	p->gid_attr_group->type.attrs = alloc_group_attrs(show_port_gid_attr_gid_type,
+							  attr.gid_tbl_len);
+	if (!p->gid_attr_group->type.attrs) {
+		ret = -ENOMEM;
+		goto err_remove_gid_ndev;
+	}
+
+	ret = sysfs_create_group(&p->gid_attr_group->kobj,
+				 &p->gid_attr_group->type);
+	if (ret)
+		goto err_free_gid_type;
+
 	p->pkey_group.name  = "pkeys";
 	p->pkey_group.attrs = alloc_group_attrs(show_port_pkey,
 						attr.pkey_tbl_len);
 	if (!p->pkey_group.attrs) {
 		ret = -ENOMEM;
-		goto err_remove_gid;
+		goto err_remove_gid_type;
 	}
 
 	ret = sysfs_create_group(&p->kobj, &p->pkey_group);
@@ -593,6 +743,28 @@ err_free_pkey:
 	kfree(p->pkey_group.attrs);
 	p->pkey_group.attrs = NULL;
 
+err_remove_gid_type:
+	sysfs_remove_group(&p->gid_attr_group->kobj,
+			   &p->gid_attr_group->type);
+
+err_free_gid_type:
+	for (i = 0; i < attr.gid_tbl_len; ++i)
+		kfree(p->gid_attr_group->type.attrs[i]);
+
+	kfree(p->gid_attr_group->type.attrs);
+	p->gid_attr_group->type.attrs = NULL;
+
+err_remove_gid_ndev:
+	sysfs_remove_group(&p->gid_attr_group->kobj,
+			   &p->gid_attr_group->ndev);
+
+err_free_gid_ndev:
+	for (i = 0; i < attr.gid_tbl_len; ++i)
+		kfree(p->gid_attr_group->ndev.attrs[i]);
+
+	kfree(p->gid_attr_group->ndev.attrs);
+	p->gid_attr_group->ndev.attrs = NULL;
+
 err_remove_gid:
 	sysfs_remove_group(&p->kobj, &p->gid_group);
 
@@ -606,6 +778,9 @@ err_free_gid:
 err_remove_pma:
 	sysfs_remove_group(&p->kobj, &pma_group);
 
+err_put_gid_attrs:
+	kobject_put(&p->gid_attr_group->kobj);
+
 err_put:
 	kobject_put(&p->kobj);
 	return ret;
@@ -826,6 +1001,11 @@ static void free_port_list_attributes(struct ib_device *device)
 		sysfs_remove_group(p, &pma_group);
 		sysfs_remove_group(p, &port->pkey_group);
 		sysfs_remove_group(p, &port->gid_group);
+		sysfs_remove_group(&port->gid_attr_group->kobj,
+				   &port->gid_attr_group->ndev);
+		sysfs_remove_group(&port->gid_attr_group->kobj,
+				   &port->gid_attr_group->type);
+		kobject_put(&port->gid_attr_group->kobj);
 		kobject_put(p);
 	}
 
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 10/33] IB/core: Support find sgid index using a filter function
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (8 preceding siblings ...)
  2015-03-25 21:19   ` [PATCH v3 for-next 09/33] IB/core: Report gid_type and gid_ndev through sysfs Somnath Kotur
@ 2015-03-25 21:19   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 11/33] IB/core: Modify ib_verbs and cma in order to use roce_gid_cache Somnath Kotur
                     ` (22 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:19 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Somnath Kotur

From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Sometimes a sgid index need to be found based on variable parameters.
For example, when the CM gets a packet from network, it needs to
match a sgid_index that matches the appropriate L2 attributes
of a packet. Extending the cache's API to include Ethernet L2
attribute is problematic, since they may be vastly extended
in the future. As a result, we add a find function that
gets a user filter function and searches the GID table
until a match is found.

Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/core/cache.c          | 24 ++++++++++++
 drivers/infiniband/core/core_priv.h      |  9 +++++
 drivers/infiniband/core/roce_gid_cache.c | 66 ++++++++++++++++++++++++++++++++
 include/rdma/ib_cache.h                  | 27 +++++++++++++
 4 files changed, 126 insertions(+)

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 882d491..ae86fe8 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -273,6 +273,30 @@ int ib_find_cached_gid_by_port(struct ib_device *device,
 }
 EXPORT_SYMBOL(ib_find_cached_gid_by_port);
 
+int ib_find_gid_by_filter(struct ib_device *device,
+			  union ib_gid *gid,
+			  u8 port_num,
+			  bool (*filter)(const union ib_gid *gid,
+					 const struct ib_gid_attr *,
+					 void *),
+			  void *context, u16 *index)
+{
+	/* Look for a RoCE device with the specified GID. */
+	if (!ib_cache_use_roce_gid_cache(device, port_num))
+		return roce_gid_cache_find_gid_by_filter(device, gid,
+							 port_num, filter,
+							 context, index);
+
+	/* Only RoCE GID cache supports filter function */
+	if (filter)
+		return -ENOSYS;
+
+	/* If no RoCE devices with the specified GID, look for IB device. */
+	return __ib_find_cached_gid_by_port(device, port_num,
+					    gid, index);
+}
+EXPORT_SYMBOL(ib_find_gid_by_filter);
+
 int ib_get_cached_pkey(struct ib_device *device,
 		       u8                port_num,
 		       int               index,
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index b5bbbdf..949844c 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -84,6 +84,15 @@ int roce_gid_cache_find_gid_by_port(struct ib_device *ib_dev, union ib_gid *gid,
 				    enum ib_gid_type gid_type, u8 port,
 				    struct net *net, int if_index, u16 *index);
 
+int roce_gid_cache_find_gid_by_filter(struct ib_device *ib_dev,
+				      union ib_gid *gid,
+				      u8 port,
+				      bool (*filter)(const union ib_gid *gid,
+						     const struct ib_gid_attr *,
+						     void *),
+				      void *context,
+				      u16 *index);
+
 int roce_gid_cache_is_active(struct ib_device *ib_dev, u8 port);
 
 enum roce_gid_cache_default_mode {
diff --git a/drivers/infiniband/core/roce_gid_cache.c b/drivers/infiniband/core/roce_gid_cache.c
index b6180eb..bd51d97 100644
--- a/drivers/infiniband/core/roce_gid_cache.c
+++ b/drivers/infiniband/core/roce_gid_cache.c
@@ -455,6 +455,72 @@ int roce_gid_cache_find_gid_by_port(struct ib_device *ib_dev, union ib_gid *gid,
 	return -ENOENT;
 }
 
+int roce_gid_cache_find_gid_by_filter(struct ib_device *ib_dev,
+				      union ib_gid *gid,
+				      u8 port,
+				      bool (*filter)(const union ib_gid *,
+						     const struct ib_gid_attr *,
+						     void *),
+				      void *context,
+				      u16 *index)
+{
+	struct ib_roce_gid_cache *cache;
+	unsigned int i;
+	bool found = false;
+
+	if (!ib_dev->cache.roce_gid_cache)
+		return -ENOSYS;
+
+	if (port < start_port(ib_dev) ||
+	    port > start_port(ib_dev) + ib_dev->phys_port_cnt ||
+	    rdma_port_get_link_layer(ib_dev, port) !=
+		IB_LINK_LAYER_ETHERNET)
+		return -ENOSYS;
+
+	cache = ib_dev->cache.roce_gid_cache[port - start_port(ib_dev)];
+
+	if (!cache || !cache->active)
+		return -ENOENT;
+
+	for (i = 0; i < cache->sz; i++) {
+		unsigned int orig_seq;
+		struct ib_gid_attr attr;
+
+		orig_seq = cache->data_vec[i].seq;
+		if (orig_seq == -1)
+			continue;
+		/* Make sure the sequence number we remeber was read
+		 * before the gid cache entry content is read.
+		 */
+		smp_rmb();
+
+		if (memcmp(gid, &cache->data_vec[i].gid, sizeof(*gid)))
+			continue;
+
+		memcpy(&attr, &cache->data_vec[i].attr, sizeof(attr));
+
+		rcu_read_lock();
+
+		/* Make sure we finished reading the attribute */
+		smp_rmb();
+		if (orig_seq == ACCESS_ONCE(cache->data_vec[i].seq))
+			if (!filter || filter(gid, &attr, context))
+				found = true;
+
+		rcu_read_unlock();
+
+		if (found)
+			break;
+	}
+
+	if (!found)
+		return -ENOENT;
+
+	if (index)
+		*index = i;
+	return 0;
+}
+
 static struct ib_roce_gid_cache *alloc_roce_gid_cache(int sz)
 {
 	struct ib_roce_gid_cache *cache =
diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h
index 36b72bf..5ed728c 100644
--- a/include/rdma/ib_cache.h
+++ b/include/rdma/ib_cache.h
@@ -111,6 +111,33 @@ int ib_find_cached_gid_by_port(struct ib_device *device,
 			       struct net	*net,
 			       int		if_index,
 			       u16              *index);
+
+/**
+ * ib_find_gid_by_filter - Returns the GID table index where a specified
+ * GID value occurs
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value could be
+ *   searched.
+ * @filter: The filter function is executed on any matching GID in the table.
+ *   If the filter function returns true, the corresponding index is returned,
+ *   otherwise, we continue searching the GID table. It's guaranteed that
+ *   while filter is executed, ndev field is valid and the structure won't
+ *   change. filter is executed in an atomic context. filter must be NULL
+ *   when RoCE GID cache isn't supported on the respective device's port.
+ * @index: The index into the cached GID table where the GID was found.  This
+ *   parameter may be NULL.
+ *
+ * ib_find_gid_by_filter() searches for the specified GID value in
+ * the local software cache.
+ */
+int ib_find_gid_by_filter(struct ib_device *device,
+			  union ib_gid *gid,
+			  u8 port_num,
+			  bool (*filter)(const union ib_gid *gid,
+					 const struct ib_gid_attr *,
+					 void *),
+			  void *context, u16 *index);
 /**
  * ib_get_cached_pkey - Returns a cached PKey table entry
  * @device: The device to query.
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 11/33] IB/core: Modify ib_verbs and cma in order to use roce_gid_cache
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (9 preceding siblings ...)
  2015-03-25 21:19   ` [PATCH v3 for-next 10/33] IB/core: Support find sgid index using a filter function Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 12/33] IB/core: Add gid_type to path and rdma_id_private Somnath Kotur
                     ` (21 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Somnath Kotur

From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Previously, we resolved the dmac and took the smac and vlan
from the resolved address. Changing that into finding a net
device that matches the IP and vlan of the network packet
and querying the RoCE GID cache for this net device,
GID and GID type.

ocrdma driver changes were done by Somnath Kotur <Somnath.Kotur-iH1Dq9VlAzfQT0dZR+AlfA@public.gmane.org>

Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/core/addr.c           |   3 +-
 drivers/infiniband/core/cm.c             |  30 ------
 drivers/infiniband/core/cma.c            |   9 --
 drivers/infiniband/core/core_priv.h      |   4 +-
 drivers/infiniband/core/sa_query.c       |   4 -
 drivers/infiniband/core/ucma.c           |   1 -
 drivers/infiniband/core/uverbs_cmd.c     |   3 +-
 drivers/infiniband/core/verbs.c          | 162 ++++++++++++++++++-------------
 drivers/infiniband/hw/mlx4/ah.c          |  15 ++-
 drivers/infiniband/hw/mlx4/mad.c         |  12 ++-
 drivers/infiniband/hw/mlx4/mcg.c         |   2 +-
 drivers/infiniband/hw/mlx4/mlx4_ib.h     |   2 +-
 drivers/infiniband/hw/mlx4/qp.c          |  48 +++++++--
 drivers/infiniband/hw/ocrdma/ocrdma.h    |   1 +
 drivers/infiniband/hw/ocrdma/ocrdma_ah.c |  20 ++--
 drivers/infiniband/hw/ocrdma/ocrdma_hw.c |  17 ++--
 include/rdma/ib_addr.h                   |   2 +-
 include/rdma/ib_sa.h                     |   2 -
 include/rdma/ib_verbs.h                  |  11 +--
 19 files changed, 190 insertions(+), 158 deletions(-)

diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index f80da50..43af7f5 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -458,7 +458,7 @@ static void resolve_cb(int status, struct sockaddr *src_addr,
 }
 
 int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac,
-			       u16 *vlan_id)
+			       u16 *vlan_id, int if_index)
 {
 	int ret = 0;
 	struct rdma_dev_addr dev_addr;
@@ -481,6 +481,7 @@ int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac,
 		return ret;
 
 	memset(&dev_addr, 0, sizeof(dev_addr));
+	dev_addr.bound_dev_if = if_index;
 
 	ctx.addr = &dev_addr;
 	init_completion(&ctx.comp);
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index d88f2ae..7974e74 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -178,8 +178,6 @@ struct cm_av {
 	struct ib_ah_attr ah_attr;
 	u16 pkey_index;
 	u8 timeout;
-	u8  valid;
-	u8  smac[ETH_ALEN];
 };
 
 struct cm_work {
@@ -382,7 +380,6 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
 			     &av->ah_attr);
 	av->timeout = path->packet_life_time + 1;
 
-	av->valid = 1;
 	return 0;
 }
 
@@ -1563,7 +1560,6 @@ static int cm_req_handler(struct cm_work *work)
 	cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]);
 
 	memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN);
-	work->path[0].vlan_id = cm_id_priv->av.ah_attr.vlan_id;
 	ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
 	if (ret) {
 		ib_get_cached_gid(work->port->cm_dev->ib_device,
@@ -3511,32 +3507,6 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
 		*qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU |
 				IB_QP_DEST_QPN | IB_QP_RQ_PSN;
 		qp_attr->ah_attr = cm_id_priv->av.ah_attr;
-		if (!cm_id_priv->av.valid) {
-			spin_unlock_irqrestore(&cm_id_priv->lock, flags);
-			return -EINVAL;
-		}
-		if (cm_id_priv->av.ah_attr.vlan_id != 0xffff) {
-			qp_attr->vlan_id = cm_id_priv->av.ah_attr.vlan_id;
-			*qp_attr_mask |= IB_QP_VID;
-		}
-		if (!is_zero_ether_addr(cm_id_priv->av.smac)) {
-			memcpy(qp_attr->smac, cm_id_priv->av.smac,
-			       sizeof(qp_attr->smac));
-			*qp_attr_mask |= IB_QP_SMAC;
-		}
-		if (cm_id_priv->alt_av.valid) {
-			if (cm_id_priv->alt_av.ah_attr.vlan_id != 0xffff) {
-				qp_attr->alt_vlan_id =
-					cm_id_priv->alt_av.ah_attr.vlan_id;
-				*qp_attr_mask |= IB_QP_ALT_VID;
-			}
-			if (!is_zero_ether_addr(cm_id_priv->alt_av.smac)) {
-				memcpy(qp_attr->alt_smac,
-				       cm_id_priv->alt_av.smac,
-				       sizeof(qp_attr->alt_smac));
-				*qp_attr_mask |= IB_QP_ALT_SMAC;
-			}
-		}
 		qp_attr->path_mtu = cm_id_priv->path_mtu;
 		qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn);
 		qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn);
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 335def9..659676c 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -666,15 +666,6 @@ static int cma_modify_qp_rtr(struct rdma_id_private *id_priv,
 	if (ret)
 		goto out;
 
-	if (rdma_node_get_transport(id_priv->cma_dev->device->node_type)
-	    == RDMA_TRANSPORT_IB &&
-	    rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)
-	    == IB_LINK_LAYER_ETHERNET) {
-		ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr.smac, NULL);
-
-		if (ret)
-			goto out;
-	}
 	if (conn_param)
 		qp_attr.max_dest_rd_atomic = conn_param->responder_resources;
 	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 949844c..94db13d 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -52,8 +52,8 @@ void ib_sysfs_cleanup(void);
 int  ib_cache_setup(void);
 void ib_cache_cleanup(void);
 
-int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
-			    struct ib_qp_attr *qp_attr, int *qp_attr_mask);
+int ib_resolve_eth_dmac(struct ib_qp *qp,
+			struct ib_qp_attr *qp_attr, int *qp_attr_mask);
 
 typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
 	      struct net_device *idev, void *cookie);
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 5b20237..705b6b8 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -559,11 +559,7 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
 	}
 	if (force_grh) {
 		memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN);
-		ah_attr->vlan_id = rec->vlan_id;
-	} else {
-		ah_attr->vlan_id = 0xffff;
 	}
-
 	return 0;
 }
 EXPORT_SYMBOL(ib_init_ah_from_path);
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 45d67e9..5eacda4 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1125,7 +1125,6 @@ static int ucma_set_ib_path(struct ucma_context *ctx,
 		return -EINVAL;
 
 	memset(&sa_path, 0, sizeof(sa_path));
-	sa_path.vlan_id = 0xffff;
 
 	ib_sa_unpack_path(path_data->path_rec, &sa_path);
 	ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1);
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index a9f0489..0cf1360 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -2095,7 +2095,7 @@ ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
 	attr->alt_ah_attr.port_num 	    = cmd.alt_dest.port_num;
 
 	if (qp->real_qp == qp) {
-		ret = ib_resolve_eth_l2_attrs(qp, attr, &cmd.attr_mask);
+		ret = ib_resolve_eth_dmac(qp, attr, &cmd.attr_mask);
 		if (ret)
 			goto release_qp;
 		ret = qp->device->modify_qp(qp, attr,
@@ -2559,7 +2559,6 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
 	attr.grh.sgid_index    = cmd.attr.grh.sgid_index;
 	attr.grh.hop_limit     = cmd.attr.grh.hop_limit;
 	attr.grh.traffic_class = cmd.attr.grh.traffic_class;
-	attr.vlan_id           = 0;
 	memset(&attr.dmac, 0, sizeof(attr.dmac));
 	memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16);
 
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 1fe3e71..2f5fd7a 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -41,6 +41,9 @@
 #include <linux/export.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <net/addrconf.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_cache.h>
@@ -192,6 +195,35 @@ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
 }
 EXPORT_SYMBOL(ib_create_ah);
 
+struct find_gid_index_context {
+	u16 vlan_id;
+};
+
+static bool find_gid_index(const union ib_gid *gid,
+			   const struct ib_gid_attr *gid_attr,
+			   void *context)
+{
+	struct find_gid_index_context *ctx =
+		(struct find_gid_index_context *)context;
+
+	if ((!!(ctx->vlan_id != 0xffff) == !is_vlan_dev(gid_attr->ndev)) ||
+	    (is_vlan_dev(gid_attr->ndev) &&
+	     vlan_dev_vlan_id(gid_attr->ndev) != ctx->vlan_id))
+		return false;
+
+	return true;
+}
+
+static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num,
+				   u16 vlan_id, union ib_gid *sgid,
+				   u16 *gid_index)
+{
+	struct find_gid_index_context context = {.vlan_id = vlan_id};
+
+	return ib_find_gid_by_filter(device, sgid, port_num, find_gid_index,
+				     &context, gid_index);
+}
+
 int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
 		       struct ib_grh *grh, struct ib_ah_attr *ah_attr)
 {
@@ -203,21 +235,30 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
 
 	memset(ah_attr, 0, sizeof *ah_attr);
 	if (is_eth) {
+		u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ?
+				wc->vlan_id : 0xffff;
+
 		if (!(wc->wc_flags & IB_WC_GRH))
 			return -EPROTOTYPE;
 
-		if (wc->wc_flags & IB_WC_WITH_SMAC &&
-		    wc->wc_flags & IB_WC_WITH_VLAN) {
-			memcpy(ah_attr->dmac, wc->smac, ETH_ALEN);
-			ah_attr->vlan_id = wc->vlan_id;
-		} else {
+		if (!(wc->wc_flags & IB_WC_WITH_SMAC) ||
+		    !(wc->wc_flags & IB_WC_WITH_VLAN)) {
 			ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid,
-					ah_attr->dmac, &ah_attr->vlan_id);
+							 ah_attr->dmac,
+							 wc->wc_flags & IB_WC_WITH_VLAN ?
+							 NULL : &vlan_id,
+							 0);
 			if (ret)
 				return ret;
 		}
-	} else {
-		ah_attr->vlan_id = 0xffff;
+
+		ret = get_sgid_index_from_eth(device, port_num, vlan_id,
+					      &grh->dgid, &gid_index);
+		if (ret)
+			return ret;
+
+		if (wc->wc_flags & IB_WC_WITH_SMAC)
+			memcpy(ah_attr->dmac, wc->smac, ETH_ALEN);
 	}
 
 	ah_attr->dlid = wc->slid;
@@ -229,10 +270,14 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
 		ah_attr->ah_flags = IB_AH_GRH;
 		ah_attr->grh.dgid = grh->sgid;
 
-		ret = ib_find_cached_gid(device, &grh->dgid, IB_GID_TYPE_IB,
-					 NULL, 0, &port_num, &gid_index);
-		if (ret)
-			return ret;
+		if (!is_eth) {
+			ret = ib_find_cached_gid_by_port(device, &grh->dgid,
+							 IB_GID_TYPE_IB,
+							 port_num, NULL, 0,
+							 &gid_index);
+			if (ret)
+				return ret;
+		}
 
 		ah_attr->grh.sgid_index = (u8) gid_index;
 		flow_class = be32_to_cpu(grh->version_tclass_flow);
@@ -502,9 +547,7 @@ EXPORT_SYMBOL(ib_create_qp);
 static const struct {
 	int			valid;
 	enum ib_qp_attr_mask	req_param[IB_QPT_MAX];
-	enum ib_qp_attr_mask	req_param_add_eth[IB_QPT_MAX];
 	enum ib_qp_attr_mask	opt_param[IB_QPT_MAX];
-	enum ib_qp_attr_mask	opt_param_add_eth[IB_QPT_MAX];
 } qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
 	[IB_QPS_RESET] = {
 		[IB_QPS_RESET] = { .valid = 1 },
@@ -585,12 +628,6 @@ static const struct {
 						IB_QP_MAX_DEST_RD_ATOMIC	|
 						IB_QP_MIN_RNR_TIMER),
 			},
-			.req_param_add_eth = {
-				[IB_QPT_RC]  = (IB_QP_SMAC),
-				[IB_QPT_UC]  = (IB_QP_SMAC),
-				[IB_QPT_XRC_INI]  = (IB_QP_SMAC),
-				[IB_QPT_XRC_TGT]  = (IB_QP_SMAC)
-			},
 			.opt_param = {
 				 [IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
 						 IB_QP_QKEY),
@@ -611,21 +648,7 @@ static const struct {
 				 [IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
 						 IB_QP_QKEY),
 			 },
-			.opt_param_add_eth = {
-				[IB_QPT_RC]  = (IB_QP_ALT_SMAC			|
-						IB_QP_VID			|
-						IB_QP_ALT_VID),
-				[IB_QPT_UC]  = (IB_QP_ALT_SMAC			|
-						IB_QP_VID			|
-						IB_QP_ALT_VID),
-				[IB_QPT_XRC_INI]  = (IB_QP_ALT_SMAC			|
-						IB_QP_VID			|
-						IB_QP_ALT_VID),
-				[IB_QPT_XRC_TGT]  = (IB_QP_ALT_SMAC			|
-						IB_QP_VID			|
-						IB_QP_ALT_VID)
-			}
-		}
+		},
 	},
 	[IB_QPS_RTR]   = {
 		[IB_QPS_RESET] = { .valid = 1 },
@@ -847,13 +870,6 @@ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
 	req_param = qp_state_table[cur_state][next_state].req_param[type];
 	opt_param = qp_state_table[cur_state][next_state].opt_param[type];
 
-	if (ll == IB_LINK_LAYER_ETHERNET) {
-		req_param |= qp_state_table[cur_state][next_state].
-			req_param_add_eth[type];
-		opt_param |= qp_state_table[cur_state][next_state].
-			opt_param_add_eth[type];
-	}
-
 	if ((mask & req_param) != req_param)
 		return 0;
 
@@ -864,41 +880,55 @@ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
 }
 EXPORT_SYMBOL(ib_modify_qp_is_ok);
 
-int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
-			    struct ib_qp_attr *qp_attr, int *qp_attr_mask)
+int ib_resolve_eth_dmac(struct ib_qp *qp,
+			struct ib_qp_attr *qp_attr, int *qp_attr_mask)
 {
 	int           ret = 0;
-	union ib_gid  sgid;
+	u8	      start_port = qp->device->node_type == RDMA_NODE_IB_SWITCH ? 0 : 1;
 
 	if ((*qp_attr_mask & IB_QP_AV)  &&
-	    (rdma_port_get_link_layer(qp->device, qp_attr->ah_attr.port_num) == IB_LINK_LAYER_ETHERNET)) {
-		ret = ib_query_gid(qp->device, qp_attr->ah_attr.port_num,
-				   qp_attr->ah_attr.grh.sgid_index, &sgid,
-				   NULL);
-		if (ret)
-			goto out;
+	    (qp_attr->ah_attr.port_num >= start_port) &&
+	    (qp_attr->ah_attr.port_num < start_port + qp->device->phys_port_cnt) &&
+	    (rdma_port_get_link_layer(qp->device, qp_attr->ah_attr.port_num) ==
+	     IB_LINK_LAYER_ETHERNET)) {
 		if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) {
-			rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, qp_attr->ah_attr.dmac);
-			rdma_get_ll_mac((struct in6_addr *)sgid.raw, qp_attr->smac);
-			if (!(*qp_attr_mask & IB_QP_VID))
-				qp_attr->vlan_id = rdma_get_vlan_id(&sgid);
+			rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw,
+					qp_attr->ah_attr.dmac);
 		} else {
-			ret = rdma_addr_find_dmac_by_grh(&sgid, &qp_attr->ah_attr.grh.dgid,
-					qp_attr->ah_attr.dmac, &qp_attr->vlan_id);
-			if (ret)
-				goto out;
-			ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr->smac, NULL);
-			if (ret)
+			union ib_gid		sgid;
+			struct ib_gid_attr	sgid_attr;
+			int			ifindex;
+
+			rcu_read_lock();
+			ret = ib_query_gid(qp->device,
+					   qp_attr->ah_attr.port_num,
+					   qp_attr->ah_attr.grh.sgid_index,
+					   &sgid, &sgid_attr);
+
+			if (ret || !sgid_attr.ndev) {
+				if (!ret)
+					ret = -ENXIO;
+				rcu_read_unlock();
 				goto out;
+			}
+
+			dev_hold(sgid_attr.ndev);
+			ifindex = sgid_attr.ndev->ifindex;
+
+			rcu_read_unlock();
+
+			ret = rdma_addr_find_dmac_by_grh(&sgid,
+							 &qp_attr->ah_attr.grh.dgid,
+							 qp_attr->ah_attr.dmac,
+							 NULL, ifindex);
+
+			dev_put(sgid_attr.ndev);
 		}
-		*qp_attr_mask |= IB_QP_SMAC;
-		if (qp_attr->vlan_id < 0xFFFF)
-			*qp_attr_mask |= IB_QP_VID;
 	}
 out:
 	return ret;
 }
-EXPORT_SYMBOL(ib_resolve_eth_l2_attrs);
+EXPORT_SYMBOL(ib_resolve_eth_dmac);
 
 
 int ib_modify_qp(struct ib_qp *qp,
@@ -907,7 +937,7 @@ int ib_modify_qp(struct ib_qp *qp,
 {
 	int ret;
 
-	ret = ib_resolve_eth_l2_attrs(qp, qp_attr, &qp_attr_mask);
+	ret = ib_resolve_eth_dmac(qp, qp_attr, &qp_attr_mask);
 	if (ret)
 		return ret;
 
diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index f50a546..aaeeb60 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -76,7 +76,9 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr
 	struct mlx4_dev *dev = ibdev->dev;
 	int is_mcast = 0;
 	struct in6_addr in6;
-	u16 vlan_tag;
+	u16 vlan_tag = 0xffff;
+	union ib_gid sgid;
+	struct ib_gid_attr gid_attr;
 
 	memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6));
 	if (rdma_is_multicast_addr(&in6)) {
@@ -85,7 +87,16 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr
 	} else {
 		memcpy(ah->av.eth.mac, ah_attr->dmac, ETH_ALEN);
 	}
-	vlan_tag = ah_attr->vlan_id;
+	rcu_read_lock();
+	ib_get_cached_gid(pd->device, ah_attr->port_num,
+			  ah_attr->grh.sgid_index, &sgid, &gid_attr);
+	memset(ah->av.eth.s_mac, 0, ETH_ALEN);
+	if (gid_attr.ndev) {
+		if (is_vlan_dev(gid_attr.ndev))
+			vlan_tag = vlan_dev_vlan_id(gid_attr.ndev);
+		memcpy(ah->av.eth.s_mac, gid_attr.ndev->dev_addr, ETH_ALEN);
+	}
+	rcu_read_unlock();
 	if (vlan_tag < 0x1000)
 		vlan_tag |= (ah_attr->sl & 7) << 13;
 	ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 82a7dd8..e686e95 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -1154,7 +1154,7 @@ static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave)
 int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
 			 enum ib_qp_type dest_qpt, u16 pkey_index,
 			 u32 remote_qpn, u32 qkey, struct ib_ah_attr *attr,
-			 u8 *s_mac, struct ib_mad *mad)
+			 u8 *s_mac, u16 vlan_id, struct ib_mad *mad)
 {
 	struct ib_sge list;
 	struct ib_send_wr wr, *bad_wr;
@@ -1241,6 +1241,9 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
 	wr.send_flags = IB_SEND_SIGNALED;
 	if (s_mac)
 		memcpy(to_mah(ah)->av.eth.s_mac, s_mac, 6);
+	if (vlan_id < 0x1000)
+		vlan_id |= (attr->sl & 7) << 13;
+	to_mah(ah)->av.eth.vlan = cpu_to_be16(vlan_id);
 
 
 	ret = ib_post_send(send_qp, &wr, &bad_wr);
@@ -1277,6 +1280,7 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc
 	u8 *slave_id;
 	int slave;
 	int port;
+	u16 vlan_id;
 
 	/* Get slave that sent this packet */
 	if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn ||
@@ -1362,10 +1366,10 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc
 		return;
 	ah_attr.port_num = port;
 	memcpy(ah_attr.dmac, tunnel->hdr.mac, 6);
-	ah_attr.vlan_id = be16_to_cpu(tunnel->hdr.vlan);
+	vlan_id = be16_to_cpu(tunnel->hdr.vlan);
 	/* if slave have default vlan use it */
 	mlx4_get_slave_default_vlan(dev->dev, ctx->port, slave,
-				    &ah_attr.vlan_id, &ah_attr.sl);
+				    &vlan_id, &ah_attr.sl);
 
 	mlx4_ib_send_to_wire(dev, slave, ctx->port,
 			     is_proxy_qp0(dev, wc->src_qp, slave) ?
@@ -1373,7 +1377,7 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc
 			     be16_to_cpu(tunnel->hdr.pkey_index),
 			     be32_to_cpu(tunnel->hdr.remote_qpn),
 			     be32_to_cpu(tunnel->hdr.qkey),
-			     &ah_attr, wc->smac, &tunnel->mad);
+			     &ah_attr, wc->smac, vlan_id, &tunnel->mad);
 }
 
 static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c
index ed327e6..86bc158 100644
--- a/drivers/infiniband/hw/mlx4/mcg.c
+++ b/drivers/infiniband/hw/mlx4/mcg.c
@@ -217,7 +217,7 @@ static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad)
 	spin_unlock(&dev->sm_lock);
 	return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev),
 				    ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY,
-				    &ah_attr, NULL, mad);
+				    &ah_attr, NULL, 0xffff, mad);
 }
 
 static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx,
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 721540c..42fe035 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -761,7 +761,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
 int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
 			 enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn,
 			 u32 qkey, struct ib_ah_attr *attr, u8 *s_mac,
-			 struct ib_mad *mad);
+			 u16 vlan_id, struct ib_mad *mad);
 
 __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx);
 
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index b9ed4f1..6f6d0db 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1351,11 +1351,12 @@ static int _mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
 static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_qp_attr *qp,
 			 enum ib_qp_attr_mask qp_attr_mask,
 			 struct mlx4_ib_qp *mqp,
-			 struct mlx4_qp_path *path, u8 port)
+			 struct mlx4_qp_path *path, u8 port,
+			 u16 vlan_id, u8 *smac)
 {
 	return _mlx4_set_path(dev, &qp->ah_attr,
-			      mlx4_mac_to_u64((u8 *)qp->smac),
-			      (qp_attr_mask & IB_QP_VID) ? qp->vlan_id : 0xffff,
+			      mlx4_mac_to_u64(smac),
+			      vlan_id,
 			      path, &mqp->pri, port);
 }
 
@@ -1366,9 +1367,8 @@ static int mlx4_set_alt_path(struct mlx4_ib_dev *dev,
 			     struct mlx4_qp_path *path, u8 port)
 {
 	return _mlx4_set_path(dev, &qp->alt_ah_attr,
-			      mlx4_mac_to_u64((u8 *)qp->alt_smac),
-			      (qp_attr_mask & IB_QP_ALT_VID) ?
-			      qp->alt_vlan_id : 0xffff,
+			      0,
+			      0xffff,
 			      path, &mqp->alt, port);
 }
 
@@ -1384,7 +1384,8 @@ static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 	}
 }
 
-static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, u8 *smac,
+static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev,
+				    struct mlx4_ib_qp *qp,
 				    struct mlx4_qp_context *context)
 {
 	u64 u64_mac;
@@ -1524,9 +1525,34 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 	}
 
 	if (attr_mask & IB_QP_AV) {
+		u8 port_num = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+		union ib_gid gid;
+		struct ib_gid_attr gid_attr = {.gid_type = IB_GID_TYPE_IB};
+		u16 vlan = 0xffff;
+		u8 smac[ETH_ALEN];
+		int status = 0;
+
+		if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
+		    IB_LINK_LAYER_ETHERNET &&
+		    attr->ah_attr.ah_flags & IB_AH_GRH) {
+			int index = attr->ah_attr.grh.sgid_index;
+
+			rcu_read_lock();
+			status = ib_get_cached_gid(ibqp->device, port_num,
+						   index, &gid, &gid_attr);
+			if (!status && !memcmp(&gid, &zgid, sizeof(gid)))
+				status = -ENOENT;
+			if (!status) {
+				vlan = rdma_vlan_dev_vlan_id(gid_attr.ndev);
+				memcpy(smac, gid_attr.ndev->dev_addr, ETH_ALEN);
+			}
+			rcu_read_unlock();
+		}
+		if (status)
+			goto out;
+
 		if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path,
-				  attr_mask & IB_QP_PORT ?
-				  attr->port_num : qp->port))
+				  port_num, vlan, smac))
 			goto out;
 
 		optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
@@ -1663,7 +1689,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 			if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD ||
 			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI ||
 			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) {
-				err = handle_eth_ud_smac_index(dev, qp, (u8 *)attr->smac, context);
+				err = handle_eth_ud_smac_index(dev, qp, context);
 				if (err) {
 					err = -EINVAL;
 					goto out;
@@ -2158,6 +2184,8 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 						be32_to_cpu(ah->av.ib.port_pd) >> 24,
 						ah->av.ib.gid_index, &sgid,
 						NULL);
+			if (!err && !memcmp(&sgid, &zgid, sizeof(sgid)))
+				err = -ENOENT;
 			if (err)
 				return err;
 		}
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h
index c9780d9..16ee36e 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma.h
@@ -36,6 +36,7 @@
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
 
 #include <be_roce.h>
 #include "ocrdma_sli.h"
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
index d812904..7ecd230 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
@@ -41,10 +41,9 @@
 
 static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,
 			struct ib_ah_attr *attr, union ib_gid *sgid,
-			int pdid, bool *isvlan)
+			int pdid, bool *isvlan, u16 vlan_tag)
 {
 	int status = 0;
-	u16 vlan_tag;
 	struct ocrdma_eth_vlan eth;
 	struct ocrdma_grh grh;
 	int eth_sz;
@@ -53,7 +52,6 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,
 	memset(&grh, 0, sizeof(grh));
 
 	/* VLAN */
-	vlan_tag = attr->vlan_id;
 	if (!vlan_tag || (vlan_tag > 0xFFF))
 		vlan_tag = dev->pvid;
 	if (vlan_tag && (vlan_tag < 0x1000)) {
@@ -94,9 +92,11 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,
 struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
 {
 	u32 *ahid_addr;
-	bool isvlan = false;
 	int status;
 	struct ocrdma_ah *ah;
+	bool isvlan = false;
+	u16 vlan_tag = 0xffff;
+	struct ib_gid_attr sgid_attr;
 	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
 	union ib_gid sgid;
@@ -114,16 +114,22 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
 	if (status)
 		goto av_err;
 
-	status = ocrdma_query_gid(&dev->ibdev, 1, attr->grh.sgid_index, &sgid);
+	rcu_read_lock();
+	status = ib_get_cached_gid(&dev->ibdev, 1, attr->grh.sgid_index, &sgid,
+				   &sgid_attr);
 	if (status) {
 		pr_err("%s(): Failed to query sgid, status = %d\n",
 		      __func__, status);
 		goto av_conf_err;
 	}
+	if (sgid_attr.ndev && is_vlan_dev(sgid_attr.ndev))
+		vlan_tag = vlan_dev_vlan_id(sgid_attr.ndev);
+	rcu_read_unlock();
 
 	if (pd->uctx) {
 		status = rdma_addr_find_dmac_by_grh(&sgid, &attr->grh.dgid,
-                                        attr->dmac, &attr->vlan_id);
+						    attr->dmac, &vlan_tag,
+						    sgid_attr.ndev->ifindex);
 		if (status) {
 			pr_err("%s(): Failed to resolve dmac from gid." 
 				"status = %d\n", __func__, status);
@@ -131,7 +137,7 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
 		}
 	}
 
-	status = set_av_attr(dev, ah, attr, &sgid, pd->id, &isvlan);
+	status = set_av_attr(dev, ah, attr, &sgid, pd->id, &isvlan, vlan_tag);
 	if (status)
 		goto av_conf_err;
 
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
index 0c9e959..e5f0244 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
@@ -2428,7 +2428,8 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp,
 	int status;
 	struct ib_ah_attr *ah_attr = &attrs->ah_attr;
 	union ib_gid sgid, zgid;
-	u32 vlan_id;
+	struct ib_gid_attr sgid_attr;
+	u32 vlan_id = 0xffff;
 	u8 mac_addr[6];
 	struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device);
 
@@ -2446,10 +2447,15 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp,
 	cmd->flags |= OCRDMA_QP_PARA_FLOW_LBL_VALID;
 	memcpy(&cmd->params.dgid[0], &ah_attr->grh.dgid.raw[0],
 	       sizeof(cmd->params.dgid));
-	status = ocrdma_query_gid(&dev->ibdev, 1,
-			ah_attr->grh.sgid_index, &sgid);
-	if (status)
-		return status;
+
+	rcu_read_lock();
+	status = ib_get_cached_gid(&dev->ibdev, 1, ah_attr->grh.sgid_index,
+				   &sgid, &sgid_attr);
+	if (!status) {
+		vlan_id = rdma_vlan_dev_vlan_id(sgid_attr.ndev);
+		memcpy(mac_addr, sgid_attr.ndev->dev_addr, ETH_ALEN);
+	}
+	rcu_read_unlock();
 
 	memset(&zgid, 0, sizeof(zgid));
 	if (!memcmp(&sgid, &zgid, sizeof(zgid)))
@@ -2467,7 +2473,6 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp,
 	ocrdma_cpu_to_le32(&cmd->params.sgid[0], sizeof(cmd->params.sgid));
 	cmd->params.vlan_dmac_b4_to_b5 = mac_addr[4] | (mac_addr[5] << 8);
 	if (attr_mask & IB_QP_VID) {
-		vlan_id = attrs->vlan_id;
 		cmd->params.vlan_dmac_b4_to_b5 |=
 		    vlan_id << OCRDMA_QP_PARAMS_VLAN_SHIFT;
 		cmd->flags |= OCRDMA_QP_PARA_VLAN_EN_VALID;
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index 3cf32d1..0dfaaa7 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -112,7 +112,7 @@ int rdma_addr_size(struct sockaddr *addr);
 
 int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id);
 int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *smac,
-			       u16 *vlan_id);
+			       u16 *vlan_id, int if_index);
 
 static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr)
 {
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index 6a1b994..eea01e6 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -154,9 +154,7 @@ struct ib_sa_path_rec {
 	u8           packet_life_time_selector;
 	u8           packet_life_time;
 	u8           preference;
-	u8           smac[ETH_ALEN];
 	u8           dmac[ETH_ALEN];
-	u16          vlan_id;
 	int	     ifindex;
 	struct net  *net;
 };
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 2d662e0..d8e4dd0 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -74,6 +74,8 @@ enum ib_gid_type {
 	IB_GID_TYPE_SIZE
 };
 
+#define ROCE_V2_UDP_DPORT	1021
+
 struct ib_gid_attr {
 	enum ib_gid_type	gid_type;
 	struct net_device	*ndev;
@@ -669,7 +671,6 @@ struct ib_ah_attr {
 	u8			ah_flags;
 	u8			port_num;
 	u8			dmac[ETH_ALEN];
-	u16			vlan_id;
 };
 
 enum ib_wc_status {
@@ -927,10 +928,6 @@ enum ib_qp_attr_mask {
 	IB_QP_PATH_MIG_STATE		= (1<<18),
 	IB_QP_CAP			= (1<<19),
 	IB_QP_DEST_QPN			= (1<<20),
-	IB_QP_SMAC			= (1<<21),
-	IB_QP_ALT_SMAC			= (1<<22),
-	IB_QP_VID			= (1<<23),
-	IB_QP_ALT_VID			= (1<<24),
 };
 
 enum ib_qp_state {
@@ -980,10 +977,6 @@ struct ib_qp_attr {
 	u8			rnr_retry;
 	u8			alt_port_num;
 	u8			alt_timeout;
-	u8			smac[ETH_ALEN];
-	u8			alt_smac[ETH_ALEN];
-	u16			vlan_id;
-	u16			alt_vlan_id;
 };
 
 enum ib_wr_opcode {
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 12/33] IB/core: Add gid_type to path and rdma_id_private
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (10 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 11/33] IB/core: Modify ib_verbs and cma in order to use roce_gid_cache Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 13/33] IB/core: Add rdma_network_type to wc Somnath Kotur
                     ` (20 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Somnath Kotur

From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

When using rdma cm, we want to take the gid_type from
the rdma_id_private. This is mandatory before adding
an API from user-space/configfs that sets
the gid_type of CM connection.

Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/core/cm.c              | 19 ++++++++++++++-----
 drivers/infiniband/core/cma.c             |  2 ++
 drivers/infiniband/core/sa_query.c        |  3 ++-
 drivers/infiniband/core/uverbs_marshall.c |  1 +
 include/rdma/ib_sa.h                      |  1 +
 5 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 7974e74..22dac05 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -358,9 +358,8 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
 	read_lock_irqsave(&cm.device_lock, flags);
 	list_for_each_entry(cm_dev, &cm.device_list, list) {
 		if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid,
-					IB_GID_TYPE_IB, path->net,
-					path->ifindex,
-					&p, NULL)) {
+					path->gid_type, path->net,
+					path->ifindex, &p, NULL)) {
 			port = cm_dev->port[p-1];
 			break;
 		}
@@ -1521,6 +1520,8 @@ static int cm_req_handler(struct cm_work *work)
 	struct ib_cm_id *cm_id;
 	struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
 	struct cm_req_msg *req_msg;
+	union ib_gid gid;
+	struct ib_gid_attr gid_attr;
 	int ret;
 
 	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
@@ -1560,11 +1561,19 @@ static int cm_req_handler(struct cm_work *work)
 	cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]);
 
 	memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN);
-	ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
+	ret = ib_get_cached_gid(work->port->cm_dev->ib_device,
+				work->port->port_num,
+				cm_id_priv->av.ah_attr.grh.sgid_index,
+				&gid, &gid_attr);
+	if (!ret) {
+		work->path[0].gid_type = gid_attr.gid_type;
+		ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
+	}
 	if (ret) {
 		ib_get_cached_gid(work->port->cm_dev->ib_device,
 				  work->port->port_num, 0, &work->path[0].sgid,
-				  NULL);
+				  &gid_attr);
+		work->path[0].gid_type = gid_attr.gid_type;
 		ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
 			       &work->path[0].sgid, sizeof work->path[0].sgid,
 			       NULL, 0);
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 659676c..9afa410 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -146,6 +146,7 @@ struct rdma_id_private {
 	u8			tos;
 	u8			reuseaddr;
 	u8			afonly;
+	enum ib_gid_type	gid_type;
 };
 
 struct cma_multicast {
@@ -1936,6 +1937,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
 		ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if);
 		route->path_rec->net = &init_net;
 		route->path_rec->ifindex = addr->dev_addr.bound_dev_if;
+		route->path_rec->gid_type = id_priv->gid_type;
 	}
 	if (!ndev) {
 		ret = -ENODEV;
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 705b6b8..f770049 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -546,7 +546,7 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
 		ah_attr->ah_flags = IB_AH_GRH;
 		ah_attr->grh.dgid = rec->dgid;
 
-		ret = ib_find_cached_gid(device, &rec->sgid, IB_GID_TYPE_IB,
+		ret = ib_find_cached_gid(device, &rec->sgid, rec->gid_type,
 					 rec->net, rec->ifindex, &port_num,
 					 &gid_index);
 		if (ret)
@@ -676,6 +676,7 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
 			  mad->data, &rec);
 		rec.net = NULL;
 		rec.ifindex = 0;
+		rec.gid_type = IB_GID_TYPE_IB;
 		memset(rec.dmac, 0, ETH_ALEN);
 		query->callback(status, &rec, query->context);
 	} else
diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c
index 7d2f14c..af020f8 100644
--- a/drivers/infiniband/core/uverbs_marshall.c
+++ b/drivers/infiniband/core/uverbs_marshall.c
@@ -144,5 +144,6 @@ void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst,
 	memset(dst->dmac, 0, sizeof(dst->dmac));
 	dst->net = NULL;
 	dst->ifindex = 0;
+	dst->gid_type = IB_GID_TYPE_IB;
 }
 EXPORT_SYMBOL(ib_copy_path_rec_from_user);
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index eea01e6..61bc231 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -157,6 +157,7 @@ struct ib_sa_path_rec {
 	u8           dmac[ETH_ALEN];
 	int	     ifindex;
 	struct net  *net;
+	enum ib_gid_type gid_type;
 };
 
 #define IB_SA_MCMEMBER_REC_MGID				IB_SA_COMP_MASK( 0)
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 13/33] IB/core: Add rdma_network_type to wc
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (11 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 12/33] IB/core: Add gid_type to path and rdma_id_private Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 14/33] IB/cma: Add configfs for rdma_cm Somnath Kotur
                     ` (19 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Somnath Kotur

From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Providers should tell IB core the wc's network type.
This is used in order to search for the proper GID in the
GID table. When using HCAs that can't provide this info,
IB core tries to deep examine the packet and extract
the GID type by itself.

Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/core/verbs.c | 106 ++++++++++++++++++++++++++++++++++++++--
 include/rdma/ib_verbs.h         |  30 ++++++++++++
 2 files changed, 131 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 2f5fd7a..2e7ccad 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -195,8 +195,84 @@ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
 }
 EXPORT_SYMBOL(ib_create_ah);
 
+static int ib_get_grh_header_version(const void *h)
+{
+	const struct iphdr *ip4h = (struct iphdr *)(h + 20);
+	struct iphdr ip4h_checked;
+	const struct ipv6hdr *ip6h = (struct ipv6hdr *)h;
+
+	if (ip6h->version != 6)
+		return (ip4h->version == 4) ? 4 : 0;
+	/* version may be 6 or 4 */
+	if (ip4h->ihl != 5) /* IPv4 header length must be 5 for RR */
+		return 6;
+	/* Verify checksum.
+	   We can't write on scattered buffers so we need to copy to
+	   temp buffer.
+	 */
+	memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked));
+	ip4h_checked.check = 0;
+	ip4h_checked.check = ip_fast_csum((u8 *)&ip4h_checked, 5);
+	/* if IPv4 header checksum is OK, bellive it */
+	if (ip4h->check == ip4h_checked.check)
+		return 4;
+	return 6;
+}
+
+static int ib_get_dgid_sgid_by_grh(const void *h,
+				   enum rdma_network_type net_type,
+				   union ib_gid *dgid, union ib_gid *sgid)
+{
+	switch (net_type) {
+	case RDMA_NETWORK_IPV4: {
+		const struct iphdr *ip4h = (struct iphdr *)(h + 20);
+
+		ipv6_addr_set_v4mapped(ip4h->daddr, (struct in6_addr *)dgid);
+		ipv6_addr_set_v4mapped(ip4h->saddr, (struct in6_addr *)sgid);
+		return 0;
+	}
+	case RDMA_NETWORK_IPV6: {
+		struct ipv6hdr *ip6h = (struct ipv6hdr *)h;
+
+		memcpy(dgid, &ip6h->daddr, sizeof(*dgid));
+		memcpy(sgid, &ip6h->saddr, sizeof(*sgid));
+		return 0;
+	}
+	case RDMA_NETWORK_IB: {
+		struct ib_grh *grh = (struct ib_grh *)h;
+
+		memcpy(dgid, &grh->dgid, sizeof(*dgid));
+		memcpy(sgid, &grh->sgid, sizeof(*sgid));
+		return 0;
+	}
+	}
+
+	return -EINVAL;
+}
+
+static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device,
+						     u8 port_num,
+						     const struct ib_grh *grh)
+{
+	int grh_version;
+
+	if (rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND)
+		return RDMA_NETWORK_IB;
+
+	grh_version = ib_get_grh_header_version(grh);
+
+	if (grh_version == 4)
+		return RDMA_NETWORK_IPV4;
+
+	if (grh->next_hdr == IPPROTO_UDP)
+		return RDMA_NETWORK_IPV6;
+
+	return RDMA_NETWORK_IB;
+}
+
 struct find_gid_index_context {
 	u16 vlan_id;
+	enum ib_gid_type gid_type;
 };
 
 static bool find_gid_index(const union ib_gid *gid,
@@ -206,6 +282,9 @@ static bool find_gid_index(const union ib_gid *gid,
 	struct find_gid_index_context *ctx =
 		(struct find_gid_index_context *)context;
 
+	if (ctx->gid_type != gid_attr->gid_type)
+		return false;
+
 	if ((!!(ctx->vlan_id != 0xffff) == !is_vlan_dev(gid_attr->ndev)) ||
 	    (is_vlan_dev(gid_attr->ndev) &&
 	     vlan_dev_vlan_id(gid_attr->ndev) != ctx->vlan_id))
@@ -216,9 +295,11 @@ static bool find_gid_index(const union ib_gid *gid,
 
 static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num,
 				   u16 vlan_id, union ib_gid *sgid,
+				   enum ib_gid_type gid_type,
 				   u16 *gid_index)
 {
-	struct find_gid_index_context context = {.vlan_id = vlan_id};
+	struct find_gid_index_context context = {.vlan_id = vlan_id,
+						 .gid_type = gid_type};
 
 	return ib_find_gid_by_filter(device, sgid, port_num, find_gid_index,
 				     &context, gid_index);
@@ -232,9 +313,24 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
 	int ret;
 	int is_eth = (rdma_port_get_link_layer(device, port_num) ==
 			IB_LINK_LAYER_ETHERNET);
+	enum rdma_network_type net_type = RDMA_NETWORK_IB;
+	enum ib_gid_type gid_type = IB_GID_TYPE_IB;
+	union ib_gid dgid;
+	union ib_gid sgid;
 
 	memset(ah_attr, 0, sizeof *ah_attr);
 	if (is_eth) {
+		if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE)
+			net_type = wc->network_hdr_type;
+		else
+			net_type = ib_get_net_type_by_grh(device, port_num, grh);
+		gid_type = ib_network_to_gid_type(net_type);
+	}
+	ret = ib_get_dgid_sgid_by_grh(grh, net_type, &dgid, &sgid);
+	if (ret)
+		return ret;
+
+	if (is_eth) {
 		u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ?
 				wc->vlan_id : 0xffff;
 
@@ -243,7 +339,7 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
 
 		if (!(wc->wc_flags & IB_WC_WITH_SMAC) ||
 		    !(wc->wc_flags & IB_WC_WITH_VLAN)) {
-			ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid,
+			ret = rdma_addr_find_dmac_by_grh(&dgid, &sgid,
 							 ah_attr->dmac,
 							 wc->wc_flags & IB_WC_WITH_VLAN ?
 							 NULL : &vlan_id,
@@ -253,7 +349,7 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
 		}
 
 		ret = get_sgid_index_from_eth(device, port_num, vlan_id,
-					      &grh->dgid, &gid_index);
+					      &dgid, gid_type, &gid_index);
 		if (ret)
 			return ret;
 
@@ -268,10 +364,10 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
 
 	if (wc->wc_flags & IB_WC_GRH) {
 		ah_attr->ah_flags = IB_AH_GRH;
-		ah_attr->grh.dgid = grh->sgid;
+		ah_attr->grh.dgid = sgid;
 
 		if (!is_eth) {
-			ret = ib_find_cached_gid_by_port(device, &grh->dgid,
+			ret = ib_find_cached_gid_by_port(device, &dgid,
 							 IB_GID_TYPE_IB,
 							 port_num, NULL, 0,
 							 &gid_index);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index d8e4dd0..9de9e62 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -50,6 +50,7 @@
 #include <linux/workqueue.h>
 #include <net/net_namespace.h>
 #include <uapi/linux/if_ether.h>
+#include <net/ipv6.h>
 
 #include <linux/atomic.h>
 #include <linux/mmu_notifier.h>
@@ -118,6 +119,33 @@ enum rdma_transport_type {
 __attribute_const__ enum rdma_transport_type
 rdma_node_get_transport(enum rdma_node_type node_type);
 
+enum rdma_network_type {
+	RDMA_NETWORK_IB,
+	RDMA_NETWORK_IPV4,
+	RDMA_NETWORK_IPV6
+};
+
+static inline enum ib_gid_type ib_network_to_gid_type(enum rdma_network_type network_type)
+{
+	if (network_type == RDMA_NETWORK_IPV4 ||
+	    network_type == RDMA_NETWORK_IPV6)
+		return IB_GID_TYPE_ROCE_V2;
+
+	return IB_GID_TYPE_IB;
+}
+
+static inline enum rdma_network_type ib_gid_to_network_type(enum ib_gid_type gid_type,
+							    union ib_gid *gid)
+{
+	if (gid_type == IB_GID_TYPE_IB)
+		return RDMA_NETWORK_IB;
+
+	if (ipv6_addr_v4mapped((struct in6_addr *)gid))
+		return RDMA_NETWORK_IPV4;
+	else
+		return RDMA_NETWORK_IPV6;
+}
+
 enum rdma_link_layer {
 	IB_LINK_LAYER_UNSPECIFIED,
 	IB_LINK_LAYER_INFINIBAND,
@@ -725,6 +753,7 @@ enum ib_wc_flags {
 	IB_WC_IP_CSUM_OK	= (1<<3),
 	IB_WC_WITH_SMAC		= (1<<4),
 	IB_WC_WITH_VLAN		= (1<<5),
+	IB_WC_WITH_NETWORK_HDR_TYPE	= (1<<6),
 };
 
 struct ib_wc {
@@ -747,6 +776,7 @@ struct ib_wc {
 	u8			port_num;	/* valid only for DR SMPs on switches */
 	u8			smac[ETH_ALEN];
 	u16			vlan_id;
+	u8			network_hdr_type;
 };
 
 enum ib_cq_notify_flags {
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 14/33] IB/cma: Add configfs for rdma_cm
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (12 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 13/33] IB/core: Add rdma_network_type to wc Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 15/33] IB/Core: Changes to the IB Core infrastructure for RoCEv2 support Somnath Kotur
                     ` (18 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak, Somnath Kotur

From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Users would like to control the behaviour of rdma_cm.
For example, old applications which doesn't set the
required RoCE gid type could be executed on RoCE V2
network types. In order to support this configuration,
we implement a configfs for rdma_cm.

In order to use the configfs, one needs to mount it and
mkdir <IB device name> inside rdma_cm directory.

The patch adds support for a single configuration file,
default_roce_mode. The mode can either be IB & RoCEv1 or
RoCEv2.

Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/Kconfig               |   5 +
 drivers/infiniband/core/Makefile         |   2 +
 drivers/infiniband/core/cma.c            |  54 +++++++-
 drivers/infiniband/core/cma_configfs.c   | 222 +++++++++++++++++++++++++++++++
 drivers/infiniband/core/core_priv.h      |  15 +++
 drivers/infiniband/core/roce_gid_cache.c |  13 ++
 6 files changed, 307 insertions(+), 4 deletions(-)
 create mode 100644 drivers/infiniband/core/cma_configfs.c

diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index b899531..20bda60 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -54,6 +54,11 @@ config INFINIBAND_ADDR_TRANS
 	depends on INFINIBAND
 	default y
 
+config CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS
+	bool
+	depends on INFINIBAND_ADDR_TRANS && CONFIGFS_FS
+	default y
+
 source "drivers/infiniband/hw/mthca/Kconfig"
 source "drivers/infiniband/hw/ipath/Kconfig"
 source "drivers/infiniband/hw/qib/Kconfig"
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index 2c94963..f6bc8c5 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -24,6 +24,8 @@ iw_cm-y :=			iwcm.o iwpm_util.o iwpm_msg.o
 
 rdma_cm-y :=			cma.o
 
+rdma_cm-$(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) += cma_configfs.o
+
 rdma_ucm-y :=			ucma.o
 
 ib_addr-y :=			addr.o
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 9afa410..8dec040 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -55,6 +55,7 @@
 #include <rdma/ib_cm.h>
 #include <rdma/ib_sa.h>
 #include <rdma/iw_cm.h>
+#include "core_priv.h"
 
 MODULE_AUTHOR("Sean Hefty");
 MODULE_DESCRIPTION("Generic RDMA CM Agent");
@@ -91,6 +92,7 @@ struct cma_device {
 	struct completion	comp;
 	atomic_t		refcount;
 	struct list_head	id_list;
+	enum ib_gid_type	default_gid_type;
 };
 
 struct rdma_bind_list {
@@ -103,6 +105,42 @@ enum {
 	CMA_OPTION_AFONLY,
 };
 
+void cma_ref_dev(struct cma_device *cma_dev)
+{
+	atomic_inc(&cma_dev->refcount);
+}
+
+struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter	filter,
+					     void		*cookie)
+{
+	struct cma_device *cma_dev;
+	struct cma_device *found_cma_dev = NULL;
+
+	mutex_lock(&lock);
+
+	list_for_each_entry(cma_dev, &dev_list, list)
+		if (filter(cma_dev->device, cookie)) {
+			found_cma_dev = cma_dev;
+			break;
+		}
+
+	if (found_cma_dev)
+		cma_ref_dev(found_cma_dev);
+	mutex_unlock(&lock);
+	return found_cma_dev;
+}
+
+enum ib_gid_type cma_get_default_gid_type(struct cma_device *cma_dev)
+{
+	return cma_dev->default_gid_type;
+}
+
+void cma_set_default_gid_type(struct cma_device *cma_dev,
+			      enum ib_gid_type default_gid_type)
+{
+	cma_dev->default_gid_type = default_gid_type;
+}
+
 /*
  * Device removal can occur at anytime, so we need extra handling to
  * serialize notifying the user of device removal with other callbacks.
@@ -248,15 +286,16 @@ static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
 static void cma_attach_to_dev(struct rdma_id_private *id_priv,
 			      struct cma_device *cma_dev)
 {
-	atomic_inc(&cma_dev->refcount);
+	cma_ref_dev(cma_dev);
 	id_priv->cma_dev = cma_dev;
+	id_priv->gid_type = cma_dev->default_gid_type;
 	id_priv->id.device = cma_dev->device;
 	id_priv->id.route.addr.dev_addr.transport =
 		rdma_node_get_transport(cma_dev->device->node_type);
 	list_add_tail(&id_priv->list, &cma_dev->id_list);
 }
 
-static inline void cma_deref_dev(struct cma_device *cma_dev)
+void cma_deref_dev(struct cma_device *cma_dev)
 {
 	if (atomic_dec_and_test(&cma_dev->refcount))
 		complete(&cma_dev->comp);
@@ -385,7 +424,7 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv,
 
 			ret = ib_find_cached_gid_by_port(cma_dev->device,
 							 &iboe_gid,
-							 IB_GID_TYPE_IB,
+							 cma_dev->default_gid_type,
 							 port,
 							 &init_net,
 							 if_index,
@@ -418,7 +457,7 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv,
 
 					ret = ib_find_cached_gid_by_port(cma_dev->device,
 									 &iboe_gid,
-									 IB_GID_TYPE_IB,
+									 cma_dev->default_gid_type,
 									 port,
 									 &init_net,
 									 if_index,
@@ -3521,6 +3560,7 @@ static void cma_add_one(struct ib_device *device)
 		return;
 
 	cma_dev->device = device;
+	cma_dev->default_gid_type = IB_GID_TYPE_IB;
 
 	init_completion(&cma_dev->comp);
 	atomic_set(&cma_dev->refcount, 1);
@@ -3701,6 +3741,9 @@ static int __init cma_init(void)
 
 	if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table))
 		printk(KERN_WARNING "RDMA CMA: failed to add netlink callback\n");
+#if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS)
+	cma_configfs_init();
+#endif
 
 	return 0;
 
@@ -3714,6 +3757,9 @@ err:
 
 static void __exit cma_cleanup(void)
 {
+#if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS)
+	cma_configfs_exit();
+#endif
 	ibnl_remove_client(RDMA_NL_RDMA_CM);
 	ib_unregister_client(&cma_client);
 	unregister_netdevice_notifier(&cma_nb);
diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c
new file mode 100644
index 0000000..9a87210
--- /dev/null
+++ b/drivers/infiniband/core/cma_configfs.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/configfs.h>
+#include <rdma/ib_verbs.h>
+#include "core_priv.h"
+
+struct cma_device;
+
+struct cma_dev_group {
+	struct config_item	item;
+};
+
+struct cma_configfs_attr {
+	struct configfs_attribute	attr;
+	ssize_t				(*show)(struct cma_device *cma_dev,
+						struct cma_dev_group *group,
+						char *buf);
+	ssize_t				(*store)(struct cma_device *cma_dev,
+						 struct cma_dev_group *group,
+						 const char *buf, size_t count);
+};
+
+static struct cma_dev_group *to_dev_group(struct config_item *item)
+{
+	return item ?
+		container_of(item, struct cma_dev_group, item) :
+		NULL;
+}
+
+static ssize_t show_default_roce_mode(struct cma_device *cma_dev,
+				      struct cma_dev_group *group,
+				      char *buf)
+{
+	return sprintf(buf, "%s",
+		       roce_gid_cache_type_str(cma_get_default_gid_type(cma_dev)));
+}
+
+static ssize_t store_default_roce_mode(struct cma_device *cma_dev,
+				       struct cma_dev_group *group,
+				       const char *buf, size_t count)
+{
+	int gid_type = roce_gid_cache_parse_gid_str(buf);
+
+	if (gid_type < 0)
+		return -EINVAL;
+
+	cma_set_default_gid_type(cma_dev, gid_type);
+
+	return strnlen(buf, count);
+}
+
+#define CMA_PARAM_ATTR_RW(_name)				\
+static struct cma_configfs_attr cma_configfs_attr_##_name =	\
+	__CONFIGFS_ATTR(_name, S_IRUGO | S_IWUSR, show_##_name, store_##_name)
+
+CMA_PARAM_ATTR_RW(default_roce_mode);
+
+static bool filter_by_name(struct ib_device *ib_dev, void *cookie)
+{
+	return !strcmp(ib_dev->name, cookie);
+}
+
+static ssize_t cma_configfs_attr_show(struct config_item *item,
+				      struct configfs_attribute *attr,
+				      char *buf)
+{
+	ssize_t ret = -EINVAL;
+	struct cma_device *cma_dev =
+		cma_enum_devices_by_ibdev(filter_by_name, config_item_name(item));
+	struct cma_dev_group *group = to_dev_group(item);
+	struct cma_configfs_attr *ca =
+		container_of(attr, struct cma_configfs_attr, attr);
+
+	if (!cma_dev)
+		return -ENODEV;
+
+	if (ca->show)
+		ret = ca->show(cma_dev, group, buf);
+
+	cma_deref_dev(cma_dev);
+	return ret;
+}
+
+static ssize_t cma_configfs_attr_store(struct config_item *item,
+				       struct configfs_attribute *attr,
+				       const char *buf, size_t count)
+{
+	ssize_t ret = -EINVAL;
+	struct cma_device *cma_dev =
+		cma_enum_devices_by_ibdev(filter_by_name, config_item_name(item));
+	struct cma_dev_group *group = to_dev_group(item);
+	struct cma_configfs_attr *ca =
+		container_of(attr, struct cma_configfs_attr, attr);
+
+	if (!cma_dev)
+		return -ENODEV;
+
+	if (ca->store)
+		ret = ca->store(cma_dev, group, buf, count);
+
+	cma_deref_dev(cma_dev);
+	return ret;
+}
+
+static struct configfs_attribute *cma_configfs_attributes[] = {
+	&cma_configfs_attr_default_roce_mode.attr,
+	NULL,
+};
+
+static void cma_configfs_attr_release(struct config_item *item)
+{
+	kfree(to_dev_group(item));
+}
+
+static struct configfs_item_operations cma_item_ops = {
+	.show_attribute		= cma_configfs_attr_show,
+	.store_attribute	= cma_configfs_attr_store,
+	.release		= cma_configfs_attr_release,
+};
+
+static struct config_item_type cma_item_type = {
+	.ct_attrs	= cma_configfs_attributes,
+	.ct_item_ops	= &cma_item_ops,
+	.ct_owner	= THIS_MODULE
+};
+
+static struct config_item *make_cma_dev(struct config_group *group,
+					const char *name)
+{
+	int err = -EINVAL;
+	struct cma_device *cma_dev = cma_enum_devices_by_ibdev(filter_by_name,
+							       (void *)name);
+	struct cma_dev_group *cma_dev_group = NULL;
+
+	if (!cma_dev)
+		goto fail;
+
+	cma_dev_group = kzalloc(sizeof(*cma_dev_group), GFP_KERNEL);
+
+	if (!cma_dev_group) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	config_item_init_type_name(&cma_dev_group->item, name, &cma_item_type);
+
+	cma_deref_dev(cma_dev);
+	return &cma_dev_group->item;
+
+fail:
+	if (cma_dev)
+		cma_deref_dev(cma_dev);
+	kfree(cma_dev_group);
+	return ERR_PTR(err);
+}
+
+static void drop_cma_dev(struct config_group *group,
+			 struct config_item *item)
+{
+	config_item_put(item);
+}
+
+static struct configfs_group_operations cma_subsys_group_ops = {
+	.make_item	= make_cma_dev,
+	.drop_item	= drop_cma_dev,
+};
+
+static struct config_item_type cma_subsys_type = {
+	.ct_group_ops	= &cma_subsys_group_ops,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct configfs_subsystem cma_subsys = {
+	.su_group	= {
+		.cg_item	= {
+			.ci_namebuf	= "rdma_cm",
+			.ci_type	= &cma_subsys_type,
+		},
+	},
+};
+
+int __init cma_configfs_init(void)
+{
+	config_group_init(&cma_subsys.su_group);
+	mutex_init(&cma_subsys.su_mutex);
+	return configfs_register_subsystem(&cma_subsys);
+}
+
+void __exit cma_configfs_exit(void)
+{
+	configfs_unregister_subsystem(&cma_subsys);
+}
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 94db13d..2d9ebe1 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -39,6 +39,20 @@
 
 #include <rdma/ib_verbs.h>
 
+#if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS)
+int cma_configfs_init(void);
+void cma_configfs_exit(void);
+#endif
+struct cma_device;
+typedef bool (*cma_device_filter)(struct ib_device *, void *);
+struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter	filter,
+					     void		*cookie);
+enum ib_gid_type cma_get_default_gid_type(struct cma_device *cma_dev);
+void cma_set_default_gid_type(struct cma_device *cma_dev,
+			      enum ib_gid_type default_gid_type);
+void cma_ref_dev(struct cma_device *cma_dev);
+void cma_deref_dev(struct cma_device *cma_dev);
+
 extern struct workqueue_struct *roce_gid_mgmt_wq;
 
 int  ib_device_register_sysfs(struct ib_device *device,
@@ -72,6 +86,7 @@ void ib_enum_roce_ports_of_netdev(roce_netdev_filter filter,
 				  void *cookie);
 
 const char *roce_gid_cache_type_str(enum ib_gid_type gid_type);
+int roce_gid_cache_parse_gid_str(const char *buf);
 
 int roce_gid_cache_get_gid(struct ib_device *ib_dev, u8 port, int index,
 			   union ib_gid *gid, struct ib_gid_attr *attr);
diff --git a/drivers/infiniband/core/roce_gid_cache.c b/drivers/infiniband/core/roce_gid_cache.c
index bd51d97..f02215b 100644
--- a/drivers/infiniband/core/roce_gid_cache.c
+++ b/drivers/infiniband/core/roce_gid_cache.c
@@ -72,6 +72,19 @@ const char *roce_gid_cache_type_str(enum ib_gid_type gid_type)
 
 	return "Invalid GID type";
 }
+EXPORT_SYMBOL_GPL(roce_gid_cache_type_str);
+
+int roce_gid_cache_parse_gid_str(const char *buf)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i)
+		if (gid_type_str[i] && !strcmp(buf, gid_type_str[i]))
+			return i;
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(roce_gid_cache_parse_gid_str);
 
 static void put_ndev(struct rcu_head *rcu)
 {
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 15/33] IB/Core: Changes to the IB Core infrastructure for RoCEv2 support
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (13 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 14/33] IB/cma: Add configfs for rdma_cm Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 16/33] RDMA/ocrdma: Changes in driver to incorporate the moving of GID Table mgmt to IB/Core Somnath Kotur
                     ` (17 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Somnath Kotur

1. Choose sgid_index and type from all the matching entries in RDMA-CM
   based on hint from the IP stack.
2. Set hop_limit for the IP Packet based on above hint from IP stack
3. Define a RDMA_NETWORK enum type.

Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/core/addr.c  |  8 +++++
 drivers/infiniband/core/cma.c   | 10 +++++-
 drivers/infiniband/core/verbs.c | 77 ++++++++++++++++++++++-------------------
 include/rdma/ib_addr.h          |  1 +
 include/rdma/ib_verbs.h         |  9 +++++
 5 files changed, 68 insertions(+), 37 deletions(-)

diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 43af7f5..da24c0e 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -257,6 +257,9 @@ static int addr4_resolve(struct sockaddr_in *src_in,
 		goto put;
 	}
 
+	if (rt->rt_uses_gateway)
+		addr->network = RDMA_NETWORK_IPV4;
+
 	ret = dst_fetch_ha(&rt->dst, addr, &fl4.daddr);
 put:
 	ip_rt_put(rt);
@@ -271,6 +274,7 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
 {
 	struct flowi6 fl6;
 	struct dst_entry *dst;
+	struct rt6_info *rt;
 	int ret;
 
 	memset(&fl6, 0, sizeof fl6);
@@ -282,6 +286,7 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
 	if ((ret = dst->error))
 		goto put;
 
+	rt = (struct rt6_info *)dst;
 	if (ipv6_addr_any(&fl6.saddr)) {
 		ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev,
 					 &fl6.daddr, 0, &fl6.saddr);
@@ -305,6 +310,9 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
 		goto put;
 	}
 
+	if (rt->rt6i_flags & RTF_GATEWAY)
+		addr->network = RDMA_NETWORK_IPV6;
+
 	ret = dst_fetch_ha(dst, addr, &fl6.daddr);
 put:
 	dst_release(dst);
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 8dec040..6f345e2 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -1952,6 +1952,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
 {
 	struct rdma_route *route = &id_priv->id.route;
 	struct rdma_addr *addr = &route->addr;
+	enum ib_gid_type network_gid_type;
 	struct cma_work *work;
 	int ret;
 	struct net_device *ndev = NULL;
@@ -1990,7 +1991,14 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
 	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr,
 		    &route->path_rec->dgid);
 
-	route->path_rec->hop_limit = 1;
+	/* Use the hint from IP Stack to select GID Type */
+	network_gid_type = ib_network_to_gid_type(addr->dev_addr.network);
+	if (addr->dev_addr.network != RDMA_NETWORK_IB) {
+		route->path_rec->gid_type = network_gid_type;
+		route->path_rec->hop_limit = IPV6_DEFAULT_HOPLIMIT;
+	} else {
+		route->path_rec->hop_limit = 1;
+	}
 	route->path_rec->reversible = 1;
 	route->path_rec->pkey = cpu_to_be16(0xffff);
 	route->path_rec->mtu_selector = IB_SA_EQ;
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 2e7ccad..3586996 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -195,11 +195,11 @@ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
 }
 EXPORT_SYMBOL(ib_create_ah);
 
-static int ib_get_grh_header_version(const void *h)
+static int ib_get_grh_header_version(const union rdma_network_hdr *h)
 {
-	const struct iphdr *ip4h = (struct iphdr *)(h + 20);
+	const struct iphdr *ip4h = (struct iphdr *)&h->roce4grh;
 	struct iphdr ip4h_checked;
-	const struct ipv6hdr *ip6h = (struct ipv6hdr *)h;
+	const struct ipv6hdr *ip6h = (struct ipv6hdr *)&h->ibgrh;
 
 	if (ip6h->version != 6)
 		return (ip4h->version == 4) ? 4 : 0;
@@ -219,37 +219,6 @@ static int ib_get_grh_header_version(const void *h)
 	return 6;
 }
 
-static int ib_get_dgid_sgid_by_grh(const void *h,
-				   enum rdma_network_type net_type,
-				   union ib_gid *dgid, union ib_gid *sgid)
-{
-	switch (net_type) {
-	case RDMA_NETWORK_IPV4: {
-		const struct iphdr *ip4h = (struct iphdr *)(h + 20);
-
-		ipv6_addr_set_v4mapped(ip4h->daddr, (struct in6_addr *)dgid);
-		ipv6_addr_set_v4mapped(ip4h->saddr, (struct in6_addr *)sgid);
-		return 0;
-	}
-	case RDMA_NETWORK_IPV6: {
-		struct ipv6hdr *ip6h = (struct ipv6hdr *)h;
-
-		memcpy(dgid, &ip6h->daddr, sizeof(*dgid));
-		memcpy(sgid, &ip6h->saddr, sizeof(*sgid));
-		return 0;
-	}
-	case RDMA_NETWORK_IB: {
-		struct ib_grh *grh = (struct ib_grh *)h;
-
-		memcpy(dgid, &grh->dgid, sizeof(*dgid));
-		memcpy(sgid, &grh->sgid, sizeof(*sgid));
-		return 0;
-	}
-	}
-
-	return -EINVAL;
-}
-
 static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device,
 						     u8 port_num,
 						     const struct ib_grh *grh)
@@ -259,7 +228,7 @@ static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device,
 	if (rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND)
 		return RDMA_NETWORK_IB;
 
-	grh_version = ib_get_grh_header_version(grh);
+	grh_version = ib_get_grh_header_version((union rdma_network_hdr *)grh);
 
 	if (grh_version == 4)
 		return RDMA_NETWORK_IPV4;
@@ -305,6 +274,38 @@ static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num,
 				     &context, gid_index);
 }
 
+static int get_gids_from_rdma_hdr(union rdma_network_hdr *hdr,
+				  enum rdma_network_type net_type,
+				  union ib_gid *sgid, union ib_gid *dgid)
+{
+	struct sockaddr_in  src_in;
+	struct sockaddr_in  dst_in;
+	__be32 src_saddr, dst_saddr;
+
+	if (!sgid || !dgid)
+		return -EINVAL;
+
+	if (net_type == RDMA_NETWORK_IPV4) {
+		memcpy(&src_in.sin_addr.s_addr,
+		       &hdr->roce4grh.saddr, 4);
+		memcpy(&dst_in.sin_addr.s_addr,
+		       &hdr->roce4grh.daddr, 4);
+		src_saddr = src_in.sin_addr.s_addr;
+		dst_saddr = dst_in.sin_addr.s_addr;
+		ipv6_addr_set_v4mapped(src_saddr,
+				       (struct in6_addr *)sgid);
+		ipv6_addr_set_v4mapped(dst_saddr,
+				       (struct in6_addr *)dgid);
+		return 0;
+	} else if (net_type == RDMA_NETWORK_IPV6 ||
+		   net_type == RDMA_NETWORK_IB) {
+		*dgid = hdr->ibgrh.dgid;
+		*sgid = hdr->ibgrh.sgid;
+		return 0;
+	} else
+		return -EINVAL;
+}
+
 int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
 		       struct ib_grh *grh, struct ib_ah_attr *ah_attr)
 {
@@ -326,7 +327,8 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
 			net_type = ib_get_net_type_by_grh(device, port_num, grh);
 		gid_type = ib_network_to_gid_type(net_type);
 	}
-	ret = ib_get_dgid_sgid_by_grh(grh, net_type, &dgid, &sgid);
+	ret = get_gids_from_rdma_hdr((union rdma_network_hdr *)grh, net_type,
+				     &sgid, &dgid);
 	if (ret)
 		return ret;
 
@@ -1007,6 +1009,9 @@ int ib_resolve_eth_dmac(struct ib_qp *qp,
 				rcu_read_unlock();
 				goto out;
 			}
+			if (sgid_attr.gid_type == IB_GID_TYPE_ROCE_V2)
+				qp_attr->ah_attr.grh.hop_limit =
+							IPV6_DEFAULT_HOPLIMIT;
 
 			dev_hold(sgid_attr.ndev);
 			ifindex = sgid_attr.ndev->ifindex;
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index 0dfaaa7..80afbf7 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -71,6 +71,7 @@ struct rdma_dev_addr {
 	unsigned short dev_type;
 	int bound_dev_if;
 	enum rdma_transport_type transport;
+	enum rdma_network_type network;
 };
 
 /**
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9de9e62..846db44 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -51,6 +51,7 @@
 #include <net/net_namespace.h>
 #include <uapi/linux/if_ether.h>
 #include <net/ipv6.h>
+#include <net/ip.h>
 
 #include <linux/atomic.h>
 #include <linux/mmu_notifier.h>
@@ -517,6 +518,14 @@ struct ib_grh {
 	union ib_gid	dgid;
 };
 
+union rdma_network_hdr {
+	struct ib_grh ibgrh;
+	struct {
+		u8		reserved[20];
+		struct iphdr	roce4grh;
+	};
+};
+
 enum {
 	IB_MULTICAST_QPN = 0xffffff
 };
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 16/33] RDMA/ocrdma: Changes in driver to incorporate the moving of GID Table mgmt to IB/Core.
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (14 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 15/33] IB/Core: Changes to the IB Core infrastructure for RoCEv2 support Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 17/33] RDMA/ocrdma: changes to support RoCE-v2 in UD path Somnath Kotur
                     ` (16 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Somnath Kotur, Devesh Sharma

1.Check and set port capability flags to indicate RoCEV2 support.
2.Change query_gid hook to return value from IB/Core GID Mgmt APIs.
3.Get rid of all the netdev notifier chain subscription code as well as
maintenance of SGID Table in memory.
4.Implement get_netdev hook in driver.

Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Devesh Sharma <devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/hw/ocrdma/ocrdma.h       |  10 ++
 drivers/infiniband/hw/ocrdma/ocrdma_hw.c    |   3 +
 drivers/infiniband/hw/ocrdma/ocrdma_main.c  | 233 +---------------------------
 drivers/infiniband/hw/ocrdma/ocrdma_sli.h   |  13 ++
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c |  33 +++-
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.h |   4 +
 6 files changed, 64 insertions(+), 232 deletions(-)

diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h
index 16ee36e..97f971a 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma.h
@@ -100,6 +100,7 @@ struct ocrdma_dev_attr {
 	u8 local_ca_ack_delay;
 	u8 ird;
 	u8 num_ird_pages;
+	u8 roce_flags;
 };
 
 struct ocrdma_dma_mem {
@@ -575,4 +576,13 @@ static inline u8 ocrdma_is_enabled_and_synced(u32 state)
 		(state & OCRDMA_STATE_FLAG_SYNC);
 }
 
+static inline bool ocrdma_is_rocev2_supported(struct ocrdma_dev *dev)
+{
+	return (dev->attr.roce_flags & (OCRDMA_L3_TYPE_IPV4 <<
+					OCRDMA_ROUDP_FLAGS_SHIFT) ||
+		dev->attr.roce_flags & (OCRDMA_L3_TYPE_IPV6 <<
+					OCRDMA_ROUDP_FLAGS_SHIFT)) ?
+								true : false;
+}
+
 #endif
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
index e5f0244..20f9e8f 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
@@ -1112,6 +1112,9 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev,
 	attr->local_ca_ack_delay = (rsp->max_pd_ca_ack_delay &
 				    OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_MASK) >>
 	    OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT;
+	attr->roce_flags = (rsp->max_pd_ca_ack_delay &
+				OCRDMA_MBX_QUERY_CFG_L3_TYPE_MASK) >>
+				OCRDMA_MBX_QUERY_CFG_L3_TYPE_SHIFT;
 	attr->max_mw = rsp->max_mw;
 	attr->max_mr = rsp->max_mr;
 	attr->max_mr_size = ((u64)rsp->max_mr_size_hi << 32) |
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index 7a2b59a..a81492f 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -51,8 +51,6 @@ static LIST_HEAD(ocrdma_dev_list);
 static DEFINE_SPINLOCK(ocrdma_devlist_lock);
 static DEFINE_IDR(ocrdma_dev_id);
 
-static union ib_gid ocrdma_zero_sgid;
-
 void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid)
 {
 	u8 mac_addr[6];
@@ -67,135 +65,6 @@ void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid)
 	guid[6] = mac_addr[4];
 	guid[7] = mac_addr[5];
 }
-
-static bool ocrdma_add_sgid(struct ocrdma_dev *dev, union ib_gid *new_sgid)
-{
-	int i;
-	unsigned long flags;
-
-	memset(&ocrdma_zero_sgid, 0, sizeof(union ib_gid));
-
-
-	spin_lock_irqsave(&dev->sgid_lock, flags);
-	for (i = 0; i < OCRDMA_MAX_SGID; i++) {
-		if (!memcmp(&dev->sgid_tbl[i], &ocrdma_zero_sgid,
-			    sizeof(union ib_gid))) {
-			/* found free entry */
-			memcpy(&dev->sgid_tbl[i], new_sgid,
-			       sizeof(union ib_gid));
-			spin_unlock_irqrestore(&dev->sgid_lock, flags);
-			return true;
-		} else if (!memcmp(&dev->sgid_tbl[i], new_sgid,
-				   sizeof(union ib_gid))) {
-			/* entry already present, no addition is required. */
-			spin_unlock_irqrestore(&dev->sgid_lock, flags);
-			return false;
-		}
-	}
-	spin_unlock_irqrestore(&dev->sgid_lock, flags);
-	return false;
-}
-
-static bool ocrdma_del_sgid(struct ocrdma_dev *dev, union ib_gid *sgid)
-{
-	int found = false;
-	int i;
-	unsigned long flags;
-
-
-	spin_lock_irqsave(&dev->sgid_lock, flags);
-	/* first is default sgid, which cannot be deleted. */
-	for (i = 1; i < OCRDMA_MAX_SGID; i++) {
-		if (!memcmp(&dev->sgid_tbl[i], sgid, sizeof(union ib_gid))) {
-			/* found matching entry */
-			memset(&dev->sgid_tbl[i], 0, sizeof(union ib_gid));
-			found = true;
-			break;
-		}
-	}
-	spin_unlock_irqrestore(&dev->sgid_lock, flags);
-	return found;
-}
-
-static int ocrdma_addr_event(unsigned long event, struct net_device *netdev,
-			     union ib_gid *gid)
-{
-	struct ib_event gid_event;
-	struct ocrdma_dev *dev;
-	bool found = false;
-	bool updated = false;
-	bool is_vlan = false;
-
-	is_vlan = netdev->priv_flags & IFF_802_1Q_VLAN;
-	if (is_vlan)
-		netdev = rdma_vlan_dev_real_dev(netdev);
-
-	rcu_read_lock();
-	list_for_each_entry_rcu(dev, &ocrdma_dev_list, entry) {
-		if (dev->nic_info.netdev == netdev) {
-			found = true;
-			break;
-		}
-	}
-	rcu_read_unlock();
-
-	if (!found)
-		return NOTIFY_DONE;
-
-	mutex_lock(&dev->dev_lock);
-	switch (event) {
-	case NETDEV_UP:
-		updated = ocrdma_add_sgid(dev, gid);
-		break;
-	case NETDEV_DOWN:
-		updated = ocrdma_del_sgid(dev, gid);
-		break;
-	default:
-		break;
-	}
-	if (updated) {
-		/* GID table updated, notify the consumers about it */
-		gid_event.device = &dev->ibdev;
-		gid_event.element.port_num = 1;
-		gid_event.event = IB_EVENT_GID_CHANGE;
-		ib_dispatch_event(&gid_event);
-	}
-	mutex_unlock(&dev->dev_lock);
-	return NOTIFY_OK;
-}
-
-static int ocrdma_inetaddr_event(struct notifier_block *notifier,
-				  unsigned long event, void *ptr)
-{
-	struct in_ifaddr *ifa = ptr;
-	union ib_gid gid;
-	struct net_device *netdev = ifa->ifa_dev->dev;
-
-	ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid);
-	return ocrdma_addr_event(event, netdev, &gid);
-}
-
-static struct notifier_block ocrdma_inetaddr_notifier = {
-	.notifier_call = ocrdma_inetaddr_event
-};
-
-#if IS_ENABLED(CONFIG_IPV6)
-
-static int ocrdma_inet6addr_event(struct notifier_block *notifier,
-				  unsigned long event, void *ptr)
-{
-	struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
-	union  ib_gid *gid = (union ib_gid *)&ifa->addr;
-	struct net_device *netdev = ifa->idev->dev;
-	return ocrdma_addr_event(event, netdev, gid);
-}
-
-static struct notifier_block ocrdma_inet6addr_notifier = {
-	.notifier_call = ocrdma_inet6addr_event
-};
-
-#endif /* IPV6 and VLAN */
-
 static enum rdma_link_layer ocrdma_link_layer(struct ib_device *device,
 					      u8 port_num)
 {
@@ -246,6 +115,8 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
 	dev->ibdev.query_port = ocrdma_query_port;
 	dev->ibdev.modify_port = ocrdma_modify_port;
 	dev->ibdev.query_gid = ocrdma_query_gid;
+	dev->ibdev.get_netdev = ocrdma_get_netdev;
+	dev->ibdev.modify_gid = ocrdma_modify_gid;
 	dev->ibdev.get_link_layer = ocrdma_link_layer;
 	dev->ibdev.alloc_pd = ocrdma_alloc_pd;
 	dev->ibdev.dealloc_pd = ocrdma_dealloc_pd;
@@ -307,12 +178,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
 static int ocrdma_alloc_resources(struct ocrdma_dev *dev)
 {
 	mutex_init(&dev->dev_lock);
-	dev->sgid_tbl = kzalloc(sizeof(union ib_gid) *
-				OCRDMA_MAX_SGID, GFP_KERNEL);
-	if (!dev->sgid_tbl)
-		goto alloc_err;
-	spin_lock_init(&dev->sgid_lock);
-
 	dev->cq_tbl = kzalloc(sizeof(struct ocrdma_cq *) *
 			      OCRDMA_MAX_CQ, GFP_KERNEL);
 	if (!dev->cq_tbl)
@@ -344,7 +209,6 @@ static void ocrdma_free_resources(struct ocrdma_dev *dev)
 	kfree(dev->stag_arr);
 	kfree(dev->qp_tbl);
 	kfree(dev->cq_tbl);
-	kfree(dev->sgid_tbl);
 }
 
 /* OCRDMA sysfs interface */
@@ -390,68 +254,6 @@ static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev)
 		device_remove_file(&dev->ibdev.dev, ocrdma_attributes[i]);
 }
 
-static void ocrdma_add_default_sgid(struct ocrdma_dev *dev)
-{
-	/* GID Index 0 - Invariant manufacturer-assigned EUI-64 */
-	union ib_gid *sgid = &dev->sgid_tbl[0];
-
-	sgid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
-	ocrdma_get_guid(dev, &sgid->raw[8]);
-}
-
-static void ocrdma_init_ipv4_gids(struct ocrdma_dev *dev,
-				  struct net_device *net)
-{
-	struct in_device *in_dev;
-	union ib_gid gid;
-	in_dev = in_dev_get(net);
-	if (in_dev) {
-		for_ifa(in_dev) {
-			ipv6_addr_set_v4mapped(ifa->ifa_address,
-					       (struct in6_addr *)&gid);
-			ocrdma_add_sgid(dev, &gid);
-		}
-		endfor_ifa(in_dev);
-		in_dev_put(in_dev);
-	}
-}
-
-static void ocrdma_init_ipv6_gids(struct ocrdma_dev *dev,
-				  struct net_device *net)
-{
-#if IS_ENABLED(CONFIG_IPV6)
-	struct inet6_dev *in6_dev;
-	union ib_gid  *pgid;
-	struct inet6_ifaddr *ifp;
-	in6_dev = in6_dev_get(net);
-	if (in6_dev) {
-		read_lock_bh(&in6_dev->lock);
-		list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
-			pgid = (union ib_gid *)&ifp->addr;
-			ocrdma_add_sgid(dev, pgid);
-		}
-		read_unlock_bh(&in6_dev->lock);
-		in6_dev_put(in6_dev);
-	}
-#endif
-}
-
-static void ocrdma_init_gid_table(struct ocrdma_dev *dev)
-{
-	struct  net_device *net_dev;
-
-	for_each_netdev(&init_net, net_dev) {
-		struct net_device *real_dev = rdma_vlan_dev_real_dev(net_dev) ?
-				rdma_vlan_dev_real_dev(net_dev) : net_dev;
-
-		if (real_dev == dev->nic_info.netdev) {
-			ocrdma_add_default_sgid(dev);
-			ocrdma_init_ipv4_gids(dev, net_dev);
-			ocrdma_init_ipv6_gids(dev, net_dev);
-		}
-	}
-}
-
 static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)
 {
 	int status = 0, i;
@@ -480,7 +282,6 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)
 		goto alloc_err;
 
 	ocrdma_init_service_level(dev);
-	ocrdma_init_gid_table(dev);
 	status = ocrdma_register_device(dev);
 	if (status)
 		goto alloc_err;
@@ -627,34 +428,12 @@ static struct ocrdma_driver ocrdma_drv = {
 	.be_abi_version		= OCRDMA_BE_ROCE_ABI_VERSION,
 };
 
-static void ocrdma_unregister_inet6addr_notifier(void)
-{
-#if IS_ENABLED(CONFIG_IPV6)
-	unregister_inet6addr_notifier(&ocrdma_inet6addr_notifier);
-#endif
-}
-
-static void ocrdma_unregister_inetaddr_notifier(void)
-{
-	unregister_inetaddr_notifier(&ocrdma_inetaddr_notifier);
-}
-
 static int __init ocrdma_init_module(void)
 {
 	int status;
 
 	ocrdma_init_debugfs();
 
-	status = register_inetaddr_notifier(&ocrdma_inetaddr_notifier);
-	if (status)
-		return status;
-
-#if IS_ENABLED(CONFIG_IPV6)
-	status = register_inet6addr_notifier(&ocrdma_inet6addr_notifier);
-	if (status)
-		goto err_notifier6;
-#endif
-
 	status = be_roce_register_driver(&ocrdma_drv);
 	if (status)
 		goto err_be_reg;
@@ -662,19 +441,13 @@ static int __init ocrdma_init_module(void)
 	return 0;
 
 err_be_reg:
-#if IS_ENABLED(CONFIG_IPV6)
-	ocrdma_unregister_inet6addr_notifier();
-err_notifier6:
-#endif
-	ocrdma_unregister_inetaddr_notifier();
+
 	return status;
 }
 
 static void __exit ocrdma_exit_module(void)
 {
 	be_roce_unregister_driver(&ocrdma_drv);
-	ocrdma_unregister_inet6addr_notifier();
-	ocrdma_unregister_inetaddr_notifier();
 	ocrdma_rem_debugfs();
 }
 
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
index 243c87c..6b74eb9 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
@@ -125,6 +125,14 @@ enum {
 	OCRDMA_DB_RQ_SHIFT		= 24
 };
 
+enum {
+	OCRDMA_L3_TYPE_IB_GRH   = 0x00,
+	OCRDMA_L3_TYPE_IPV4     = 0x01,
+	OCRDMA_L3_TYPE_IPV6     = 0x02
+};
+
+#define OCRDMA_ROUDP_FLAGS_SHIFT	0x03
+
 #define OCRDMA_DB_CQ_RING_ID_MASK       0x3FF	/* bits 0 - 9 */
 #define OCRDMA_DB_CQ_RING_ID_EXT_MASK  0x0C00	/* bits 10-11 of qid at 12-11 */
 /* qid #2 msbits at 12-11 */
@@ -488,6 +496,9 @@ enum {
 	OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT		= 8,
 	OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_MASK		= 0xFF <<
 				OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT,
+	OCRDMA_MBX_QUERY_CFG_L3_TYPE_SHIFT		 = 0,
+	OCRDMA_MBX_QUERY_CFG_L3_TYPE_MASK		= 0xFF <<
+				OCRDMA_MBX_QUERY_CFG_L3_TYPE_SHIFT,
 
 	OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT		= 0,
 	OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK		= 0xFFFF,
@@ -1049,6 +1060,8 @@ enum {
 	OCRDMA_QP_PARAMS_STATE_MASK		= BIT(5) | BIT(6) | BIT(7),
 	OCRDMA_QP_PARAMS_FLAGS_SQD_ASYNC	= BIT(8),
 	OCRDMA_QP_PARAMS_FLAGS_INB_ATEN		= BIT(9),
+	OCRDMA_QP_PARAMS_FLAGS_L3_TYPE_SHIFT	= 11,
+	OCRDMA_QP_PARAMS_FLAGS_L3_TYPE_MASK	= BIT(11) | BIT(12) | BIT(13),
 	OCRDMA_QP_PARAMS_MAX_SGE_RECV_SHIFT	= 16,
 	OCRDMA_QP_PARAMS_MAX_SGE_RECV_MASK	= 0xFFFF <<
 					OCRDMA_QP_PARAMS_MAX_SGE_RECV_SHIFT,
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index 8771755..47413c3 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -31,6 +31,7 @@
 #include <rdma/iw_cm.h>
 #include <rdma/ib_umem.h>
 #include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
 
 #include "ocrdma.h"
 #include "ocrdma_hw.h"
@@ -49,6 +50,7 @@ int ocrdma_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
 int ocrdma_query_gid(struct ib_device *ibdev, u8 port,
 		     int index, union ib_gid *sgid)
 {
+	int ret;
 	struct ocrdma_dev *dev;
 
 	dev = get_ocrdma_dev(ibdev);
@@ -56,7 +58,22 @@ int ocrdma_query_gid(struct ib_device *ibdev, u8 port,
 	if (index >= OCRDMA_MAX_SGID)
 		return -EINVAL;
 
-	memcpy(sgid, &dev->sgid_tbl[index], sizeof(*sgid));
+	ret = ib_get_cached_gid(ibdev, port, index, sgid, NULL);
+	if (ret == -EAGAIN) {
+		memcpy(sgid, &zgid, sizeof(*sgid));
+		return 0;
+	}
+
+	return ret;
+}
+
+int ocrdma_modify_gid(struct ib_device *ibdev, u8 port_num, unsigned int index,
+		      const union ib_gid *gid, const struct ib_gid_attr *attr,
+		      void **context)
+{
+	struct ocrdma_dev *dev;
+
+	dev = get_ocrdma_dev(ibdev);
 
 	return 0;
 }
@@ -106,6 +123,15 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr)
 	return 0;
 }
 
+struct net_device *ocrdma_get_netdev(struct ib_device *ibdev, u8 port_num)
+{
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
+
+	if (dev)
+		return dev->nic_info.netdev;
+
+	return NULL;
+}
 static inline void get_link_speed_and_width(struct ocrdma_dev *dev,
 					    u8 *ib_speed, u8 *ib_width)
 {
@@ -175,7 +201,10 @@ int ocrdma_query_port(struct ib_device *ibdev,
 	props->port_cap_flags =
 	    IB_PORT_CM_SUP |
 	    IB_PORT_REINIT_SUP |
-	    IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_IP_BASED_GIDS;
+	    IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP |
+	    IB_PORT_IP_BASED_GIDS | IB_PORT_ROCE;
+	if (ocrdma_is_rocev2_supported(dev))
+		props->port_cap_flags |= IB_PORT_ROCE_V2;
 	props->gid_tbl_len = OCRDMA_MAX_SGID;
 	props->pkey_tbl_len = 1;
 	props->bad_pkey_cntr = 0;
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
index b8f7853..8204182 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -44,6 +44,10 @@ int ocrdma_modify_port(struct ib_device *, u8 port, int mask,
 void ocrdma_get_guid(struct ocrdma_dev *, u8 *guid);
 int ocrdma_query_gid(struct ib_device *, u8 port,
 		     int index, union ib_gid *gid);
+struct net_device *ocrdma_get_netdev(struct ib_device *device, u8 port_num);
+int ocrdma_modify_gid(struct ib_device *ibdev, u8 port_num, unsigned int index,
+		      const union ib_gid *gid, const struct ib_gid_attr *attr,
+		      void **context);
 int ocrdma_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey);
 
 struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *,
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 17/33] RDMA/ocrdma: changes to support RoCE-v2 in UD path
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (15 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 16/33] RDMA/ocrdma: Changes in driver to incorporate the moving of GID Table mgmt to IB/Core Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 18/33] RDMA/ocrdma: changes to support RoCE-v2 in RC path Somnath Kotur
                     ` (15 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Devesh Sharma, Somnath Kotur

From: Devesh Sharma <devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>

To support UD protocol this patch adds following
changes to existing UD implementation.

1. AH creation resolves gid-type for a given index.
2. Based on GID-type protocol header is built.
3. Work completion reports l3-type if f/w supports RoCE-v2
   and sets IB_WC_WITH_NETWORK_HDR_TYPE flag in wc->wc_flags.

Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Devesh Sharma <devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/hw/ocrdma/ocrdma.h       |  1 +
 drivers/infiniband/hw/ocrdma/ocrdma_ah.c    | 69 ++++++++++++++++++++++++-----
 drivers/infiniband/hw/ocrdma/ocrdma_sli.h   |  5 ++-
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 23 ++++++++--
 4 files changed, 81 insertions(+), 17 deletions(-)

diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h
index 97f971a..302fd0e 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma.h
@@ -341,6 +341,7 @@ struct ocrdma_ah {
 	struct ocrdma_av *av;
 	u16 sgid_index;
 	u32 id;
+	u8 hdr_type;
 };
 
 struct ocrdma_qp_hwq_info {
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
index 7ecd230..1bb72a0 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
@@ -39,6 +39,20 @@
 
 #define OCRDMA_VID_PCP_SHIFT	0xD
 
+static u16 ocrdma_hdr_type_to_proto_num(u8 hdr_type)
+{
+	switch (hdr_type) {
+	case OCRDMA_L3_TYPE_IB_GRH:
+		return (u16)0x8915;
+	case OCRDMA_L3_TYPE_IPV4:
+		return (u16)0x0800;
+	case OCRDMA_L3_TYPE_IPV6:
+		return (u16)0x86dd;
+	default:
+		return 0;
+	}
+}
+
 static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,
 			struct ib_ah_attr *attr, union ib_gid *sgid,
 			int pdid, bool *isvlan, u16 vlan_tag)
@@ -47,22 +61,33 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,
 	struct ocrdma_eth_vlan eth;
 	struct ocrdma_grh grh;
 	int eth_sz;
+	u16 proto_num = 0;
+	u8 nxthdr = 0x11;
+	struct iphdr ipv4;
+	union {
+		struct sockaddr     _sockaddr;
+		struct sockaddr_in  _sockaddr_in;
+		struct sockaddr_in6 _sockaddr_in6;
+	} sgid_addr, dgid_addr;
 
 	memset(&eth, 0, sizeof(eth));
 	memset(&grh, 0, sizeof(grh));
+	/* Protocol Number */
+	proto_num = ocrdma_hdr_type_to_proto_num(ah->hdr_type);
+	nxthdr = (proto_num == 0x8915) ? 0x1b : 0x11;
 
 	/* VLAN */
 	if (!vlan_tag || (vlan_tag > 0xFFF))
 		vlan_tag = dev->pvid;
 	if (vlan_tag && (vlan_tag < 0x1000)) {
 		eth.eth_type = cpu_to_be16(0x8100);
-		eth.roce_eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE);
+		eth.roce_eth_type = cpu_to_be16(proto_num);
 		vlan_tag |= (dev->sl & 0x07) << OCRDMA_VID_PCP_SHIFT;
 		eth.vlan_tag = cpu_to_be16(vlan_tag);
 		eth_sz = sizeof(struct ocrdma_eth_vlan);
 		*isvlan = true;
 	} else {
-		eth.eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE);
+		eth.eth_type = cpu_to_be16(proto_num);
 		eth_sz = sizeof(struct ocrdma_eth_basic);
 	}
 	/* MAC */
@@ -71,18 +96,34 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,
 	if (status)
 		return status;
 	ah->sgid_index = attr->grh.sgid_index;
-	memcpy(&grh.sgid[0], sgid->raw, sizeof(union ib_gid));
-	memcpy(&grh.dgid[0], attr->grh.dgid.raw, sizeof(attr->grh.dgid.raw));
-
-	grh.tclass_flow = cpu_to_be32((6 << 28) |
-			(attr->grh.traffic_class << 24) |
-			attr->grh.flow_label);
-	/* 0x1b is next header value in GRH */
-	grh.pdid_hoplimit = cpu_to_be32((pdid << 16) |
-			(0x1b << 8) | attr->grh.hop_limit);
 	/* Eth HDR */
 	memcpy(&ah->av->eth_hdr, &eth, eth_sz);
-	memcpy((u8 *)ah->av + eth_sz, &grh, sizeof(struct ocrdma_grh));
+	if (ah->hdr_type == RDMA_NETWORK_IPV4) {
+		*((__be16 *)&ipv4) = htons((4 << 12) | (5 << 8) |
+					   attr->grh.traffic_class);
+		ipv4.id = cpu_to_be16(pdid);
+		ipv4.frag_off = htons(IP_DF);
+		ipv4.tot_len = htons(0);
+		ipv4.ttl = attr->grh.hop_limit;
+		ipv4.protocol = nxthdr;
+		rdma_gid2ip(&sgid_addr._sockaddr, sgid);
+		ipv4.saddr = sgid_addr._sockaddr_in.sin_addr.s_addr;
+		rdma_gid2ip(&dgid_addr._sockaddr, &attr->grh.dgid);
+		ipv4.daddr = dgid_addr._sockaddr_in.sin_addr.s_addr;
+		memcpy((u8 *)ah->av + eth_sz, &ipv4, sizeof(struct iphdr));
+	} else {
+		memcpy(&grh.sgid[0], sgid->raw, sizeof(union ib_gid));
+		grh.tclass_flow = cpu_to_be32((6 << 28) |
+					      (attr->grh.traffic_class << 24) |
+					      attr->grh.flow_label);
+		memcpy(&grh.dgid[0], attr->grh.dgid.raw,
+		       sizeof(attr->grh.dgid.raw));
+		/* 0x1b is next header value in GRH */
+		grh.pdid_hoplimit = cpu_to_be32((pdid << 16) |
+						(nxthdr << 8) |
+						attr->grh.hop_limit);
+		memcpy((u8 *)ah->av + eth_sz, &grh, sizeof(struct ocrdma_grh));
+	}
 	if (*isvlan)
 		ah->av->valid |= OCRDMA_AV_VLAN_VALID;
 	ah->av->valid = cpu_to_le32(ah->av->valid);
@@ -106,6 +147,7 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
 
 	if (atomic_cmpxchg(&dev->update_sl, 1, 0))
 		ocrdma_init_service_level(dev);
+
 	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
 	if (!ah)
 		return ERR_PTR(-ENOMEM);
@@ -126,6 +168,9 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
 		vlan_tag = vlan_dev_vlan_id(sgid_attr.ndev);
 	rcu_read_unlock();
 
+	/* Get network header type for this GID */
+	ah->hdr_type = ib_gid_to_network_type(sgid_attr.gid_type, &sgid);
+
 	if (pd->uctx) {
 		status = rdma_addr_find_dmac_by_grh(&sgid, &attr->grh.dgid,
 						    attr->dmac, &vlan_tag,
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
index 6b74eb9..4fb68ee 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
@@ -1681,8 +1681,11 @@ enum {
 
 	/* w1 */
 	OCRDMA_CQE_UD_XFER_LEN_SHIFT	= 16,
+	OCRDMA_CQE_UD_XFER_LEN_MASK     = 0x1FFF,
 	OCRDMA_CQE_PKEY_SHIFT		= 0,
 	OCRDMA_CQE_PKEY_MASK		= 0xFFFF,
+	OCRDMA_CQE_UD_L3TYPE_SHIFT      = 29,
+	OCRDMA_CQE_UD_L3TYPE_MASK       = 0x07,
 
 	/* w2 */
 	OCRDMA_CQE_QPN_SHIFT		= 0,
@@ -1807,7 +1810,7 @@ struct ocrdma_ewqe_ud_hdr {
 	u32 rsvd_dest_qpn;
 	u32 qkey;
 	u32 rsvd_ahid;
-	u32 rsvd;
+	u32 hdr_type;
 };
 
 /* extended wqe followed by hdr_wqe for Fast Memory register */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index 47413c3..0444850 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -31,7 +31,6 @@
 #include <rdma/iw_cm.h>
 #include <rdma/ib_umem.h>
 #include <rdma/ib_addr.h>
-#include <rdma/ib_cache.h>
 
 #include "ocrdma.h"
 #include "ocrdma_hw.h"
@@ -1963,6 +1962,7 @@ static void ocrdma_build_ud_hdr(struct ocrdma_qp *qp,
 	else
 		ud_hdr->qkey = wr->wr.ud.remote_qkey;
 	ud_hdr->rsvd_ahid = ah->id;
+	ud_hdr->hdr_type = ah->hdr_type;
 	if (ah->av->valid & OCRDMA_AV_VLAN_VALID)
 		hdr->cw |= (OCRDMA_FLAG_AH_VLAN_PR << OCRDMA_WQE_FLAGS_SHIFT);
 }
@@ -2698,9 +2698,11 @@ static bool ocrdma_poll_scqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe,
 	return expand;
 }
 
-static int ocrdma_update_ud_rcqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe)
+static int ocrdma_update_ud_rcqe(struct ocrdma_dev *dev, struct ib_wc *ibwc,
+				 struct ocrdma_cqe *cqe)
 {
 	int status;
+	u16 hdr_type = 0;
 
 	status = (le32_to_cpu(cqe->flags_status_srcqpn) &
 		OCRDMA_CQE_UD_STATUS_MASK) >> OCRDMA_CQE_UD_STATUS_SHIFT;
@@ -2710,7 +2712,17 @@ static int ocrdma_update_ud_rcqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe)
 						OCRDMA_CQE_PKEY_MASK;
 	ibwc->wc_flags = IB_WC_GRH;
 	ibwc->byte_len = (le32_to_cpu(cqe->ud.rxlen_pkey) >>
-					OCRDMA_CQE_UD_XFER_LEN_SHIFT);
+			  OCRDMA_CQE_UD_XFER_LEN_SHIFT) &
+			  OCRDMA_CQE_UD_XFER_LEN_MASK;
+
+	if (ocrdma_is_rocev2_supported(dev)) {
+		hdr_type = (le32_to_cpu(cqe->ud.rxlen_pkey) >>
+			    OCRDMA_CQE_UD_L3TYPE_SHIFT) &
+			    OCRDMA_CQE_UD_L3TYPE_MASK;
+		ibwc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE;
+		ibwc->network_hdr_type = hdr_type;
+	}
+
 	return status;
 }
 
@@ -2773,12 +2785,15 @@ static bool ocrdma_poll_err_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe,
 static void ocrdma_poll_success_rcqe(struct ocrdma_qp *qp,
 				     struct ocrdma_cqe *cqe, struct ib_wc *ibwc)
 {
+	struct ocrdma_dev *dev;
+
+	dev = get_ocrdma_dev(qp->ibqp.device);
 	ibwc->opcode = IB_WC_RECV;
 	ibwc->qp = &qp->ibqp;
 	ibwc->status = IB_WC_SUCCESS;
 
 	if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI)
-		ocrdma_update_ud_rcqe(ibwc, cqe);
+		ocrdma_update_ud_rcqe(dev, ibwc, cqe);
 	else
 		ibwc->byte_len = le32_to_cpu(cqe->rq.rxlen);
 
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 18/33] RDMA/ocrdma: changes to support RoCE-v2 in RC path
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (16 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 17/33] RDMA/ocrdma: changes to support RoCE-v2 in UD path Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 19/33] RDMA/ocrdma: changes to support user AH creation Somnath Kotur
                     ` (14 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Devesh Sharma, Somnath Kotur

From: Devesh Sharma <devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>

To support RoCE-V2 this patch implements following changes
1. Get the GID-type for a given sgid.
2. Based on the gid type get IPv4 L3 address
   and give those to FW.
3. Provide l3-type to FW.

Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Devesh Sharma <devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
index 20f9e8f..147fccf 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
@@ -2433,7 +2433,13 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp,
 	union ib_gid sgid, zgid;
 	struct ib_gid_attr sgid_attr;
 	u32 vlan_id = 0xffff;
-	u8 mac_addr[6];
+	u8 mac_addr[6], hdr_type;
+	union {
+		struct sockaddr     _sockaddr;
+		struct sockaddr_in  _sockaddr_in;
+		struct sockaddr_in6 _sockaddr_in6;
+	} sgid_addr, dgid_addr;
+
 	struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device);
 
 	if ((ah_attr->ah_flags & IB_AH_GRH) == 0)
@@ -2448,6 +2454,8 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp,
 	cmd->params.hop_lmt_rq_psn |=
 	    (ah_attr->grh.hop_limit << OCRDMA_QP_PARAMS_HOP_LMT_SHIFT);
 	cmd->flags |= OCRDMA_QP_PARA_FLOW_LBL_VALID;
+
+	/* GIDs */
 	memcpy(&cmd->params.dgid[0], &ah_attr->grh.dgid.raw[0],
 	       sizeof(cmd->params.dgid));
 
@@ -2471,17 +2479,35 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp,
 		return status;
 	cmd->params.dmac_b0_to_b3 = mac_addr[0] | (mac_addr[1] << 8) |
 				(mac_addr[2] << 16) | (mac_addr[3] << 24);
+	hdr_type = ib_gid_to_network_type(sgid_attr.gid_type, &sgid);
+	if (hdr_type == RDMA_NETWORK_IPV4) {
+		status = rdma_gid2ip(&sgid_addr._sockaddr, &sgid);
+		if (status)
+			return status;
+		status = rdma_gid2ip(&dgid_addr._sockaddr, &ah_attr->grh.dgid);
+		if (status)
+			return status;
+		memcpy(&cmd->params.dgid[0],
+		       &dgid_addr._sockaddr_in.sin_addr.s_addr, 4);
+		memcpy(&cmd->params.sgid[0],
+		       &sgid_addr._sockaddr_in.sin_addr.s_addr, 4);
+	}
 	/* convert them to LE format. */
 	ocrdma_cpu_to_le32(&cmd->params.dgid[0], sizeof(cmd->params.dgid));
 	ocrdma_cpu_to_le32(&cmd->params.sgid[0], sizeof(cmd->params.sgid));
 	cmd->params.vlan_dmac_b4_to_b5 = mac_addr[4] | (mac_addr[5] << 8);
-	if (attr_mask & IB_QP_VID) {
+	if (vlan_id < 0x1000) {
 		cmd->params.vlan_dmac_b4_to_b5 |=
 		    vlan_id << OCRDMA_QP_PARAMS_VLAN_SHIFT;
 		cmd->flags |= OCRDMA_QP_PARA_VLAN_EN_VALID;
 		cmd->params.rnt_rc_sl_fl |=
 			(dev->sl & 0x07) << OCRDMA_QP_PARAMS_SL_SHIFT;
 	}
+
+	cmd->params.max_sge_recv_flags |=
+					 ((hdr_type <<
+					 OCRDMA_QP_PARAMS_FLAGS_L3_TYPE_SHIFT) &
+					 OCRDMA_QP_PARAMS_FLAGS_L3_TYPE_MASK);
 	return 0;
 }
 
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 19/33] RDMA/ocrdma: changes to support user AH creation
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (17 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 18/33] RDMA/ocrdma: changes to support RoCE-v2 in RC path Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 20/33] IB/mlx4: Remove gid table management for RoCE Somnath Kotur
                     ` (13 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Devesh Sharma, Somnath Kotur

From: Devesh Sharma <devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>

To support user space AH this uses ahid field to convey
l3-type to user space library. The library is responsible
for decoding the l3-type out of ahid.

Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
Signed-off-by: Devesh Sharma <devesh.sharma-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/hw/ocrdma/ocrdma_ah.c | 5 +++++
 drivers/infiniband/hw/ocrdma/ocrdma_ah.h | 5 +++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
index 1bb72a0..65a39cc 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
@@ -191,6 +191,11 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
 		ahid_addr = pd->uctx->ah_tbl.va + attr->dlid;
 		*ahid_addr = 0;
 		*ahid_addr |= ah->id & OCRDMA_AH_ID_MASK;
+		if (ocrdma_is_rocev2_supported(dev)) {
+			*ahid_addr |= ((u32)ah->hdr_type &
+				       OCRDMA_AH_L3_TYPE_MASK) <<
+				       OCRDMA_AH_L3_TYPE_SHIFT;
+		}
 		if (isvlan)
 			*ahid_addr |= (OCRDMA_AH_VLAN_VALID_MASK <<
 				       OCRDMA_AH_VLAN_VALID_SHIFT);
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
index 726a87c..ed45ecd 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
@@ -31,9 +31,10 @@
 enum {
 	OCRDMA_AH_ID_MASK		= 0x3FF,
 	OCRDMA_AH_VLAN_VALID_MASK	= 0x01,
-	OCRDMA_AH_VLAN_VALID_SHIFT	= 0x1F
+	OCRDMA_AH_VLAN_VALID_SHIFT	= 0x1F,
+	OCRDMA_AH_L3_TYPE_MASK		= 0x03,
+	OCRDMA_AH_L3_TYPE_SHIFT		= 0x1D /* 29 bits */
 };
-
 struct ib_ah *ocrdma_create_ah(struct ib_pd *, struct ib_ah_attr *);
 int ocrdma_destroy_ah(struct ib_ah *);
 int ocrdma_query_ah(struct ib_ah *, struct ib_ah_attr *);
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 20/33] IB/mlx4: Remove gid table management for RoCE
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (18 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 19/33] RDMA/ocrdma: changes to support user AH creation Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 21/33] IB/mlx4: Replace spin_lock with rw_semaphore Somnath Kotur
                     ` (12 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Moni Shoua, Somnath Kotur

From: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

RoCE GID table management moved to InfiniBand core driver.
Core driver is now responsible to populate the GID table and supply
query and lookup functions for GIDs. HW drivers are responsible only modify
GID table in network adapters.
The query_gid hook should now return the answer from the cache when link layer
is Ethernet.

Signed-off-by: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/hw/mlx4/main.c    | 495 +----------------------------------
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   4 -
 2 files changed, 14 insertions(+), 485 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 6fa5e49..91caffc 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -45,6 +45,7 @@
 #include <rdma/ib_smi.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
 
 #include <linux/mlx4/driver.h>
 #include <linux/mlx4/cmd.h>
@@ -74,13 +75,6 @@ static const char mlx4_ib_version[] =
 	DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
 	DRV_VERSION " (" DRV_RELDATE ")\n";
 
-struct update_gid_work {
-	struct work_struct	work;
-	union ib_gid		gids[128];
-	struct mlx4_ib_dev     *dev;
-	int			port;
-};
-
 static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init);
 
 static struct workqueue_struct *wq;
@@ -474,23 +468,21 @@ out:
 	return err;
 }
 
-static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index,
-			  union ib_gid *gid)
-{
-	struct mlx4_ib_dev *dev = to_mdev(ibdev);
-
-	*gid = dev->iboe.gid_table[port - 1][index];
-
-	return 0;
-}
-
 static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
 			     union ib_gid *gid)
 {
-	if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND)
+	int ret;
+
+	if (ib_cache_use_roce_gid_cache(ibdev, port))
 		return __mlx4_ib_query_gid(ibdev, port, index, gid, 0);
-	else
-		return iboe_query_gid(ibdev, port, index, gid);
+
+	ret = ib_get_cached_gid(ibdev, port, index, gid, NULL);
+	if (ret == -EAGAIN) {
+		memcpy(gid, &zgid, sizeof(*gid));
+		return 0;
+	}
+
+	return ret;
 }
 
 int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
@@ -1480,273 +1472,6 @@ static struct device_attribute *mlx4_class_attributes[] = {
 	&dev_attr_board_id
 };
 
-static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id,
-				     struct net_device *dev)
-{
-	memcpy(eui, dev->dev_addr, 3);
-	memcpy(eui + 5, dev->dev_addr + 3, 3);
-	if (vlan_id < 0x1000) {
-		eui[3] = vlan_id >> 8;
-		eui[4] = vlan_id & 0xff;
-	} else {
-		eui[3] = 0xff;
-		eui[4] = 0xfe;
-	}
-	eui[0] ^= 2;
-}
-
-static void update_gids_task(struct work_struct *work)
-{
-	struct update_gid_work *gw = container_of(work, struct update_gid_work, work);
-	struct mlx4_cmd_mailbox *mailbox;
-	union ib_gid *gids;
-	int err;
-	struct mlx4_dev	*dev = gw->dev->dev;
-	int is_bonded = mlx4_is_bonded(dev);
-
-	if (!gw->dev->ib_active)
-		return;
-
-	mailbox = mlx4_alloc_cmd_mailbox(dev);
-	if (IS_ERR(mailbox)) {
-		pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox));
-		return;
-	}
-
-	gids = mailbox->buf;
-	memcpy(gids, gw->gids, sizeof gw->gids);
-
-	err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
-		       1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
-		       MLX4_CMD_WRAPPED);
-	if (err)
-		pr_warn("set port command failed\n");
-	else
-		if ((gw->port == 1) || !is_bonded)
-			mlx4_ib_dispatch_event(gw->dev,
-					       is_bonded ? 1 : gw->port,
-					       IB_EVENT_GID_CHANGE);
-
-	mlx4_free_cmd_mailbox(dev, mailbox);
-	kfree(gw);
-}
-
-static void reset_gids_task(struct work_struct *work)
-{
-	struct update_gid_work *gw =
-			container_of(work, struct update_gid_work, work);
-	struct mlx4_cmd_mailbox *mailbox;
-	union ib_gid *gids;
-	int err;
-	struct mlx4_dev	*dev = gw->dev->dev;
-
-	if (!gw->dev->ib_active)
-		return;
-
-	mailbox = mlx4_alloc_cmd_mailbox(dev);
-	if (IS_ERR(mailbox)) {
-		pr_warn("reset gid table failed\n");
-		goto free;
-	}
-
-	gids = mailbox->buf;
-	memcpy(gids, gw->gids, sizeof(gw->gids));
-
-	if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, gw->port) ==
-				    IB_LINK_LAYER_ETHERNET) {
-		err = mlx4_cmd(dev, mailbox->dma,
-			       MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
-			       1, MLX4_CMD_SET_PORT,
-			       MLX4_CMD_TIME_CLASS_B,
-			       MLX4_CMD_WRAPPED);
-		if (err)
-			pr_warn(KERN_WARNING
-				"set port %d command failed\n", gw->port);
-	}
-
-	mlx4_free_cmd_mailbox(dev, mailbox);
-free:
-	kfree(gw);
-}
-
-static int update_gid_table(struct mlx4_ib_dev *dev, int port,
-			    union ib_gid *gid, int clear,
-			    int default_gid)
-{
-	struct update_gid_work *work;
-	int i;
-	int need_update = 0;
-	int free = -1;
-	int found = -1;
-	int max_gids;
-
-	if (default_gid) {
-		free = 0;
-	} else {
-		max_gids = dev->dev->caps.gid_table_len[port];
-		for (i = 1; i < max_gids; ++i) {
-			if (!memcmp(&dev->iboe.gid_table[port - 1][i], gid,
-				    sizeof(*gid)))
-				found = i;
-
-			if (clear) {
-				if (found >= 0) {
-					need_update = 1;
-					dev->iboe.gid_table[port - 1][found] =
-						zgid;
-					break;
-				}
-			} else {
-				if (found >= 0)
-					break;
-
-				if (free < 0 &&
-				    !memcmp(&dev->iboe.gid_table[port - 1][i],
-					    &zgid, sizeof(*gid)))
-					free = i;
-			}
-		}
-	}
-
-	if (found == -1 && !clear && free >= 0) {
-		dev->iboe.gid_table[port - 1][free] = *gid;
-		need_update = 1;
-	}
-
-	if (!need_update)
-		return 0;
-
-	work = kzalloc(sizeof(*work), GFP_ATOMIC);
-	if (!work)
-		return -ENOMEM;
-
-	memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof(work->gids));
-	INIT_WORK(&work->work, update_gids_task);
-	work->port = port;
-	work->dev = dev;
-	queue_work(wq, &work->work);
-
-	return 0;
-}
-
-static void mlx4_make_default_gid(struct  net_device *dev, union ib_gid *gid)
-{
-	gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
-	mlx4_addrconf_ifid_eui48(&gid->raw[8], 0xffff, dev);
-}
-
-
-static int reset_gid_table(struct mlx4_ib_dev *dev, u8 port)
-{
-	struct update_gid_work *work;
-
-	work = kzalloc(sizeof(*work), GFP_ATOMIC);
-	if (!work)
-		return -ENOMEM;
-
-	memset(dev->iboe.gid_table[port - 1], 0, sizeof(work->gids));
-	memset(work->gids, 0, sizeof(work->gids));
-	INIT_WORK(&work->work, reset_gids_task);
-	work->dev = dev;
-	work->port = port;
-	queue_work(wq, &work->work);
-	return 0;
-}
-
-static int mlx4_ib_addr_event(int event, struct net_device *event_netdev,
-			      struct mlx4_ib_dev *ibdev, union ib_gid *gid)
-{
-	struct mlx4_ib_iboe *iboe;
-	int port = 0;
-	struct net_device *real_dev = rdma_vlan_dev_real_dev(event_netdev) ?
-				rdma_vlan_dev_real_dev(event_netdev) :
-				event_netdev;
-	union ib_gid default_gid;
-
-	mlx4_make_default_gid(real_dev, &default_gid);
-
-	if (!memcmp(gid, &default_gid, sizeof(*gid)))
-		return 0;
-
-	if (event != NETDEV_DOWN && event != NETDEV_UP)
-		return 0;
-
-	if ((real_dev != event_netdev) &&
-	    (event == NETDEV_DOWN) &&
-	    rdma_link_local_addr((struct in6_addr *)gid))
-		return 0;
-
-	iboe = &ibdev->iboe;
-	spin_lock_bh(&iboe->lock);
-
-	for (port = 1; port <= ibdev->dev->caps.num_ports; ++port)
-		if ((netif_is_bond_master(real_dev) &&
-		     (real_dev == iboe->masters[port - 1])) ||
-		     (!netif_is_bond_master(real_dev) &&
-		     (real_dev == iboe->netdevs[port - 1])))
-			update_gid_table(ibdev, port, gid,
-					 event == NETDEV_DOWN, 0);
-
-	spin_unlock_bh(&iboe->lock);
-	return 0;
-
-}
-
-static u8 mlx4_ib_get_dev_port(struct net_device *dev,
-			       struct mlx4_ib_dev *ibdev)
-{
-	u8 port = 0;
-	struct mlx4_ib_iboe *iboe;
-	struct net_device *real_dev = rdma_vlan_dev_real_dev(dev) ?
-				rdma_vlan_dev_real_dev(dev) : dev;
-
-	iboe = &ibdev->iboe;
-
-	for (port = 1; port <= ibdev->dev->caps.num_ports; ++port)
-		if ((netif_is_bond_master(real_dev) &&
-		     (real_dev == iboe->masters[port - 1])) ||
-		     (!netif_is_bond_master(real_dev) &&
-		     (real_dev == iboe->netdevs[port - 1])))
-			break;
-
-	if ((port == 0) || (port > ibdev->dev->caps.num_ports))
-		return 0;
-	else
-		return port;
-}
-
-static int mlx4_ib_inet_event(struct notifier_block *this, unsigned long event,
-				void *ptr)
-{
-	struct mlx4_ib_dev *ibdev;
-	struct in_ifaddr *ifa = ptr;
-	union ib_gid gid;
-	struct net_device *event_netdev = ifa->ifa_dev->dev;
-
-	ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid);
-
-	ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet);
-
-	mlx4_ib_addr_event(event, event_netdev, ibdev, &gid);
-	return NOTIFY_DONE;
-}
-
-#if IS_ENABLED(CONFIG_IPV6)
-static int mlx4_ib_inet6_event(struct notifier_block *this, unsigned long event,
-				void *ptr)
-{
-	struct mlx4_ib_dev *ibdev;
-	struct inet6_ifaddr *ifa = ptr;
-	union  ib_gid *gid = (union ib_gid *)&ifa->addr;
-	struct net_device *event_netdev = ifa->idev->dev;
-
-	ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet6);
-
-	mlx4_ib_addr_event(event, event_netdev, ibdev, gid);
-	return NOTIFY_DONE;
-}
-#endif
-
 #define MLX4_IB_INVALID_MAC	((u64)-1)
 static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev,
 			       struct net_device *dev,
@@ -1805,94 +1530,6 @@ unlock:
 	mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]);
 }
 
-static void mlx4_ib_get_dev_addr(struct net_device *dev,
-				 struct mlx4_ib_dev *ibdev, u8 port)
-{
-	struct in_device *in_dev;
-#if IS_ENABLED(CONFIG_IPV6)
-	struct inet6_dev *in6_dev;
-	union ib_gid  *pgid;
-	struct inet6_ifaddr *ifp;
-	union ib_gid default_gid;
-#endif
-	union ib_gid gid;
-
-
-	if ((port == 0) || (port > ibdev->dev->caps.num_ports))
-		return;
-
-	/* IPv4 gids */
-	in_dev = in_dev_get(dev);
-	if (in_dev) {
-		for_ifa(in_dev) {
-			/*ifa->ifa_address;*/
-			ipv6_addr_set_v4mapped(ifa->ifa_address,
-					       (struct in6_addr *)&gid);
-			update_gid_table(ibdev, port, &gid, 0, 0);
-		}
-		endfor_ifa(in_dev);
-		in_dev_put(in_dev);
-	}
-#if IS_ENABLED(CONFIG_IPV6)
-	mlx4_make_default_gid(dev, &default_gid);
-	/* IPv6 gids */
-	in6_dev = in6_dev_get(dev);
-	if (in6_dev) {
-		read_lock_bh(&in6_dev->lock);
-		list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
-			pgid = (union ib_gid *)&ifp->addr;
-			if (!memcmp(pgid, &default_gid, sizeof(*pgid)))
-				continue;
-			update_gid_table(ibdev, port, pgid, 0, 0);
-		}
-		read_unlock_bh(&in6_dev->lock);
-		in6_dev_put(in6_dev);
-	}
-#endif
-}
-
-static void mlx4_ib_set_default_gid(struct mlx4_ib_dev *ibdev,
-				 struct  net_device *dev, u8 port)
-{
-	union ib_gid gid;
-	mlx4_make_default_gid(dev, &gid);
-	update_gid_table(ibdev, port, &gid, 0, 1);
-}
-
-static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev)
-{
-	struct	net_device *dev;
-	struct mlx4_ib_iboe *iboe = &ibdev->iboe;
-	int i;
-	int err = 0;
-
-	for (i = 1; i <= ibdev->num_ports; ++i) {
-		if (rdma_port_get_link_layer(&ibdev->ib_dev, i) ==
-		    IB_LINK_LAYER_ETHERNET) {
-			err = reset_gid_table(ibdev, i);
-			if (err)
-				goto out;
-		}
-	}
-
-	read_lock(&dev_base_lock);
-	spin_lock_bh(&iboe->lock);
-
-	for_each_netdev(&init_net, dev) {
-		u8 port = mlx4_ib_get_dev_port(dev, ibdev);
-		/* port will be non-zero only for ETH ports */
-		if (port) {
-			mlx4_ib_set_default_gid(ibdev, dev, port);
-			mlx4_ib_get_dev_addr(dev, ibdev, port);
-		}
-	}
-
-	spin_unlock_bh(&iboe->lock);
-	read_unlock(&dev_base_lock);
-out:
-	return err;
-}
-
 static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
 				 struct net_device *dev,
 				 unsigned long event)
@@ -1902,81 +1539,22 @@ static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
 	int update_qps_port = -1;
 	int port;
 
+	ASSERT_RTNL();
+
 	iboe = &ibdev->iboe;
 
 	spin_lock_bh(&iboe->lock);
 	mlx4_foreach_ib_transport_port(port, ibdev->dev) {
-		enum ib_port_state	port_state = IB_PORT_NOP;
-		struct net_device *old_master = iboe->masters[port - 1];
-		struct net_device *curr_netdev;
-		struct net_device *curr_master;
 
 		iboe->netdevs[port - 1] =
 			mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port);
-		if (iboe->netdevs[port - 1])
-			mlx4_ib_set_default_gid(ibdev,
-						iboe->netdevs[port - 1], port);
-		curr_netdev = iboe->netdevs[port - 1];
-
-		if (iboe->netdevs[port - 1] &&
-		    netif_is_bond_slave(iboe->netdevs[port - 1])) {
-			iboe->masters[port - 1] = netdev_master_upper_dev_get(
-				iboe->netdevs[port - 1]);
-		} else {
-			iboe->masters[port - 1] = NULL;
-		}
-		curr_master = iboe->masters[port - 1];
 
 		if (dev == iboe->netdevs[port - 1] &&
 		    (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER ||
 		     event == NETDEV_UP || event == NETDEV_CHANGE))
 			update_qps_port = port;
 
-		if (curr_netdev) {
-			port_state = (netif_running(curr_netdev) && netif_carrier_ok(curr_netdev)) ?
-						IB_PORT_ACTIVE : IB_PORT_DOWN;
-			mlx4_ib_set_default_gid(ibdev, curr_netdev, port);
-			if (curr_master) {
-				/* if using bonding/team and a slave port is down, we
-				 * don't want the bond IP based gids in the table since
-				 * flows that select port by gid may get the down port.
-				*/
-				if (port_state == IB_PORT_DOWN &&
-				    !mlx4_is_bonded(ibdev->dev)) {
-					reset_gid_table(ibdev, port);
-					mlx4_ib_set_default_gid(ibdev,
-								curr_netdev,
-								port);
-				} else {
-					/* gids from the upper dev (bond/team)
-					 * should appear in port's gid table
-					*/
-					mlx4_ib_get_dev_addr(curr_master,
-							     ibdev, port);
-				}
-			}
-			/* if bonding is used it is possible that we add it to
-			 * masters only after IP address is assigned to the
-			 * net bonding interface.
-			*/
-			if (curr_master && (old_master != curr_master)) {
-				reset_gid_table(ibdev, port);
-				mlx4_ib_set_default_gid(ibdev,
-							curr_netdev, port);
-				mlx4_ib_get_dev_addr(curr_master, ibdev, port);
-			}
-
-			if (!curr_master && (old_master != curr_master)) {
-				reset_gid_table(ibdev, port);
-				mlx4_ib_set_default_gid(ibdev,
-							curr_netdev, port);
-				mlx4_ib_get_dev_addr(curr_netdev, ibdev, port);
-			}
-		} else {
-			reset_gid_table(ibdev, port);
-		}
 	}
-
 	spin_unlock_bh(&iboe->lock);
 
 	if (update_qps_port > 0)
@@ -2347,26 +1925,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 				goto err_notif;
 			}
 		}
-		if (!iboe->nb_inet.notifier_call) {
-			iboe->nb_inet.notifier_call = mlx4_ib_inet_event;
-			err = register_inetaddr_notifier(&iboe->nb_inet);
-			if (err) {
-				iboe->nb_inet.notifier_call = NULL;
-				goto err_notif;
-			}
-		}
-#if IS_ENABLED(CONFIG_IPV6)
-		if (!iboe->nb_inet6.notifier_call) {
-			iboe->nb_inet6.notifier_call = mlx4_ib_inet6_event;
-			err = register_inet6addr_notifier(&iboe->nb_inet6);
-			if (err) {
-				iboe->nb_inet6.notifier_call = NULL;
-				goto err_notif;
-			}
-		}
-#endif
-		if (mlx4_ib_init_gid_table(ibdev))
-			goto err_notif;
 	}
 
 	for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
@@ -2397,18 +1955,6 @@ err_notif:
 			pr_warn("failure unregistering notifier\n");
 		ibdev->iboe.nb.notifier_call = NULL;
 	}
-	if (ibdev->iboe.nb_inet.notifier_call) {
-		if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet))
-			pr_warn("failure unregistering notifier\n");
-		ibdev->iboe.nb_inet.notifier_call = NULL;
-	}
-#if IS_ENABLED(CONFIG_IPV6)
-	if (ibdev->iboe.nb_inet6.notifier_call) {
-		if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6))
-			pr_warn("failure unregistering notifier\n");
-		ibdev->iboe.nb_inet6.notifier_call = NULL;
-	}
-#endif
 	flush_workqueue(wq);
 
 	mlx4_ib_close_sriov(ibdev);
@@ -2532,19 +2078,6 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
 		kfree(ibdev->ib_uc_qpns_bitmap);
 	}
 
-	if (ibdev->iboe.nb_inet.notifier_call) {
-		if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet))
-			pr_warn("failure unregistering notifier\n");
-		ibdev->iboe.nb_inet.notifier_call = NULL;
-	}
-#if IS_ENABLED(CONFIG_IPV6)
-	if (ibdev->iboe.nb_inet6.notifier_call) {
-		if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6))
-			pr_warn("failure unregistering notifier\n");
-		ibdev->iboe.nb_inet6.notifier_call = NULL;
-	}
-#endif
-
 	iounmap(ibdev->uar_map);
 	for (p = 0; p < ibdev->num_ports; ++p)
 		if (ibdev->counters[p] != -1)
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 42fe035..e3805a4 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -457,12 +457,8 @@ struct mlx4_ib_sriov {
 struct mlx4_ib_iboe {
 	spinlock_t		lock;
 	struct net_device      *netdevs[MLX4_MAX_PORTS];
-	struct net_device      *masters[MLX4_MAX_PORTS];
 	atomic64_t		mac[MLX4_MAX_PORTS];
 	struct notifier_block 	nb;
-	struct notifier_block	nb_inet;
-	struct notifier_block	nb_inet6;
-	union ib_gid		gid_table[MLX4_MAX_PORTS][128];
 };
 
 struct pkey_mgt {
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 21/33] IB/mlx4: Replace spin_lock with rw_semaphore
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (19 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 20/33] IB/mlx4: Remove gid table management for RoCE Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 22/33] IB/mlx4: Lock with RCU instead of RTNL Somnath Kotur
                     ` (11 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Moni Shoua, Somnath Kotur

From: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Protection on iboe->netdevs is no longer required to be from an atomic context.
Replacing a spin_lock with a semaphore is allowed and makes more sense.

Signed-off-by: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/hw/mlx4/main.c    | 27 ++++++++++-----------------
 drivers/infiniband/hw/mlx4/mlx4_ib.h |  2 +-
 2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 91caffc..d8b227e 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -369,7 +369,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,
 	props->active_mtu	= IB_MTU_256;
 	if (is_bonded)
 		rtnl_lock(); /* required to get upper dev */
-	spin_lock_bh(&iboe->lock);
+	down_read(&iboe->sem);
 	ndev = iboe->netdevs[port - 1];
 	if (ndev && is_bonded)
 		ndev = netdev_master_upper_dev_get(ndev);
@@ -383,7 +383,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,
 					IB_PORT_ACTIVE : IB_PORT_DOWN;
 	props->phys_state	= state_to_phys_state(props->state);
 out_unlock:
-	spin_unlock_bh(&iboe->lock);
+	up_read(&iboe->sem);
 	if (is_bonded)
 		rtnl_unlock();
 out:
@@ -825,11 +825,11 @@ int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
 	if (!mqp->port)
 		return 0;
 
-	spin_lock_bh(&mdev->iboe.lock);
+	down_read(&mdev->iboe.sem);
 	ndev = mdev->iboe.netdevs[mqp->port - 1];
 	if (ndev)
 		dev_hold(ndev);
-	spin_unlock_bh(&mdev->iboe.lock);
+	up_read(&mdev->iboe.sem);
 
 	if (ndev) {
 		ret = 1;
@@ -1330,7 +1330,6 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
 	struct mlx4_dev *dev = mdev->dev;
 	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
-	struct net_device *ndev;
 	struct mlx4_ib_gid_entry *ge;
 	enum mlx4_protocol prot =  MLX4_PROT_IB_IPV6;
 	struct mlx4_flow_reg_id reg_id = {0, 0};
@@ -1370,13 +1369,6 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 	mutex_lock(&mqp->mutex);
 	ge = find_gid_entry(mqp, gid->raw);
 	if (ge) {
-		spin_lock_bh(&mdev->iboe.lock);
-		ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL;
-		if (ndev)
-			dev_hold(ndev);
-		spin_unlock_bh(&mdev->iboe.lock);
-		if (ndev)
-			dev_put(ndev);
 		list_del(&ge->list);
 		kfree(ge);
 	} else
@@ -1543,7 +1535,7 @@ static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
 
 	iboe = &ibdev->iboe;
 
-	spin_lock_bh(&iboe->lock);
+	down_write(&iboe->sem);
 	mlx4_foreach_ib_transport_port(port, ibdev->dev) {
 
 		iboe->netdevs[port - 1] =
@@ -1555,7 +1547,7 @@ static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
 			update_qps_port = port;
 
 	}
-	spin_unlock_bh(&iboe->lock);
+	up_write(&iboe->sem);
 
 	if (update_qps_port > 0)
 		mlx4_ib_update_qps(ibdev, dev, update_qps_port);
@@ -1848,7 +1840,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 
 	mlx4_ib_alloc_eqs(dev, ibdev);
 
-	spin_lock_init(&iboe->lock);
+	init_rwsem(&iboe->sem);
 
 	if (init_node_data(ibdev))
 		goto err_map;
@@ -2153,7 +2145,8 @@ static void handle_bonded_port_state_event(struct work_struct *work)
 	struct ib_event ibev;
 
 	kfree(ew);
-	spin_lock_bh(&ibdev->iboe.lock);
+
+	down_read(&ibdev->iboe.sem);
 	for (i = 0; i < MLX4_MAX_PORTS; ++i) {
 		struct net_device *curr_netdev = ibdev->iboe.netdevs[i];
 
@@ -2165,7 +2158,7 @@ static void handle_bonded_port_state_event(struct work_struct *work)
 		bonded_port_state = (bonded_port_state != IB_PORT_ACTIVE) ?
 			curr_port_state : IB_PORT_ACTIVE;
 	}
-	spin_unlock_bh(&ibdev->iboe.lock);
+	up_read(&ibdev->iboe.sem);
 
 	ibev.device = &ibdev->ib_dev;
 	ibev.element.port_num = 1;
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index e3805a4..166ebf9 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -455,7 +455,7 @@ struct mlx4_ib_sriov {
 };
 
 struct mlx4_ib_iboe {
-	spinlock_t		lock;
+	struct rw_semaphore	sem; /* guard from concurrent access to data in this struct */
 	struct net_device      *netdevs[MLX4_MAX_PORTS];
 	atomic64_t		mac[MLX4_MAX_PORTS];
 	struct notifier_block 	nb;
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 22/33] IB/mlx4: Lock with RCU instead of RTNL
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (20 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 21/33] IB/mlx4: Replace spin_lock with rw_semaphore Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 23/33] net/mlx4: Postpone the registration of net_device Somnath Kotur
                     ` (10 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Moni Shoua, Somnath Kotur

From: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

The function eth_link_query_port() used to take the RTNL lock when
call to netdev_master_upper_dev_get() was necessary. This makes it
impossible to call this function with RTNL lock is held. Calling
netdev_master_upper_dev_get_rcu() and locking with RCU instead solve
this problem.

Signed-off-by: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/hw/mlx4/main.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index d8b227e..32cd009 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -367,14 +367,15 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,
 	props->state		= IB_PORT_DOWN;
 	props->phys_state	= state_to_phys_state(props->state);
 	props->active_mtu	= IB_MTU_256;
-	if (is_bonded)
-		rtnl_lock(); /* required to get upper dev */
 	down_read(&iboe->sem);
 	ndev = iboe->netdevs[port - 1];
-	if (ndev && is_bonded)
-		ndev = netdev_master_upper_dev_get(ndev);
+	if (ndev && is_bonded) {
+		rcu_read_lock(); /* required to get upper dev */
+		ndev = netdev_master_upper_dev_get_rcu(ndev);
+		rcu_read_unlock();
+	}
 	if (!ndev)
-		goto out_unlock;
+		goto unlock;
 
 	tmp = iboe_get_mtu(ndev->mtu);
 	props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256;
@@ -382,10 +383,8 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,
 	props->state		= (netif_running(ndev) && netif_carrier_ok(ndev)) ?
 					IB_PORT_ACTIVE : IB_PORT_DOWN;
 	props->phys_state	= state_to_phys_state(props->state);
-out_unlock:
+unlock:
 	up_read(&iboe->sem);
-	if (is_bonded)
-		rtnl_unlock();
 out:
 	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
 	return err;
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 23/33] net/mlx4: Postpone the registration of net_device
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (21 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 22/33] IB/mlx4: Lock with RCU instead of RTNL Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 24/33] IB/mlx4: Advertise RoCE support in port capabilities Somnath Kotur
                     ` (9 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Moni Shoua, Somnath Kotur

From: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

The mlx4 network driver was registered in the context of the 'add'
function of the core driver (called when HW should be registered).
This makes the netdev event NETDEV_REGISTER to be sent in a context
where the answer to get_protocol_dev() callback returns NULL. This may
be confusing to listeners of netdev events.
This patch is a preparation to the patch that implements the
get_netdev() callback in the IB/mlx4 driver.

Signed-off-by: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/net/ethernet/mellanox/mlx4/en_main.c | 36 ++++++++++++++++------------
 drivers/net/ethernet/mellanox/mlx4/intf.c    |  3 +++
 include/linux/mlx4/driver.h                  |  1 +
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_main.c b/drivers/net/ethernet/mellanox/mlx4/en_main.c
index 2859ac6..64b4f8d2 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_main.c
@@ -219,6 +219,26 @@ static void mlx4_en_remove(struct mlx4_dev *dev, void *endev_ptr)
 	kfree(mdev);
 }
 
+static void mlx4_en_activate(struct mlx4_dev *dev, void *ctx)
+{
+	int i;
+	struct mlx4_en_dev *mdev = ctx;
+
+	/* Create a netdev for each port */
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
+		mlx4_info(mdev, "Activating port:%d\n", i);
+		if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i]))
+			mdev->pndev[i] = NULL;
+	}
+
+	/* register notifier */
+	mdev->nb.notifier_call = mlx4_en_netdev_event;
+	if (register_netdevice_notifier(&mdev->nb)) {
+		mdev->nb.notifier_call = NULL;
+		mlx4_err(mdev, "Failed to create notifier\n");
+	}
+}
+
 static void *mlx4_en_add(struct mlx4_dev *dev)
 {
 	struct mlx4_en_dev *mdev;
@@ -292,21 +312,6 @@ static void *mlx4_en_add(struct mlx4_dev *dev)
 	mutex_init(&mdev->state_lock);
 	mdev->device_up = true;
 
-	/* Setup ports */
-
-	/* Create a netdev for each port */
-	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) {
-		mlx4_info(mdev, "Activating port:%d\n", i);
-		if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i]))
-			mdev->pndev[i] = NULL;
-	}
-	/* register notifier */
-	mdev->nb.notifier_call = mlx4_en_netdev_event;
-	if (register_netdevice_notifier(&mdev->nb)) {
-		mdev->nb.notifier_call = NULL;
-		mlx4_err(mdev, "Failed to create notifier\n");
-	}
-
 	return mdev;
 
 err_mr:
@@ -330,6 +335,7 @@ static struct mlx4_interface mlx4_en_interface = {
 	.event		= mlx4_en_event,
 	.get_dev	= mlx4_en_get_netdev,
 	.protocol	= MLX4_PROT_ETH,
+	.activate	= mlx4_en_activate,
 };
 
 static void mlx4_en_verify_params(void)
diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c b/drivers/net/ethernet/mellanox/mlx4/intf.c
index a1a5985..ccd4030 100644
--- a/drivers/net/ethernet/mellanox/mlx4/intf.c
+++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
@@ -63,8 +63,11 @@ static void mlx4_add_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
 		spin_lock_irq(&priv->ctx_lock);
 		list_add_tail(&dev_ctx->list, &priv->ctx_list);
 		spin_unlock_irq(&priv->ctx_lock);
+		if (intf->activate)
+			intf->activate(&priv->dev, dev_ctx->context);
 	} else
 		kfree(dev_ctx);
+
 }
 
 static void mlx4_remove_device(struct mlx4_interface *intf, struct mlx4_priv *priv)
diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
index 9553a73..5a06d96 100644
--- a/include/linux/mlx4/driver.h
+++ b/include/linux/mlx4/driver.h
@@ -59,6 +59,7 @@ struct mlx4_interface {
 	void			(*event) (struct mlx4_dev *dev, void *context,
 					  enum mlx4_dev_event event, unsigned long param);
 	void *			(*get_dev)(struct mlx4_dev *dev, void *context, u8 port);
+	void			(*activate)(struct mlx4_dev *dev, void *context);
 	struct list_head	list;
 	enum mlx4_protocol	protocol;
 	int			flags;
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 24/33] IB/mlx4: Advertise RoCE support in port capabilities
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (22 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 23/33] net/mlx4: Postpone the registration of net_device Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 25/33] IB/mlx4: Implement ib_device callback - get_netdev Somnath Kotur
                     ` (8 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Moni Shoua, Somnath Kotur

From: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

The port capability flags should indicate the support in RoCE modes (V1
or V2) of the port. The mlx4 driver sets these flags according to the
capabilities reported by the HW.

Signed-off-by: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/hw/mlx4/main.c         |  6 ++++++
 drivers/net/ethernet/mellanox/mlx4/fw.c   |  5 ++++-
 drivers/net/ethernet/mellanox/mlx4/main.c |  6 +++++-
 include/linux/mlx4/device.h               | 13 ++++++++++---
 4 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 32cd009..bf87a95 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -359,6 +359,12 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port,
 						IB_WIDTH_4X : IB_WIDTH_1X;
 	props->active_speed	= IB_SPEED_QDR;
 	props->port_cap_flags	= IB_PORT_CM_SUP | IB_PORT_IP_BASED_GIDS;
+
+	if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE)
+		props->port_cap_flags	|= IB_PORT_ROCE;
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
+		props->port_cap_flags	|= IB_PORT_ROCE_V2 | IB_PORT_ROCE;
+
 	props->gid_tbl_len	= mdev->dev->caps.gid_table_len[port];
 	props->max_msg_sz	= mdev->dev->caps.max_msg_sz;
 	props->pkey_tbl_len	= 1;
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 3702fd1..d573e73 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -146,7 +146,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[17] = "Asymmetric EQs support",
 		[18] = "More than 80 VFs support",
 		[19] = "Performance optimized for limited rule configuration flow steering support",
-		[21] = "Port Remap support"
+		[21] = "Port Remap support",
+		[22] = "RoCEv2 support"
 	};
 	int i;
 
@@ -852,6 +853,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
 	MLX4_GET(dev_cap->bmme_flags, outbox,
 		 QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	if (dev_cap->bmme_flags & MLX4_FLAG_ROCE_V1_V2)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ROCE_V1_V2;
 	if (dev_cap->bmme_flags & MLX4_FLAG_PORT_REMAP)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PORT_REMAP;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_CONFIG_DEV_OFFSET);
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 1893a57..29c60fd 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -386,8 +386,12 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	if (mlx4_priv(dev)->pci_dev_data & MLX4_PCI_DEV_FORCE_SENSE_PORT)
 		dev->caps.flags |= MLX4_DEV_CAP_FLAG_SENSE_SUPPORT;
 	/* Don't do sense port on multifunction devices (for now at least) */
-	if (mlx4_is_mfunc(dev))
+	/* Don't do enable RoCE V2 on multifunction devices */
+	if (mlx4_is_mfunc(dev)) {
 		dev->caps.flags &= ~MLX4_DEV_CAP_FLAG_SENSE_SUPPORT;
+		dev_cap->flags2 &= ~MLX4_DEV_CAP_FLAG2_ROCE_V1_V2;
+		mlx4_dbg(dev, "RoCE V2 is not supported when SR-IOV is enabled\n");
+	}
 
 	if (mlx4_low_memory_profile()) {
 		dev->caps.log_num_macs  = MLX4_MIN_LOG_NUM_MAC;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 9a05e73..9bdf157 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -202,7 +202,8 @@ enum {
 	MLX4_DEV_CAP_FLAG2_SYS_EQS		= 1LL <<  17,
 	MLX4_DEV_CAP_FLAG2_80_VFS		= 1LL <<  18,
 	MLX4_DEV_CAP_FLAG2_FS_A0		= 1LL <<  19,
-	MLX4_DEV_CAP_FLAG2_PORT_REMAP		= 1LL <<  21
+	MLX4_DEV_CAP_FLAG2_PORT_REMAP		= 1LL <<  21,
+	MLX4_DEV_CAP_FLAG2_ROCE_V1_V2		= 1LL <<  22
 };
 
 enum {
@@ -250,6 +251,7 @@ enum {
 	MLX4_BMME_FLAG_TYPE_2_WIN	= 1 <<  9,
 	MLX4_BMME_FLAG_RESERVED_LKEY	= 1 << 10,
 	MLX4_BMME_FLAG_FAST_REG_WR	= 1 << 11,
+	MLX4_BMME_FLAG_ROCE_V1_V2	= 1 << 19,
 	MLX4_BMME_FLAG_PORT_REMAP	= 1 << 24,
 	MLX4_BMME_FLAG_VSD_INIT2RTR	= 1 << 28,
 };
@@ -258,6 +260,10 @@ enum {
 	MLX4_FLAG_PORT_REMAP		= MLX4_BMME_FLAG_PORT_REMAP
 };
 
+enum {
+	MLX4_FLAG_ROCE_V1_V2		= MLX4_BMME_FLAG_ROCE_V1_V2
+};
+
 enum mlx4_event {
 	MLX4_EVENT_TYPE_COMP		   = 0x00,
 	MLX4_EVENT_TYPE_PATH_MIG	   = 0x01,
@@ -888,9 +894,10 @@ struct mlx4_mad_ifc {
 		if (((dev)->caps.port_mask[port] != MLX4_PORT_TYPE_IB))
 
 #define mlx4_foreach_ib_transport_port(port, dev)                         \
-	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)	  \
+	for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)       \
 		if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \
-			((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE))
+			((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) || \
+			((dev)->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2))
 
 #define MLX4_INVALID_SLAVE_ID	0xFF
 
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 25/33] IB/mlx4: Implement ib_device callback - get_netdev
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (23 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 24/33] IB/mlx4: Advertise RoCE support in port capabilities Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 26/33] IB/mlx4: Implement ib_device callback - modify_gid Somnath Kotur
                     ` (7 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Moni Shoua, Somnath Kotur

From: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

This is a new callback that is required for RoCEv2 support.
In port aggregation mode it is required to return the netdev of the
active port so  support in mlx4 core driver to figure out that port
identity is required.

Signed-off-by: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/hw/mlx4/main.c         | 29 +++++++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/main.c | 18 ++++++++++++++++++
 include/linux/mlx4/driver.h               |  1 +
 3 files changed, 48 insertions(+)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index bf87a95..04e6603 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -47,6 +47,8 @@
 #include <rdma/ib_addr.h>
 #include <rdma/ib_cache.h>
 
+#include <net/bonding.h>
+
 #include <linux/mlx4/driver.h>
 #include <linux/mlx4/cmd.h>
 #include <linux/mlx4/qp.h>
@@ -1527,6 +1529,32 @@ unlock:
 	mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]);
 }
 
+static struct net_device *mlx4_ib_get_netdev(struct ib_device *device, u8 port_num)
+{
+	struct mlx4_ib_dev *ibdev = to_mdev(device);
+
+	if (mlx4_is_bonded(ibdev->dev)) {
+		struct net_device *dev;
+		struct net_device *upper = NULL;
+
+		rcu_read_lock();
+
+		dev = mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port_num);
+		if (dev)
+			upper = netdev_master_upper_dev_get_rcu(dev);
+		else
+			goto unlock;
+		if (upper)
+			dev = bond_option_active_slave_get_rcu(netdev_priv(upper));
+unlock:
+		rcu_read_unlock();
+
+		return dev;
+	}
+
+	return mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port_num);
+}
+
 static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
 				 struct net_device *dev,
 				 unsigned long event)
@@ -1806,6 +1834,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->ib_dev.attach_mcast	= mlx4_ib_mcg_attach;
 	ibdev->ib_dev.detach_mcast	= mlx4_ib_mcg_detach;
 	ibdev->ib_dev.process_mad	= mlx4_ib_process_mad;
+	ibdev->ib_dev.get_netdev	= mlx4_ib_get_netdev;
 
 	if (!mlx4_is_slave(ibdev->dev)) {
 		ibdev->ib_dev.alloc_fmr		= mlx4_ib_fmr_alloc;
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 29c60fd..3f469d3 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -1241,6 +1241,24 @@ int mlx4_port_map_set(struct mlx4_dev *dev, struct mlx4_port_map *v2p)
 }
 EXPORT_SYMBOL_GPL(mlx4_port_map_set);
 
+int mlx4_port_map_get(struct mlx4_dev *dev, u8 vport, u8 *pport)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (!pport)
+		return -EINVAL;
+	*pport = 0;
+
+	if (vport == 1)
+		*pport = priv->v2p.port1;
+	else if (vport == 2)
+		*pport = priv->v2p.port2;
+	if (!*pport)
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_port_map_get);
+
 static int mlx4_load_fw(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
index 5a06d96..a992971 100644
--- a/include/linux/mlx4/driver.h
+++ b/include/linux/mlx4/driver.h
@@ -81,6 +81,7 @@ struct mlx4_port_map {
 };
 
 int mlx4_port_map_set(struct mlx4_dev *dev, struct mlx4_port_map *v2p);
+int mlx4_port_map_get(struct mlx4_dev *dev, u8 vport, u8 *pport);
 
 void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port);
 
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 26/33] IB/mlx4: Implement ib_device callback - modify_gid
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (24 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 25/33] IB/mlx4: Implement ib_device callback - get_netdev Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 27/33] IB/mlx4: Configure device to work in RoCEv2 Somnath Kotur
                     ` (6 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Moni Shoua, Somnath Kotur

From: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

This is a new callbac that is required for RoCEv2 support.
In RoCE, GID table is managed in the IB core driver. The role of the
mlx4 driver is to synchronize the HW with the entries in the GID table.
Since it is possible that the same GID value will appear more than once
in the GID table (though with different attributes) it is required from
the mlx4 driver to maintain a reference counting mechanism and populate
the HW with a single value.
Since an index to the GID table is not necessarily the same as index to
the matching entry in the HW GID table, a translation between indexes is
required.

Signed-off-by: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/hw/mlx4/main.c    | 226 +++++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/mlx4/mlx4_ib.h |  18 +++
 include/linux/mlx4/cmd.h             |   3 +-
 include/linux/mlx4/device.h          |   3 +-
 4 files changed, 248 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 04e6603..96a6ec0 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1555,6 +1555,230 @@ unlock:
 	return mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port_num);
 }
 
+static int mlx4_ib_update_gids_v1(struct gid_entry *gids,
+				  struct mlx4_ib_dev *ibdev,
+				  u8 port_num)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+	struct mlx4_dev *dev = ibdev->dev;
+	int i;
+	union ib_gid *gid_tbl;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+
+	gid_tbl = mailbox->buf;
+
+	for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i)
+		memcpy(&gid_tbl[i], &gids[i].gid, sizeof(union ib_gid));
+
+	err = mlx4_cmd(dev, mailbox->dma,
+		       MLX4_SET_PORT_GID_TABLE << 8 | port_num,
+		       1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
+	if (mlx4_is_bonded(dev))
+		err += mlx4_cmd(dev, mailbox->dma,
+				MLX4_SET_PORT_GID_TABLE << 8 | 2,
+				1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+				MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+static int mlx4_ib_update_gids_v1_v2(struct gid_entry *gids,
+				     struct mlx4_ib_dev *ibdev,
+				     u8 port_num)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+	struct mlx4_dev *dev = ibdev->dev;
+	int i;
+	struct {
+		union ib_gid	gid;
+		__be32		rsrvd1[2];
+		__be16		rsrvd2;
+		u8		type;
+		u8		version;
+		__be32		rsrvd3;
+	} *gid_tbl;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+
+	gid_tbl = mailbox->buf;
+	for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) {
+		memcpy(&gid_tbl[i].gid, &gids[i].gid, sizeof(union ib_gid));
+		if (gids[i].gid_type == IB_GID_TYPE_ROCE_V2) {
+			gid_tbl[i].version = 2;
+			if (!ipv6_addr_v4mapped((struct in6_addr *)&gids[i].gid))
+				gid_tbl[i].type = 1;
+		}
+	}
+
+	err = mlx4_cmd(dev, mailbox->dma,
+		       MLX4_SET_PORT_ROCE_ADDR << 8 | port_num,
+		       1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
+	if (mlx4_is_bonded(dev))
+		err += mlx4_cmd(dev, mailbox->dma,
+				MLX4_SET_PORT_ROCE_ADDR << 8 | 2,
+				1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+				MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
+static int mlx4_ib_update_gids(struct gid_entry *gids,
+			       struct mlx4_ib_dev *ibdev,
+			       u8 port_num)
+{
+	if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)
+		return mlx4_ib_update_gids_v1_v2(gids, ibdev, port_num);
+
+	return mlx4_ib_update_gids_v1(gids, ibdev, port_num);
+}
+
+static int mlx4_ib_modify_gid(struct ib_device *device,
+			      u8 port_num, unsigned int index,
+			      const union ib_gid *gid,
+			      const struct ib_gid_attr *attr,
+			      void **context)
+{
+	struct mlx4_ib_dev *ibdev = to_mdev(device);
+	struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+	struct mlx4_port_gid_table   *port_gid_table;
+	int free = -1, found = -1;
+	int ret = 0;
+	int clear = !memcmp(&zgid, gid, sizeof(*gid));
+	int hw_update = 0;
+	int i;
+	struct gid_entry *gids = NULL;
+
+	if (ib_cache_use_roce_gid_cache(device, port_num))
+		return -EINVAL;
+
+	if (port_num > MLX4_MAX_PORTS)
+		return -EINVAL;
+
+	if (!context)
+		return -EINVAL;
+
+	down_write(&iboe->sem);
+	port_gid_table = &iboe->gid_table[port_num - 1];
+
+	if (clear) {
+		struct gid_cache_context *ctx = *context;
+
+		if (ctx) {
+			ctx->refcount--;
+			if (!ctx->refcount) {
+				unsigned int index = ctx->real_index;
+
+				memcpy(&port_gid_table->gids[index].gid, &zgid, sizeof(*gid));
+				kfree(port_gid_table->gids[index].ctx);
+				port_gid_table->gids[index].ctx = NULL;
+				hw_update = 1;
+			}
+		}
+	} else {
+		for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) {
+			if (!memcmp(&port_gid_table->gids[i].gid, gid, sizeof(*gid))) {
+				found = (port_gid_table->gids[i].gid_type == attr->gid_type) ? i : -1;
+				if (found >= 0)
+					break;
+			}
+			if (free < 0 && !memcmp(&port_gid_table->gids[i].gid, &zgid, sizeof(*gid)))
+				free = i; /* HW has space */
+		}
+
+		if (found < 0) {
+			if (free < 0) {
+				ret = -ENOSPC;
+			} else {
+				port_gid_table->gids[free].ctx = kmalloc(sizeof(*port_gid_table->gids[free].ctx), GFP_KERNEL);
+				if (!port_gid_table->gids[free].ctx) {
+					ret = -ENOMEM;
+				} else {
+					*context = port_gid_table->gids[free].ctx;
+					memcpy(&port_gid_table->gids[free].gid, gid, sizeof(*gid));
+					port_gid_table->gids[free].gid_type = attr->gid_type;
+					port_gid_table->gids[free].ctx->real_index = free;
+					port_gid_table->gids[free].ctx->refcount = 1;
+					hw_update = 1;
+				}
+			}
+		} else {
+			struct gid_cache_context *ctx = port_gid_table->gids[found].ctx;
+			*context = ctx;
+			ctx->refcount++;
+		}
+	}
+	if (!ret && hw_update) {
+		gids = kmalloc(sizeof(*gids) * MLX4_MAX_PORT_GIDS, GFP_KERNEL);
+		if (!gids) {
+			ret = -ENOMEM;
+		} else {
+			for (i = 0; i < MLX4_MAX_PORT_GIDS; i++) {
+				memcpy(&gids[i].gid, &port_gid_table->gids[i].gid, sizeof(union ib_gid));
+				gids[i].gid_type = port_gid_table->gids[i].gid_type;
+			}
+		}
+	}
+	up_write(&iboe->sem);
+
+	if (!ret && hw_update) {
+		ret = mlx4_ib_update_gids(gids, ibdev, port_num);
+		kfree(gids);
+	}
+
+	return ret;
+}
+
+int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
+				    u8 port_num, int index)
+{
+	struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+	struct gid_cache_context *ctx = NULL;
+	union ib_gid gid;
+	struct mlx4_port_gid_table   *port_gid_table;
+	int real_index = -EINVAL;
+	int i;
+	int ret;
+	struct ib_gid_attr attr;
+
+	if (port_num > MLX4_MAX_PORTS)
+		return -EINVAL;
+
+	if (ib_cache_use_roce_gid_cache(&ibdev->ib_dev, port_num))
+		return index;
+
+	ret = ib_get_cached_gid(&ibdev->ib_dev, port_num, index, &gid, &attr);
+	if (ret)
+		return ret;
+
+	if (!memcmp(&gid, &zgid, sizeof(gid)))
+		return -EINVAL;
+
+	down_read(&iboe->sem);
+	port_gid_table = &iboe->gid_table[port_num - 1];
+
+	for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i)
+		if (!memcmp(&port_gid_table->gids[i].gid, &gid, sizeof(gid)) &&
+		    (attr.gid_type == port_gid_table->gids[i].gid_type)) {
+			ctx = port_gid_table->gids[i].ctx;
+			break;
+		}
+	if (ctx)
+		real_index = ctx->real_index;
+	up_read(&iboe->sem);
+	return real_index;
+}
+
 static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
 				 struct net_device *dev,
 				 unsigned long event)
@@ -1835,6 +2059,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->ib_dev.detach_mcast	= mlx4_ib_mcg_detach;
 	ibdev->ib_dev.process_mad	= mlx4_ib_process_mad;
 	ibdev->ib_dev.get_netdev	= mlx4_ib_get_netdev;
+	ibdev->ib_dev.modify_gid	= mlx4_ib_modify_gid;
 
 	if (!mlx4_is_slave(ibdev->dev)) {
 		ibdev->ib_dev.alloc_fmr		= mlx4_ib_fmr_alloc;
@@ -1930,6 +2155,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 			goto err_steer_free_bitmap;
 	}
 
+	memset(iboe->gid_table, 0, sizeof(struct mlx4_port_gid_table) * MLX4_MAX_PORTS);
 	for (j = 1; j <= ibdev->dev->caps.num_ports; j++)
 		atomic64_set(&iboe->mac[j - 1], ibdev->dev->caps.def_mac[j]);
 
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 166ebf9..018bda6 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -454,11 +454,27 @@ struct mlx4_ib_sriov {
 	struct idr pv_id_table;
 };
 
+struct gid_cache_context {
+	int real_index;
+	int refcount;
+};
+
+struct gid_entry {
+	union ib_gid	gid;
+	enum ib_gid_type gid_type;
+	struct gid_cache_context *ctx;
+};
+
+struct mlx4_port_gid_table {
+	struct gid_entry gids[MLX4_MAX_PORT_GIDS];
+};
+
 struct mlx4_ib_iboe {
 	struct rw_semaphore	sem; /* guard from concurrent access to data in this struct */
 	struct net_device      *netdevs[MLX4_MAX_PORTS];
 	atomic64_t		mac[MLX4_MAX_PORTS];
 	struct notifier_block 	nb;
+	struct mlx4_port_gid_table gid_table[MLX4_MAX_PORTS];
 };
 
 struct pkey_mgt {
@@ -804,5 +820,7 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
 			  u64 start, u64 length, u64 virt_addr,
 			  int mr_access_flags, struct ib_pd *pd,
 			  struct ib_udata *udata);
+int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev,
+				    u8 port_num, int index);
 
 #endif /* MLX4_IB_H */
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index d764350..8cec202 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -192,7 +192,8 @@ enum {
 	MLX4_SET_PORT_GID_TABLE = 0x5,
 	MLX4_SET_PORT_PRIO2TC	= 0x8,
 	MLX4_SET_PORT_SCHEDULER = 0x9,
-	MLX4_SET_PORT_VXLAN	= 0xB
+	MLX4_SET_PORT_VXLAN	= 0xB,
+	MLX4_SET_PORT_ROCE_ADDR	= 0xD
 };
 
 enum {
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 9bdf157..dfc4a86 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -80,7 +80,8 @@ enum {
 
 enum {
 	MLX4_MAX_PORTS		= 2,
-	MLX4_MAX_PORT_PKEYS	= 128
+	MLX4_MAX_PORT_PKEYS	= 128,
+	MLX4_MAX_PORT_GIDS	= 128
 };
 
 /* base qkey for use in sriov tunnel-qp/proxy-qp communication.
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 27/33] IB/mlx4: Configure device to work in RoCEv2
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (25 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 26/33] IB/mlx4: Implement ib_device callback - modify_gid Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 28/33] IB/mlx4: Translate cache gid index to real index Somnath Kotur
                     ` (5 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Moni Shoua, Somnath Kotur

From: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Some mlx4 adapters are RoCEv2 capable. To enable this feature some
hardware configuration is required. This is

1. Set port general parameters
2. Configure the outgoing UDP destination port
3. Configure the QP that work with RoCEv2

Signed-off-by: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/hw/mlx4/main.c         | 10 +++++++-
 drivers/infiniband/hw/mlx4/qp.c           | 40 +++++++++++++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx4/fw.c   | 16 ++++++++++++-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h |  3 ++-
 drivers/net/ethernet/mellanox/mlx4/port.c |  9 ++++++-
 drivers/net/ethernet/mellanox/mlx4/qp.c   | 27 +++++++++++++++++++++
 include/linux/mlx4/device.h               |  1 +
 include/linux/mlx4/qp.h                   | 15 ++++++++++--
 8 files changed, 111 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 96a6ec0..ee99f62 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2168,7 +2168,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	if (mlx4_ib_init_sriov(ibdev))
 		goto err_mad;
 
-	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) {
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE ||
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
 		if (!iboe->nb.notifier_call) {
 			iboe->nb.notifier_call = mlx4_ib_netdev_event;
 			err = register_netdevice_notifier(&iboe->nb);
@@ -2177,6 +2178,13 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 				goto err_notif;
 			}
 		}
+		if (!mlx4_is_slave(dev) &&
+		    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+			err = mlx4_config_roce_v2_port(dev, ROCE_V2_UDP_DPORT);
+			if (err) {
+				goto err_notif;
+			}
+		}
 	}
 
 	for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 6f6d0db..847f9ec 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1408,6 +1408,24 @@ static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev,
 	return 0;
 }
 
+enum {
+	MLX4_QPC_ROCE_MODE_1 = 0,
+	MLX4_QPC_ROCE_MODE_2 = 2,
+	MLX4_QPC_ROCE_MODE_MAX = 0xff
+};
+
+static u8 gid_type_to_qpc(enum ib_gid_type gid_type)
+{
+	switch (gid_type) {
+	case IB_GID_TYPE_IB:
+		return MLX4_QPC_ROCE_MODE_1;
+	case IB_GID_TYPE_ROCE_V2:
+		return MLX4_QPC_ROCE_MODE_2;
+	default:
+		return MLX4_QPC_ROCE_MODE_MAX;
+	}
+}
+
 static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 			       const struct ib_qp_attr *attr, int attr_mask,
 			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
@@ -1531,12 +1549,14 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		u16 vlan = 0xffff;
 		u8 smac[ETH_ALEN];
 		int status = 0;
+		int is_eth = rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
+				IB_LINK_LAYER_ETHERNET;
 
-		if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
-		    IB_LINK_LAYER_ETHERNET &&
-		    attr->ah_attr.ah_flags & IB_AH_GRH) {
+		if (is_eth && attr->ah_attr.ah_flags & IB_AH_GRH) {
 			int index = attr->ah_attr.grh.sgid_index;
 
+			if (mlx4_is_bonded(dev->dev))
+				port_num  = 1;
 			rcu_read_lock();
 			status = ib_get_cached_gid(ibqp->device, port_num,
 						   index, &gid, &gid_attr);
@@ -1555,8 +1575,20 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 				  port_num, vlan, smac))
 			goto out;
 
+		if (is_eth && gid_attr.gid_type == IB_GID_TYPE_ROCE_V2)
+			context->pri_path.hop_limit = IPV6_DEFAULT_HOPLIMIT;
+
 		optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
 			   MLX4_QP_OPTPAR_SCHED_QUEUE);
+
+		if (is_eth && (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR)) {
+			u8 qpc_roce_mode = gid_type_to_qpc(gid_attr.gid_type);
+
+			if (qpc_roce_mode == MLX4_QPC_ROCE_MODE_MAX)
+				goto out;
+			context->rlkey_roce_mode |= (qpc_roce_mode << 6);
+		}
+
 	}
 
 	if (attr_mask & IB_QP_TIMEOUT) {
@@ -1728,7 +1760,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 		sqd_event = 0;
 
 	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
-		context->rlkey |= (1 << 4);
+		context->rlkey_roce_mode |= (1 << 4);
 
 	/*
 	 * Before passing a kernel QP to the HW, make sure that the
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index d573e73..0086aab 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -2056,7 +2056,8 @@ struct mlx4_config_dev {
 	__be32	rsvd1[3];
 	__be16	vxlan_udp_dport;
 	__be16	rsvd2;
-	__be32	rsvd3;
+	__be16  roce_v2_entropy;
+	__be16  roce_v2_udp_dport;
 	__be32	roce_flags;
 	__be32	rsvd4[25];
 	__be16	rsvd5;
@@ -2065,6 +2066,7 @@ struct mlx4_config_dev {
 };
 
 #define MLX4_VXLAN_UDP_DPORT (1 << 0)
+#define MLX4_ROCE_V2_UDP_DPORT BIT(3)
 #define MLX4_DISABLE_RX_PORT BIT(18)
 
 static int mlx4_CONFIG_DEV_set(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev)
@@ -2182,6 +2184,18 @@ int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis)
 	return mlx4_CONFIG_DEV_set(dev, &config_dev);
 }
 
+int mlx4_config_roce_v2_port(struct mlx4_dev *dev, u16 udp_port)
+{
+	struct mlx4_config_dev config_dev;
+
+	memset(&config_dev, 0, sizeof(config_dev));
+	config_dev.update_flags    = cpu_to_be32(MLX4_ROCE_V2_UDP_DPORT);
+	config_dev.roce_v2_udp_dport = cpu_to_be16(udp_port);
+
+	return mlx4_CONFIG_DEV_set(dev, &config_dev);
+}
+EXPORT_SYMBOL_GPL(mlx4_config_roce_v2_port);
+
 int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2)
 {
 	struct mlx4_cmd_mailbox *mailbox;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index beb1c08..f4160a8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -763,7 +763,8 @@ enum {
 struct mlx4_set_port_general_context {
 	u8 reserved[3];
 	u8 flags;
-	u16 reserved2;
+	u8 roce_mode;
+	u8 rr_proto;
 	__be16 mtu;
 	u8 pptx;
 	u8 pfctx;
diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c
index 30eb1ea..3757b98 100644
--- a/drivers/net/ethernet/mellanox/mlx4/port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -955,6 +955,8 @@ int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz)
 	return err;
 }
 
+#define SET_PORT_ROCE_2_FLAGS          0x10
+#define MLX4_SET_PORT_ROCE_V1_V2       0x2
 int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
 			  u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx)
 {
@@ -973,7 +975,12 @@ int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu,
 	context->pfctx = pfctx;
 	context->pprx = (pprx * (!pfcrx)) << 7;
 	context->pfcrx = pfcrx;
-
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+		context->flags |= SET_PORT_ROCE_2_FLAGS;
+		context->roce_mode |=
+			(MLX4_SET_PORT_ROCE_V1_V2 & 7)
+			<< 4;
+	}
 	in_mod = MLX4_SET_PORT_GENERAL << 8 | port;
 	err = mlx4_cmd(dev, mailbox->dma, in_mod, 1, MLX4_CMD_SET_PORT,
 		       MLX4_CMD_TIME_CLASS_B,  MLX4_CMD_WRAPPED);
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index 2bb8553..03917dd 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -167,6 +167,13 @@ static int __mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 		context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
 	}
 
+	if ((cur_state == MLX4_QP_STATE_RTR) &&
+	    (new_state == MLX4_QP_STATE_RTS) &&
+	    dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2 &&
+	    !mlx4_is_mfunc(dev)) {
+		context->roce_entropy = cpu_to_be16(mlx4_qp_roce_entropy(dev, qp->qpn));
+	}
+
 	*(__be32 *) mailbox->buf = cpu_to_be32(optpar);
 	memcpy(mailbox->buf + 8, context, sizeof *context);
 
@@ -898,3 +905,23 @@ int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mlx4_qp_to_ready);
+
+u32 mlx4_qp_roce_entropy(struct mlx4_dev *dev, u32 qpn)
+{
+	struct mlx4_qp_context context;
+	struct mlx4_qp qp;
+	int err;
+
+	qp.qpn = qpn;
+	err = mlx4_qp_query(dev, &qp, &context);
+	if (!err) {
+		u32 dest_qpn = be32_to_cpu(context.remote_qpn) & 0xffffff;
+		u16 folded_dst = folded_qp(dest_qpn);
+		u16 folded_src = folded_qp(qpn);
+
+		return (dest_qpn != qpn) ? ((folded_dst ^ folded_src) | 0xC000) :
+			folded_src | 0xC000;
+	}
+	return 0xdead;
+}
+EXPORT_SYMBOL_GPL(mlx4_qp_roce_entropy);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index dfc4a86..dd1488c 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -1354,6 +1354,7 @@ int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port);
 
 int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port);
 int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis);
+int mlx4_config_roce_v2_port(struct mlx4_dev *dev, u16 udp_port);
 int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2);
 int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port);
 int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port);
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index 2bbc62a..b7497b9 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -184,7 +184,7 @@ struct mlx4_qp_context {
 	u8			mtu_msgmax;
 	u8			rq_size_stride;
 	u8			sq_size_stride;
-	u8			rlkey;
+	u8			rlkey_roce_mode;
 	__be32			usr_page;
 	__be32			local_qpn;
 	__be32			remote_qpn;
@@ -194,7 +194,8 @@ struct mlx4_qp_context {
 	u32			reserved1;
 	__be32			next_send_psn;
 	__be32			cqn_send;
-	u32			reserved2[2];
+	__be16                  roce_entropy;
+	__be16                  reserved2[3];
 	__be32			last_acked_psn;
 	__be32			ssn;
 	__be32			params2;
@@ -462,4 +463,14 @@ static inline struct mlx4_qp *__mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn)
 
 void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp);
 
+static inline u16 folded_qp(u32 q)
+{
+	u16 res;
+
+	res = ((q & 0xff) ^ ((q & 0xff0000) >> 16)) | (q & 0xff00);
+	return res;
+}
+
+u32 mlx4_qp_roce_entropy(struct mlx4_dev *dev, u32 qpn);
+
 #endif /* MLX4_QP_H */
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 28/33] IB/mlx4: Translate cache gid index to real index
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (26 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 27/33] IB/mlx4: Configure device to work in RoCEv2 Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 29/33] net/mlx4_core: Add handling of R-RoCE over IPV4 in qp attach flow Somnath Kotur
                     ` (4 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Moni Shoua, Somnath Kotur

From: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

When QP is modified with path the given sgid_index is not necessarily
the index that HW knows. This is due to optimizations that can save
place in the HW table. Therefore, translation is required.

Signed-off-by: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/hw/mlx4/qp.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 847f9ec..d7d7c5a 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1256,14 +1256,18 @@ static int _mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
 		path->static_rate = 0;
 
 	if (ah->ah_flags & IB_AH_GRH) {
-		if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) {
+		int real_sgid_index = mlx4_ib_gid_index_to_real_index(dev,
+								      port,
+								      ah->grh.sgid_index);
+
+		if (real_sgid_index >= dev->dev->caps.gid_table_len[port]) {
 			pr_err("sgid_index (%u) too large. max is %d\n",
-			       ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1);
+			       real_sgid_index, dev->dev->caps.gid_table_len[port] - 1);
 			return -1;
 		}
 
 		path->grh_mylmc |= 1 << 7;
-		path->mgid_index = ah->grh.sgid_index;
+		path->mgid_index = real_sgid_index;
 		path->hop_limit  = ah->grh.hop_limit;
 		path->tclass_flowlabel =
 			cpu_to_be32((ah->grh.traffic_class << 20) |
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 29/33] net/mlx4_core: Add handling of R-RoCE over IPV4 in qp attach flow
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (27 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 28/33] IB/mlx4: Translate cache gid index to real index Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 30/33] IB/core: Initialize UD header structure with IP and UDP headers Somnath Kotur
                     ` (3 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Maor Gottlieb, Somnath Kotur

From: Maor Gottlieb <maorg-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

In that case, the IPv4 bit should be enabled in the IB flow spec.

Signed-off-by: Maor Gottlieb <maorg-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/net/ethernet/mellanox/mlx4/mcg.c | 14 ++++++++++++--
 include/linux/mlx4/device.h              |  6 ++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c
index a3867e7..cdf07b9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mcg.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c
@@ -858,7 +858,9 @@ static int parse_trans_rule(struct mlx4_dev *dev, struct mlx4_spec_list *spec,
 		break;
 
 	case MLX4_NET_TRANS_RULE_ID_IB:
-		rule_hw->ib.l3_qpn = spec->ib.l3_qpn;
+		rule_hw->ib.l3_qpn = spec->ib.l3_qpn |
+			(spec->ib.roce_type == MLX4_FLOW_SPEC_IB_ROCE_TYPE_IPV4 ?
+			 0x80 : 0);
 		rule_hw->ib.qpn_mask = spec->ib.qpn_msk;
 		memcpy(&rule_hw->ib.dst_gid, &spec->ib.dst_gid, 16);
 		memcpy(&rule_hw->ib.dst_gid_msk, &spec->ib.dst_gid_msk, 16);
@@ -1377,10 +1379,18 @@ int mlx4_trans_to_dmfs_attach(struct mlx4_dev *dev, struct mlx4_qp *qp,
 			memcpy(spec.eth.dst_mac_msk, &mac_mask, ETH_ALEN);
 			break;
 
+		case MLX4_PROT_IB_IPV4:
+			spec.id = MLX4_NET_TRANS_RULE_ID_IB;
+			memcpy(spec.ib.dst_gid + 12, gid + 12, 4);
+			memset(spec.ib.dst_gid_msk + 12, 0xff, 4);
+			spec.ib.roce_type = MLX4_FLOW_SPEC_IB_ROCE_TYPE_IPV4;
+
+			break;
 		case MLX4_PROT_IB_IPV6:
 			spec.id = MLX4_NET_TRANS_RULE_ID_IB;
 			memcpy(spec.ib.dst_gid, gid, 16);
-			memset(&spec.ib.dst_gid_msk, 0xff, 16);
+			memset(spec.ib.dst_gid_msk, 0xff, 16);
+			spec.ib.roce_type = MLX4_FLOW_SPEC_IB_ROCE_TYPE_IPV6;
 			break;
 		default:
 			return -EINVAL;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index dd1488c..58b0b8c 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -369,6 +369,11 @@ enum mlx4_protocol {
 	MLX4_PROT_FCOE
 };
 
+enum mlx4_flow_roce_type {
+	MLX4_FLOW_SPEC_IB_ROCE_TYPE_IPV6 = 0,
+	MLX4_FLOW_SPEC_IB_ROCE_TYPE_IPV4
+};
+
 enum {
 	MLX4_MTT_FLAG_PRESENT		= 1
 };
@@ -1096,6 +1101,7 @@ struct mlx4_spec_ipv4 {
 struct mlx4_spec_ib {
 	__be32  l3_qpn;
 	__be32	qpn_msk;
+	enum    mlx4_flow_roce_type roce_type;
 	u8	dst_gid[16];
 	u8	dst_gid_msk[16];
 };
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 30/33] IB/core: Initialize UD header structure with IP and UDP headers
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (28 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 29/33] net/mlx4_core: Add handling of R-RoCE over IPV4 in qp attach flow Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 31/33] IB/mlx4: Enable send of RoCE QP1 packets with IP/UDP headers Somnath Kotur
                     ` (2 subsequent siblings)
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Moni Shoua, Somnath Kotur

From: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

ib_ud_header_init() is used to format InfiniBand headers
in a buffer up to (but not with) BTH. For RoCEv2 it is required that
this function would be able to build also IP and UDP headers.

Signed-off-by: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/core/ud_header.c    | 153 ++++++++++++++++++++++++++++++---
 drivers/infiniband/hw/mlx4/qp.c        |   7 +-
 drivers/infiniband/hw/mthca/mthca_qp.c |   2 +-
 include/rdma/ib_pack.h                 |  44 ++++++++--
 4 files changed, 186 insertions(+), 20 deletions(-)

diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c
index 72feee6..a4d4072 100644
--- a/drivers/infiniband/core/ud_header.c
+++ b/drivers/infiniband/core/ud_header.c
@@ -35,6 +35,7 @@
 #include <linux/string.h>
 #include <linux/export.h>
 #include <linux/if_ether.h>
+#include <linux/ip.h>
 
 #include <rdma/ib_pack.h>
 
@@ -116,6 +117,68 @@ static const struct ib_field vlan_table[]  = {
 	  .size_bits    = 16 }
 };
 
+static const struct ib_field ip4_table[]  = {
+	{ STRUCT_FIELD(ip4, ver_len),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(ip4, tos),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(ip4, tot_len),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(ip4, id),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(ip4, frag_off),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(ip4, ttl),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(ip4, protocol),
+	  .offset_words = 2,
+	  .offset_bits  = 8,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(ip4, check),
+	  .offset_words = 2,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(ip4, saddr),
+	  .offset_words = 3,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ STRUCT_FIELD(ip4, daddr),
+	  .offset_words = 4,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 }
+};
+
+static const struct ib_field udp_table[]  = {
+	{ STRUCT_FIELD(udp, sport),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(udp, dport),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(udp, length),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(udp, csum),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 }
+};
+
 static const struct ib_field grh_table[]  = {
 	{ STRUCT_FIELD(grh, ip_version),
 	  .offset_words = 0,
@@ -213,6 +276,26 @@ static const struct ib_field deth_table[] = {
 	  .size_bits    = 24 }
 };
 
+__be16 ib_ud_ip4_csum(struct ib_ud_header *header)
+{
+	struct iphdr iph;
+
+	iph.ihl		= 5;
+	iph.version	= 4;
+	iph.tos		= header->ip4.tos;
+	iph.tot_len	= header->ip4.tot_len;
+	iph.id		= header->ip4.id;
+	iph.frag_off	= header->ip4.frag_off;
+	iph.ttl		= header->ip4.ttl;
+	iph.protocol	= header->ip4.protocol;
+	iph.check	= 0;
+	iph.saddr	= header->ip4.saddr;
+	iph.daddr	= header->ip4.daddr;
+
+	return ip_fast_csum((u8 *)&iph, iph.ihl);
+}
+EXPORT_SYMBOL(ib_ud_ip4_csum);
+
 /**
  * ib_ud_header_init - Initialize UD header structure
  * @payload_bytes:Length of packet payload
@@ -220,19 +303,35 @@ static const struct ib_field deth_table[] = {
  * @eth_present: specify if Eth header is present
  * @vlan_present: packet is tagged vlan
  * @grh_present:GRH flag (if non-zero, GRH will be included)
+ * @ip_version:GRH flag (if non-zero, IP header, V4 or V6, will be included)
+ * @grh_present:GRH flag (if non-zero, UDP header will be included)
  * @immediate_present: specify if immediate data is present
  * @header:Structure to initialize
  */
-void ib_ud_header_init(int     		    payload_bytes,
-		       int		    lrh_present,
-		       int		    eth_present,
-		       int		    vlan_present,
-		       int    		    grh_present,
-		       int		    immediate_present,
-		       struct ib_ud_header *header)
+int ib_ud_header_init(int     payload_bytes,
+		      int    lrh_present,
+		      int    eth_present,
+		      int    vlan_present,
+		      int    grh_present,
+		      int    ip_version,
+		      int    udp_present,
+		      int    immediate_present,
+		      struct ib_ud_header *header)
 {
+	int ipv4_present;
+	int ipv6_present;
+
+	grh_present = grh_present && !ip_version;
 	memset(header, 0, sizeof *header);
 
+	/*
+	 * UDP header without IP header doesn't make sense
+	 */
+	if (udp_present && ip_version != 4 && ip_version != 6)
+		return -EINVAL;
+
+	ipv4_present = (ip_version == 4);
+	ipv6_present = (ip_version == 6);
 	if (lrh_present) {
 		u16 packet_length;
 
@@ -252,7 +351,7 @@ void ib_ud_header_init(int     		    payload_bytes,
 	if (vlan_present)
 		header->eth.type = cpu_to_be16(ETH_P_8021Q);
 
-	if (grh_present) {
+	if (ipv6_present || grh_present) {
 		header->grh.ip_version      = 6;
 		header->grh.payload_length  =
 			cpu_to_be16((IB_BTH_BYTES     +
@@ -260,8 +359,29 @@ void ib_ud_header_init(int     		    payload_bytes,
 				     payload_bytes    +
 				     4                + /* ICRC     */
 				     3) & ~3);          /* round up */
-		header->grh.next_header     = 0x1b;
+		header->grh.next_header     = udp_present ? IPPROTO_UDP : 0x1b;
+	}
+
+	if (ipv4_present) {
+		int udp_bytes = udp_present ? IB_UDP_BYTES : 0;
+
+		header->ip4.ver_len = 0x45; /* version 4, 5 words */
+		header->ip4.tot_len =
+			cpu_to_be16(IB_IP4_BYTES   +
+				     udp_bytes     +
+				     IB_BTH_BYTES  +
+				     IB_DETH_BYTES +
+				     payload_bytes +
+				     4);     /* ICRC     */
+		header->ip4.protocol = IPPROTO_UDP;
 	}
+	if (udp_present && ip_version)
+		header->udp.length =
+			cpu_to_be16(IB_UDP_BYTES   +
+				     IB_BTH_BYTES  +
+				     IB_DETH_BYTES +
+				     payload_bytes +
+				     4);     /* ICRC     */
 
 	if (immediate_present)
 		header->bth.opcode           = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
@@ -273,8 +393,11 @@ void ib_ud_header_init(int     		    payload_bytes,
 	header->lrh_present = lrh_present;
 	header->eth_present = eth_present;
 	header->vlan_present = vlan_present;
-	header->grh_present = grh_present;
+	header->grh_present = grh_present || ipv6_present;
+	header->ipv4_present = ipv4_present;
+	header->udp_present = udp_present;
 	header->immediate_present = immediate_present;
+	return 0;
 }
 EXPORT_SYMBOL(ib_ud_header_init);
 
@@ -311,6 +434,16 @@ int ib_ud_header_pack(struct ib_ud_header *header,
 			&header->grh, buf + len);
 		len += IB_GRH_BYTES;
 	}
+	if (header->ipv4_present) {
+		ib_pack(ip4_table, ARRAY_SIZE(ip4_table),
+			&header->ip4, buf + len);
+		len += IB_IP4_BYTES;
+	}
+	if (header->udp_present) {
+		ib_pack(udp_table, ARRAY_SIZE(udp_table),
+			&header->udp, buf + len);
+		len += IB_UDP_BYTES;
+	}
 
 	ib_pack(bth_table, ARRAY_SIZE(bth_table),
 		&header->bth, buf + len);
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index d7d7c5a..1141cf0 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -2085,7 +2085,7 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
 	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER)
 		send_size += sizeof (struct mlx4_ib_tunnel_header);
 
-	ib_ud_header_init(send_size, 1, 0, 0, 0, 0, &sqp->ud_header);
+	ib_ud_header_init(send_size, 1, 0, 0, 0, 0, 0, 0, &sqp->ud_header);
 
 	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) {
 		sqp->ud_header.lrh.service_level =
@@ -2231,7 +2231,10 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 			is_vlan = 1;
 		}
 	}
-	ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header);
+	err = ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh,
+				0, 0, 0, &sqp->ud_header);
+	if (err)
+		return err;
 
 	if (!is_eth) {
 		sqp->ud_header.lrh.service_level =
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index e354b2f..22a04fd 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -1485,7 +1485,7 @@ static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp,
 	u16 pkey;
 
 	ib_ud_header_init(256, /* assume a MAD */ 1, 0, 0,
-			  mthca_ah_grh_present(to_mah(wr->wr.ud.ah)), 0,
+			  mthca_ah_grh_present(to_mah(wr->wr.ud.ah)), 0, 0, 0,
 			  &sqp->ud_header);
 
 	err = mthca_read_ah(dev, to_mah(wr->wr.ud.ah), &sqp->ud_header);
diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h
index b1f7592..acbc1af 100644
--- a/include/rdma/ib_pack.h
+++ b/include/rdma/ib_pack.h
@@ -41,6 +41,8 @@ enum {
 	IB_ETH_BYTES  = 14,
 	IB_VLAN_BYTES = 4,
 	IB_GRH_BYTES  = 40,
+	IB_IP4_BYTES  = 20,
+	IB_UDP_BYTES  = 8,
 	IB_BTH_BYTES  = 12,
 	IB_DETH_BYTES = 8
 };
@@ -221,6 +223,26 @@ struct ib_unpacked_eth {
 	__be16	type;
 };
 
+struct ib_unpacked_ip4 {
+	u8	ver_len;
+	u8	tos;
+	__be16	tot_len;
+	__be16	id;
+	__be16	frag_off;
+	u8	ttl;
+	u8	protocol;
+	__be16	check;
+	__be32	saddr;
+	__be32	daddr;
+};
+
+struct ib_unpacked_udp {
+	__be16	sport;
+	__be16	dport;
+	__be16	length;
+	__be16	csum;
+};
+
 struct ib_unpacked_vlan {
 	__be16  tag;
 	__be16  type;
@@ -235,6 +257,10 @@ struct ib_ud_header {
 	struct ib_unpacked_vlan vlan;
 	int			grh_present;
 	struct ib_unpacked_grh	grh;
+	int			ipv4_present;
+	struct ib_unpacked_ip4	ip4;
+	int			udp_present;
+	struct ib_unpacked_udp	udp;
 	struct ib_unpacked_bth	bth;
 	struct ib_unpacked_deth deth;
 	int			immediate_present;
@@ -251,13 +277,17 @@ void ib_unpack(const struct ib_field        *desc,
 	       void                         *buf,
 	       void                         *structure);
 
-void ib_ud_header_init(int		    payload_bytes,
-		       int		    lrh_present,
-		       int		    eth_present,
-		       int		    vlan_present,
-		       int		    grh_present,
-		       int		    immediate_present,
-		       struct ib_ud_header *header);
+__be16 ib_ud_ip4_csum(struct ib_ud_header *header);
+
+int ib_ud_header_init(int		    payload_bytes,
+		      int		    lrh_present,
+		      int		    eth_present,
+		      int		    vlan_present,
+		      int		    grh_present,
+		      int		    ip_version,
+		      int		    udp_present,
+		      int		    immediate_present,
+		      struct ib_ud_header *header);
 
 int ib_ud_header_pack(struct ib_ud_header *header,
 		      void                *buf);
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 31/33] IB/mlx4: Enable send of RoCE QP1 packets with IP/UDP headers
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (29 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 30/33] IB/core: Initialize UD header structure with IP and UDP headers Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 32/33] IB/mlx4: Create and use another QP1 for RoCEv2 Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 33/33] IB/cma: Join and leave multicast groups with IGMP Somnath Kotur
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Moni Shoua, Somnath Kotur

From: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

RoCEv2 packets are sent over IP/UDP protocols.
The mlx4 driver uses a type of RAW QP to send packets for QP1 and
therefore needs to build the network headers below BTH in software.

This patche adds option to build QP1 packets with IP and UDP headers if
RoCEv2 is requested.

Signed-off-by: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/hw/mlx4/qp.c | 84 +++++++++++++++++++++++++----------------
 1 file changed, 52 insertions(+), 32 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 1141cf0..fb37415 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -32,6 +32,8 @@
  */
 
 #include <linux/log2.h>
+#include <linux/if_ether.h>
+#include <net/ip.h>
 #include <linux/slab.h>
 #include <linux/netdevice.h>
 
@@ -2169,16 +2171,7 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
 	return 0;
 }
 
-static void mlx4_u64_to_smac(u8 *dst_mac, u64 src_mac)
-{
-	int i;
-
-	for (i = ETH_ALEN; i; i--) {
-		dst_mac[i - 1] = src_mac & 0xff;
-		src_mac >>= 8;
-	}
-}
-
+#define MLX4_ROCEV2_QP1_SPORT 0xC000
 static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 			    void *wqe, unsigned *mlx_seg_len)
 {
@@ -2198,6 +2191,8 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 	bool is_eth;
 	bool is_vlan = false;
 	bool is_grh;
+	bool is_udp = false;
+	int ip_version = 0;
 
 	send_size = 0;
 	for (i = 0; i < wr->num_sge; ++i)
@@ -2206,6 +2201,8 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 	is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
 	is_grh = mlx4_ib_ah_grh_present(ah);
 	if (is_eth) {
+		struct ib_gid_attr gid_attr;
+
 		if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
 			/* When multi-function is enabled, the ib_core gid
 			 * indexes don't necessarily match the hw ones, so
@@ -2216,23 +2213,31 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 			if (err)
 				return err;
 		} else  {
-			err = ib_get_cached_gid(ib_dev,
+			err = ib_get_cached_gid(sqp->qp.ibqp.device,
 						be32_to_cpu(ah->av.ib.port_pd) >> 24,
-						ah->av.ib.gid_index, &sgid,
-						NULL);
+						ah->av.ib.gid_index, &sgid, &gid_attr);
 			if (!err && !memcmp(&sgid, &zgid, sizeof(sgid)))
 				err = -ENOENT;
-			if (err)
+			if (!err) {
+				is_udp = (gid_attr.gid_type == IB_GID_TYPE_ROCE_V2) ? true : false;
+				if (is_udp) {
+					if (ipv6_addr_v4mapped((struct in6_addr *)&sgid))
+						ip_version = 4;
+					else
+						ip_version = 6;
+					is_grh = false;
+				}
+			} else {
 				return err;
+			}
 		}
-
 		if (ah->av.eth.vlan != cpu_to_be16(0xffff)) {
 			vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff;
 			is_vlan = 1;
 		}
 	}
 	err = ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh,
-				0, 0, 0, &sqp->ud_header);
+			  ip_version, is_udp, 0, &sqp->ud_header);
 	if (err)
 		return err;
 
@@ -2243,12 +2248,14 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 		sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
 	}
 
-	if (is_grh) {
+	if (is_grh || (ip_version == 6)) {
 		sqp->ud_header.grh.traffic_class =
 			(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
 		sqp->ud_header.grh.flow_label    =
 			ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
-		sqp->ud_header.grh.hop_limit     = ah->av.ib.hop_limit;
+
+		sqp->ud_header.grh.hop_limit     = (is_udp) ?
+			IPV6_DEFAULT_HOPLIMIT : ah->av.ib.hop_limit;
 		if (is_eth)
 			memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16);
 		else {
@@ -2272,6 +2279,26 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 		       ah->av.ib.dgid, 16);
 	}
 
+	if (ip_version == 4) {
+		sqp->ud_header.ip4.tos =
+			(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
+		sqp->ud_header.ip4.id = 0;
+		sqp->ud_header.ip4.frag_off = htons(IP_DF);
+		sqp->ud_header.ip4.ttl = (is_udp) ?
+			IPV6_DEFAULT_HOPLIMIT : ah->av.eth.hop_limit;
+
+		memcpy(&sqp->ud_header.ip4.saddr,
+		       sgid.raw + 12, 4);
+		memcpy(&sqp->ud_header.ip4.daddr, ah->av.ib.dgid + 12, 4);
+		sqp->ud_header.ip4.check = ib_ud_ip4_csum(&sqp->ud_header);
+	}
+
+	if (is_udp) {
+		sqp->ud_header.udp.dport = htons(ROCE_V2_UDP_DPORT);
+		sqp->ud_header.udp.sport = htons(MLX4_ROCEV2_QP1_SPORT);
+		sqp->ud_header.udp.csum = 0;
+	}
+
 	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
 
 	if (!is_eth) {
@@ -2300,34 +2327,27 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
 
 	if (is_eth) {
 		struct in6_addr in6;
-
+		u16 ether_type;
 		u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13;
 
+		ether_type = (!is_udp) ? MLX4_IB_IBOE_ETHERTYPE :
+			(ip_version == 4 ? ETH_P_IP : ETH_P_IPV6);
+
 		mlx->sched_prio = cpu_to_be16(pcp);
 
+		ether_addr_copy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac);
 		memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
-		/* FIXME: cache smac value? */
 		memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2);
 		memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4);
 		memcpy(&in6, sgid.raw, sizeof(in6));
 
-		if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
-			u64 mac = atomic64_read(&to_mdev(ib_dev)->iboe.mac[sqp->qp.port - 1]);
-			u8 smac[ETH_ALEN];
-
-			mlx4_u64_to_smac(smac, mac);
-			memcpy(sqp->ud_header.eth.smac_h, smac, ETH_ALEN);
-		} else {
-			/* use the src mac of the tunnel */
-			memcpy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac, ETH_ALEN);
-		}
 
 		if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
 			mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
 		if (!is_vlan) {
-			sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
+			sqp->ud_header.eth.type = cpu_to_be16(ether_type);
 		} else {
-			sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
+			sqp->ud_header.vlan.type = cpu_to_be16(ether_type);
 			sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
 		}
 	} else {
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 32/33] IB/mlx4: Create and use another QP1 for RoCEv2
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (30 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 31/33] IB/mlx4: Enable send of RoCE QP1 packets with IP/UDP headers Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  2015-03-25 21:20   ` [PATCH v3 for-next 33/33] IB/cma: Join and leave multicast groups with IGMP Somnath Kotur
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Moni Shoua, Somnath Kotur

From: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

The mlx4 driver uses a special QP to implement the GSI QP. This kind of
QP allows to build the InfiniBand headers in SW to be put before the
payload that comes in with the WR. The mlx4 HW builds the packet,
calculates the ICRC and puts it at the end of the payload. This ICRC
calculation however depends on the QP configuration which is determined
when QP is modified (roce_mode during INIT->RTR). On the other hand,  ICRC
verification when packet is received does to depend on this
configuration.
Therefore, using 2 GSI QPs for send (one for each RoCE version) and 1
GSI QP for receive are required.

Signed-off-by: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   7 ++
 drivers/infiniband/hw/mlx4/qp.c      | 155 +++++++++++++++++++++++++++++++----
 2 files changed, 144 insertions(+), 18 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 018bda6..a853330 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -159,11 +159,18 @@ struct mlx4_ib_wq {
 	unsigned		tail;
 };
 
+enum {
+	MLX4_IB_QP_CREATE_ROCE_V2_GSI = IB_QP_CREATE_RESERVED_START
+};
+
 enum mlx4_ib_qp_flags {
 	MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO,
 	MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK,
 	MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP,
 	MLX4_IB_QP_CREATE_USE_GFP_NOIO = IB_QP_CREATE_USE_GFP_NOIO,
+
+	/* Mellanox specific flags start from IB_QP_CREATE_RESERVED_START */
+	MLX4_IB_ROCE_V2_GSI_QP = MLX4_IB_QP_CREATE_ROCE_V2_GSI,
 	MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30,
 	MLX4_IB_SRIOV_SQP = 1 << 31,
 };
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index fb37415..b54f315 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -81,6 +81,7 @@ struct mlx4_ib_sqp {
 	u32			send_psn;
 	struct ib_ud_header	ud_header;
 	u8			header_buf[MLX4_IB_UD_HEADER_SIZE];
+	struct ib_qp		*roce_v2_gsi;
 };
 
 enum {
@@ -150,7 +151,10 @@ static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 			}
 		}
 	}
-	return proxy_sqp;
+	if (proxy_sqp)
+		return 1;
+
+	return !!(qp->flags & MLX4_IB_ROCE_V2_GSI_QP);
 }
 
 /* used for INIT/CLOSE port logic */
@@ -672,6 +676,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 			qp = &sqp->qp;
 			qp->pri.vid = 0xFFFF;
 			qp->alt.vid = 0xFFFF;
+			sqp->roce_v2_gsi = NULL;
 		} else {
 			qp = kzalloc(sizeof (struct mlx4_ib_qp), gfp);
 			if (!qp)
@@ -1029,9 +1034,17 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 	del_gid_entries(qp);
 }
 
-static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr)
+static int get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr)
 {
 	/* Native or PPF */
+	if ((!mlx4_is_mfunc(dev->dev) || mlx4_is_master(dev->dev)) &&
+	    attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI) {
+		int sqpn;
+		int res = mlx4_qp_reserve_range(dev->dev, 1, 1, &sqpn, 0);
+
+		return res ? -abs(res) : sqpn;
+	}
+
 	if (!mlx4_is_mfunc(dev->dev) ||
 	    (mlx4_is_master(dev->dev) &&
 	     attr->create_flags & MLX4_IB_SRIOV_SQP)) {
@@ -1039,6 +1052,7 @@ static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr)
 			(attr->qp_type == IB_QPT_SMI ? 0 : 2) +
 			attr->port_num - 1;
 	}
+
 	/* PF or VF -- creating proxies */
 	if (attr->qp_type == IB_QPT_SMI)
 		return dev->dev->caps.qp0_proxy[attr->port_num - 1];
@@ -1046,9 +1060,9 @@ static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr)
 		return dev->dev->caps.qp1_proxy[attr->port_num - 1];
 }
 
-struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
-				struct ib_qp_init_attr *init_attr,
-				struct ib_udata *udata)
+static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
+					struct ib_qp_init_attr *init_attr,
+					struct ib_udata *udata)
 {
 	struct mlx4_ib_qp *qp = NULL;
 	int err;
@@ -1066,6 +1080,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 					MLX4_IB_SRIOV_TUNNEL_QP |
 					MLX4_IB_SRIOV_SQP |
 					MLX4_IB_QP_NETIF |
+					MLX4_IB_QP_CREATE_ROCE_V2_GSI |
 					MLX4_IB_QP_CREATE_USE_GFP_NOIO))
 		return ERR_PTR(-EINVAL);
 
@@ -1074,13 +1089,19 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 			return ERR_PTR(-EINVAL);
 	}
 
-	if (init_attr->create_flags &&
-	    (udata ||
-	     ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP | MLX4_IB_QP_CREATE_USE_GFP_NOIO)) &&
-	      init_attr->qp_type != IB_QPT_UD) ||
-	     ((init_attr->create_flags & MLX4_IB_SRIOV_SQP) &&
-	      init_attr->qp_type > IB_QPT_GSI)))
-		return ERR_PTR(-EINVAL);
+	if (init_attr->create_flags) {
+		/* userspace is not allowed to set create flags */
+		if (udata)
+			return ERR_PTR(-EINVAL);
+
+		if ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP | MLX4_IB_QP_CREATE_USE_GFP_NOIO) &&
+		     init_attr->qp_type != IB_QPT_UD) &&
+		    (init_attr->create_flags & MLX4_IB_SRIOV_SQP &&
+		     init_attr->qp_type > IB_QPT_GSI) &&
+		    (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI &&
+		     init_attr->qp_type != IB_QPT_GSI))
+			return ERR_PTR(-EINVAL);
+	}
 
 	switch (init_attr->qp_type) {
 	case IB_QPT_XRC_TGT:
@@ -1117,19 +1138,25 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 	case IB_QPT_SMI:
 	case IB_QPT_GSI:
 	{
+		int sqpn;
+
 		/* Userspace is not allowed to create special QPs: */
 		if (udata)
 			return ERR_PTR(-EINVAL);
+		sqpn = get_sqp_num(to_mdev(pd->device), init_attr);
+
+		if (sqpn < 0)
+			return ERR_PTR(sqpn);
 
 		err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata,
-				       get_sqp_num(to_mdev(pd->device), init_attr),
+				       sqpn,
 				       &qp, gfp);
 		if (err)
 			return ERR_PTR(err);
 
 		qp->port	= init_attr->port_num;
-		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
-
+		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 :
+			init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI ? sqpn : 1;
 		break;
 	}
 	default:
@@ -1140,7 +1167,42 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 	return &qp->ibqp;
 }
 
-int mlx4_ib_destroy_qp(struct ib_qp *qp)
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata) {
+	struct ib_device *device = pd ? pd->device : init_attr->xrcd->device;
+	struct ib_qp *ibqp;
+	struct mlx4_ib_dev *dev = to_mdev(device);
+
+	ibqp = _mlx4_ib_create_qp(pd, init_attr, udata);
+
+	if (!IS_ERR_OR_NULL(ibqp) &&
+	    (init_attr->qp_type == IB_QPT_GSI) &&
+	    !(init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI)) {
+		struct mlx4_ib_sqp *sqp = to_msqp((to_mqp(ibqp)));
+		int is_eth = rdma_port_get_link_layer(pd->device, init_attr->port_num) ==
+			IB_LINK_LAYER_ETHERNET;
+
+		if (is_eth &&
+		    dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
+			init_attr->create_flags |= MLX4_IB_QP_CREATE_ROCE_V2_GSI;
+			sqp->roce_v2_gsi = ib_create_qp(pd, init_attr);
+
+			if (IS_ERR_OR_NULL(sqp->roce_v2_gsi)) {
+				pr_err("Failed to create GSI QP for RoCEv2 (%ld)\n", PTR_ERR(sqp->roce_v2_gsi));
+				sqp->roce_v2_gsi = NULL;
+			} else {
+				sqp = to_msqp(to_mqp(sqp->roce_v2_gsi));
+				sqp->qp.flags |= MLX4_IB_ROCE_V2_GSI_QP;
+			}
+
+			init_attr->create_flags &= ~MLX4_IB_QP_CREATE_ROCE_V2_GSI;
+		}
+	}
+	return ibqp;
+}
+
+static int _mlx4_ib_destroy_qp(struct ib_qp *qp)
 {
 	struct mlx4_ib_dev *dev = to_mdev(qp->device);
 	struct mlx4_ib_qp *mqp = to_mqp(qp);
@@ -1166,6 +1228,20 @@ int mlx4_ib_destroy_qp(struct ib_qp *qp)
 	return 0;
 }
 
+int mlx4_ib_destroy_qp(struct ib_qp *qp)
+{
+	struct mlx4_ib_qp *mqp = to_mqp(qp);
+
+	if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+		struct mlx4_ib_sqp *sqp = to_msqp(mqp);
+
+		if (sqp->roce_v2_gsi)
+			ib_destroy_qp(sqp->roce_v2_gsi);
+	}
+
+	return _mlx4_ib_destroy_qp(qp);
+}
+
 static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type)
 {
 	switch (type) {
@@ -1539,6 +1615,14 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 			mlx4_ib_steer_qp_reg(dev, qp, 1);
 			steer_qp = 1;
 		}
+
+		if (ibqp->qp_type == IB_QPT_GSI) {
+			enum ib_gid_type gid_type = qp->flags & MLX4_IB_ROCE_V2_GSI_QP ?
+				IB_GID_TYPE_ROCE_V2 : IB_GID_TYPE_IB;
+			u8 qpc_roce_mode = gid_type_to_qpc(gid_type);
+
+			context->rlkey_roce_mode |= (qpc_roce_mode << 6);
+		}
 	}
 
 	if (attr_mask & IB_QP_PKEY_INDEX) {
@@ -1941,8 +2025,8 @@ out:
 	return err;
 }
 
-int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-		      int attr_mask, struct ib_udata *udata)
+static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+			      int attr_mask, struct ib_udata *udata)
 {
 	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
 	struct mlx4_ib_qp *qp = to_mqp(ibqp);
@@ -2045,6 +2129,25 @@ out:
 	return err;
 }
 
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+	int ret;
+
+	ret = _mlx4_ib_modify_qp(ibqp, attr, attr_mask, udata);
+
+	if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+		struct mlx4_ib_sqp *sqp = to_msqp(mqp);
+
+		if (sqp->roce_v2_gsi)
+			ret = ib_modify_qp(sqp->roce_v2_gsi, attr, attr_mask);
+		if (ret)
+			pr_err("Failed to modify GSI QP for RoCEv2 (%d)\n", ret);
+	}
+	return ret;
+}
+
 static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey)
 {
 	int i;
@@ -2709,6 +2812,22 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 	__be32 blh;
 	int i;
 
+	if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
+		struct mlx4_ib_sqp *sqp = to_msqp(qp);
+
+		if (sqp->roce_v2_gsi) {
+			struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+			struct ib_gid_attr gid_attr;
+			union ib_gid gid;
+
+			if (!ib_get_cached_gid(ibqp->device,
+					       be32_to_cpu(ah->av.ib.port_pd) >> 24,
+					       ah->av.ib.gid_index, &gid, &gid_attr))
+				qp = (gid_attr.gid_type == IB_GID_TYPE_ROCE_V2) ?
+					to_mqp(sqp->roce_v2_gsi) : qp;
+		}
+	}
+
 	spin_lock_irqsave(&qp->sq.lock, flags);
 
 	ind = qp->sq_next_wqe;
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 for-next 33/33] IB/cma: Join and leave multicast groups with IGMP
       [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
                     ` (31 preceding siblings ...)
  2015-03-25 21:20   ` [PATCH v3 for-next 32/33] IB/mlx4: Create and use another QP1 for RoCEv2 Somnath Kotur
@ 2015-03-25 21:20   ` Somnath Kotur
  32 siblings, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-25 21:20 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Moni Shoua, Somnath Kotur

From: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Since RoCEv2 is a protocol over IP header it is required to send IGMP
join and leave requests to the network when joining and leaving
multicast groups.

Signed-off-by: Moni Shoua <monis-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
---
 drivers/infiniband/core/cma.c       | 78 ++++++++++++++++++++++++++++++++++---
 drivers/infiniband/core/multicast.c | 18 ++++++++-
 include/rdma/ib_sa.h                |  3 ++
 3 files changed, 92 insertions(+), 7 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 6f345e2..8f997d7 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -38,6 +38,7 @@
 #include <linux/in6.h>
 #include <linux/mutex.h>
 #include <linux/random.h>
+#include <linux/igmp.h>
 #include <linux/idr.h>
 #include <linux/inetdevice.h>
 #include <linux/slab.h>
@@ -196,6 +197,7 @@ struct cma_multicast {
 	void			*context;
 	struct sockaddr_storage	addr;
 	struct kref		mcref;
+	bool			igmp_joined;
 };
 
 struct cma_work {
@@ -283,6 +285,26 @@ static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
 	hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
 }
 
+static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join)
+{
+	struct in_device *in_dev = NULL;
+
+	if (ndev) {
+		rtnl_lock();
+		in_dev = __in_dev_get_rtnl(ndev);
+		if (in_dev) {
+			if (join)
+				ip_mc_inc_group(in_dev,
+						*(__be32 *)(mgid->raw+12));
+			else
+				ip_mc_dec_group(in_dev,
+						*(__be32 *)(mgid->raw+12));
+		}
+		rtnl_unlock();
+	}
+	return (in_dev) ? 0 : -ENODEV;
+}
+
 static void cma_attach_to_dev(struct rdma_id_private *id_priv,
 			      struct cma_device *cma_dev)
 {
@@ -1076,6 +1098,20 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
 			kfree(mc);
 			break;
 		case IB_LINK_LAYER_ETHERNET:
+			if (mc->igmp_joined) {
+				struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+				struct net_device *ndev = NULL;
+
+				if (dev_addr->bound_dev_if)
+					ndev = dev_get_by_index(&init_net,
+								dev_addr->bound_dev_if);
+				if (ndev) {
+					cma_igmp_send(ndev,
+						      &mc->multicast.ib->rec.mgid,
+						      false);
+					dev_put(ndev);
+				}
+			}
 			kref_put(&mc->mcref, release_mc);
 			break;
 		default:
@@ -3356,7 +3392,7 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
 {
 	struct iboe_mcast_work *work;
 	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
-	int err;
+	int err = 0;
 	struct sockaddr *addr = (struct sockaddr *)&mc->addr;
 	struct net_device *ndev = NULL;
 
@@ -3388,13 +3424,30 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
 	mc->multicast.ib->rec.rate = iboe_get_rate(ndev);
 	mc->multicast.ib->rec.hop_limit = 1;
 	mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu);
+	mc->multicast.ib->rec.ifindex = dev_addr->bound_dev_if;
+	mc->multicast.ib->rec.net = &init_net;
+	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+		    &mc->multicast.ib->rec.port_gid);
+
+	if (addr->sa_family == AF_INET) {
+		mc->multicast.ib->rec.gid_type =
+			id_priv->cma_dev->default_gid_type;
+		if (mc->multicast.ib->rec.gid_type == IB_GID_TYPE_ROCE_V2)
+			err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid,
+					    true);
+		if (!err) {
+			mc->igmp_joined = true;
+			mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT;
+		}
+	} else {
+		mc->multicast.ib->rec.gid_type = IB_GID_TYPE_IB;
+	}
 	dev_put(ndev);
-	if (!mc->multicast.ib->rec.mtu) {
+	if (err || !mc->multicast.ib->rec.mtu) {
 		err = -EINVAL;
 		goto out2;
 	}
-	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
-		    &mc->multicast.ib->rec.port_gid);
+
 	work->id = id_priv;
 	work->mc = mc;
 	INIT_WORK(&work->work, iboe_mcast_work_handler);
@@ -3429,7 +3482,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
 	memcpy(&mc->addr, addr, rdma_addr_size(addr));
 	mc->context = context;
 	mc->id_priv = id_priv;
-
+	mc->igmp_joined = false;
 	spin_lock(&id_priv->lock);
 	list_add(&mc->list, &id_priv->mc_list);
 	spin_unlock(&id_priv->lock);
@@ -3486,6 +3539,21 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
 					kfree(mc);
 					break;
 				case IB_LINK_LAYER_ETHERNET:
+					if (mc->igmp_joined) {
+						struct rdma_dev_addr *dev_addr = &id->route.addr.dev_addr;
+						struct net_device *ndev = NULL;
+
+						if (dev_addr->bound_dev_if)
+							ndev = dev_get_by_index(&init_net,
+										dev_addr->bound_dev_if);
+						if (ndev) {
+							cma_igmp_send(ndev,
+								      &mc->multicast.ib->rec.mgid,
+								      false);
+							dev_put(ndev);
+						}
+						mc->igmp_joined = false;
+					}
 					kref_put(&mc->mcref, release_mc);
 					break;
 				default:
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
index f1927f1..9cbee6c 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -729,8 +729,22 @@ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
 	u16 gid_index;
 	u8 p;
 
-	ret = ib_find_cached_gid(device, &rec->port_gid, IB_GID_TYPE_IB,
-				 NULL, 0, &p, &gid_index);
+	switch (rdma_port_get_link_layer(device, port_num)) {
+	case IB_LINK_LAYER_ETHERNET:
+		ret = ib_find_cached_gid_by_port(device, &rec->port_gid,
+						 rec->gid_type, port_num,
+						 rec->net, rec->ifindex,
+						 &gid_index);
+		break;
+	case IB_LINK_LAYER_INFINIBAND:
+		ret = ib_find_cached_gid(device, &rec->port_gid,
+					 IB_GID_TYPE_IB, NULL, 0, &p,
+					 &gid_index);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
 	if (ret)
 		return ret;
 
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index 61bc231..653d538 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -198,6 +198,9 @@ struct ib_sa_mcmember_rec {
 	u8           scope;
 	u8           join_state;
 	int          proxy_join;
+	int	     ifindex;
+	struct net  *net;
+	enum ib_gid_type gid_type;
 };
 
 /* Service Record Component Mask Sec 15.2.5.14 Ver 1.1	*/
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]     ` <44ab0dce-c7c9-400b-af24-10b8981358a7-3RiH6ntJJkOPfaB/Gd0HpljyZtpTMMwT@public.gmane.org>
@ 2015-03-25 23:42       ` Bart Van Assche
       [not found]         ` <551347E9.5090503-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org>
  2015-04-08  0:30       ` Hefty, Sean
  2015-04-26 17:20       ` Or Gerlitz
  2 siblings, 1 reply; 82+ messages in thread
From: Bart Van Assche @ 2015-03-25 23:42 UTC (permalink / raw)
  To: Somnath Kotur, roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak

On 03/25/2015 02:19 PM, Somnath Kotur wrote:
> +	if (cache->data_vec[ix].attr.ndev &&
> +	    cache->data_vec[ix].attr.ndev != old_net_dev)

A few lines earlier the memory old_net_dev points at was freed. If two 
instances of this function run concurrently, what prevents that the 
old_net_dev memory has been reallocated and hence that attr.ndev == 
old_net_dev although both pointers refer(red) to different network devices ?

> +	ACCESS_ONCE(cache->data_vec[ix].seq) = orig_seq;

Invoking write_gid() is only safe if the caller serializes write_gid() 
calls. Apparently the cache->lock mutex is used for that purpose. So why 
is it necessary to use ACCESS_ONCE() here ? Why is it needed to prevent 
that the compiler coalesces this write with another write into the same 
structure ?

> +		/* Make sure the sequence number we remeber was read

This looks like a typo - shouldn't the above read "remember" ?

BTW, the style of that comment is recommended only for networking code 
and not for IB code. Have you verified this patch with checkpatch ?

> +	mutex_lock(&cache->lock);
> +
> +	for (ix = 0; ix < cache->sz; ix++)
> +		if (cache->data_vec[ix].attr.ndev == ndev)
> +			write_gid(ib_dev, port, cache, ix, &zgid, &zattr);
> +
> +	mutex_unlock(&cache->lock);
> +	return 0;

The traditional Linux kernel coding style is one blank line before 
mutex_lock() and after mutex_unlock() but not after mutex_lock() nor 
before mutex_unlock().

> +	orig_seq = ACCESS_ONCE(cache->data_vec[index].seq);
> +	/* Make sure we read the sequence number before copying the
> +	 * gid to local storage. */
> +	smp_rmb();

Please use READ_ONCE() instead of ACCESS_ONCE() as recommended in 
<linux/compiler.h>.

> +static void free_roce_gid_cache(struct ib_device *ib_dev, u8 port)
> +{
> +	int i;
> +	struct ib_roce_gid_cache *cache =
> +		ib_dev->cache.roce_gid_cache[port - 1];
> +
> +	if (!cache)
> +		return;
> +
> +	for (i = 0; i < cache->sz; ++i) {
> +		if (memcmp(&cache->data_vec[i].gid, &zgid,
> +			   sizeof(cache->data_vec[i].gid)))
> +		    write_gid(ib_dev, port, cache, i, &zgid, &zattr);
> +	}
 > +	kfree(cache->data_vec);
 > +	kfree(cache);
 > +}

Overwriting data just before it is freed is not useful. Please use 
CONFIG_SLUB_DEBUG=y to debug use-after-free issues instead of such code.

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]     ` <9f65de5e-ed5f-48d2-bff2-03ffbe4f4876-3RiH6ntJJkOPfaB/Gd0HpljyZtpTMMwT@public.gmane.org>
@ 2015-03-25 23:46       ` Bart Van Assche
       [not found]         ` <551348BD.9080200-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org>
  2015-04-26 20:10       ` Or Gerlitz
  1 sibling, 1 reply; 82+ messages in thread
From: Bart Van Assche @ 2015-03-25 23:46 UTC (permalink / raw)
  To: Somnath Kotur, roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak

On 03/25/2015 02:19 PM, Somnath Kotur wrote:
> +static void ib_device_complete_cb(struct kref *kref)
> +{
> +	struct ib_device *device = container_of(kref, struct ib_device,
> +						refcount);
> +
> +	if (device->reg_state >= IB_DEV_UNREGISTERING)
> +		complete(&device->free);
> +}

> @@ -355,6 +393,9 @@ void ib_unregister_device(struct ib_device *device)
>
>   	ib_device_unregister_sysfs(device);
>
> +	ib_device_put(device);
> +	wait_for_completion(&device->free);

Why is it necessary here to wait until the last reference is gone ? Why 
doesn't ib_device_complete_cb() free any memory ?

Bart.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* RE: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]         ` <551347E9.5090503-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org>
@ 2015-03-26 14:05           ` Somnath Kotur
  2015-04-14 13:23           ` Matan Barak
  1 sibling, 0 replies; 82+ messages in thread
From: Somnath Kotur @ 2015-03-26 14:05 UTC (permalink / raw)
  To: Bart Van Assche, roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak

Hi Matan/Moni,
                    Could either of you please respond to both of Bart's queries?

Thanks
Somnath

> -----Original Message-----
> From: Bart Van Assche [mailto:bart.vanassche-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org]
> Sent: Thursday, March 26, 2015 5:13 AM
> To: Somnath Kotur; roland-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; Matan Barak
> Subject: Re: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
> 
> On 03/25/2015 02:19 PM, Somnath Kotur wrote:
> > +	if (cache->data_vec[ix].attr.ndev &&
> > +	    cache->data_vec[ix].attr.ndev != old_net_dev)
> 
> A few lines earlier the memory old_net_dev points at was freed. If two
> instances of this function run concurrently, what prevents that the
> old_net_dev memory has been reallocated and hence that attr.ndev ==
> old_net_dev although both pointers refer(red) to different network devices
> ?
> 
> > +	ACCESS_ONCE(cache->data_vec[ix].seq) = orig_seq;
> 
> Invoking write_gid() is only safe if the caller serializes write_gid() calls.
> Apparently the cache->lock mutex is used for that purpose. So why is it
> necessary to use ACCESS_ONCE() here ? Why is it needed to prevent that
> the compiler coalesces this write with another write into the same structure
> ?
> 
> > +		/* Make sure the sequence number we remeber was read
> 
> This looks like a typo - shouldn't the above read "remember" ?
> 
> BTW, the style of that comment is recommended only for networking code
> and not for IB code. Have you verified this patch with checkpatch ?
> 
> > +	mutex_lock(&cache->lock);
> > +
> > +	for (ix = 0; ix < cache->sz; ix++)
> > +		if (cache->data_vec[ix].attr.ndev == ndev)
> > +			write_gid(ib_dev, port, cache, ix, &zgid, &zattr);
> > +
> > +	mutex_unlock(&cache->lock);
> > +	return 0;
> 
> The traditional Linux kernel coding style is one blank line before
> mutex_lock() and after mutex_unlock() but not after mutex_lock() nor
> before mutex_unlock().
> 
> > +	orig_seq = ACCESS_ONCE(cache->data_vec[index].seq);
> > +	/* Make sure we read the sequence number before copying the
> > +	 * gid to local storage. */
> > +	smp_rmb();
> 
> Please use READ_ONCE() instead of ACCESS_ONCE() as recommended in
> <linux/compiler.h>.
> 
> > +static void free_roce_gid_cache(struct ib_device *ib_dev, u8 port) {
> > +	int i;
> > +	struct ib_roce_gid_cache *cache =
> > +		ib_dev->cache.roce_gid_cache[port - 1];
> > +
> > +	if (!cache)
> > +		return;
> > +
> > +	for (i = 0; i < cache->sz; ++i) {
> > +		if (memcmp(&cache->data_vec[i].gid, &zgid,
> > +			   sizeof(cache->data_vec[i].gid)))
> > +		    write_gid(ib_dev, port, cache, i, &zgid, &zattr);
> > +	}
>  > +	kfree(cache->data_vec);
>  > +	kfree(cache);
>  > +}
> 
> Overwriting data just before it is freed is not useful. Please use
> CONFIG_SLUB_DEBUG=y to debug use-after-free issues instead of such
> code.
> 
> Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* RE: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]     ` <44ab0dce-c7c9-400b-af24-10b8981358a7-3RiH6ntJJkOPfaB/Gd0HpljyZtpTMMwT@public.gmane.org>
  2015-03-25 23:42       ` Bart Van Assche
@ 2015-04-08  0:30       ` Hefty, Sean
       [not found]         ` <1828884A29C6694DAF28B7E6B8A82373A8FBE792-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  2015-04-26 17:20       ` Or Gerlitz
  2 siblings, 1 reply; 82+ messages in thread
From: Hefty, Sean @ 2015-04-08  0:30 UTC (permalink / raw)
  To: Somnath Kotur, roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak

> In order to manage multiple types, vlans and MACs per GID, we
> need to store them along the GID itself. We store the net device
> as well, as sometimes GIDs should be handled according to the
> net device they came from. Since populating the GID table should
> be identical for every RoCE provider, the GIDs table should be
> handled in ib_core.
> 
> Adding a GID cache table that supports a lockless find, add and
> delete gids. The lockless nature comes from using a unique
> sequence number per table entry and detecting that while reading/
> writing this sequence wasn't changed.
> 
> By using this RoCE GID cache table, providers must implement a
> modify_gid callback. The table is managed exclusively by
> this roce_gid_cache and the provider just need to write
> the data to the hardware.
> 
> Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
> ---
>  drivers/infiniband/core/Makefile         |   3 +-
>  drivers/infiniband/core/core_priv.h      |  24 ++
>  drivers/infiniband/core/roce_gid_cache.c | 518

Why does RoCE need such a complex gid cache?  If a gid cache is needed at all, why should it be restricted to RoCE only?  And why is such a complex synchronization scheme needed?  Seriously, how many times will GIDs change and how many readers at once do you expect to have?


> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index 65994a1..1866595 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -64,6 +64,36 @@ union ib_gid {
>  	} global;
>  };
> 
> +extern union ib_gid zgid;
> +
> +enum ib_gid_type {
> +	/* If link layer is Ethernet, this is RoCE V1 */

I don't understand this comment.  Does RoCE v2 not run on Ethernet?

> +	IB_GID_TYPE_IB        = 0,
> +	IB_GID_TYPE_ROCE_V2   = 1,
> +	IB_GID_TYPE_SIZE
> +};

Can you explain the purpose of defining a 'GID type'.  A GID is just a global address.  Why does it matter to anyone using it how it was constructed?

> +
> +struct ib_gid_attr {
> +	enum ib_gid_type	gid_type;
> +	struct net_device	*ndev;
> +};
> +
> +struct ib_roce_gid_cache_entry {
> +	/* seq number of 0 indicates entry being changed. */
> +	unsigned int        seq;
> +	union ib_gid        gid;
> +	struct ib_gid_attr  attr;
> +	void		   *context;
> +};
> +
> +struct ib_roce_gid_cache {
> +	int		     active;
> +	int                  sz;
> +	/* locking against multiple writes in data_vec */
> +	struct mutex         lock;
> +	struct ib_roce_gid_cache_entry *data_vec;
> +};
> +
>  enum rdma_node_type {
>  	/* IB values map to NodeInfo:NodeType. */
>  	RDMA_NODE_IB_CA 	= 1,
> @@ -265,7 +295,9 @@ enum ib_port_cap_flags {
>  	IB_PORT_BOOT_MGMT_SUP			= 1 << 23,
>  	IB_PORT_LINK_LATENCY_SUP		= 1 << 24,
>  	IB_PORT_CLIENT_REG_SUP			= 1 << 25,
> -	IB_PORT_IP_BASED_GIDS			= 1 << 26
> +	IB_PORT_IP_BASED_GIDS			= 1 << 26,
> +	IB_PORT_ROCE				= 1 << 27,
> +	IB_PORT_ROCE_V2				= 1 << 28,

Why does RoCE suddenly require a port capability bit?  RoCE runs today without setting any bit.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* RE: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]         ` <1828884A29C6694DAF28B7E6B8A82373A8FBE792-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2015-04-08  4:10           ` Somnath Kotur
       [not found]             ` <7F44EA5110810A40B7DAFB605C41975D58F98121-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
  2015-04-08  8:49           ` Moni Shoua
  1 sibling, 1 reply; 82+ messages in thread
From: Somnath Kotur @ 2015-04-08  4:10 UTC (permalink / raw)
  To: Hefty, Sean, roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak

Hi Sean,

> -----Original Message-----
> From: Hefty, Sean [mailto:sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org]
> Sent: Wednesday, April 08, 2015 6:00 AM
> To: Somnath Kotur; roland-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; Matan Barak
> Subject: RE: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
> 
> > In order to manage multiple types, vlans and MACs per GID, we need to
> > store them along the GID itself. We store the net device as well, as
> > sometimes GIDs should be handled according to the net device they came
> > from. Since populating the GID table should be identical for every
> > RoCE provider, the GIDs table should be handled in ib_core.
> >
> > Adding a GID cache table that supports a lockless find, add and delete
> > gids. The lockless nature comes from using a unique sequence number
> > per table entry and detecting that while reading/ writing this
> > sequence wasn't changed.
> >
> > By using this RoCE GID cache table, providers must implement a
> > modify_gid callback. The table is managed exclusively by this
> > roce_gid_cache and the provider just need to write the data to the
> > hardware.
> >
> > Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> > Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
> > ---
> >  drivers/infiniband/core/Makefile         |   3 +-
> >  drivers/infiniband/core/core_priv.h      |  24 ++
> >  drivers/infiniband/core/roce_gid_cache.c | 518
> 
> Why does RoCE need such a complex gid cache?  If a gid cache is needed at
> all, why should it be restricted to RoCE only?  And why is such a complex
> synchronization scheme needed?  Seriously, how many times will GIDs
> change and how many readers at once do you expect to have?
> 
> 
> > diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index
> > 65994a1..1866595 100644
> > --- a/include/rdma/ib_verbs.h
> > +++ b/include/rdma/ib_verbs.h
> > @@ -64,6 +64,36 @@ union ib_gid {
> >  	} global;
> >  };
> >
> > +extern union ib_gid zgid;
> > +
> > +enum ib_gid_type {
> > +	/* If link layer is Ethernet, this is RoCE V1 */
> 
> I don't understand this comment.  Does RoCE v2 not run on Ethernet?
> 
Yes, this comment probably could use a reword..
> > +	IB_GID_TYPE_IB        = 0,
> > +	IB_GID_TYPE_ROCE_V2   = 1,
> > +	IB_GID_TYPE_SIZE
> > +};
> 
> Can you explain the purpose of defining a 'GID type'.  A GID is just a global
> address.  Why does it matter to anyone using it how it was constructed?

This is part of RoCE V2 Specification.  Please refer to Section A 17.8 . 
The GID Type determines the protocol for outbound packet generation i.e RoCE V1 (0x8915 Ether Type) or RoCEV2 (IPv4 or IPv6)
> 
> > +
> > +struct ib_gid_attr {
> > +	enum ib_gid_type	gid_type;
> > +	struct net_device	*ndev;
> > +};
> > +
> > +struct ib_roce_gid_cache_entry {
> > +	/* seq number of 0 indicates entry being changed. */
> > +	unsigned int        seq;
> > +	union ib_gid        gid;
> > +	struct ib_gid_attr  attr;
> > +	void		   *context;
> > +};
> > +
> > +struct ib_roce_gid_cache {
> > +	int		     active;
> > +	int                  sz;
> > +	/* locking against multiple writes in data_vec */
> > +	struct mutex         lock;
> > +	struct ib_roce_gid_cache_entry *data_vec; };
> > +
> >  enum rdma_node_type {
> >  	/* IB values map to NodeInfo:NodeType. */
> >  	RDMA_NODE_IB_CA 	= 1,
> > @@ -265,7 +295,9 @@ enum ib_port_cap_flags {
> >  	IB_PORT_BOOT_MGMT_SUP			= 1 << 23,
> >  	IB_PORT_LINK_LATENCY_SUP		= 1 << 24,
> >  	IB_PORT_CLIENT_REG_SUP			= 1 << 25,
> > -	IB_PORT_IP_BASED_GIDS			= 1 << 26
> > +	IB_PORT_IP_BASED_GIDS			= 1 << 26,
> > +	IB_PORT_ROCE				= 1 << 27,
> > +	IB_PORT_ROCE_V2				= 1 << 28,
> 
> Why does RoCE suddenly require a port capability bit?  RoCE runs today
> without setting any bit.
Again, this is part of RoCE V2 SPEC, please refer to Section A17.5.1- Query HCA(Pasting snippet below)
A new "RoCE Supported" capability bit shall be added to the Port Attributes
list. This capability bit applies exclusively to ports of the new
"RoCEv2" type


Thanks
Som
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]         ` <1828884A29C6694DAF28B7E6B8A82373A8FBE792-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  2015-04-08  4:10           ` Somnath Kotur
@ 2015-04-08  8:49           ` Moni Shoua
  1 sibling, 0 replies; 82+ messages in thread
From: Moni Shoua @ 2015-04-08  8:49 UTC (permalink / raw)
  To: Hefty, Sean
  Cc: Somnath Kotur, roland-DgEjT+Ai2ygdnm+yROfE0A,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak

On Wed, Apr 8, 2015 at 2:30 AM, Hefty, Sean <sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
>> In order to manage multiple types, vlans and MACs per GID, we
>> need to store them along the GID itself. We store the net device
>> as well, as sometimes GIDs should be handled according to the
>> net device they came from. Since populating the GID table should
>> be identical for every RoCE provider, the GIDs table should be
>> handled in ib_core.
>>
>> Adding a GID cache table that supports a lockless find, add and
>> delete gids. The lockless nature comes from using a unique
>> sequence number per table entry and detecting that while reading/
>> writing this sequence wasn't changed.
>>
>> By using this RoCE GID cache table, providers must implement a
>> modify_gid callback. The table is managed exclusively by
>> this roce_gid_cache and the provider just need to write
>> the data to the hardware.
>>
>> Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>> Signed-off-by: Somnath Kotur <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
>> ---
>>  drivers/infiniband/core/Makefile         |   3 +-
>>  drivers/infiniband/core/core_priv.h      |  24 ++
>>  drivers/infiniband/core/roce_gid_cache.c | 518
>
> Why does RoCE need such a complex gid cache?  If a gid cache is needed at all, why should it be restricted to RoCE only?  And why is such a complex synchronization scheme needed?  Seriously, how many times will GIDs change and how many readers at once do you expect to have?
>
GID cache is also implemented for link layer IB. Howver, for RoCE the
GID cache is also the manager of the table. This means that adding or
removing entries from the GID table is under the responsibility of the
cache and not the HW/device driver. This is a new scheme that frees
each vendor's driver to deal with net and inet events.
Content of the GID table is much more dynamic for RoCE than for IB and
so is access to the table so I guess that extra mechanism is required.
The fact that GID entry is associated with net_device and inet_addr
objects that can be modified/deleted at any time is an example.
>
>> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
>> index 65994a1..1866595 100644
>> --- a/include/rdma/ib_verbs.h
>> +++ b/include/rdma/ib_verbs.h
>> @@ -64,6 +64,36 @@ union ib_gid {
>>       } global;
>>  };
>>
>> +extern union ib_gid zgid;
>> +
>> +enum ib_gid_type {
>> +     /* If link layer is Ethernet, this is RoCE V1 */
>
> I don't understand this comment.  Does RoCE v2 not run on Ethernet?
>
>> +     IB_GID_TYPE_IB        = 0,
>> +     IB_GID_TYPE_ROCE_V2   = 1,
>> +     IB_GID_TYPE_SIZE
>> +};
>
> Can you explain the purpose of defining a 'GID type'.  A GID is just a global address.  Why does it matter to anyone using it how it was constructed?
>
>> +
>> +struct ib_gid_attr {
>> +     enum ib_gid_type        gid_type;
>> +     struct net_device       *ndev;
>> +};
>> +
>> +struct ib_roce_gid_cache_entry {
>> +     /* seq number of 0 indicates entry being changed. */
>> +     unsigned int        seq;
>> +     union ib_gid        gid;
>> +     struct ib_gid_attr  attr;
>> +     void               *context;
>> +};
>> +
>> +struct ib_roce_gid_cache {
>> +     int                  active;
>> +     int                  sz;
>> +     /* locking against multiple writes in data_vec */
>> +     struct mutex         lock;
>> +     struct ib_roce_gid_cache_entry *data_vec;
>> +};
>> +
>>  enum rdma_node_type {
>>       /* IB values map to NodeInfo:NodeType. */
>>       RDMA_NODE_IB_CA         = 1,
>> @@ -265,7 +295,9 @@ enum ib_port_cap_flags {
>>       IB_PORT_BOOT_MGMT_SUP                   = 1 << 23,
>>       IB_PORT_LINK_LATENCY_SUP                = 1 << 24,
>>       IB_PORT_CLIENT_REG_SUP                  = 1 << 25,
>> -     IB_PORT_IP_BASED_GIDS                   = 1 << 26
>> +     IB_PORT_IP_BASED_GIDS                   = 1 << 26,
>> +     IB_PORT_ROCE                            = 1 << 27,
>> +     IB_PORT_ROCE_V2                         = 1 << 28,
>
> Why does RoCE suddenly require a port capability bit?  RoCE runs today without setting any bit.
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* RE: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]             ` <7F44EA5110810A40B7DAFB605C41975D58F98121-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
@ 2015-04-13 23:50               ` Hefty, Sean
       [not found]                 ` <1828884A29C6694DAF28B7E6B8A82373A8FC0C00-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Hefty, Sean @ 2015-04-13 23:50 UTC (permalink / raw)
  To: Somnath Kotur, roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak

> Yes, this comment probably could use a reword..
> > > +	IB_GID_TYPE_IB        = 0,
> > > +	IB_GID_TYPE_ROCE_V2   = 1,
> > > +	IB_GID_TYPE_SIZE
> > > +};
> >
> > Can you explain the purpose of defining a 'GID type'.  A GID is just a
> global
> > address.  Why does it matter to anyone using it how it was constructed?
> 
> This is part of RoCE V2 Specification.  Please refer to Section A 17.8 .
> The GID Type determines the protocol for outbound packet generation i.e
> RoCE V1 (0x8915 Ether Type) or RoCEV2 (IPv4 or IPv6)

This isn't an interface for the RoCE specification.  Why does this need to be added to the verbs interface?  It hasn't been needed by apps yet, and I don't see why the apps should be made to care now how the GID is formatted.

> > > @@ -265,7 +295,9 @@ enum ib_port_cap_flags {
> > >  	IB_PORT_BOOT_MGMT_SUP			= 1 << 23,
> > >  	IB_PORT_LINK_LATENCY_SUP		= 1 << 24,
> > >  	IB_PORT_CLIENT_REG_SUP			= 1 << 25,
> > > -	IB_PORT_IP_BASED_GIDS			= 1 << 26
> > > +	IB_PORT_IP_BASED_GIDS			= 1 << 26,
> > > +	IB_PORT_ROCE				= 1 << 27,
> > > +	IB_PORT_ROCE_V2				= 1 << 28,
> >
> > Why does RoCE suddenly require a port capability bit?  RoCE runs today
> > without setting any bit.
> Again, this is part of RoCE V2 SPEC, please refer to Section A17.5.1-
> Query HCA(Pasting snippet below)
> A new "RoCE Supported" capability bit shall be added to the Port
> Attributes
> list. This capability bit applies exclusively to ports of the new
> "RoCEv2" type

Same comment as above.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]                 ` <1828884A29C6694DAF28B7E6B8A82373A8FC0C00-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2015-04-14  9:32                   ` Matan Barak
       [not found]                     ` <552CDEA5.6020709-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Matan Barak @ 2015-04-14  9:32 UTC (permalink / raw)
  To: Hefty, Sean, Somnath Kotur, roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA



On 4/14/2015 2:50 AM, Hefty, Sean wrote:
>> Yes, this comment probably could use a reword..
>>>> +	IB_GID_TYPE_IB        = 0,
>>>> +	IB_GID_TYPE_ROCE_V2   = 1,
>>>> +	IB_GID_TYPE_SIZE
>>>> +};
>>>
>>> Can you explain the purpose of defining a 'GID type'.  A GID is just a
>> global
>>> address.  Why does it matter to anyone using it how it was constructed?
>>
>> This is part of RoCE V2 Specification.  Please refer to Section A 17.8 .
>> The GID Type determines the protocol for outbound packet generation i.e
>> RoCE V1 (0x8915 Ether Type) or RoCEV2 (IPv4 or IPv6)
>
> This isn't an interface for the RoCE specification.  Why does this need to be added to the verbs interface?  It hasn't been needed by apps yet, and I don't see why the apps should be made to care now how the GID is formatted.
>

This is a part of the GID meta info. The user should be able to choose 
between RoCE V1 (which is represented here by IB_GID_TYPE_IB) and RoCE 
V2 - just as a user could choose between IPv6 and IPv4.

>>>> @@ -265,7 +295,9 @@ enum ib_port_cap_flags {
>>>>   	IB_PORT_BOOT_MGMT_SUP			= 1 << 23,
>>>>   	IB_PORT_LINK_LATENCY_SUP		= 1 << 24,
>>>>   	IB_PORT_CLIENT_REG_SUP			= 1 << 25,
>>>> -	IB_PORT_IP_BASED_GIDS			= 1 << 26
>>>> +	IB_PORT_IP_BASED_GIDS			= 1 << 26,
>>>> +	IB_PORT_ROCE				= 1 << 27,
>>>> +	IB_PORT_ROCE_V2				= 1 << 28,
>>>
>>> Why does RoCE suddenly require a port capability bit?  RoCE runs today
>>> without setting any bit.
>> Again, this is part of RoCE V2 SPEC, please refer to Section A17.5.1-
>> Query HCA(Pasting snippet below)
>> A new "RoCE Supported" capability bit shall be added to the Port
>> Attributes
>> list. This capability bit applies exclusively to ports of the new
>> "RoCEv2" type
>
> Same comment as above.
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]         ` <551347E9.5090503-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org>
  2015-03-26 14:05           ` Somnath Kotur
@ 2015-04-14 13:23           ` Matan Barak
       [not found]             ` <552D14C6.50000-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  1 sibling, 1 reply; 82+ messages in thread
From: Matan Barak @ 2015-04-14 13:23 UTC (permalink / raw)
  To: Bart Van Assche, Somnath Kotur, roland-DgEjT+Ai2ygdnm+yROfE0A,
	Moni Shoua, Or Gerlitz
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA



On 3/26/2015 1:42 AM, Bart Van Assche wrote:
> On 03/25/2015 02:19 PM, Somnath Kotur wrote:
>> +    if (cache->data_vec[ix].attr.ndev &&
>> +        cache->data_vec[ix].attr.ndev != old_net_dev)
>
> A few lines earlier the memory old_net_dev points at was freed. If two
> instances of this function run concurrently, what prevents that the
> old_net_dev memory has been reallocated and hence that attr.ndev ==
> old_net_dev although both pointers refer(red) to different network
> devices ?
>

write_gid is *almost* always called in a mutex. The only case it's not 
protected is in free_roce_gid_cache. free_roce_gid_cache is called only 
in the error flow of roce_gid_cache_setup_one, when no concurrent 
write_gid could be made (as the cache isn't setup yet) and in 
roce_gid_cache_cleanup_one. roce_gid_cache_client_setup_one is called in 
the error flow of roce_gid_cache_client_setup_one (where no other 
write_gid are expected for the same above reason) and in 
roce_gid_cache_client_cleanup_work_handler, where it's called after 
flush_workqueue(roce_gid_mgmt_wq). Since all write_gids are called 
through roce_gid_mgmt_wq and we set the cache to inactive mode before 
flushing the wq and freeing the cache, I think we can conclude no 
concurrent write_gid on the same cache are possible.

>> +    ACCESS_ONCE(cache->data_vec[ix].seq) = orig_seq;
>
> Invoking write_gid() is only safe if the caller serializes write_gid()
> calls. Apparently the cache->lock mutex is used for that purpose. So why
> is it necessary to use ACCESS_ONCE() here ? Why is it needed to prevent
> that the compiler coalesces this write with another write into the same
> structure ?
>

The mutex only serializes cache writes. Cache reads could be done in 
concurrent with writes and are protected by the ACCESS_ONCE.

>> +        /* Make sure the sequence number we remeber was read
>
> This looks like a typo - shouldn't the above read "remember" ?
>

Will be fixed in V4, thanks.

> BTW, the style of that comment is recommended only for networking code
> and not for IB code. Have you verified this patch with checkpatch ?
>

Of course and I've just re-run checkpatch on this patch. It doesn't 
catch this.

>> +    mutex_lock(&cache->lock);
>> +
>> +    for (ix = 0; ix < cache->sz; ix++)
>> +        if (cache->data_vec[ix].attr.ndev == ndev)
>> +            write_gid(ib_dev, port, cache, ix, &zgid, &zattr);
>> +
>> +    mutex_unlock(&cache->lock);
>> +    return 0;
>
> The traditional Linux kernel coding style is one blank line before
> mutex_lock() and after mutex_unlock() but not after mutex_lock() nor
> before mutex_unlock().
>

I didn't find this in the CodingStyle doc. Could you please quote or 
post a link?

>> +    orig_seq = ACCESS_ONCE(cache->data_vec[index].seq);
>> +    /* Make sure we read the sequence number before copying the
>> +     * gid to local storage. */
>> +    smp_rmb();
>
> Please use READ_ONCE() instead of ACCESS_ONCE() as recommended in
> <linux/compiler.h>.
>

Ok, I'll change that in V4. I see that READ_ONCE and WRITE_ONCE is 
different from ACCESS_ONCE only for aggregated data types (which isn't 
our case), but it won't hurt to change that.

>> +static void free_roce_gid_cache(struct ib_device *ib_dev, u8 port)
>> +{
>> +    int i;
>> +    struct ib_roce_gid_cache *cache =
>> +        ib_dev->cache.roce_gid_cache[port - 1];
>> +
>> +    if (!cache)
>> +        return;
>> +
>> +    for (i = 0; i < cache->sz; ++i) {
>> +        if (memcmp(&cache->data_vec[i].gid, &zgid,
>> +               sizeof(cache->data_vec[i].gid)))
>> +            write_gid(ib_dev, port, cache, i, &zgid, &zattr);
>> +    }
>  > +    kfree(cache->data_vec);
>  > +    kfree(cache);
>  > +}
>
> Overwriting data just before it is freed is not useful. Please use
> CONFIG_SLUB_DEBUG=y to debug use-after-free issues instead of such code.
>

It's mandatory as write_gid with zgid might cause the vendor driver to 
free memory it allocated for this GID entry (like in the mlx4 case).

> Bart.

Thanks for the review :)

Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]         ` <551348BD.9080200-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org>
@ 2015-04-14 13:27           ` Matan Barak
  0 siblings, 0 replies; 82+ messages in thread
From: Matan Barak @ 2015-04-14 13:27 UTC (permalink / raw)
  To: Bart Van Assche, Somnath Kotur, roland-DgEjT+Ai2ygdnm+yROfE0A,
	Moni Shoua, Or Gerlitz
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA



On 3/26/2015 1:46 AM, Bart Van Assche wrote:
> On 03/25/2015 02:19 PM, Somnath Kotur wrote:
>> +static void ib_device_complete_cb(struct kref *kref)
>> +{
>> +    struct ib_device *device = container_of(kref, struct ib_device,
>> +                        refcount);
>> +
>> +    if (device->reg_state >= IB_DEV_UNREGISTERING)
>> +        complete(&device->free);
>> +}
>
>> @@ -355,6 +393,9 @@ void ib_unregister_device(struct ib_device *device)
>>
>>       ib_device_unregister_sysfs(device);
>>
>> +    ib_device_put(device);
>> +    wait_for_completion(&device->free);
>
> Why is it necessary here to wait until the last reference is gone ? Why
> doesn't ib_device_complete_cb() free any memory ?
>

IMHO, ib_unregister_device should be a blocking call. The caller would 
like to be certain that any usage of the IB device is completed before 
it frees other resources/memory it allocated.

> Bart.
>

Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]             ` <552D14C6.50000-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2015-04-14 15:31               ` Bart Van Assche
  0 siblings, 0 replies; 82+ messages in thread
From: Bart Van Assche @ 2015-04-14 15:31 UTC (permalink / raw)
  To: Matan Barak, Somnath Kotur, roland-DgEjT+Ai2ygdnm+yROfE0A,
	Moni Shoua, Or Gerlitz
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA

On 04/14/15 15:23, Matan Barak wrote:
>>> +    mutex_lock(&cache->lock);
>>> +
>>> +    for (ix = 0; ix < cache->sz; ix++)
>>> +        if (cache->data_vec[ix].attr.ndev == ndev)
>>> +            write_gid(ib_dev, port, cache, ix, &zgid, &zattr);
>>> +
>>> +    mutex_unlock(&cache->lock);
>>> +    return 0;
>>
>> The traditional Linux kernel coding style is one blank line before
>> mutex_lock() and after mutex_unlock() but not after mutex_lock() nor
>> before mutex_unlock().
>>
>
> I didn't find this in the CodingStyle doc. Could you please quote or
> post a link?

Hello Matan,

I'm not aware of any formal documentation of this style guideline. But 
if you look around in the Linux kernel tree you will see that most 
kernel code follows this style.

Bart.


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* RE: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]                     ` <552CDEA5.6020709-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2015-04-14 17:32                       ` Hefty, Sean
       [not found]                         ` <1828884A29C6694DAF28B7E6B8A82373A8FC11F3-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Hefty, Sean @ 2015-04-14 17:32 UTC (permalink / raw)
  To: Matan Barak, Somnath Kotur, roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA

> This is a part of the GID meta info. The user should be able to choose
> between RoCE V1 (which is represented here by IB_GID_TYPE_IB) and RoCE
> V2 - just as a user could choose between IPv6 and IPv4.

IPv4 and IPv6 are different protocols, not different formats for the same address.  How does RoCE v2 not break every app?  This isn't like asking the user to choose between IPv4 versus IPv6, it's asking them to choose between IPv4 assigned by DHCP versus IPv4 assigned statically.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* RE: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]                         ` <1828884A29C6694DAF28B7E6B8A82373A8FC11F3-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2015-04-15  5:35                           ` Somnath Kotur
       [not found]                             ` <7F44EA5110810A40B7DAFB605C41975D58FA0B05-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Somnath Kotur @ 2015-04-15  5:35 UTC (permalink / raw)
  To: Hefty, Sean, Matan Barak, roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA



> -----Original Message-----
> From: Hefty, Sean [mailto:sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org]
> Sent: Tuesday, April 14, 2015 11:02 PM
> To: Matan Barak; Somnath Kotur; roland-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: RE: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
> 
> > This is a part of the GID meta info. The user should be able to choose
> > between RoCE V1 (which is represented here by IB_GID_TYPE_IB) and
> RoCE
> > V2 - just as a user could choose between IPv6 and IPv4.
> 
> IPv4 and IPv6 are different protocols, not different formats for the same
> address.  How does RoCE v2 not break every app? 
It does not  break every app, the choice of which GID type to use is made by the RDMA-CM based on network topology hint obtained from the IP stack.
Please refer to patch 15/33: IB/Core: Changes to the IB Core infrastructure for RoCEv2 support.
Of course, if the user does not want to go with this choice made by the RDMA-CM, then there is the option of overriding it using the configfs patch (PATCH 14/33)
Hope that clarifies?

Thanks
Som
 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* RE: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]                             ` <7F44EA5110810A40B7DAFB605C41975D58FA0B05-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
@ 2015-04-15 16:08                               ` Hefty, Sean
       [not found]                                 ` <1828884A29C6694DAF28B7E6B8A82373A8FC19D9-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Hefty, Sean @ 2015-04-15 16:08 UTC (permalink / raw)
  To: Somnath Kotur, Matan Barak, roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA

> It does not  break every app, the choice of which GID type to use is made
> by the RDMA-CM based on network topology hint obtained from the IP stack.
> Please refer to patch 15/33: IB/Core: Changes to the IB Core
> infrastructure for RoCEv2 support.
> Of course, if the user does not want to go with this choice made by the
> RDMA-CM, then there is the option of overriding it using the configfs
> patch (PATCH 14/33)
> Hope that clarifies?

RoCE v2 is really Infiniband over UDP over IP.  Why don't we just call it IBoUDP like it is?

IBoUDP changes the Ethertype, replaces the network header, adds a new transport protocol header, and layers IB over that.  This change should be exposed properly and not as just a new GID type.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* RE: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]                                 ` <1828884A29C6694DAF28B7E6B8A82373A8FC19D9-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2015-04-15 16:21                                   ` Suri Shelvapille
       [not found]                                     ` <CY1PR03MB1440108D65F18916AF9B2425DEE50-DUcFgbLRNhB/HYnSB+xpdWP7xZHs9kq/vxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  2015-04-16 10:43                                   ` Moni Shoua
  1 sibling, 1 reply; 82+ messages in thread
From: Suri Shelvapille @ 2015-04-15 16:21 UTC (permalink / raw)
  To: Hefty, Sean, Somnath Kotur, Matan Barak, roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA

IMHO, it would be good to have a physical layer representation in the naming convention.

-----Original Message-----
From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Hefty, Sean
Sent: Wednesday, April 15, 2015 12:08 PM
To: Somnath Kotur; Matan Barak; roland-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Subject: RE: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache

> It does not  break every app, the choice of which GID type to use is
> made by the RDMA-CM based on network topology hint obtained from the IP stack.
> Please refer to patch 15/33: IB/Core: Changes to the IB Core
> infrastructure for RoCEv2 support.
> Of course, if the user does not want to go with this choice made by
> the RDMA-CM, then there is the option of overriding it using the
> configfs patch (PATCH 14/33) Hope that clarifies?

RoCE v2 is really Infiniband over UDP over IP.  Why don't we just call it IBoUDP like it is?

IBoUDP changes the Ethertype, replaces the network header, adds a new transport protocol header, and layers IB over that.  This change should be exposed properly and not as just a new GID type.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at  http://vger.kernel.org/majordomo-info.html

This correspondence, and any attachments or files transmitted with this correspondence, contains information which may be confidential and privileged and is intended solely for the use of the addressee. Unless you are the addressee or are authorized to receive messages for the addressee, you may not use, copy, disseminate, or disclose this correspondence or any information contained in this correspondence to any third party. If you have received this correspondence in error, please notify the sender immediately and delete this correspondence and any attachments or files transmitted with this correspondence from your system, and destroy any and all copies thereof, electronic or otherwise. Your cooperation and understanding are greatly appreciated.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]                                     ` <CY1PR03MB1440108D65F18916AF9B2425DEE50-DUcFgbLRNhB/HYnSB+xpdWP7xZHs9kq/vxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2015-04-16 10:42                                       ` Matan Barak
  0 siblings, 0 replies; 82+ messages in thread
From: Matan Barak @ 2015-04-16 10:42 UTC (permalink / raw)
  To: Suri Shelvapille, Hefty, Sean, Somnath Kotur,
	roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA

AFAIK, RoCE v2 is the known and official name. Why would we like to come 
up with a customized name?

These are indeed two different protocols, thus the comparison to DHCP 
assigned addresses and static addresses is (to say the least) a bit off.

Even when comparing IPv4 and IPv6, the most significant user change is 
the sockaddr_in and sockaddr_in6 addresses. IMHO, since the GID format 
is identical, the changes could be encoded in a gid_type argument. The 
gid_type is an inherent part of the address, making identical gids with 
different gid_types as two different addresses, as expected.

On 4/15/2015 7:21 PM, Suri Shelvapille wrote:
> IMHO, it would be good to have a physical layer representation in the naming convention.
>
> -----Original Message-----
> From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Hefty, Sean
> Sent: Wednesday, April 15, 2015 12:08 PM
> To: Somnath Kotur; Matan Barak; roland-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org
> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: RE: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
>
>> It does not  break every app, the choice of which GID type to use is
>> made by the RDMA-CM based on network topology hint obtained from the IP stack.
>> Please refer to patch 15/33: IB/Core: Changes to the IB Core
>> infrastructure for RoCEv2 support.
>> Of course, if the user does not want to go with this choice made by
>> the RDMA-CM, then there is the option of overriding it using the
>> configfs patch (PATCH 14/33) Hope that clarifies?
>
> RoCE v2 is really Infiniband over UDP over IP.  Why don't we just call it IBoUDP like it is?
>
> IBoUDP changes the Ethertype, replaces the network header, adds a new transport protocol header, and layers IB over that.  This change should be exposed properly and not as just a new GID type.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
> This correspondence, and any attachments or files transmitted with this correspondence, contains information which may be confidential and privileged and is intended solely for the use of the addressee. Unless you are the addressee or are authorized to receive messages for the addressee, you may not use, copy, disseminate, or disclose this correspondence or any information contained in this correspondence to any third party. If you have received this correspondence in error, please notify the sender immediately and delete this correspondence and any attachments or files transmitted with this correspondence from your system, and destroy any and all copies thereof, electronic or otherwise. Your cooperation and understanding are greatly appreciated.
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]                                 ` <1828884A29C6694DAF28B7E6B8A82373A8FC19D9-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  2015-04-15 16:21                                   ` Suri Shelvapille
@ 2015-04-16 10:43                                   ` Moni Shoua
       [not found]                                     ` <CAG9sBKPQ7r2j4Awd3=CtRzekWPVe6hcO1+S+kspMEr4n=kDnkw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 82+ messages in thread
From: Moni Shoua @ 2015-04-16 10:43 UTC (permalink / raw)
  To: Hefty, Sean
  Cc: Somnath Kotur, Matan Barak, roland-DgEjT+Ai2ygdnm+yROfE0A,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Wed, Apr 15, 2015 at 7:08 PM, Hefty, Sean <sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
>> It does not  break every app, the choice of which GID type to use is made
>> by the RDMA-CM based on network topology hint obtained from the IP stack.
>> Please refer to patch 15/33: IB/Core: Changes to the IB Core
>> infrastructure for RoCEv2 support.
>> Of course, if the user does not want to go with this choice made by the
>> RDMA-CM, then there is the option of overriding it using the configfs
>> patch (PATCH 14/33)
>> Hope that clarifies?
>
> RoCE v2 is really Infiniband over UDP over IP.  Why don't we just call it IBoUDP like it is?
RoCEv2 is the name in the IBTA spec (Annex 17)
>
> IBoUDP changes the Ethertype, replaces the network header, adds a new transport protocol header, and layers IB over that.  This change should be exposed properly and not as just a new GID type.
I don't understand what do you suggest here. Can you give an example?
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* RE: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]                                     ` <CAG9sBKPQ7r2j4Awd3=CtRzekWPVe6hcO1+S+kspMEr4n=kDnkw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-04-16 14:58                                       ` Hefty, Sean
  0 siblings, 0 replies; 82+ messages in thread
From: Hefty, Sean @ 2015-04-16 14:58 UTC (permalink / raw)
  To: Moni Shoua
  Cc: Somnath Kotur, Matan Barak, roland-DgEjT+Ai2ygdnm+yROfE0A,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

> > RoCE v2 is really Infiniband over UDP over IP.  Why don't we just call
> it IBoUDP like it is?
> RoCEv2 is the name in the IBTA spec (Annex 17)

We call RoCE IBoE in the kernel, because that's what it is.  RoCE is an IBTA marketing name.

Looking through the Annex, I don't see where Ethernet is even a requirement for this technology to work.  The IB transport is layered over a standard UDP header.  I do see where the spec calls out updating the IP header, but that's it.

Regardless of what it's called, it replaces the underlying network and transport protocols, versus IB-classic or IBoE/RoCE.  That should be captured properly, not by saying there's a new GID type.  RoCE v2 doesn't even use GIDs as part of its protocol.  It uses UDP/IP addresses.


> > IBoUDP changes the Ethertype, replaces the network header, adds a new
> transport protocol header, and layers IB over that.  This change should be
> exposed properly and not as just a new GID type.
> I don't understand what do you suggest here. Can you give an example?

I don't have a solution here.  Please look at Michael Wang's patch series and see how this would fit into that model.  The introduction of iWarp required defining a new 'transport' type.  IBoE added a new link layer.  Based on those changes, this would warrant introducing a new network layer, so that it can be distinguished properly from the other options.  Maybe that's the right approach?

Cisco's NIC reports a transport layer of 'USNIC_UDP', which should really just be 'UDP'.  That NIC supports UDP/IP/Ethernet, based on the rdma stack's model.  RoCE v2 is also UDP/IP/Ethernet, it only layers IB over that.  (This makes the use of the term 'transport' confusing.  Maybe there should also be a 'session' protocol?)  It seems completely reasonable that a device which does IB/UDP/IP/Ethernet to someday expose the UDP/IP/Ethernet portion (if it doesn't already), and from the same port at the same time.

Rather than continuing to try to make everything look like an IB-classic device because it's convenient, the stack needs to start exposing things properly.  I don't know what the right solution should be, but trying to capture this level of detail as a different GID type definitely looks like a step in the wrong direction.

- Sean


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]     ` <44ab0dce-c7c9-400b-af24-10b8981358a7-3RiH6ntJJkOPfaB/Gd0HpljyZtpTMMwT@public.gmane.org>
  2015-03-25 23:42       ` Bart Van Assche
  2015-04-08  0:30       ` Hefty, Sean
@ 2015-04-26 17:20       ` Or Gerlitz
       [not found]         ` <CAJ3xEMgepRUQs+GiMWxzV_QFaRnfbX7TPOdB_sKgRhHj7x7NDg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2 siblings, 1 reply; 82+ messages in thread
From: Or Gerlitz @ 2015-04-26 17:20 UTC (permalink / raw)
  To: Somnath Kotur
  Cc: Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak

On Thu, Mar 26, 2015 at 12:19 AM, Somnath Kotur
<somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org> wrote:
> From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>
> In order to manage multiple types, vlans and MACs per GID, we
> need to store them along the GID itself. We store the net device
> as well, as sometimes GIDs should be handled according to the
> net device they came from. Since populating the GID table should
> be identical for every RoCE provider, the GIDs table should be
> handled in ib_core.
>
> Adding a GID cache table that supports a lockless find, add and
> delete gids. The lockless nature comes from using a unique
> sequence number per table entry and detecting that while reading/
> writing this sequence wasn't changed.

Matan, please use existing mechanism which fits the problem you are
trying to solve, I guess one of RCU or seqlock should do the job.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]     ` <9f65de5e-ed5f-48d2-bff2-03ffbe4f4876-3RiH6ntJJkOPfaB/Gd0HpljyZtpTMMwT@public.gmane.org>
  2015-03-25 23:46       ` Bart Van Assche
@ 2015-04-26 20:10       ` Or Gerlitz
       [not found]         ` <CAJ3xEMhBNt-VNNds37sXnJi3nP9ZTMd6mC3s+qZWh0XsO1n_Nw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 82+ messages in thread
From: Or Gerlitz @ 2015-04-26 20:10 UTC (permalink / raw)
  To: Somnath Kotur
  Cc: Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Matan Barak

On Thu, Mar 26, 2015 at 12:19 AM, Somnath Kotur
<somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org> wrote:
> From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>
> Previously. we used device_mutex lock in order to protect
> the device's list. That means that in order to guarantee a
> device isn't freed while we use it, we had to lock all
> devices.

Matan, looking on the cover letter, it says: "[...] Patch 0002 adds a
reference count mechanism to IB devices. This mechanism is similar to
dev_hold and dev_put available for net devices. This is mandatory for
later patches [...]"

So in that respect, saying here "Previously. we used device_mutex
lock" is a bit cryptic, @ least one typo must exist in this sentence,
right? did you want to say "Currently we use device_mutex lock for XXX
[...] and this should be replaced as of a YYY change which is to be
introduced [...]" please clarify

> Adding a kref per IB device. Before an IB device
> is unregistered, we wait before its not held anymore.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]         ` <CAJ3xEMgepRUQs+GiMWxzV_QFaRnfbX7TPOdB_sKgRhHj7x7NDg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-04-27  7:32           ` Matan Barak
       [not found]             ` <553DE614.7050508-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Matan Barak @ 2015-04-27  7:32 UTC (permalink / raw)
  To: Or Gerlitz, Somnath Kotur
  Cc: Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA



On 4/26/2015 8:20 PM, Or Gerlitz wrote:
> On Thu, Mar 26, 2015 at 12:19 AM, Somnath Kotur
> <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org> wrote:
>> From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>
>> In order to manage multiple types, vlans and MACs per GID, we
>> need to store them along the GID itself. We store the net device
>> as well, as sometimes GIDs should be handled according to the
>> net device they came from. Since populating the GID table should
>> be identical for every RoCE provider, the GIDs table should be
>> handled in ib_core.
>>
>> Adding a GID cache table that supports a lockless find, add and
>> delete gids. The lockless nature comes from using a unique
>> sequence number per table entry and detecting that while reading/
>> writing this sequence wasn't changed.
>
> Matan, please use existing mechanism which fits the problem you are
> trying to solve, I guess one of RCU or seqlock should do the job.
>

seqcount fits this problem better. Since if a write and read are done in 
parallel, there's a good chance we read an out of date entry and we are 
going to use a GID entry that's going to change in T+epsilon, so RCU 
doesn't really have an advantage here.

The current implementation is a bit more efficient than seqcount, as it 
allows early termination of read-while-write (because the write puts a 
known "currently updating" value that the read knows to ignore). AFAIK, 
this doesn't exist in the current seqcount implementation. However, 
since this isn't a crucial data-path, I'll change that to seqcount.

seqcount is preferred over seqlock, as I don't need the spinlock in seqlock.

> Or.
>

Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]         ` <CAJ3xEMhBNt-VNNds37sXnJi3nP9ZTMd6mC3s+qZWh0XsO1n_Nw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-04-27  8:25           ` Matan Barak
       [not found]             ` <553DF294.4070507-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Matan Barak @ 2015-04-27  8:25 UTC (permalink / raw)
  To: Or Gerlitz, Somnath Kotur
  Cc: Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA



On 4/26/2015 11:10 PM, Or Gerlitz wrote:
> On Thu, Mar 26, 2015 at 12:19 AM, Somnath Kotur
> <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org> wrote:
>> From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>
>> Previously. we used device_mutex lock in order to protect
>> the device's list. That means that in order to guarantee a
>> device isn't freed while we use it, we had to lock all
>> devices.
>
> Matan, looking on the cover letter, it says: "[...] Patch 0002 adds a
> reference count mechanism to IB devices. This mechanism is similar to
> dev_hold and dev_put available for net devices. This is mandatory for
> later patches [...]"
>
> So in that respect, saying here "Previously. we used device_mutex
> lock" is a bit cryptic, @ least one typo must exist in this sentence,
> right? did you want to say "Currently we use device_mutex lock for XXX
> [...] and this should be replaced as of a YYY change which is to be
> introduced [...]" please clarify

Correct, I'll change that into:

Currently we use device_mutex lock for protecting the device's list. In 
the current approach, in order to guarantee a device isn't freed we have 
to lock all devices.

Adding a kref per IB device. Before an IB device is unregistered, we 
wait until it's not held anymore.

>
>> Adding a kref per IB device. Before an IB device
>> is unregistered, we wait before its not held anymore.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]             ` <553DF294.4070507-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2015-04-27 16:22               ` Jason Gunthorpe
       [not found]                 ` <20150427162256.GA24316-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  2015-04-28 11:51               ` Or Gerlitz
  1 sibling, 1 reply; 82+ messages in thread
From: Jason Gunthorpe @ 2015-04-27 16:22 UTC (permalink / raw)
  To: Matan Barak
  Cc: Or Gerlitz, Somnath Kotur, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Mon, Apr 27, 2015 at 11:25:56AM +0300, Matan Barak wrote:
> 
> 
> On 4/26/2015 11:10 PM, Or Gerlitz wrote:
> >On Thu, Mar 26, 2015 at 12:19 AM, Somnath Kotur
> ><somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org> wrote:
> >>From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> >>
> >>Previously. we used device_mutex lock in order to protect
> >>the device's list. That means that in order to guarantee a
> >>device isn't freed while we use it, we had to lock all
> >>devices.
> >
> >Matan, looking on the cover letter, it says: "[...] Patch 0002 adds a
> >reference count mechanism to IB devices. This mechanism is similar to
> >dev_hold and dev_put available for net devices. This is mandatory for
> >later patches [...]"
> >
> >So in that respect, saying here "Previously. we used device_mutex
> >lock" is a bit cryptic, @ least one typo must exist in this sentence,
> >right? did you want to say "Currently we use device_mutex lock for XXX
> >[...] and this should be replaced as of a YYY change which is to be
> >introduced [...]" please clarify
> 
> Correct, I'll change that into:
> 
> Currently we use device_mutex lock for protecting the device's list.
> In the current approach, in order to guarantee a device isn't freed
> we have to lock all devices.
> 
> Adding a kref per IB device. Before an IB device is unregistered, we
> wait until it's not held anymore.

Why do we need two krefs for this structure? There is already a kref
inside the embedded 'struct device dev'

Sounds wrong to me without a lot more explanation.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]             ` <553DE614.7050508-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2015-04-27 18:22               ` Or Gerlitz
       [not found]                 ` <CAJ3xEMjEhv3Nm_EfFcBWLk1ChQXBM5KvPxh5DstCqxeMo0MGwA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Or Gerlitz @ 2015-04-27 18:22 UTC (permalink / raw)
  To: Matan Barak; +Cc: Somnath Kotur, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Haggai Eran

On Mon, Apr 27, 2015 at 10:32 AM, Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org> wrote:
> On 4/26/2015 8:20 PM, Or Gerlitz wrote:
>> On Thu, Mar 26, 2015 at 12:19 AM, Somnath Kotur
>> <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org> wrote:
>>> From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

>>> In order to manage multiple types, vlans and MACs per GID, we
>>> need to store them along the GID itself. We store the net device
>>> as well, as sometimes GIDs should be handled according to the
>>> net device they came from. Since populating the GID table should
>>> be identical for every RoCE provider, the GIDs table should be
>>> handled in ib_core.
>>>
>>> Adding a GID cache table that supports a lockless find, add and
>>> delete gids. The lockless nature comes from using a unique
>>> sequence number per table entry and detecting that while reading/
>>> writing this sequence wasn't changed.

>> Matan, please use existing mechanism which fits the problem you are
>> trying to solve, I guess one of RCU or seqlock should do the job.

> seqcount fits this problem better. Since if a write and read are done in
> parallel, there's a good chance we read an out of date entry and we are
> going to use a GID entry that's going to change in T+epsilon, so RCU doesn't
> really have an advantage here.

So going back to the problem... we are talking on applications/drivers
that attempt to establish new connections doing reads and writes done
on behalf of IP stack changes, both are very much not critical path.
So this is kind of similar to the neighbour table maintained by ND
subsystem which is used by all IP based networking applications and
that code uses RCU. I don't see what's wrong with RCU for our sort
smaller scale subsystem and what is even wrong with simple rwlock
which is the mechanism used today by the IB core git cache, this goes
too complex and for no reason that I can think of.


> The current implementation is a bit more efficient than seqcount, as it
> allows early termination of read-while-write (because the write puts a known
> "currently updating" value that the read knows to ignore). AFAIK, this
> doesn't exist in the current seqcount implementation. However, since this
> isn't a crucial data-path, I'll change that to seqcount.
>
> seqcount is preferred over seqlock, as I don't need the spinlock in seqlock.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]                 ` <CAJ3xEMjEhv3Nm_EfFcBWLk1ChQXBM5KvPxh5DstCqxeMo0MGwA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-04-28  7:17                   ` Matan Barak
       [not found]                     ` <553F341D.8000907-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Matan Barak @ 2015-04-28  7:17 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: Somnath Kotur, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Haggai Eran



On 4/27/2015 9:22 PM, Or Gerlitz wrote:
> On Mon, Apr 27, 2015 at 10:32 AM, Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org> wrote:
>> On 4/26/2015 8:20 PM, Or Gerlitz wrote:
>>> On Thu, Mar 26, 2015 at 12:19 AM, Somnath Kotur
>>> <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org> wrote:
>>>> From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>
>>>> In order to manage multiple types, vlans and MACs per GID, we
>>>> need to store them along the GID itself. We store the net device
>>>> as well, as sometimes GIDs should be handled according to the
>>>> net device they came from. Since populating the GID table should
>>>> be identical for every RoCE provider, the GIDs table should be
>>>> handled in ib_core.
>>>>
>>>> Adding a GID cache table that supports a lockless find, add and
>>>> delete gids. The lockless nature comes from using a unique
>>>> sequence number per table entry and detecting that while reading/
>>>> writing this sequence wasn't changed.
>
>>> Matan, please use existing mechanism which fits the problem you are
>>> trying to solve, I guess one of RCU or seqlock should do the job.
>
>> seqcount fits this problem better. Since if a write and read are done in
>> parallel, there's a good chance we read an out of date entry and we are
>> going to use a GID entry that's going to change in T+epsilon, so RCU doesn't
>> really have an advantage here.
>
> So going back to the problem... we are talking on applications/drivers
> that attempt to establish new connections doing reads and writes done
> on behalf of IP stack changes, both are very much not critical path.
> So this is kind of similar to the neighbour table maintained by ND
> subsystem which is used by all IP based networking applications and
> that code uses RCU. I don't see what's wrong with RCU for our sort
> smaller scale subsystem and what is even wrong with simple rwlock
> which is the mechanism used today by the IB core git cache, this goes
> too complex and for no reason that I can think of.
>

I think the real question is why to deal with RCUs that will require 
re-allocation of entries when it's not necessary or why do we want to 
use rwlock if the kernel provides a mechanism (called seqcount) that 
fits this problem better?
I disagree about seqcount being complex - if you look at its API you'll 
find it's a lot simpler than RCU.

>
>> The current implementation is a bit more efficient than seqcount, as it
>> allows early termination of read-while-write (because the write puts a known
>> "currently updating" value that the read knows to ignore). AFAIK, this
>> doesn't exist in the current seqcount implementation. However, since this
>> isn't a crucial data-path, I'll change that to seqcount.
>>
>> seqcount is preferred over seqlock, as I don't need the spinlock in seqlock.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                 ` <20150427162256.GA24316-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2015-04-28  8:32                   ` Matan Barak
       [not found]                     ` <553F4588.80301-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Matan Barak @ 2015-04-28  8:32 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Or Gerlitz, Somnath Kotur, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA



On 4/27/2015 7:22 PM, Jason Gunthorpe wrote:
> On Mon, Apr 27, 2015 at 11:25:56AM +0300, Matan Barak wrote:
>>
>>
>> On 4/26/2015 11:10 PM, Or Gerlitz wrote:
>>> On Thu, Mar 26, 2015 at 12:19 AM, Somnath Kotur
>>> <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org> wrote:
>>>> From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>>>
>>>> Previously. we used device_mutex lock in order to protect
>>>> the device's list. That means that in order to guarantee a
>>>> device isn't freed while we use it, we had to lock all
>>>> devices.
>>>
>>> Matan, looking on the cover letter, it says: "[...] Patch 0002 adds a
>>> reference count mechanism to IB devices. This mechanism is similar to
>>> dev_hold and dev_put available for net devices. This is mandatory for
>>> later patches [...]"
>>>
>>> So in that respect, saying here "Previously. we used device_mutex
>>> lock" is a bit cryptic, @ least one typo must exist in this sentence,
>>> right? did you want to say "Currently we use device_mutex lock for XXX
>>> [...] and this should be replaced as of a YYY change which is to be
>>> introduced [...]" please clarify
>>
>> Correct, I'll change that into:
>>
>> Currently we use device_mutex lock for protecting the device's list.
>> In the current approach, in order to guarantee a device isn't freed
>> we have to lock all devices.
>>
>> Adding a kref per IB device. Before an IB device is unregistered, we
>> wait until it's not held anymore.
>
> Why do we need two krefs for this structure? There is already a kref
> inside the embedded 'struct device dev'
>
> Sounds wrong to me without a lot more explanation.
>

This was already asked by Haggai Eran awhile ago and was answered. 
Anyway, in ib_unregister_device we delete all client's related data.
We would like to ensure that all references were released before this
data is being deleted. Meaning, we would like to ensure the device is 
still functioning but isn't referenced rather than just to avoid freeing 
the IB device's memory.
ib_device_get and ib_device_hold are APIs for the clients, similar to
dev_hold and dev_put.

> Jason
>

Regards,
Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]             ` <553DF294.4070507-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  2015-04-27 16:22               ` Jason Gunthorpe
@ 2015-04-28 11:51               ` Or Gerlitz
       [not found]                 ` <CAJ3xEMjzgS_uR1VaeGzW+jcfG2oiVo4=fCctX6o4OVbKRX2n0Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 82+ messages in thread
From: Or Gerlitz @ 2015-04-28 11:51 UTC (permalink / raw)
  To: Matan Barak
  Cc: Somnath Kotur, Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Mon, Apr 27, 2015 at 11:25 AM, Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org> wrote:
> On 4/26/2015 11:10 PM, Or Gerlitz wrote:
>> On Thu, Mar 26, 2015 at 12:19 AM, Somnath Kotur
>> <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org> wrote:
>>>
>>> From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>>
>>> Previously. we used device_mutex lock in order to protect
>>> the device's list. That means that in order to guarantee a
>>> device isn't freed while we use it, we had to lock all
>>> devices.
>>
>>
>> Matan, looking on the cover letter, it says: "[...] Patch 0002 adds a
>> reference count mechanism to IB devices. This mechanism is similar to
>> dev_hold and dev_put available for net devices. This is mandatory for
>> later patches [...]"

> Correct, I'll change that into:
> Currently we use device_mutex lock for protecting the device's list. In the
> current approach, in order to guarantee a device isn't freed we have to lock
> all devices.
> Adding a kref per IB device. Before an IB device is unregistered, we wait
> until it's not held anymore.

Why is this change mandatory for the proposed design?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache
       [not found]                     ` <553F341D.8000907-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2015-04-28 12:57                       ` Or Gerlitz
  0 siblings, 0 replies; 82+ messages in thread
From: Or Gerlitz @ 2015-04-28 12:57 UTC (permalink / raw)
  To: Matan Barak; +Cc: Somnath Kotur, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Haggai Eran

On Tue, Apr 28, 2015 at 10:17 AM, Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org> wrote:
> On 4/27/2015 9:22 PM, Or Gerlitz wrote:
> I think the real question is why to deal with RCUs that will require
> re-allocation of entries when it's not necessary or why do we want to use
> rwlock if the kernel provides a mechanism (called seqcount) that fits this
> problem better?
> I disagree about seqcount being complex - if you look at its API you'll find
> it's a lot simpler than RCU.

I took a 2nd look, seqcount is indeed way simpler from RCU, and by
itself is simple to use
if you feel this provides better solution vs. simple rwlock, so I'm
good with that.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                 ` <CAJ3xEMjzgS_uR1VaeGzW+jcfG2oiVo4=fCctX6o4OVbKRX2n0Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-04-28 14:03                   ` Matan Barak
       [not found]                     ` <553F931F.6000302-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Matan Barak @ 2015-04-28 14:03 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Somnath Kotur, Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA



On 4/28/2015 2:51 PM, Or Gerlitz wrote:
> On Mon, Apr 27, 2015 at 11:25 AM, Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org> wrote:
>> On 4/26/2015 11:10 PM, Or Gerlitz wrote:
>>> On Thu, Mar 26, 2015 at 12:19 AM, Somnath Kotur
>>> <somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org> wrote:
>>>>
>>>> From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>>>
>>>> Previously. we used device_mutex lock in order to protect
>>>> the device's list. That means that in order to guarantee a
>>>> device isn't freed while we use it, we had to lock all
>>>> devices.
>>>
>>>
>>> Matan, looking on the cover letter, it says: "[...] Patch 0002 adds a
>>> reference count mechanism to IB devices. This mechanism is similar to
>>> dev_hold and dev_put available for net devices. This is mandatory for
>>> later patches [...]"
>
>> Correct, I'll change that into:
>> Currently we use device_mutex lock for protecting the device's list. In the
>> current approach, in order to guarantee a device isn't freed we have to lock
>> all devices.
>> Adding a kref per IB device. Before an IB device is unregistered, we wait
>> until it's not held anymore.
>
> Why is this change mandatory for the proposed design?
>

The cleanup of roce_gid_cache is done in a different context, so we need 
to make sure the device is still alive while doing so. In addition, we 
don't want that the unregistration process of ib_core will free our 
context data.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                     ` <553F4588.80301-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2015-04-28 16:03                       ` Jason Gunthorpe
       [not found]                         ` <20150428160315.GA5497-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Jason Gunthorpe @ 2015-04-28 16:03 UTC (permalink / raw)
  To: Matan Barak
  Cc: Or Gerlitz, Somnath Kotur, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Tue, Apr 28, 2015 at 11:32:08AM +0300, Matan Barak wrote:

> This was already asked by Haggai Eran awhile ago and was answered.
> Anyway, in ib_unregister_device we delete all client's related data.
> We would like to ensure that all references were released before this
> data is being deleted. Meaning, we would like to ensure the device
> is still functioning but isn't referenced rather than just to avoid
> freeing the IB device's memory.

A kref is the wrong datastructure for that purpose.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                         ` <20150428160315.GA5497-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2015-04-28 16:17                           ` Matan Barak
  0 siblings, 0 replies; 82+ messages in thread
From: Matan Barak @ 2015-04-28 16:17 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Or Gerlitz, Somnath Kotur, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA



On 4/28/2015 7:03 PM, Jason Gunthorpe wrote:
> On Tue, Apr 28, 2015 at 11:32:08AM +0300, Matan Barak wrote:
>
>> This was already asked by Haggai Eran awhile ago and was answered.
>> Anyway, in ib_unregister_device we delete all client's related data.
>> We would like to ensure that all references were released before this
>> data is being deleted. Meaning, we would like to ensure the device
>> is still functioning but isn't referenced rather than just to avoid
>> freeing the IB device's memory.
>
> A kref is the wrong datastructure for that purpose.

What is the right data structure in your opinion?

>
> Jason
>

Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                     ` <553F931F.6000302-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2015-04-28 17:43                       ` Jason Gunthorpe
       [not found]                         ` <20150428174312.GB5497-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Jason Gunthorpe @ 2015-04-28 17:43 UTC (permalink / raw)
  To: Matan Barak
  Cc: Or Gerlitz, Somnath Kotur, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Tue, Apr 28, 2015 at 05:03:11PM +0300, Matan Barak wrote:
 
> The cleanup of roce_gid_cache is done in a different context, so we
> need to make sure the device is still alive while doing so. 

This explanation doesn't look right to me. I don't see anything like
that under roce_gid_cache_cleanup_one ?

Although, there must be a call to the driver's modify_gid to free
context before freeing, and I don't see that obviously happening..

However, all the other async work launched doesn't look safe at all.

So, did you mean that the device must still be alive while all the
other work is running? And the point of this scheme is to guarentee
all the work is flushed? (at least I hope it is, otherwise there are
bigger problems here)

It is just fundamentally wrong to return from ib_client.remove while
async work is still outstanding, the client is expected to deal with
this internally and synchronously.

You don't need IB core help to do this.

Or: This should have been fixed after Haggai brought it up...

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                         ` <20150428174312.GB5497-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2015-04-28 19:04                           ` Or Gerlitz
  2015-04-29  9:16                           ` Matan Barak
  2015-04-29 15:29                           ` Matan Barak
  2 siblings, 0 replies; 82+ messages in thread
From: Or Gerlitz @ 2015-04-28 19:04 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Matan Barak, Somnath Kotur, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Haggai Eran

On Tue, Apr 28, 2015 at 8:43 PM, Jason Gunthorpe
<jgunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org> wrote:
> On Tue, Apr 28, 2015 at 05:03:11PM +0300, Matan Barak wrote:

[...]
> Or: This should have been fixed after Haggai brought it up...

Jason, looking again on the correspondence between Matan and Haggai, I
think this one was sort of left in the air (or actually fell on the
floor),  happens, and indeed we should strive to do better and avoid
that, thanks for the 2nd eye.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                         ` <20150428174312.GB5497-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  2015-04-28 19:04                           ` Or Gerlitz
@ 2015-04-29  9:16                           ` Matan Barak
  2015-04-29 15:29                           ` Matan Barak
  2 siblings, 0 replies; 82+ messages in thread
From: Matan Barak @ 2015-04-29  9:16 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Or Gerlitz, Somnath Kotur, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA



On 4/28/2015 8:43 PM, Jason Gunthorpe wrote:
> On Tue, Apr 28, 2015 at 05:03:11PM +0300, Matan Barak wrote:
>
>> The cleanup of roce_gid_cache is done in a different context, so we
>> need to make sure the device is still alive while doing so.
>
> This explanation doesn't look right to me. I don't see anything like
> that under roce_gid_cache_cleanup_one ?
>
> Although, there must be a call to the driver's modify_gid to free
> context before freeing, and I don't see that obviously happening..
>

Of course there is:
roce_gid_cache_client_cleanup_one -> 
roce_gid_cache_client_cleanup_one_work -> *work* -> 
roce_gid_cache_client_cleanup_work_handler -> roce_gid_cache_cleanup_one 
-> free_roce_gid_cache -> write_gid -> modify_gid

> However, all the other async work launched doesn't look safe at all.
>

I think it's safe - roce_gid_cache_client_cleanup_one deactivates the 
cache (no new events are handled on the cache). Ongoing events are 
flushed in roce_gid_cache_client_cleanup_one_work 
(flush_workqueue(roce_gid_mgmt_wq)). When roce_gid_cache_cleanup_one is 
called - no work will be done or is currently in process on this cache.

> So, did you mean that the device must still be alive while all the
> other work is running? And the point of this scheme is to guarentee
> all the work is flushed? (at least I hope it is, otherwise there are
> bigger problems here)
>

It's not safe to free the client's context in ib_unregister_device while 
the client isn't done. The obvious solution is to wait in client->remove 
(like you suggested) until the client has finished cleaning up things. 
This doesn't fit our case - since client->remove is called under 
device_lock, but it's possible (for example) that 
roce_rescan_device_work_handler is currently running and waits to grab 
this exact mutex - DEAD LOCK.
Freeing thing asynchronously is a possible solution - we don't lock all 
IB devices but just our device. Moreover, it's also more future proof as 
other clients might want to run other things concurrently.

> It is just fundamentally wrong to return from ib_client.remove while
> async work is still outstanding, the client is expected to deal with
> this internally and synchronously.
>
> You don't need IB core help to do this.
>

netdev is taking a similar approach - please take a look at 
netdev_wait_allrefs

> Or: This should have been fixed after Haggai brought it up...
>
> Jason
>

Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                         ` <20150428174312.GB5497-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  2015-04-28 19:04                           ` Or Gerlitz
  2015-04-29  9:16                           ` Matan Barak
@ 2015-04-29 15:29                           ` Matan Barak
       [not found]                             ` <5540F8F4.5010906-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
  2 siblings, 1 reply; 82+ messages in thread
From: Matan Barak @ 2015-04-29 15:29 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Or Gerlitz, Somnath Kotur, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA



On 4/28/2015 8:43 PM, Jason Gunthorpe wrote:
> On Tue, Apr 28, 2015 at 05:03:11PM +0300, Matan Barak wrote:
>
>> The cleanup of roce_gid_cache is done in a different context, so we
>> need to make sure the device is still alive while doing so.
>
> This explanation doesn't look right to me. I don't see anything like
> that under roce_gid_cache_cleanup_one ?
>
> Although, there must be a call to the driver's modify_gid to free
> context before freeing, and I don't see that obviously happening..
>

Of course there is:
roce_gid_cache_client_cleanup_one -> 
roce_gid_cache_client_cleanup_one_work -> *work* -> 
roce_gid_cache_client_cleanup_work_handler -> roce_gid_cache_cleanup_one 
-> free_roce_gid_cache -> write_gid -> modify_gid

> However, all the other async work launched doesn't look safe at all.
>

I think it's safe - roce_gid_cache_client_cleanup_one deactivates the 
cache (no new events are handled on the cache). Ongoing events are 
flushed in roce_gid_cache_client_cleanup_one_work 
(flush_workqueue(roce_gid_mgmt_wq)). When roce_gid_cache_cleanup_one is 
called - no work will be done or is currently in process on this cache.

> So, did you mean that the device must still be alive while all the
> other work is running? And the point of this scheme is to guarentee
> all the work is flushed? (at least I hope it is, otherwise there are
> bigger problems here)
>

It's not safe to free the client's context in ib_unregister_device while 
the client isn't done. The obvious solution is to wait in client->remove 
(like you suggested) until the client has finished cleaning up things. 
This doesn't fit our case - since client->remove is called under 
device_lock, but it's possible (for example) that 
roce_rescan_device_work_handler is currently running and waits to grab 
this exact mutex - DEAD LOCK.
Freeing thing asynchronously is a possible solution - we don't lock all 
IB devices but just our device. Moreover, it's also more future proof as 
other clients might want to run other things concurrently.

> It is just fundamentally wrong to return from ib_client.remove while
> async work is still outstanding, the client is expected to deal with
> this internally and synchronously.
>
> You don't need IB core help to do this.
>

netdev is taking a similar approach - please take a look at 
netdev_wait_allrefs

> Or: This should have been fixed after Haggai brought it up...
>
> Jason
>

Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                             ` <5540F8F4.5010906-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
@ 2015-04-29 16:48                               ` Jason Gunthorpe
       [not found]                                 ` <20150429164847.GA12781-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Jason Gunthorpe @ 2015-04-29 16:48 UTC (permalink / raw)
  To: matanb-VPRAkNaXOzVWk0Htik3J/w
  Cc: Or Gerlitz, Somnath Kotur, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA


> >Although, there must be a call to the driver's modify_gid to free
> >context before freeing, and I don't see that obviously happening..
 
> Of course there is:
> roce_gid_cache_client_cleanup_one ->
> roce_gid_cache_client_cleanup_one_work -> *work* ->
> roce_gid_cache_client_cleanup_work_handler ->
> roce_gid_cache_cleanup_one -> free_roce_gid_cache -> write_gid ->
> modify_gid

Ah, this wasn't there in the earlier versions. Good.
 
> >However, all the other async work launched doesn't look safe at all.

> I think it's safe - roce_gid_cache_client_cleanup_one deactivates
> the cache (no new events are handled on the cache). Ongoing events
> are flushed in roce_gid_cache_client_cleanup_one_work

Do you mean roce_gid_cache_client_cleanup_work_handler? 

> It's not safe to free the client's context in ib_unregister_device
> while the client isn't done. The obvious solution is to wait in
> client->remove (like you suggested) until the client has finished
> cleaning up things.

This isn't just the obvious solution, it is the *expected* solution.
In the kernel the add/remove style idiom always works like this.

> This doesn't fit our case - since client->remove is called under
> device_lock, but it's possible (for example) that
> roce_rescan_device_work_handler is currently running and waits to
> grab this exact mutex - DEAD LOCK.

Uh, client->remove is most obviously called with
drivers/infiniband/core/device.c:device_mutex held, is that what you
mean? But that can't be right because only four functions hold that
lock and none of them are obviously called from this patch?

If these patches have a locking problem then breaking the add/remove
idiom is not the way to solve it.

Look, four people have asked about this patch, and I still have yet to
see an accurate and convincing answer from you what is actually going
on here. Please actually spend some time to properly research and
describe why the remove callback can't be synchronous.

> >It is just fundamentally wrong to return from ib_client.remove while
> >async work is still outstanding, the client is expected to deal with
> >this internally and synchronously.
> >
> >You don't need IB core help to do this.
> 
> netdev is taking a similar approach - please take a look at
> netdev_wait_allrefs

No, it really isn't, from the attached drivers perspective after
unregister_netdevice returns the driver is not allowed to operate the
net device anymore. netdev_wait_allrefs is part of making
unregister_netdevice synchronous.

The same is true of our IB client attaches, after a client returns
from remove it is not allowed to operate the IB device anymore.

That is a standard idiom, and we'd need a huge compelling reason to go
away from that.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                                 ` <20150429164847.GA12781-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2015-04-30  8:21                                   ` Matan Barak
       [not found]                                     ` <5541E5ED.7000606-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Matan Barak @ 2015-04-30  8:21 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Or Gerlitz, Somnath Kotur, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA



On 4/29/2015 7:48 PM, Jason Gunthorpe wrote:
>
>>> Although, there must be a call to the driver's modify_gid to free
>>> context before freeing, and I don't see that obviously happening..
>
>> Of course there is:
>> roce_gid_cache_client_cleanup_one ->
>> roce_gid_cache_client_cleanup_one_work -> *work* ->
>> roce_gid_cache_client_cleanup_work_handler ->
>> roce_gid_cache_cleanup_one -> free_roce_gid_cache -> write_gid ->
>> modify_gid
>
> Ah, this wasn't there in the earlier versions. Good.
>
>>> However, all the other async work launched doesn't look safe at all.
>
>> I think it's safe - roce_gid_cache_client_cleanup_one deactivates
>> the cache (no new events are handled on the cache). Ongoing events
>> are flushed in roce_gid_cache_client_cleanup_one_work
>
> Do you mean roce_gid_cache_client_cleanup_work_handler?
>

Yeah, the flush_workqueue is done in 
roce_gid_cache_client_cleanup_work_handler

>> It's not safe to free the client's context in ib_unregister_device
>> while the client isn't done. The obvious solution is to wait in
>> client->remove (like you suggested) until the client has finished
>> cleaning up things.
>
> This isn't just the obvious solution, it is the *expected* solution.
> In the kernel the add/remove style idiom always works like this.
>
>> This doesn't fit our case - since client->remove is called under
>> device_lock, but it's possible (for example) that
>> roce_rescan_device_work_handler is currently running and waits to
>> grab this exact mutex - DEAD LOCK.
>
> Uh, client->remove is most obviously called with
> drivers/infiniband/core/device.c:device_mutex held, is that what you
> mean? But that can't be right because only four functions hold that
> lock and none of them are obviously called from this patch?
>
> If these patches have a locking problem then breaking the add/remove
> idiom is not the way to solve it.
>
> Look, four people have asked about this patch, and I still have yet to
> see an accurate and convincing answer from you what is actually going
> on here. Please actually spend some time to properly research and
> describe why the remove callback can't be synchronous.
>

ib_unregister_device calls the remove function with device_mutex help. 
In addition, ib_enum_roce_ports_of_netdev does the same. Every 
interesting netdev/inet/inet6 event that's handled in roce_gid_mgmt 
triggers ib_enum_roce_ports_of_netdev by using the workqueue (for 
example, 
netdevice_event->*work*->netdevice_event_work_handler->ib_enum_roce_ports_of_netdev).

When a device is being unregistered, the remove function is called under 
device_mutex:
ib_unregister_device->roce_gid_cache_client_cleanup_one->*work*->roce_gid_cache_client_cleanup_work_handler->flush_workqueue

This flush_workqueue could wait for ib_enum_roce_ports_of_netdev. If we 
would have called it straight from the remove function, we're waiting 
for a work which waits for a mutex that will be unlocked only after the 
iteration over all remove functions is completed -> *DEADLOCK*

You could argue that flush_workqueue isn't needed, but that let's look 
at the following flow:

roce_gid_cache_client_setup_one->roce_rescan_device->*work (with the 
exact ib_dev)*->....
We need to make sure the ib_dev isn't free'd before this work is done.

There might be some ways around it - for example, introduce another 
workqueue for roce_rescan_device and flush this workqueue only. Every 
way has its advantages and disadvantages. I think it's problematic that 
device_mutex can't be held in a work as *most* client works are 
synchronized when the a device is being unregistered. It could affect 
future clients as well.

I'll be happy to explain/fix any issue you have regarding this code, but 
of course it needs to be concrete.

>>> It is just fundamentally wrong to return from ib_client.remove while
>>> async work is still outstanding, the client is expected to deal with
>>> this internally and synchronously.
>>>
>>> You don't need IB core help to do this.
>>
>> netdev is taking a similar approach - please take a look at
>> netdev_wait_allrefs
>
> No, it really isn't, from the attached drivers perspective after
> unregister_netdevice returns the driver is not allowed to operate the
> net device anymore. netdev_wait_allrefs is part of making
> unregister_netdevice synchronous.
>
> The same is true of our IB client attaches, after a client returns
> from remove it is not allowed to operate the IB device anymore.
>

ib_unregister_device is synchronous in the exact same manner - after it 
returns, no client operate on the IB device. 
wait_for_completion(&device->free) was added for this exact reason.

> That is a standard idiom, and we'd need a huge compelling reason to go
> away from that.

A device can be remove if and only if it's reference count is zero. 
That's the only point where we guarantee nobody uses it anymore. That's 
a standard idiom as well.

>
> Jason
>

I really appreciate the review and thanks for looking at this patch,
Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* RE: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                                     ` <5541E5ED.7000606-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2015-04-30 16:56                                       ` Hefty, Sean
       [not found]                                         ` <1828884A29C6694DAF28B7E6B8A82373A8FC929B-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  2015-04-30 17:26                                       ` Jason Gunthorpe
  1 sibling, 1 reply; 82+ messages in thread
From: Hefty, Sean @ 2015-04-30 16:56 UTC (permalink / raw)
  To: Matan Barak, Jason Gunthorpe
  Cc: Or Gerlitz, Somnath Kotur, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

> >> roce_gid_cache_client_cleanup_one ->
> >> roce_gid_cache_client_cleanup_one_work -> *work* ->
> >> roce_gid_cache_client_cleanup_work_handler ->
> >> roce_gid_cache_cleanup_one -> free_roce_gid_cache -> write_gid ->
> >> modify_gid
> >
> > Ah, this wasn't there in the earlier versions. Good.
> >
> >>> However, all the other async work launched doesn't look safe at all.
> >
> >> I think it's safe - roce_gid_cache_client_cleanup_one deactivates
> >> the cache (no new events are handled on the cache). Ongoing events
> >> are flushed in roce_gid_cache_client_cleanup_one_work
> >
> > Do you mean roce_gid_cache_client_cleanup_work_handler?
> >
> 
> Yeah, the flush_workqueue is done in
> roce_gid_cache_client_cleanup_work_handler

This entire cache seems completely over-architected.  Why is this complexity needed?  How frequently do we really expect these "GIDs" to change?

I still don't even buy that a cache is needed at all.  The RDMA CM can accept as input UDP/IP addresses.  RoCEv2 puts UDP/IP addresses on the wire.  Why should there be a conversion from an IP address to a GID to get back to the same IP address?  Why aren't IP addresses passed directly to the drivers?  Just change how the RDMA CM associates an IP address with the RDMA device.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                                     ` <5541E5ED.7000606-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  2015-04-30 16:56                                       ` Hefty, Sean
@ 2015-04-30 17:26                                       ` Jason Gunthorpe
       [not found]                                         ` <20150430172606.GA32666-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  1 sibling, 1 reply; 82+ messages in thread
From: Jason Gunthorpe @ 2015-04-30 17:26 UTC (permalink / raw)
  To: Matan Barak
  Cc: Or Gerlitz, Somnath Kotur, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Thu, Apr 30, 2015 at 11:21:01AM +0300, Matan Barak wrote:

> ib_unregister_device calls the remove function with device_mutex
> help. In addition, ib_enum_roce_ports_of_netdev does the same. Every
> interesting netdev/inet/inet6 event that's handled in roce_gid_mgmt
> triggers ib_enum_roce_ports_of_netdev by using the workqueue (for
> example, netdevice_event->*work*->netdevice_event_work_handler->ib_enum_roce_ports_of_netdev).

So, okay, now it is very clear. This should have been described
explicitly in the kref commit message, for instance:

 ... 

 Later commits in this series are going to extend the use of
 device_mutex via ib_enum_roce_ports_of_netdev resulting in ...
 .. To solve this deadlock introduce ....

Part of the job of the patch author is to make review work better by
highlighting the most troublesome areas, remember we have more patch
authors than reviewers so work must be pushed onto the author side.

Lets look at the original commit message provided:

 Previously. we used device_mutex lock in order to protect
 the device's list. 

  ** 'previously' is wrong, this patch does nothing to change what
      device_mutex covers, it still protects the device_list and still
      protects the client_list

 That means that in order to guarantee a
 device isn't freed while we use it, we had to lock all
 devices.

  ** No, locking the device_mutex does nothing to protect a device
     from being freed. The existing kref does that. The device_mutex
     protects the device_list from mutation, and
     ib_enum_roce_ports_of_netdev must hold it when it iterates over
     that list.

     It prevents a device from being unregistered.
     Accurate specificity is important in these commit messages.
     Otherwise nobody understands what is being described.

 Adding a kref per IB device. Before an IB device
 is unregistered, we wait before its not held anymore.

  ** Well, that is what the patch did, but the commit message is
     supposed to explain *why* too

Do you understand why this is so confusing?

> You could argue that flush_workqueue isn't needed, but that let's
> look at the following flow:

No, I wouldn't argue that, all the async work obviously needs to
cancel or complete during remove(), that's what I've been saying.

> There might be some ways around it - for example, introduce another
> workqueue for roce_rescan_device and flush this workqueue only.
> Every way has its advantages and disadvantages.

I don't see that either, like I keep saying, all work must complete,
more work queues don't really seem to help that.

> I think it's problematic that device_mutex can't be held in a work
> as *most* client works are synchronized when the a device is being
> unregistered. It could affect future clients as well.

But until this patch set added ib_enum_roce_ports_of_netdev the
device_mutex would never conflict with anything a client could do.

So, ultimtely, this is really a self created problem, and infact, the
problem lies with the introduction of ib_enum_roce_ports_of_netdev -
adding a new use of the device_mutex that is not register/unregister
related exposes the design limitations of that mutex *that Roland
clearly explained in a giant comment!*

So we need to fix that problem, not add a new mechanism. Off the top
of my head:
 - Split device_mutex into client_mutex and device_mutex,
   hold only the client_mutex when working with the client_list.
 - Convert device_mutex into a r/w sem
 - Use a different scheme to match netdevs for
   ib_enum_roce_ports_of_netdev, that doesn't rely on holding
   device_mutex during query

The first two seem really simple to me. I'd do that.
 
> >No, it really isn't, from the attached drivers perspective after
> >unregister_netdevice returns the driver is not allowed to operate the
> >net device anymore. netdev_wait_allrefs is part of making
> >unregister_netdevice synchronous.
> >
> >The same is true of our IB client attaches, after a client returns
> >from remove it is not allowed to operate the IB device anymore.
> >
> 
> ib_unregister_device is synchronous in the exact same manner - after
> it returns, no client operate on the IB device.
> wait_for_completion(&device->free) was added for this exact reason.

No, you are missing the layering here.

In this context ib_client is the *driver*, and like all drivers once
it's remove method returns the driver can no longer expect the
attached device is operable, and must guarentee the driver's .text is
freeable.

> >That is a standard idiom, and we'd need a huge compelling reason to go
> >away from that.
> 
> A device can be remove if and only if it's reference count is zero.
> That's the only point where we guarantee nobody uses it anymore.
> That's a standard idiom as well.

No actually. The kref is tied to the memory lifetime, and is pretty
much universal that the memory lifetime and device operable lifetime
(ie still registered) are very different things.

This is why using a kref to describe anything other than memory
lifetime is not correct, and two krefs in a structure is obviously
nonsense.

Some places use an 'active count + completion', like netdev, kernfs,
etc to track active users and use that to block an unregister path,
but that isn't a kref.

Okay, lets break down and understand why this is an important standard
guarentee, for our specific case. Lets generalize this scenario a
bit. roce_gid_cache isn't modular, but lets assume it is (after all
the intent of this patch is to weaken the remove() invarient for
everyone.)

On module remove it will call ib_unregister_client, and when
ib_unregister_client returns the module will be become unloaded.

The invariant for module unload requires no module code to be
running and no module code to be runnable in future -- ie all work
must be complete or canceled.

module unload always calls the driver's remove function, that is a
standard idiom.

So, the first thing to notice, the kref patch didn't actually change
ib_unregister_client - it doesn't
'wait_for_completion(&device->free)'.  Immediately we have a
architectural bug, modules can exit while they have outstanding code
runnable in their .text. Oops.

Next, we realize we cannot fix this by waiting on device->free, it is
ostensibly a general lock that all clients use on the ib_device, we
need something isolated to this client.

Finally, we realize if we can isolate something to one client, then
the client can handle it during it's own remove() function, we don't
need core support.

Thus, the way out is to make ib_client.remove() completely synchronous
and *guarentee* that the module's .text will never be used again after
remove() returns. Obviously this means all work is complete or
canceled.

*THIS* is why driver remove is idiomatically always synchronous in the
kernel.

Please appreciate how much time it takes to explain all of this :(
Just fix it already :(

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                                         ` <1828884A29C6694DAF28B7E6B8A82373A8FC929B-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2015-04-30 17:52                                           ` Jason Gunthorpe
       [not found]                                             ` <20150430175252.GB32666-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Jason Gunthorpe @ 2015-04-30 17:52 UTC (permalink / raw)
  To: Hefty, Sean
  Cc: Matan Barak, Or Gerlitz, Somnath Kotur, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Thu, Apr 30, 2015 at 04:56:13PM +0000, Hefty, Sean wrote:

> I still don't even buy that a cache is needed at all.  The RDMA CM
> can accept as input UDP/IP addresses.  RoCEv2 puts UDP/IP addresses
> on the wire.  Why should there be a conversion from an IP address to
> a GID to get back to the same IP address?  Why aren't IP addresses
> passed directly to the drivers?  Just change how the RDMA CM
> associates an IP address with the RDMA device.

I feel it looks overdesigned too..

But, how could RDMA CM work? Inbound CM messages will be filtered if
the IP is not in the HW GID table? How could UD work?

This current scheme is just so ugly, there are so many wonky
possibilities. What happens if I remove an IP and then add a new one?
The GID index will eventually be re-used, and QPs bound to that gid
index will silently change source IPs. Horrible.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* RE: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                                             ` <20150430175252.GB32666-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2015-04-30 19:21                                               ` Hefty, Sean
       [not found]                                                 ` <1828884A29C6694DAF28B7E6B8A82373A8FC9419-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
  2015-05-01  6:34                                               ` Matan Barak
  1 sibling, 1 reply; 82+ messages in thread
From: Hefty, Sean @ 2015-04-30 19:21 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Matan Barak, Or Gerlitz, Somnath Kotur, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

> But, how could RDMA CM work? Inbound CM messages will be filtered if
> the IP is not in the HW GID table?

I'm not understanding the issue.

If a device has some requirement to program its assigned IP addresses into some HW table, I don't see why upper layers should be made to care.  The IB CM is essentially handling 2 different type of CM messages -- one for IB and one for RoCE.  This just adds a third type.  All three types are similar, with some fields ignored and others formatted using different data.  The CM interface can be updated to better reflect reality, rather than pretending that RoCE has path records or anything other than IB-classic has LIDs.  The CM message definitions themselves could also be updated to indicate which fields matter.

> How could UD work?

I haven't given much thought to UD, but since AV creation goes directly to the driver, I still don't see why GIDs need to be used.  The driver can encode whatever it needs to (e.g. GID/IP index) into the AV.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                                                 ` <1828884A29C6694DAF28B7E6B8A82373A8FC9419-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2015-04-30 21:28                                                   ` Jason Gunthorpe
       [not found]                                                     ` <20150430212842.GB7709-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Jason Gunthorpe @ 2015-04-30 21:28 UTC (permalink / raw)
  To: Hefty, Sean
  Cc: Matan Barak, Or Gerlitz, Somnath Kotur, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Thu, Apr 30, 2015 at 07:21:08PM +0000, Hefty, Sean wrote:
> > But, how could RDMA CM work? Inbound CM messages will be filtered if
> > the IP is not in the HW GID table?
> 
> I'm not understanding the issue.
> 
> If a device has some requirement to program its assigned IP
> addresses into some HW table, I don't see why upper layers should be
> made to care.

Okay, I was only thinking about the first couple patches in this
series.  Three drivers will have this HW gid table, so having a driver
helper library in the common code makes sense to me.

But then the series it just seems to go crazy - what is with all these
net dev patches? Now we are doing something with bonding? And a lot of
screwy looking rocev2 gid type stuff? And driver updates? And 33 patches worth?

You are totally right, this GID index and GID type stuff is getting
*everywhere*, and it is hard to follow the *why* of all the changes
are really needed.

Matan, if you want to progress this, then split it up.  Make a patch
to library the existing roce GID table driver code and update the
drivers. Just that bit alone has already spawned a huge discussion.

Work the other ancillary unrelated patches in small chuncks through
their trees. 

Then come back to rocev2 with an actual core API proposal patch set
that can be discussed.

Honestly, I'm not willing to even look at a patch set this big and
rambly.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                                         ` <20150430172606.GA32666-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2015-05-01  6:28                                           ` Matan Barak
       [not found]                                             ` <CAAKD3BBGQwZ_Ainm6MSQjSkaXsJd9M5Vo4oarTLyFiVMQVS5_Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Matan Barak @ 2015-05-01  6:28 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Matan Barak, Or Gerlitz, Somnath Kotur, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Thu, Apr 30, 2015 at 8:26 PM, Jason Gunthorpe
<jgunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org> wrote:
> On Thu, Apr 30, 2015 at 11:21:01AM +0300, Matan Barak wrote:
>
>> ib_unregister_device calls the remove function with device_mutex
>> help. In addition, ib_enum_roce_ports_of_netdev does the same. Every
>> interesting netdev/inet/inet6 event that's handled in roce_gid_mgmt
>> triggers ib_enum_roce_ports_of_netdev by using the workqueue (for
>> example, netdevice_event->*work*->netdevice_event_work_handler->ib_enum_roce_ports_of_netdev).
>
> So, okay, now it is very clear. This should have been described
> explicitly in the kref commit message, for instance:
>
>  ...
>
>  Later commits in this series are going to extend the use of
>  device_mutex via ib_enum_roce_ports_of_netdev resulting in ...
>  .. To solve this deadlock introduce ....
>
> Part of the job of the patch author is to make review work better by
> highlighting the most troublesome areas, remember we have more patch
> authors than reviewers so work must be pushed onto the author side.
>
> Lets look at the original commit message provided:
>
>  Previously. we used device_mutex lock in order to protect
>  the device's list.
>
>   ** 'previously' is wrong, this patch does nothing to change what
>       device_mutex covers, it still protects the device_list and still
>       protects the client_list
>
>  That means that in order to guarantee a
>  device isn't freed while we use it, we had to lock all
>  devices.
>
>   ** No, locking the device_mutex does nothing to protect a device
>      from being freed. The existing kref does that. The device_mutex
>      protects the device_list from mutation, and
>      ib_enum_roce_ports_of_netdev must hold it when it iterates over
>      that list.
>
>      It prevents a device from being unregistered.
>      Accurate specificity is important in these commit messages.
>      Otherwise nobody understands what is being described.
>
>  Adding a kref per IB device. Before an IB device
>  is unregistered, we wait before its not held anymore.
>
>   ** Well, that is what the patch did, but the commit message is
>      supposed to explain *why* too
>
> Do you understand why this is so confusing?
>

I agree, it should have been clarified better. The "why" is as important as the
"what" and an accurate reason could have make review's life easier.
I'll fix that. Thanks.

>> You could argue that flush_workqueue isn't needed, but that let's
>> look at the following flow:
>
> No, I wouldn't argue that, all the async work obviously needs to
> cancel or complete during remove(), that's what I've been saying.
>
>> There might be some ways around it - for example, introduce another
>> workqueue for roce_rescan_device and flush this workqueue only.
>> Every way has its advantages and disadvantages.
>
> I don't see that either, like I keep saying, all work must complete,
> more work queues don't really seem to help that.
>

I think this is a possible solution, as all works but the "rescan" work
aren't device specific. That means that after we move the rescan work
to another workqueue and synchronize it in the client->remove function,
we could be sure the device won't be used by this client anymore.

The rest of the works only iterate on the *existing* device list. Since the
ib_unregister_device
(a) lock (b) iterate over client->remove (c) remove from list (d) free
(e) unlock
all roce_gid_cache's works won't find it in the list and we'll be safe.

Regarding ib_unregister_client - if we guarantee that after each
client->remove(dev)
dev isn't used in this client, we actually guarantee that after removing all IB
devices no IB device will be used (induction).

What do you think?

>> I think it's problematic that device_mutex can't be held in a work
>> as *most* client works are synchronized when the a device is being
>> unregistered. It could affect future clients as well.
>
> But until this patch set added ib_enum_roce_ports_of_netdev the
> device_mutex would never conflict with anything a client could do.
>
> So, ultimtely, this is really a self created problem, and infact, the
> problem lies with the introduction of ib_enum_roce_ports_of_netdev -
> adding a new use of the device_mutex that is not register/unregister
> related exposes the design limitations of that mutex *that Roland
> clearly explained in a giant comment!*
>

I agree, while it's a general problem - it was first introduced by using
device_mutex in an asynchronous context (that should be flushed in
the remove function).

> So we need to fix that problem, not add a new mechanism. Off the top
> of my head:
>  - Split device_mutex into client_mutex and device_mutex,
>    hold only the client_mutex when working with the client_list.

Seems like a possible nice solution. I'll look into that.

>  - Convert device_mutex into a r/w sem

I'm not sure this will solve the problem, as besides the new enumerate
devices function, all existing functions update the list - so we'll have
read-while-write scenario and we'll be in the exact same condition.

>  - Use a different scheme to match netdevs for
>    ib_enum_roce_ports_of_netdev, that doesn't rely on holding
>    device_mutex during query
>

We could switch to RCU protected list or something similar,
but I honestly don't think it worth the complexity.

> The first two seem really simple to me. I'd do that.
>

Agree, but please consider also the addition of another
workqueue as a possible solution. It *does* (seem) to answer
all of your concerns and could be safer and cause less code changes.

>> >No, it really isn't, from the attached drivers perspective after
>> >unregister_netdevice returns the driver is not allowed to operate the
>> >net device anymore. netdev_wait_allrefs is part of making
>> >unregister_netdevice synchronous.
>> >
>> >The same is true of our IB client attaches, after a client returns
>> >from remove it is not allowed to operate the IB device anymore.
>> >
>>
>> ib_unregister_device is synchronous in the exact same manner - after
>> it returns, no client operate on the IB device.
>> wait_for_completion(&device->free) was added for this exact reason.
>
> No, you are missing the layering here.
>
> In this context ib_client is the *driver*, and like all drivers once
> it's remove method returns the driver can no longer expect the
> attached device is operable, and must guarentee the driver's .text is
> freeable.
>
>> >That is a standard idiom, and we'd need a huge compelling reason to go
>> >away from that.
>>
>> A device can be remove if and only if it's reference count is zero.
>> That's the only point where we guarantee nobody uses it anymore.
>> That's a standard idiom as well.
>
> No actually. The kref is tied to the memory lifetime, and is pretty
> much universal that the memory lifetime and device operable lifetime
> (ie still registered) are very different things.
>
> This is why using a kref to describe anything other than memory
> lifetime is not correct, and two krefs in a structure is obviously
> nonsense.
>
> Some places use an 'active count + completion', like netdev, kernfs,
> etc to track active users and use that to block an unregister path,
> but that isn't a kref.
>

kref just hides an atomic refcount - so you're actually saying using
the abstraction contradicts the kernel object's lifetime strategy?
I get that, I could have used an atomic refcount - but I agree that it's
worth exploring other solutions the will preserve the invariant of
client->remove being synchronous.

> Okay, lets break down and understand why this is an important standard
> guarentee, for our specific case. Lets generalize this scenario a
> bit. roce_gid_cache isn't modular, but lets assume it is (after all
> the intent of this patch is to weaken the remove() invarient for
> everyone.)
>
> On module remove it will call ib_unregister_client, and when
> ib_unregister_client returns the module will be become unloaded.
>
> The invariant for module unload requires no module code to be
> running and no module code to be runnable in future -- ie all work
> must be complete or canceled.
>
> module unload always calls the driver's remove function, that is a
> standard idiom.
>
> So, the first thing to notice, the kref patch didn't actually change
> ib_unregister_client - it doesn't
> 'wait_for_completion(&device->free)'.  Immediately we have a
> architectural bug, modules can exit while they have outstanding code
> runnable in their .text. Oops.
>

I tend to agree. The __exit function flushes the workqueue (so eventually
we guarentee that no code will execute before the module is unloaded).
IMHO, after ib_unregister_client returns, the module could still run code
(this is why the __exit handler exists), but it's not allowed to use any
data of the module which unregistered it, meaning - we can't allow it to
use any IB device.

> Next, we realize we cannot fix this by waiting on device->free, it is
> ostensibly a general lock that all clients use on the ib_device, we
> need something isolated to this client.
>
> Finally, we realize if we can isolate something to one client, then
> the client can handle it during it's own remove() function, we don't
> need core support.
>
> Thus, the way out is to make ib_client.remove() completely synchronous
> and *guarentee* that the module's .text will never be used again after
> remove() returns. Obviously this means all work is complete or
> canceled.
>
> *THIS* is why driver remove is idiomatically always synchronous in the
> kernel.
>

Ok, I get your point, thanks. I'll keep that call synchronous. The two
options which are on the table right now are:
(a) split the device_mutex to device_mutex and client_mutex.
      ib_unregister_device first grabs only client_mutex and after it called
      client->remove it grabs device->mutex to the rest of the work.
(b) Introduce another workqueue for all device-specific events
(actually, only rescan).
      This allows us to flush only this workqueue without waiting for any
      work which needs device_mutex.

> Please appreciate how much time it takes to explain all of this :(
> Just fix it already :(
>

I really appreciate it, again - thanks for looking at this and writing
this *long*
explanation.

> Jason
> --

Thanks a lot,
Matan

> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                                             ` <20150430175252.GB32666-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  2015-04-30 19:21                                               ` Hefty, Sean
@ 2015-05-01  6:34                                               ` Matan Barak
       [not found]                                                 ` <CAAKD3BCJbUAMYhBzwuQFct=cRSXnGC=ELzNkvw2X04a4UipQwQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 82+ messages in thread
From: Matan Barak @ 2015-05-01  6:34 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Hefty, Sean, Matan Barak, Or Gerlitz, Somnath Kotur,
	Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Thu, Apr 30, 2015 at 8:52 PM, Jason Gunthorpe
<jgunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org> wrote:
> On Thu, Apr 30, 2015 at 04:56:13PM +0000, Hefty, Sean wrote:
>
>> I still don't even buy that a cache is needed at all.  The RDMA CM
>> can accept as input UDP/IP addresses.  RoCEv2 puts UDP/IP addresses
>> on the wire.  Why should there be a conversion from an IP address to
>> a GID to get back to the same IP address?  Why aren't IP addresses
>> passed directly to the drivers?  Just change how the RDMA CM
>> associates an IP address with the RDMA device.
>
> I feel it looks overdesigned too..
>
> But, how could RDMA CM work? Inbound CM messages will be filtered if
> the IP is not in the HW GID table? How could UD work?
>
> This current scheme is just so ugly, there are so many wonky
> possibilities. What happens if I remove an IP and then add a new one?
> The GID index will eventually be re-used, and QPs bound to that gid
> index will silently change source IPs. Horrible.

This should be handled by the vendor's driver/other future ib_core part.
This patchset introduces roce_gid_cache that manages the GID table and
notify vendors about GID changes.

The vendor needs to:
(a) Move all QPs that use GID x to error state when GID x is deleted from
      the table.
(b) Change all QPs that use GID x to use a special invalid GID entry.
(c) Don't delete GIDs that are being used by a QP.

>
> Jason

Matan

> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                                                     ` <20150430212842.GB7709-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2015-05-01  6:41                                                       ` Matan Barak
  0 siblings, 0 replies; 82+ messages in thread
From: Matan Barak @ 2015-05-01  6:41 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Hefty, Sean, Matan Barak, Or Gerlitz, Somnath Kotur,
	Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Fri, May 1, 2015 at 12:28 AM, Jason Gunthorpe
<jgunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org> wrote:
> On Thu, Apr 30, 2015 at 07:21:08PM +0000, Hefty, Sean wrote:
>> > But, how could RDMA CM work? Inbound CM messages will be filtered if
>> > the IP is not in the HW GID table?
>>
>> I'm not understanding the issue.
>>
>> If a device has some requirement to program its assigned IP
>> addresses into some HW table, I don't see why upper layers should be
>> made to care.
>
> Okay, I was only thinking about the first couple patches in this
> series.  Three drivers will have this HW gid table, so having a driver
> helper library in the common code makes sense to me.
>
> But then the series it just seems to go crazy - what is with all these
> net dev patches? Now we are doing something with bonding? And a lot of
> screwy looking rocev2 gid type stuff? And driver updates? And 33 patches worth?
>

Actually, we're in the middle of splitting this series into two series
- the first introduces
the roce_gid_cache management and the second will introduce RoCE V2.
The bonding changes are crucial - the mlx4 driver currently support bonding
and changing to a new mechanism without supporting bonding will cause
a regression.

> You are totally right, this GID index and GID type stuff is getting
> *everywhere*, and it is hard to follow the *why* of all the changes
> are really needed.
>
> Matan, if you want to progress this, then split it up.  Make a patch
> to library the existing roce GID table driver code and update the
> drivers. Just that bit alone has already spawned a huge discussion.
>

It's already in progress. The new series is expected to be a lot smaller.

> Work the other ancillary unrelated patches in small chuncks through
> their trees.
>

The DRV patch was already sent to net. Regarding the bonding and ipv6
patch - we can't change them without a good reason which relies in this
patchset.

> Then come back to rocev2 with an actual core API proposal patch set
> that can be discussed.
>
> Honestly, I'm not willing to even look at a patch set this big and
> rambly.

Ok, hopefully we'll have something soon enough.

>
> Jason

Thanks a lot for the review and your comments.

Matan

> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                                             ` <CAAKD3BBGQwZ_Ainm6MSQjSkaXsJd9M5Vo4oarTLyFiVMQVS5_Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-05-01 17:31                                               ` Jason Gunthorpe
       [not found]                                                 ` <20150501173133.GB17940-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Jason Gunthorpe @ 2015-05-01 17:31 UTC (permalink / raw)
  To: Matan Barak
  Cc: Matan Barak, Or Gerlitz, Somnath Kotur, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Fri, May 01, 2015 at 09:28:40AM +0300, Matan Barak wrote:

> >> There might be some ways around it - for example, introduce another
> >> workqueue for roce_rescan_device and flush this workqueue only.
> >> Every way has its advantages and disadvantages.
> >
> > I don't see that either, like I keep saying, all work must complete,
> > more work queues don't really seem to help that.
> >
> 
> I think this is a possible solution, as all works but the "rescan" work
> aren't device specific. That means that after we move the rescan work
> to another workqueue and synchronize it in the client->remove function,
> we could be sure the device won't be used by this client anymore.
> 
> The rest of the works only iterate on the *existing* device list. Since the
> ib_unregister_device
> (a) lock (b) iterate over client->remove (c) remove from list (d) free
> (e) unlock
> all roce_gid_cache's works won't find it in the list and we'll be safe.

Well.. be carefull with the locking here, if device_mutex no longer
covers remove() then remove() and ib_enum_roce_ports_of_netdev() can
run concurrently and they will need locking against
free_roce_gid_cache(), which appears to implicitly rely on the
device_mutex (too subtle and fragile)

If there is a 'global' part and a 'device' specific part, it would
good to make this more obvious and clear: group the functions together,
annotate the difference with naming, a comment, or something. There
are *alot* of little functions in this module, it is hard to
understand the call graph without careful study.

> Regarding ib_unregister_client - if we guarantee that after each
> client->remove(dev)
> dev isn't used in this client, we actually guarantee that after removing all IB
> devices no IB device will be used (induction).

Yes, that is the idea..

> > So, ultimtely, this is really a self created problem, and infact, the
> > problem lies with the introduction of ib_enum_roce_ports_of_netdev -
> > adding a new use of the device_mutex that is not register/unregister
> > related exposes the design limitations of that mutex *that Roland
> > clearly explained in a giant comment!*

> I agree, while it's a general problem - it was first introduced by using
> device_mutex in an asynchronous context (that should be flushed in
> the remove function).

Well, no, generally speaking you can't use device_mutex in any client
facing API like ib_enum_roce_ports_of_netdev - it isn't just async
contexts, but it means the add() and remove() functions instantly
deadlock if they call a client facing API - that is not sane.

> >  - Convert device_mutex into a r/w sem
> 
> I'm not sure this will solve the problem, as besides the new enumerate
> devices function, all existing functions update the list - so we'll have
> read-while-write scenario and we'll be in the exact same condition.

You'd do something like hold the write sem and mutate the device list
to unlink the target device, then switch to a read sem to traverse the
client list. Two locks are clearer if contention is not an issue.

> Agree, but please consider also the addition of another
> workqueue as a possible solution. It *does* (seem) to answer
> all of your concerns and could be safer and cause less code changes.

As above, fundamentally, introducing a client API that holds
device_mutex is just wrong, so we need to address this.

I would also recommend splitting your per-device/global stuff anyhow,
global stuff needs a clear unwind during remove, and so on. It isn't
really optional to have this clarity.

> kref just hides an atomic refcount - so you're actually saying using
> the abstraction contradicts the kernel object's lifetime strategy?

kref has specific semantic meaning, it isn't a general use data
structure.

> I tend to agree. The __exit function flushes the workqueue (so eventually
> we guarentee that no code will execute before the module is unloaded).
> IMHO, after ib_unregister_client returns, the module could still run code
> (this is why the __exit handler exists), but it's not allowed to use any
> data of the module which unregistered it, meaning - we can't allow it to
> use any IB device.

Yes, I was being terse, of course the .text is still going to be used,
but the general idea of lifetime holds: the .text cannot remain in
service of the ib_device because the __exit handler cannot clean that
up.

> Ok, I get your point, thanks. I'll keep that call synchronous. The two
> options which are on the table right now are:
> (a) split the device_mutex to device_mutex and client_mutex.
>       ib_unregister_device first grabs only client_mutex and after it called
>       client->remove it grabs device->mutex to the rest of the work.
> (b) Introduce another workqueue for all device-specific events
> (actually, only rescan).
>       This allows us to flush only this workqueue without waiting for any
>       work which needs device_mutex.

The other option is to make rescan sense that the ib_device has been
freed and ignore it, and then flush the work queue that runs that on
module __exit. This might be needed anyhow due to the free race
discussed above. Be sure to add the necessary kref get/puts to hold
the ib device though.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                                                 ` <CAAKD3BCJbUAMYhBzwuQFct=cRSXnGC=ELzNkvw2X04a4UipQwQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-05-01 17:36                                                   ` Jason Gunthorpe
       [not found]                                                     ` <20150501173643.GC17940-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 82+ messages in thread
From: Jason Gunthorpe @ 2015-05-01 17:36 UTC (permalink / raw)
  To: Matan Barak
  Cc: Hefty, Sean, Matan Barak, Or Gerlitz, Somnath Kotur,
	Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Fri, May 01, 2015 at 09:34:24AM +0300, Matan Barak wrote:

> > This current scheme is just so ugly, there are so many wonky
> > possibilities. What happens if I remove an IP and then add a new one?
> > The GID index will eventually be re-used, and QPs bound to that gid
> > index will silently change source IPs. Horrible.
> 
> This should be handled by the vendor's driver/other future ib_core part.
> This patchset introduces roce_gid_cache that manages the GID table and
> notify vendors about GID changes.
> 
> The vendor needs to:
> (a) Move all QPs that use GID x to error state when GID x is deleted from
>       the table.
> (b) Change all QPs that use GID x to use a special invalid GID entry.
> (c) Don't delete GIDs that are being used by a QP.

What about AH's for UD?

What about clients that discover and then hold the GID index
internally?

What about the impossible to fix race of returing the GID index in the
work completion and translating that back to an IP?

It is a terrible scheme, Sean is right, the clients should work with
the actual sock addr, somehow, at least kernel side. Converting from a
sockaddr to a gid index cannot really be done without some kind of
lock and ref count scheme.

At the very least, that should be the starting point, if we can't get
there then patch on a case by case basis why.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                                                     ` <20150501173643.GC17940-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2015-05-03  9:05                                                       ` Matan Barak
  0 siblings, 0 replies; 82+ messages in thread
From: Matan Barak @ 2015-05-03  9:05 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Hefty, Sean, Matan Barak, Or Gerlitz, Somnath Kotur,
	Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA

On 5/1/2015 8:36 PM, Jason Gunthorpe wrote:
> On Fri, May 01, 2015 at 09:34:24AM +0300, Matan Barak wrote:
>
>>> This current scheme is just so ugly, there are so many wonky
>>> possibilities. What happens if I remove an IP and then add a new one?
>>> The GID index will eventually be re-used, and QPs bound to that gid
>>> index will silently change source IPs. Horrible.
>>
>> This should be handled by the vendor's driver/other future ib_core part.
>> This patchset introduces roce_gid_cache that manages the GID table and
>> notify vendors about GID changes.
>>
>> The vendor needs to:
>> (a) Move all QPs that use GID x to error state when GID x is deleted from
>>        the table.
>> (b) Change all QPs that use GID x to use a special invalid GID entry.
>> (c) Don't delete GIDs that are being used by a QP.
>
> What about AH's for UD?
>

The plan is to have read-only memory-mapped AHs for UD. The kernel will
create AH with a sequence counter. This AH will be mapped as read-only
memory to the user-space. When sending, the user-space will atomically 
use this AH. If a GID is changed, the kernel will update this GID index 
internally. That's a long-term goal.

> What about clients that discover and then hold the GID index
> internally?
>
> What about the impossible to fix race of returing the GID index in the
> work completion and translating that back to an IP?
>
> It is a terrible scheme, Sean is right, the clients should work with
> the actual sock addr, somehow, at least kernel side. Converting from a
> sockaddr to a gid index cannot really be done without some kind of
> lock and ref count scheme.

This is the current behavior as well. The current patch-set doesn't make
it any worse or better. We don't expect to fix all world's problems.
We could add reference-count in a later patchset. Working with sockaddr
has its own (similar) problems - if the net-device's IP is changed -
using a sockaddr will just use an old incorrect IP address (which by
now could be prohibited by the administrator).

>
> At the very least, that should be the starting point, if we can't get
> there then patch on a case by case basis why.
>

I agree - we don't want to regress, but we only add the roce_gid_cache 
in this patchset (next version will postpone adding RoCE V2 to later 
patchset).

> Jason
>

Matan

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices
       [not found]                                                 ` <20150501173133.GB17940-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2015-05-05 10:58                                                   ` Matan Barak
  0 siblings, 0 replies; 82+ messages in thread
From: Matan Barak @ 2015-05-05 10:58 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Matan Barak, Or Gerlitz, Somnath Kotur, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Fri, May 1, 2015 at 8:31 PM, Jason Gunthorpe
<jgunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org> wrote:
> On Fri, May 01, 2015 at 09:28:40AM +0300, Matan Barak wrote:
>
>> >> There might be some ways around it - for example, introduce another
>> >> workqueue for roce_rescan_device and flush this workqueue only.
>> >> Every way has its advantages and disadvantages.
>> >
>> > I don't see that either, like I keep saying, all work must complete,
>> > more work queues don't really seem to help that.
>> >
>>
>> I think this is a possible solution, as all works but the "rescan" work
>> aren't device specific. That means that after we move the rescan work
>> to another workqueue and synchronize it in the client->remove function,
>> we could be sure the device won't be used by this client anymore.
>>
>> The rest of the works only iterate on the *existing* device list. Since the
>> ib_unregister_device
>> (a) lock (b) iterate over client->remove (c) remove from list (d) free
>> (e) unlock
>> all roce_gid_cache's works won't find it in the list and we'll be safe.
>
> Well.. be carefull with the locking here, if device_mutex no longer
> covers remove() then remove() and ib_enum_roce_ports_of_netdev() can
> run concurrently and they will need locking against
> free_roce_gid_cache(), which appears to implicitly rely on the
> device_mutex (too subtle and fragile)
>
> If there is a 'global' part and a 'device' specific part, it would
> good to make this more obvious and clear: group the functions together,
> annotate the difference with naming, a comment, or something. There
> are *alot* of little functions in this module, it is hard to
> understand the call graph without careful study.
>

I'll try to rearrange the code a bit better.

>> Regarding ib_unregister_client - if we guarantee that after each
>> client->remove(dev)
>> dev isn't used in this client, we actually guarantee that after removing all IB
>> devices no IB device will be used (induction).
>
> Yes, that is the idea..
>
>> > So, ultimtely, this is really a self created problem, and infact, the
>> > problem lies with the introduction of ib_enum_roce_ports_of_netdev -
>> > adding a new use of the device_mutex that is not register/unregister
>> > related exposes the design limitations of that mutex *that Roland
>> > clearly explained in a giant comment!*
>
>> I agree, while it's a general problem - it was first introduced by using
>> device_mutex in an asynchronous context (that should be flushed in
>> the remove function).
>
> Well, no, generally speaking you can't use device_mutex in any client
> facing API like ib_enum_roce_ports_of_netdev - it isn't just async
> contexts, but it means the add() and remove() functions instantly
> deadlock if they call a client facing API - that is not sane.
>

I still haven't looked at this thoroughly, but we might be able to use rwsem
(as you suggested) or a RCU protected device list. I think that changing to
one of these will make ib_enum_roce_ports_of_netdev safe in async context.

>> >  - Convert device_mutex into a r/w sem
>>
>> I'm not sure this will solve the problem, as besides the new enumerate
>> devices function, all existing functions update the list - so we'll have
>> read-while-write scenario and we'll be in the exact same condition.
>
> You'd do something like hold the write sem and mutate the device list
> to unlink the target device, then switch to a read sem to traverse the
> client list. Two locks are clearer if contention is not an issue.
>
>> Agree, but please consider also the addition of another
>> workqueue as a possible solution. It *does* (seem) to answer
>> all of your concerns and could be safer and cause less code changes.
>
> As above, fundamentally, introducing a client API that holds
> device_mutex is just wrong, so we need to address this.
>
> I would also recommend splitting your per-device/global stuff anyhow,
> global stuff needs a clear unwind during remove, and so on. It isn't
> really optional to have this clarity.
>

Ok, I'll look into that for the next version of this patchset -
probably splitting to
rwsem/RCU with clear separation of per-device and global functions.

>> kref just hides an atomic refcount - so you're actually saying using
>> the abstraction contradicts the kernel object's lifetime strategy?
>
> kref has specific semantic meaning, it isn't a general use data
> structure.
>
>> I tend to agree. The __exit function flushes the workqueue (so eventually
>> we guarentee that no code will execute before the module is unloaded).
>> IMHO, after ib_unregister_client returns, the module could still run code
>> (this is why the __exit handler exists), but it's not allowed to use any
>> data of the module which unregistered it, meaning - we can't allow it to
>> use any IB device.
>
> Yes, I was being terse, of course the .text is still going to be used,
> but the general idea of lifetime holds: the .text cannot remain in
> service of the ib_device because the __exit handler cannot clean that
> up.
>
>> Ok, I get your point, thanks. I'll keep that call synchronous. The two
>> options which are on the table right now are:
>> (a) split the device_mutex to device_mutex and client_mutex.
>>       ib_unregister_device first grabs only client_mutex and after it called
>>       client->remove it grabs device->mutex to the rest of the work.
>> (b) Introduce another workqueue for all device-specific events
>> (actually, only rescan).
>>       This allows us to flush only this workqueue without waiting for any
>>       work which needs device_mutex.
>
> The other option is to make rescan sense that the ib_device has been
> freed and ignore it, and then flush the work queue that runs that on
> module __exit. This might be needed anyhow due to the free race
> discussed above. Be sure to add the necessary kref get/puts to hold
> the ib device though.
>
> Jason

Thanks for the review. I'll look thoroughly on all the options you mentioned.

Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

end of thread, other threads:[~2015-05-05 10:58 UTC | newest]

Thread overview: 82+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <1427318422-12004-1-git-send-email-somnath.kotur@emulex.com>
     [not found] ` <1427318422-12004-1-git-send-email-somnath.kotur-laKkSmNT4hbQT0dZR+AlfA@public.gmane.org>
2015-03-25 21:19   ` [PATCH v3 for-next 01/33] IB/core: Add RoCE GID cache Somnath Kotur
     [not found]     ` <44ab0dce-c7c9-400b-af24-10b8981358a7-3RiH6ntJJkOPfaB/Gd0HpljyZtpTMMwT@public.gmane.org>
2015-03-25 23:42       ` Bart Van Assche
     [not found]         ` <551347E9.5090503-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org>
2015-03-26 14:05           ` Somnath Kotur
2015-04-14 13:23           ` Matan Barak
     [not found]             ` <552D14C6.50000-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2015-04-14 15:31               ` Bart Van Assche
2015-04-08  0:30       ` Hefty, Sean
     [not found]         ` <1828884A29C6694DAF28B7E6B8A82373A8FBE792-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
2015-04-08  4:10           ` Somnath Kotur
     [not found]             ` <7F44EA5110810A40B7DAFB605C41975D58F98121-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
2015-04-13 23:50               ` Hefty, Sean
     [not found]                 ` <1828884A29C6694DAF28B7E6B8A82373A8FC0C00-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
2015-04-14  9:32                   ` Matan Barak
     [not found]                     ` <552CDEA5.6020709-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2015-04-14 17:32                       ` Hefty, Sean
     [not found]                         ` <1828884A29C6694DAF28B7E6B8A82373A8FC11F3-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
2015-04-15  5:35                           ` Somnath Kotur
     [not found]                             ` <7F44EA5110810A40B7DAFB605C41975D58FA0B05-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
2015-04-15 16:08                               ` Hefty, Sean
     [not found]                                 ` <1828884A29C6694DAF28B7E6B8A82373A8FC19D9-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
2015-04-15 16:21                                   ` Suri Shelvapille
     [not found]                                     ` <CY1PR03MB1440108D65F18916AF9B2425DEE50-DUcFgbLRNhB/HYnSB+xpdWP7xZHs9kq/vxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2015-04-16 10:42                                       ` Matan Barak
2015-04-16 10:43                                   ` Moni Shoua
     [not found]                                     ` <CAG9sBKPQ7r2j4Awd3=CtRzekWPVe6hcO1+S+kspMEr4n=kDnkw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-04-16 14:58                                       ` Hefty, Sean
2015-04-08  8:49           ` Moni Shoua
2015-04-26 17:20       ` Or Gerlitz
     [not found]         ` <CAJ3xEMgepRUQs+GiMWxzV_QFaRnfbX7TPOdB_sKgRhHj7x7NDg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-04-27  7:32           ` Matan Barak
     [not found]             ` <553DE614.7050508-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2015-04-27 18:22               ` Or Gerlitz
     [not found]                 ` <CAJ3xEMjEhv3Nm_EfFcBWLk1ChQXBM5KvPxh5DstCqxeMo0MGwA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-04-28  7:17                   ` Matan Barak
     [not found]                     ` <553F341D.8000907-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2015-04-28 12:57                       ` Or Gerlitz
2015-03-25 21:19   ` [PATCH v3 for-next 02/33] IB/core: Add kref to IB devices Somnath Kotur
     [not found]     ` <9f65de5e-ed5f-48d2-bff2-03ffbe4f4876-3RiH6ntJJkOPfaB/Gd0HpljyZtpTMMwT@public.gmane.org>
2015-03-25 23:46       ` Bart Van Assche
     [not found]         ` <551348BD.9080200-XdAiOPVOjttBDgjK7y7TUQ@public.gmane.org>
2015-04-14 13:27           ` Matan Barak
2015-04-26 20:10       ` Or Gerlitz
     [not found]         ` <CAJ3xEMhBNt-VNNds37sXnJi3nP9ZTMd6mC3s+qZWh0XsO1n_Nw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-04-27  8:25           ` Matan Barak
     [not found]             ` <553DF294.4070507-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2015-04-27 16:22               ` Jason Gunthorpe
     [not found]                 ` <20150427162256.GA24316-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2015-04-28  8:32                   ` Matan Barak
     [not found]                     ` <553F4588.80301-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2015-04-28 16:03                       ` Jason Gunthorpe
     [not found]                         ` <20150428160315.GA5497-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2015-04-28 16:17                           ` Matan Barak
2015-04-28 11:51               ` Or Gerlitz
     [not found]                 ` <CAJ3xEMjzgS_uR1VaeGzW+jcfG2oiVo4=fCctX6o4OVbKRX2n0Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-04-28 14:03                   ` Matan Barak
     [not found]                     ` <553F931F.6000302-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2015-04-28 17:43                       ` Jason Gunthorpe
     [not found]                         ` <20150428174312.GB5497-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2015-04-28 19:04                           ` Or Gerlitz
2015-04-29  9:16                           ` Matan Barak
2015-04-29 15:29                           ` Matan Barak
     [not found]                             ` <5540F8F4.5010906-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
2015-04-29 16:48                               ` Jason Gunthorpe
     [not found]                                 ` <20150429164847.GA12781-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2015-04-30  8:21                                   ` Matan Barak
     [not found]                                     ` <5541E5ED.7000606-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2015-04-30 16:56                                       ` Hefty, Sean
     [not found]                                         ` <1828884A29C6694DAF28B7E6B8A82373A8FC929B-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
2015-04-30 17:52                                           ` Jason Gunthorpe
     [not found]                                             ` <20150430175252.GB32666-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2015-04-30 19:21                                               ` Hefty, Sean
     [not found]                                                 ` <1828884A29C6694DAF28B7E6B8A82373A8FC9419-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
2015-04-30 21:28                                                   ` Jason Gunthorpe
     [not found]                                                     ` <20150430212842.GB7709-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2015-05-01  6:41                                                       ` Matan Barak
2015-05-01  6:34                                               ` Matan Barak
     [not found]                                                 ` <CAAKD3BCJbUAMYhBzwuQFct=cRSXnGC=ELzNkvw2X04a4UipQwQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-05-01 17:36                                                   ` Jason Gunthorpe
     [not found]                                                     ` <20150501173643.GC17940-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2015-05-03  9:05                                                       ` Matan Barak
2015-04-30 17:26                                       ` Jason Gunthorpe
     [not found]                                         ` <20150430172606.GA32666-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2015-05-01  6:28                                           ` Matan Barak
     [not found]                                             ` <CAAKD3BBGQwZ_Ainm6MSQjSkaXsJd9M5Vo4oarTLyFiVMQVS5_Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-05-01 17:31                                               ` Jason Gunthorpe
     [not found]                                                 ` <20150501173133.GB17940-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2015-05-05 10:58                                                   ` Matan Barak
2015-03-25 21:19   ` [PATCH v3 for-next 03/33] IB/core: Add RoCE GID population Somnath Kotur
2015-03-25 21:19   ` [PATCH v3 for-next 04/33] IB/core: Add default GID for RoCE GID Cache Somnath Kotur
2015-03-25 21:19   ` [PATCH v3 for-next 05/33] net/bonding: make DRV macros private Somnath Kotur
2015-03-25 21:19   ` [PATCH v3 for-next 06/33] net: Add info for NETDEV_CHANGEUPPER event Somnath Kotur
2015-03-25 21:19   ` [PATCH v3 for-next 07/33] IB/core: Add RoCE cache bonding support Somnath Kotur
2015-03-25 21:19   ` [PATCH v3 for-next 08/33] IB/core: GID attribute should be returned from verbs API and cache API Somnath Kotur
2015-03-25 21:19   ` [PATCH v3 for-next 09/33] IB/core: Report gid_type and gid_ndev through sysfs Somnath Kotur
2015-03-25 21:19   ` [PATCH v3 for-next 10/33] IB/core: Support find sgid index using a filter function Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 11/33] IB/core: Modify ib_verbs and cma in order to use roce_gid_cache Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 12/33] IB/core: Add gid_type to path and rdma_id_private Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 13/33] IB/core: Add rdma_network_type to wc Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 14/33] IB/cma: Add configfs for rdma_cm Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 15/33] IB/Core: Changes to the IB Core infrastructure for RoCEv2 support Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 16/33] RDMA/ocrdma: Changes in driver to incorporate the moving of GID Table mgmt to IB/Core Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 17/33] RDMA/ocrdma: changes to support RoCE-v2 in UD path Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 18/33] RDMA/ocrdma: changes to support RoCE-v2 in RC path Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 19/33] RDMA/ocrdma: changes to support user AH creation Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 20/33] IB/mlx4: Remove gid table management for RoCE Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 21/33] IB/mlx4: Replace spin_lock with rw_semaphore Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 22/33] IB/mlx4: Lock with RCU instead of RTNL Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 23/33] net/mlx4: Postpone the registration of net_device Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 24/33] IB/mlx4: Advertise RoCE support in port capabilities Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 25/33] IB/mlx4: Implement ib_device callback - get_netdev Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 26/33] IB/mlx4: Implement ib_device callback - modify_gid Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 27/33] IB/mlx4: Configure device to work in RoCEv2 Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 28/33] IB/mlx4: Translate cache gid index to real index Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 29/33] net/mlx4_core: Add handling of R-RoCE over IPV4 in qp attach flow Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 30/33] IB/core: Initialize UD header structure with IP and UDP headers Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 31/33] IB/mlx4: Enable send of RoCE QP1 packets with IP/UDP headers Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 32/33] IB/mlx4: Create and use another QP1 for RoCEv2 Somnath Kotur
2015-03-25 21:20   ` [PATCH v3 for-next 33/33] IB/cma: Join and leave multicast groups with IGMP Somnath Kotur

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.