* [PATCH v3 for-next 01/13] IB/core: Use SRCU when reading client_list or device_list
2015-05-10 10:26 [PATCH v3 for-next 00/13] Add network namespace support in the RDMA-CM Haggai Eran
@ 2015-05-10 10:26 ` Haggai Eran
2015-05-11 18:18 ` Jason Gunthorpe
2015-05-10 10:26 ` [PATCH v3 for-next 02/13] IB/addr: Pass network namespace as a parameter Haggai Eran
` (9 subsequent siblings)
10 siblings, 1 reply; 46+ messages in thread
From: Haggai Eran @ 2015-05-10 10:26 UTC (permalink / raw)
To: Doug Ledford
Cc: linux-rdma, netdev, Liran Liss, Guy Shapiro, Shachar Raindel,
Yotam Kenneth, Haggai Eran, Matan Barak, Jason Gunthorpe
Currently the RDMA subsystem's device list and client list are protected by
a single mutex. This prevents adding user-facing APIs that iterate these
lists, since using them may cause a deadlock. The patch attempts to solve
this problem by adding an SRCU to protect the lists. Readers now don't need
the mutex, and are safe just by using srcu_read_lock/unlock.
The ib_register_device, ib_register_client, and ib_unregister_client
functions are modified to only lock the device_mutex during their
respective list modification, and use the SRCU for iteration on the other
list. In ib_unregister_device, the client list iteration remains in the
mutex critical section as it is done in reverse order.
This patch attempts to solve a similar need [1] that was seen in the RoCE
v2 patch series.
[1] http://www.spinics.net/lists/linux-rdma/msg24733.html
Cc: Matan Barak <matanb@mellanox.com>
Cc: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
---
drivers/infiniband/core/device.c | 75 ++++++++++++++++++++++++++++++----------
1 file changed, 56 insertions(+), 19 deletions(-)
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index b360350a0b20..7d90b2ca2eba 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -58,12 +58,11 @@ EXPORT_SYMBOL_GPL(ib_wq);
static LIST_HEAD(device_list);
static LIST_HEAD(client_list);
+/* device_srcu protects access to both device_list and client_list. */
+static struct srcu_struct device_srcu;
+
/*
- * device_mutex protects access to both device_list and client_list.
- * There's no real point to using multiple locks or something fancier
- * like an rwsem: we always access both lists, and we're always
- * modifying one list or the other list. In any case this is not a
- * hot path so there's no point in trying to optimize.
+ * device_mutex protects writer access to both device_list and client_list.
*/
static DEFINE_MUTEX(device_mutex);
@@ -276,6 +275,7 @@ int ib_register_device(struct ib_device *device,
u8, struct kobject *))
{
int ret;
+ int id;
mutex_lock(&device_mutex);
@@ -315,13 +315,19 @@ int ib_register_device(struct ib_device *device,
device->reg_state = IB_DEV_REGISTERED;
+ mutex_unlock(&device_mutex);
+
+ id = srcu_read_lock(&device_srcu);
{
struct ib_client *client;
- list_for_each_entry(client, &client_list, list)
+ list_for_each_entry_rcu(client, &client_list, list)
if (client->add && !add_client_context(device, client))
client->add(device);
}
+ srcu_read_unlock(&device_srcu, id);
+
+ return 0;
out:
mutex_unlock(&device_mutex);
@@ -338,6 +344,7 @@ EXPORT_SYMBOL(ib_register_device);
void ib_unregister_device(struct ib_device *device)
{
struct ib_client *client;
+ LIST_HEAD(contexts);
struct ib_client_data *context, *tmp;
unsigned long flags;
@@ -347,21 +354,26 @@ void ib_unregister_device(struct ib_device *device)
if (client->remove)
client->remove(device);
- list_del(&device->core_list);
+ list_del_rcu(&device->core_list);
+
+ mutex_unlock(&device_mutex);
+
+ synchronize_srcu(&device_srcu);
kfree(device->gid_tbl_len);
kfree(device->pkey_tbl_len);
- mutex_unlock(&device_mutex);
-
ib_device_unregister_sysfs(device);
spin_lock_irqsave(&device->client_data_lock, flags);
- list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
- kfree(context);
+ list_cut_position(&contexts, &device->client_data_list,
+ device->client_data_list.prev);
spin_unlock_irqrestore(&device->client_data_lock, flags);
device->reg_state = IB_DEV_UNREGISTERED;
+
+ list_for_each_entry_safe(context, tmp, &contexts, list)
+ kfree(context);
}
EXPORT_SYMBOL(ib_unregister_device);
@@ -381,15 +393,19 @@ EXPORT_SYMBOL(ib_unregister_device);
int ib_register_client(struct ib_client *client)
{
struct ib_device *device;
+ int id;
mutex_lock(&device_mutex);
+ list_add_tail_rcu(&client->list, &client_list);
+ mutex_unlock(&device_mutex);
- list_add_tail(&client->list, &client_list);
- list_for_each_entry(device, &device_list, core_list)
+ id = srcu_read_lock(&device_srcu);
+
+ list_for_each_entry_rcu(device, &device_list, core_list)
if (client->add && !add_client_context(device, client))
client->add(device);
- mutex_unlock(&device_mutex);
+ srcu_read_unlock(&device_srcu, id);
return 0;
}
@@ -407,11 +423,13 @@ void ib_unregister_client(struct ib_client *client)
{
struct ib_client_data *context, *tmp;
struct ib_device *device;
+ LIST_HEAD(contexts);
unsigned long flags;
+ int id;
- mutex_lock(&device_mutex);
+ id = srcu_read_lock(&device_srcu);
- list_for_each_entry(device, &device_list, core_list) {
+ list_for_each_entry_rcu(device, &device_list, core_list) {
if (client->remove)
client->remove(device);
@@ -419,13 +437,21 @@ void ib_unregister_client(struct ib_client *client)
list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
if (context->client == client) {
list_del(&context->list);
- kfree(context);
+ list_add(&context->list, &contexts);
}
spin_unlock_irqrestore(&device->client_data_lock, flags);
}
- list_del(&client->list);
+ srcu_read_unlock(&device_srcu, id);
+
+ mutex_lock(&device_mutex);
+ list_del_rcu(&client->list);
mutex_unlock(&device_mutex);
+
+ synchronize_srcu(&device_srcu);
+
+ list_for_each_entry_safe(context, tmp, &contexts, list)
+ kfree(context);
}
EXPORT_SYMBOL(ib_unregister_client);
@@ -738,9 +764,15 @@ static int __init ib_core_init(void)
{
int ret;
+ ret = init_srcu_struct(&device_srcu);
+ if (ret) {
+ pr_warn("Couldn't initialize SRCU\n");
+ return ret;
+ }
+
ib_wq = alloc_workqueue("infiniband", 0, 0);
if (!ib_wq)
- return -ENOMEM;
+ goto err_srcu;
ret = ib_sysfs_setup();
if (ret) {
@@ -770,6 +802,9 @@ err_sysfs:
err:
destroy_workqueue(ib_wq);
+err_srcu:
+ cleanup_srcu_struct(&device_srcu);
+
return ret;
}
@@ -780,6 +815,8 @@ static void __exit ib_core_cleanup(void)
ib_sysfs_cleanup();
/* Make sure that any pending umem accounting work is done. */
destroy_workqueue(ib_wq);
+ srcu_barrier(&device_srcu);
+ cleanup_srcu_struct(&device_srcu);
}
module_init(ib_core_init);
--
1.7.11.2
^ permalink raw reply related [flat|nested] 46+ messages in thread
* Re: [PATCH v3 for-next 01/13] IB/core: Use SRCU when reading client_list or device_list
2015-05-10 10:26 ` [PATCH v3 for-next 01/13] IB/core: Use SRCU when reading client_list or device_list Haggai Eran
@ 2015-05-11 18:18 ` Jason Gunthorpe
2015-05-12 6:07 ` Haggai Eran
0 siblings, 1 reply; 46+ messages in thread
From: Jason Gunthorpe @ 2015-05-11 18:18 UTC (permalink / raw)
To: Haggai Eran
Cc: Doug Ledford, linux-rdma, netdev, Liran Liss, Guy Shapiro,
Shachar Raindel, Yotam Kenneth, Matan Barak
On Sun, May 10, 2015 at 01:26:32PM +0300, Haggai Eran wrote:
> Currently the RDMA subsystem's device list and client list are protected by
> a single mutex. This prevents adding user-facing APIs that iterate these
> lists, since using them may cause a deadlock. The patch attempts to solve
> this problem by adding an SRCU to protect the lists. Readers now don't need
> the mutex, and are safe just by using srcu_read_lock/unlock.
>
> The ib_register_device, ib_register_client, and ib_unregister_client
> functions are modified to only lock the device_mutex during their
> respective list modification, and use the SRCU for iteration on the other
> list. In ib_unregister_device, the client list iteration remains in the
> mutex critical section as it is done in reverse order.
>
> This patch attempts to solve a similar need [1] that was seen in the RoCE
> v2 patch series.
>
> [1] http://www.spinics.net/lists/linux-rdma/msg24733.html
So at first blush this looked reasonable, but, no it is racy:
ib_register_client:
mutex_lock(&device_mutex);
list_add_tail_rcu(&client->list, &client_list);
mutex_unlock(&device_mutex);
id = srcu_read_lock(&device_srcu);
list_for_each_entry_rcu(device, &device_list, core_list)
client->add(device);
ib_register_device:
mutex_lock(&device_mutex);
list_add_tail(&device->core_list, &device_list);
mutex_unlock(&device_mutex);
id = srcu_read_lock(&device_srcu);
list_for_each_entry_rcu(client, &client_list, list)
client->add(device);
So, if two threads call the two registers then the new client will
have it's add called twice on the same device.
There are similar problems with removal.
I'm not sure RCU is the right way to approach this. The driver core
has the same basic task to perform, maybe review it's locking
arrangment between the device list and driver list.
[Actually we probably should be using the driver core here, with IB
clients as device drivers, but that is way beyond scope of this..]
I had a bunch of other comments, but they are not relevant,
considering the above.
Jason
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 for-next 01/13] IB/core: Use SRCU when reading client_list or device_list
2015-05-11 18:18 ` Jason Gunthorpe
@ 2015-05-12 6:07 ` Haggai Eran
0 siblings, 0 replies; 46+ messages in thread
From: Haggai Eran @ 2015-05-12 6:07 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: Doug Ledford, linux-rdma, netdev, Liran Liss, Guy Shapiro,
Shachar Raindel, Yotam Kenneth, Matan Barak
On 11/05/2015 21:18, Jason Gunthorpe wrote:
> So at first blush this looked reasonable, but, no it is racy:
>
> ib_register_client:
> mutex_lock(&device_mutex);
> list_add_tail_rcu(&client->list, &client_list);
> mutex_unlock(&device_mutex);
>
> id = srcu_read_lock(&device_srcu);
> list_for_each_entry_rcu(device, &device_list, core_list)
> client->add(device);
>
> ib_register_device:
> mutex_lock(&device_mutex);
> list_add_tail(&device->core_list, &device_list);
> mutex_unlock(&device_mutex);
>
> id = srcu_read_lock(&device_srcu);
> list_for_each_entry_rcu(client, &client_list, list)
> client->add(device);
>
> So, if two threads call the two registers then the new client will
> have it's add called twice on the same device.
>
> There are similar problems with removal.
Right. Sorry I missed that. Our first draft just kept the addition and
removal of elements to the device or client list under the mutex,
thinking that only the new code in this patchset that does traversal of
the lists would use the SRCU read lock. We then changed it thinking that
it would be better to make some use of this SRCU in this patch as well.
>
> I'm not sure RCU is the right way to approach this. The driver core
> has the same basic task to perform, maybe review it's locking
> arrangment between the device list and driver list.
>
> [Actually we probably should be using the driver core here, with IB
> clients as device drivers, but that is way beyond scope of this..]
So, I'm not very familiar with that code, but it seems that the main
difference is that in the core a single driver can be attached to a
device. The device_add function calls bus_add_device to add it to the
bus's list, and and later calls bus_probe_device to find a driver,
without any lock between these calls. And bus_add_driver adds a new
driver to the bus's driver list and then calls driver_attach without any
lock between them. The only thing I see that makes sure a driver won't
be attached twice to a device is the dev->driver field which is updated
under the device lock during the probing process.
I guess a similar thing we can do is to rely on the context we associate
with a pair of a client and a device. If such a context exist, we don't
need to call client->add again. What do you think?
Haggai
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 for-next 01/13] IB/core: Use SRCU when reading client_list or device_list
@ 2015-05-12 6:07 ` Haggai Eran
0 siblings, 0 replies; 46+ messages in thread
From: Haggai Eran @ 2015-05-12 6:07 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: Doug Ledford, linux-rdma, netdev, Liran Liss, Guy Shapiro,
Shachar Raindel, Yotam Kenneth, Matan Barak
On 11/05/2015 21:18, Jason Gunthorpe wrote:
> So at first blush this looked reasonable, but, no it is racy:
>
> ib_register_client:
> mutex_lock(&device_mutex);
> list_add_tail_rcu(&client->list, &client_list);
> mutex_unlock(&device_mutex);
>
> id = srcu_read_lock(&device_srcu);
> list_for_each_entry_rcu(device, &device_list, core_list)
> client->add(device);
>
> ib_register_device:
> mutex_lock(&device_mutex);
> list_add_tail(&device->core_list, &device_list);
> mutex_unlock(&device_mutex);
>
> id = srcu_read_lock(&device_srcu);
> list_for_each_entry_rcu(client, &client_list, list)
> client->add(device);
>
> So, if two threads call the two registers then the new client will
> have it's add called twice on the same device.
>
> There are similar problems with removal.
Right. Sorry I missed that. Our first draft just kept the addition and
removal of elements to the device or client list under the mutex,
thinking that only the new code in this patchset that does traversal of
the lists would use the SRCU read lock. We then changed it thinking that
it would be better to make some use of this SRCU in this patch as well.
>
> I'm not sure RCU is the right way to approach this. The driver core
> has the same basic task to perform, maybe review it's locking
> arrangment between the device list and driver list.
>
> [Actually we probably should be using the driver core here, with IB
> clients as device drivers, but that is way beyond scope of this..]
So, I'm not very familiar with that code, but it seems that the main
difference is that in the core a single driver can be attached to a
device. The device_add function calls bus_add_device to add it to the
bus's list, and and later calls bus_probe_device to find a driver,
without any lock between these calls. And bus_add_driver adds a new
driver to the bus's driver list and then calls driver_attach without any
lock between them. The only thing I see that makes sure a driver won't
be attached twice to a device is the dev->driver field which is updated
under the device lock during the probing process.
I guess a similar thing we can do is to rely on the context we associate
with a pair of a client and a device. If such a context exist, we don't
need to call client->add again. What do you think?
Haggai
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 for-next 01/13] IB/core: Use SRCU when reading client_list or device_list
2015-05-12 6:07 ` Haggai Eran
(?)
@ 2015-05-12 18:23 ` Jason Gunthorpe
2015-05-13 8:10 ` Haggai Eran
-1 siblings, 1 reply; 46+ messages in thread
From: Jason Gunthorpe @ 2015-05-12 18:23 UTC (permalink / raw)
To: Haggai Eran
Cc: Doug Ledford, linux-rdma, netdev, Liran Liss, Guy Shapiro,
Shachar Raindel, Yotam Kenneth, Matan Barak
On Tue, May 12, 2015 at 09:07:51AM +0300, Haggai Eran wrote:
> > I'm not sure RCU is the right way to approach this. The driver core
> > has the same basic task to perform, maybe review it's locking
> > arrangment between the device list and driver list.
> >
> > [Actually we probably should be using the driver core here, with IB
> > clients as device drivers, but that is way beyond scope of this..]
>
> So, I'm not very familiar with that code, but it seems that the main
> difference is that in the core a single driver can be attached to a
> device.
Roughly, a bus (IB would be a bus) has devices attached to it, and
devices have drivers attached to them. Bus:Device is 1:N,
Device:Drvier is 1:1.
There a a couple of reasons to be interested in re-using the driver
core, perhaps the best is that it has all the infrastructure to let us
auto-load client modules...
> I guess a similar thing we can do is to rely on the context we associate
> with a pair of a client and a device. If such a context exist, we don't
> need to call client->add again. What do you think?
I didn't look closely, isn't this enough?
device_register:
mutex_lock(client_mutex);
down_write(devices_rwsem);
list_add(device_list,dev,..);
up_write(devices_rwsem);
/* Caller must prevent device_register/unregister concurrancy on the
same dev */
foreach(client_list)
.. client->add(dev,...) ..
mutex_unlock(client_mutex)
client_register:
mutex_lock(client_mutex)
list_add(client_list,new_client..)
down_read(devices_rwsem);
foreach(device_list)
.. client->add(dev,new_client,..) ..
up_read(devices_rwsem);
mutex_unlock(client_mutex)
[Note, I didn't check this carefully, just intuitively seems like a
sane starting point]
Jason
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 for-next 01/13] IB/core: Use SRCU when reading client_list or device_list
2015-05-12 18:23 ` Jason Gunthorpe
@ 2015-05-13 8:10 ` Haggai Eran
0 siblings, 0 replies; 46+ messages in thread
From: Haggai Eran @ 2015-05-13 8:10 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: Doug Ledford, linux-rdma, netdev, Liran Liss, Guy Shapiro,
Shachar Raindel, Yotam Kenneth, Matan Barak
On 12/05/2015 21:23, Jason Gunthorpe wrote:
> On Tue, May 12, 2015 at 09:07:51AM +0300, Haggai Eran wrote:
>>> I'm not sure RCU is the right way to approach this. The driver core
>>> has the same basic task to perform, maybe review it's locking
>>> arrangment between the device list and driver list.
>>>
>>> [Actually we probably should be using the driver core here, with IB
>>> clients as device drivers, but that is way beyond scope of this..]
>>
>> So, I'm not very familiar with that code, but it seems that the main
>> difference is that in the core a single driver can be attached to a
>> device.
>
> Roughly, a bus (IB would be a bus) has devices attached to it, and
> devices have drivers attached to them. Bus:Device is 1:N,
> Device:Drvier is 1:1.
>
> There a a couple of reasons to be interested in re-using the driver
> core, perhaps the best is that it has all the infrastructure to let us
> auto-load client modules...
So you want an ib_core bus and each client is a "device" or a driver on
that bus? But this doesn't get you the relation between clients and ib
devices.
>
>> I guess a similar thing we can do is to rely on the context we associate
>> with a pair of a client and a device. If such a context exist, we don't
>> need to call client->add again. What do you think?
>
> I didn't look closely, isn't this enough?
>
> device_register:
> mutex_lock(client_mutex);
> down_write(devices_rwsem);
> list_add(device_list,dev,..);
> up_write(devices_rwsem);
>
> /* Caller must prevent device_register/unregister concurrancy on the
> same dev */
>
> foreach(client_list)
> .. client->add(dev,...) ..
>
> mutex_unlock(client_mutex)
>
> client_register:
> mutex_lock(client_mutex)
> list_add(client_list,new_client..)
> down_read(devices_rwsem);
> foreach(device_list)
> .. client->add(dev,new_client,..) ..
> up_read(devices_rwsem);
> mutex_unlock(client_mutex)
>
> [Note, I didn't check this carefully, just intuitively seems like a
> sane starting point]
That could perhaps work for the RoCEv2 patch-set, as their deadlock
relates to iterating the device list. This patch set however does an
iteration on the client list (patch 3). Because a client remove could
block on this iteration, you can still get a deadlock.
I think I prefer keeping the single device_mutex and the SRCU, but
keeping the device_mutex locked as it is today, covering all of the
registration and unregistration code. Only the new code that reads the
client list or the device list without modification to the other list
will use the SRCU read lock.
Haggai
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 for-next 01/13] IB/core: Use SRCU when reading client_list or device_list
@ 2015-05-13 8:10 ` Haggai Eran
0 siblings, 0 replies; 46+ messages in thread
From: Haggai Eran @ 2015-05-13 8:10 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: Doug Ledford, linux-rdma, netdev, Liran Liss, Guy Shapiro,
Shachar Raindel, Yotam Kenneth, Matan Barak
On 12/05/2015 21:23, Jason Gunthorpe wrote:
> On Tue, May 12, 2015 at 09:07:51AM +0300, Haggai Eran wrote:
>>> I'm not sure RCU is the right way to approach this. The driver core
>>> has the same basic task to perform, maybe review it's locking
>>> arrangment between the device list and driver list.
>>>
>>> [Actually we probably should be using the driver core here, with IB
>>> clients as device drivers, but that is way beyond scope of this..]
>>
>> So, I'm not very familiar with that code, but it seems that the main
>> difference is that in the core a single driver can be attached to a
>> device.
>
> Roughly, a bus (IB would be a bus) has devices attached to it, and
> devices have drivers attached to them. Bus:Device is 1:N,
> Device:Drvier is 1:1.
>
> There a a couple of reasons to be interested in re-using the driver
> core, perhaps the best is that it has all the infrastructure to let us
> auto-load client modules...
So you want an ib_core bus and each client is a "device" or a driver on
that bus? But this doesn't get you the relation between clients and ib
devices.
>
>> I guess a similar thing we can do is to rely on the context we associate
>> with a pair of a client and a device. If such a context exist, we don't
>> need to call client->add again. What do you think?
>
> I didn't look closely, isn't this enough?
>
> device_register:
> mutex_lock(client_mutex);
> down_write(devices_rwsem);
> list_add(device_list,dev,..);
> up_write(devices_rwsem);
>
> /* Caller must prevent device_register/unregister concurrancy on the
> same dev */
>
> foreach(client_list)
> .. client->add(dev,...) ..
>
> mutex_unlock(client_mutex)
>
> client_register:
> mutex_lock(client_mutex)
> list_add(client_list,new_client..)
> down_read(devices_rwsem);
> foreach(device_list)
> .. client->add(dev,new_client,..) ..
> up_read(devices_rwsem);
> mutex_unlock(client_mutex)
>
> [Note, I didn't check this carefully, just intuitively seems like a
> sane starting point]
That could perhaps work for the RoCEv2 patch-set, as their deadlock
relates to iterating the device list. This patch set however does an
iteration on the client list (patch 3). Because a client remove could
block on this iteration, you can still get a deadlock.
I think I prefer keeping the single device_mutex and the SRCU, but
keeping the device_mutex locked as it is today, covering all of the
registration and unregistration code. Only the new code that reads the
client list or the device list without modification to the other list
will use the SRCU read lock.
Haggai
^ permalink raw reply [flat|nested] 46+ messages in thread
[parent not found: <555306E7.9020000-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>]
* Re: [PATCH v3 for-next 01/13] IB/core: Use SRCU when reading client_list or device_list
[not found] ` <555306E7.9020000-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2015-05-13 15:57 ` Jason Gunthorpe
2015-05-14 10:35 ` Haggai Eran
0 siblings, 1 reply; 46+ messages in thread
From: Jason Gunthorpe @ 2015-05-13 15:57 UTC (permalink / raw)
To: Haggai Eran
Cc: Doug Ledford, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
netdev-u79uwXL29TY76Z2rM5mHXA, Liran Liss, Guy Shapiro,
Shachar Raindel, Yotam Kenneth, Matan Barak
On Wed, May 13, 2015 at 11:10:15AM +0300, Haggai Eran wrote:
> >> I guess a similar thing we can do is to rely on the context we associate
> >> with a pair of a client and a device. If such a context exist, we don't
> >> need to call client->add again. What do you think?
> >
> > I didn't look closely, isn't this enough?
> >
> > device_register:
> > mutex_lock(client_mutex);
> > down_write(devices_rwsem);
> > list_add(device_list,dev,..);
> > up_write(devices_rwsem);
> >
> > /* Caller must prevent device_register/unregister concurrancy on the
> > same dev */
> >
> > foreach(client_list)
> > .. client->add(dev,...) ..
> >
> > mutex_unlock(client_mutex)
> >
> > client_register:
> > mutex_lock(client_mutex)
> > list_add(client_list,new_client..)
> > down_read(devices_rwsem);
> > foreach(device_list)
> > .. client->add(dev,new_client,..) ..
> > up_read(devices_rwsem);
> > mutex_unlock(client_mutex)
> >
> > [Note, I didn't check this carefully, just intuitively seems like a
> > sane starting point]
>
> That could perhaps work for the RoCEv2 patch-set, as their deadlock
> relates to iterating the device list. This patch set however does an
> iteration on the client list (patch 3). Because a client remove could
> block on this iteration, you can still get a deadlock.
Really? Gross.
Still, I think you got it right in your analysis, but we don't need RCU:
device_register:
mutex_lock(modify_mutex);
down_write(lists_rwsem);
list_add(device_list,dev,..);
up_write(lists_rwsem);
// implied by modify_mutex: down_read(lists_rwsem)
foreach(client_list)
.. client->add(dev,...) ..
mutex_unlock(modify_mutex)
client_register:
mutex_lock(modify_mutex);
// implied by modify_mutex: down_read(lists_rwsem)
foreach(device_list)
.. client->add(dev,new_client...) ..
down_write(lists_rwsem);
list_add(client_list,new_client..);
up_write(lists_rwsem);
mutex_unlock(modify_mutex)
client_unregister:
mutex_lock(modify_mutex);
down_write(lists_rwsem);
list_cut(client_list,..rm_client..);
up_write(lists_rwsem);
// implied by modify_mutex: down_read(lists_rwsem)
foreach(device_list)
.. client->remove(dev,rm_client...) ..
mutex_unlock(modify_mutex)
etc. Notice the ordering.
> I think I prefer keeping the single device_mutex and the SRCU, but
> keeping the device_mutex locked as it is today, covering all of the
> registration and unregistration code. Only the new code that reads the
> client list or the device list without modification to the other list
> will use the SRCU read lock.
In this case, I don't see a justification to use RCU, we'd need a
high read load before optimizing the rwsem into RCU would make
sense.
I'm not sure you should ever use RCU until you've designed the locking
using traditional means.
Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 for-next 01/13] IB/core: Use SRCU when reading client_list or device_list
2015-05-13 15:57 ` Jason Gunthorpe
@ 2015-05-14 10:35 ` Haggai Eran
0 siblings, 0 replies; 46+ messages in thread
From: Haggai Eran @ 2015-05-14 10:35 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: Doug Ledford, linux-rdma, netdev, Liran Liss, Guy Shapiro,
Shachar Raindel, Yotam Kenneth, Matan Barak
On 13/05/2015 18:57, Jason Gunthorpe wrote:
> On Wed, May 13, 2015 at 11:10:15AM +0300, Haggai Eran wrote:
>
>>>> I guess a similar thing we can do is to rely on the context we associate
>>>> with a pair of a client and a device. If such a context exist, we don't
>>>> need to call client->add again. What do you think?
>>>
>>> I didn't look closely, isn't this enough?
>>>
>>> device_register:
>>> mutex_lock(client_mutex);
>>> down_write(devices_rwsem);
>>> list_add(device_list,dev,..);
>>> up_write(devices_rwsem);
>>>
>>> /* Caller must prevent device_register/unregister concurrancy on the
>>> same dev */
>>>
>>> foreach(client_list)
>>> .. client->add(dev,...) ..
>>>
>>> mutex_unlock(client_mutex)
>>>
>>> client_register:
>>> mutex_lock(client_mutex)
>>> list_add(client_list,new_client..)
>>> down_read(devices_rwsem);
>>> foreach(device_list)
>>> .. client->add(dev,new_client,..) ..
>>> up_read(devices_rwsem);
>>> mutex_unlock(client_mutex)
>>>
>>> [Note, I didn't check this carefully, just intuitively seems like a
>>> sane starting point]
>>
>> That could perhaps work for the RoCEv2 patch-set, as their deadlock
>> relates to iterating the device list. This patch set however does an
>> iteration on the client list (patch 3). Because a client remove could
>> block on this iteration, you can still get a deadlock.
>
> Really? Gross.
>
> Still, I think you got it right in your analysis, but we don't need RCU:
>
> device_register:
> mutex_lock(modify_mutex);
> down_write(lists_rwsem);
> list_add(device_list,dev,..);
> up_write(lists_rwsem);
>
> // implied by modify_mutex: down_read(lists_rwsem)
> foreach(client_list)
> .. client->add(dev,...) ..
> mutex_unlock(modify_mutex)
>
> client_register:
> mutex_lock(modify_mutex);
> // implied by modify_mutex: down_read(lists_rwsem)
> foreach(device_list)
> .. client->add(dev,new_client...) ..
>
> down_write(lists_rwsem);
> list_add(client_list,new_client..);
> up_write(lists_rwsem);
>
> mutex_unlock(modify_mutex)
>
> client_unregister:
> mutex_lock(modify_mutex);
> down_write(lists_rwsem);
> list_cut(client_list,..rm_client..);
> up_write(lists_rwsem);
>
> // implied by modify_mutex: down_read(lists_rwsem)
> foreach(device_list)
> .. client->remove(dev,rm_client...) ..
>
> mutex_unlock(modify_mutex)
>
> etc. Notice the ordering.
>
Looks good. I'll use it in the next version of the patch.
^ permalink raw reply [flat|nested] 46+ messages in thread
* Re: [PATCH v3 for-next 01/13] IB/core: Use SRCU when reading client_list or device_list
@ 2015-05-14 10:35 ` Haggai Eran
0 siblings, 0 replies; 46+ messages in thread
From: Haggai Eran @ 2015-05-14 10:35 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: Doug Ledford, linux-rdma, netdev, Liran Liss, Guy Shapiro,
Shachar Raindel, Yotam Kenneth, Matan Barak
On 13/05/2015 18:57, Jason Gunthorpe wrote:
> On Wed, May 13, 2015 at 11:10:15AM +0300, Haggai Eran wrote:
>
>>>> I guess a similar thing we can do is to rely on the context we associate
>>>> with a pair of a client and a device. If such a context exist, we don't
>>>> need to call client->add again. What do you think?
>>>
>>> I didn't look closely, isn't this enough?
>>>
>>> device_register:
>>> mutex_lock(client_mutex);
>>> down_write(devices_rwsem);
>>> list_add(device_list,dev,..);
>>> up_write(devices_rwsem);
>>>
>>> /* Caller must prevent device_register/unregister concurrancy on the
>>> same dev */
>>>
>>> foreach(client_list)
>>> .. client->add(dev,...) ..
>>>
>>> mutex_unlock(client_mutex)
>>>
>>> client_register:
>>> mutex_lock(client_mutex)
>>> list_add(client_list,new_client..)
>>> down_read(devices_rwsem);
>>> foreach(device_list)
>>> .. client->add(dev,new_client,..) ..
>>> up_read(devices_rwsem);
>>> mutex_unlock(client_mutex)
>>>
>>> [Note, I didn't check this carefully, just intuitively seems like a
>>> sane starting point]
>>
>> That could perhaps work for the RoCEv2 patch-set, as their deadlock
>> relates to iterating the device list. This patch set however does an
>> iteration on the client list (patch 3). Because a client remove could
>> block on this iteration, you can still get a deadlock.
>
> Really? Gross.
>
> Still, I think you got it right in your analysis, but we don't need RCU:
>
> device_register:
> mutex_lock(modify_mutex);
> down_write(lists_rwsem);
> list_add(device_list,dev,..);
> up_write(lists_rwsem);
>
> // implied by modify_mutex: down_read(lists_rwsem)
> foreach(client_list)
> .. client->add(dev,...) ..
> mutex_unlock(modify_mutex)
>
> client_register:
> mutex_lock(modify_mutex);
> // implied by modify_mutex: down_read(lists_rwsem)
> foreach(device_list)
> .. client->add(dev,new_client...) ..
>
> down_write(lists_rwsem);
> list_add(client_list,new_client..);
> up_write(lists_rwsem);
>
> mutex_unlock(modify_mutex)
>
> client_unregister:
> mutex_lock(modify_mutex);
> down_write(lists_rwsem);
> list_cut(client_list,..rm_client..);
> up_write(lists_rwsem);
>
> // implied by modify_mutex: down_read(lists_rwsem)
> foreach(device_list)
> .. client->remove(dev,rm_client...) ..
>
> mutex_unlock(modify_mutex)
>
> etc. Notice the ordering.
>
Looks good. I'll use it in the next version of the patch.
^ permalink raw reply [flat|nested] 46+ messages in thread
* [PATCH v3 for-next 02/13] IB/addr: Pass network namespace as a parameter
2015-05-10 10:26 [PATCH v3 for-next 00/13] Add network namespace support in the RDMA-CM Haggai Eran
2015-05-10 10:26 ` [PATCH v3 for-next 01/13] IB/core: Use SRCU when reading client_list or device_list Haggai Eran
@ 2015-05-10 10:26 ` Haggai Eran
2015-05-10 10:26 ` [PATCH v3 for-next 05/13] IB/cm: Reference count ib_cm_ids Haggai Eran
` (8 subsequent siblings)
10 siblings, 0 replies; 46+ messages in thread
From: Haggai Eran @ 2015-05-10 10:26 UTC (permalink / raw)
To: Doug Ledford
Cc: linux-rdma, netdev, Liran Liss, Guy Shapiro, Shachar Raindel,
Yotam Kenneth, Haggai Eran
From: Guy Shapiro <guysh@mellanox.com>
Add network namespace support to the ib_addr module. For that, all the
address resolution and matching should be done using the appropriate
namespace instead of init_net.
This is achieved by:
1. Adding an explicit network namespace argument to exported function that
require a namespace.
2. Saving the namespace in the rdma_addr_client structure.
3. Using it when calling networking functions.
In order to preserve the behavior of calling modules, &init_net is
passed as the parameter in calls from other modules. This is modified as
namespace support is added on more levels.
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Yotam Kenneth <yotamke@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
Signed-off-by: Guy Shapiro <guysh@mellanox.com>
---
drivers/infiniband/core/addr.c | 18 ++++++++++--------
drivers/infiniband/core/cma.c | 1 +
include/rdma/ib_addr.h | 16 +++++++++++++++-
3 files changed, 26 insertions(+), 9 deletions(-)
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index f80da50d84a5..5998dd11256d 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -128,7 +128,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
int ret = -EADDRNOTAVAIL;
if (dev_addr->bound_dev_if) {
- dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+ dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
if (!dev)
return -ENODEV;
ret = rdma_copy_addr(dev_addr, dev, NULL);
@@ -138,7 +138,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
switch (addr->sa_family) {
case AF_INET:
- dev = ip_dev_find(&init_net,
+ dev = ip_dev_find(dev_addr->net,
((struct sockaddr_in *) addr)->sin_addr.s_addr);
if (!dev)
@@ -149,12 +149,11 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
*vlan_id = rdma_vlan_dev_vlan_id(dev);
dev_put(dev);
break;
-
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
rcu_read_lock();
- for_each_netdev_rcu(&init_net, dev) {
- if (ipv6_chk_addr(&init_net,
+ for_each_netdev_rcu(dev_addr->net, dev) {
+ if (ipv6_chk_addr(dev_addr->net,
&((struct sockaddr_in6 *) addr)->sin6_addr,
dev, 1)) {
ret = rdma_copy_addr(dev_addr, dev, NULL);
@@ -236,7 +235,7 @@ static int addr4_resolve(struct sockaddr_in *src_in,
fl4.daddr = dst_ip;
fl4.saddr = src_ip;
fl4.flowi4_oif = addr->bound_dev_if;
- rt = ip_route_output_key(&init_net, &fl4);
+ rt = ip_route_output_key(addr->net, &fl4);
if (IS_ERR(rt)) {
ret = PTR_ERR(rt);
goto out;
@@ -278,12 +277,13 @@ static int addr6_resolve(struct sockaddr_in6 *src_in,
fl6.saddr = src_in->sin6_addr;
fl6.flowi6_oif = addr->bound_dev_if;
- dst = ip6_route_output(&init_net, NULL, &fl6);
+ dst = ip6_route_output(addr->net, NULL, &fl6);
if ((ret = dst->error))
goto put;
if (ipv6_addr_any(&fl6.saddr)) {
- ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev,
+ ret = ipv6_dev_get_saddr(addr->net,
+ ip6_dst_idev(dst)->dev,
&fl6.daddr, 0, &fl6.saddr);
if (ret)
goto put;
@@ -481,6 +481,7 @@ int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac,
return ret;
memset(&dev_addr, 0, sizeof(dev_addr));
+ dev_addr.net = &init_net;
ctx.addr = &dev_addr;
init_completion(&ctx.comp);
@@ -517,6 +518,7 @@ int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id)
if (ret)
return ret;
memset(&dev_addr, 0, sizeof(dev_addr));
+ dev_addr.net = &init_net;
ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id);
if (ret)
return ret;
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 5dcc61833557..c06d181d69c4 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -524,6 +524,7 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
INIT_LIST_HEAD(&id_priv->listen_list);
INIT_LIST_HEAD(&id_priv->mc_list);
get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
+ id_priv->id.route.addr.dev_addr.net = &init_net;
return &id_priv->id;
}
diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h
index ce55906b54a0..7d7831de2040 100644
--- a/include/rdma/ib_addr.h
+++ b/include/rdma/ib_addr.h
@@ -47,6 +47,7 @@
#include <rdma/ib_verbs.h>
#include <rdma/ib_pack.h>
#include <net/ipv6.h>
+#include <net/net_namespace.h>
struct rdma_addr_client {
atomic_t refcount;
@@ -64,6 +65,16 @@ void rdma_addr_register_client(struct rdma_addr_client *client);
*/
void rdma_addr_unregister_client(struct rdma_addr_client *client);
+/**
+ * struct rdma_dev_addr - Contains resolved RDMA hardware addresses
+ * @src_dev_addr: Source MAC address.
+ * @dst_dev_addr: Destination MAC address.
+ * @broadcast: Broadcast address of the device.
+ * @dev_type: The interface hardware type of the device.
+ * @bound_dev_if: An optional device interface index.
+ * @transport: The transport type used.
+ * @net: Network namespace containing the bound_dev_if net_dev.
+ */
struct rdma_dev_addr {
unsigned char src_dev_addr[MAX_ADDR_LEN];
unsigned char dst_dev_addr[MAX_ADDR_LEN];
@@ -71,11 +82,14 @@ struct rdma_dev_addr {
unsigned short dev_type;
int bound_dev_if;
enum rdma_transport_type transport;
+ struct net *net;
};
/**
* rdma_translate_ip - Translate a local IP address to an RDMA hardware
* address.
+ *
+ * The dev_addr->net field must be initialized.
*/
int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
u16 *vlan_id);
@@ -90,7 +104,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
* @dst_addr: The destination address to resolve.
* @addr: A reference to a data location that will receive the resolved
* addresses. The data location must remain valid until the callback has
- * been invoked.
+ * been invoked. The net field of the addr struct must be valid.
* @timeout_ms: Amount of time to wait for the address resolution to complete.
* @callback: Call invoked once address resolution has completed, timed out,
* or been canceled. A status of 0 indicates success.
--
1.7.11.2
^ permalink raw reply related [flat|nested] 46+ messages in thread
* [PATCH v3 for-next 05/13] IB/cm: Reference count ib_cm_ids
2015-05-10 10:26 [PATCH v3 for-next 00/13] Add network namespace support in the RDMA-CM Haggai Eran
2015-05-10 10:26 ` [PATCH v3 for-next 01/13] IB/core: Use SRCU when reading client_list or device_list Haggai Eran
2015-05-10 10:26 ` [PATCH v3 for-next 02/13] IB/addr: Pass network namespace as a parameter Haggai Eran
@ 2015-05-10 10:26 ` Haggai Eran
[not found] ` <1431253604-9214-6-git-send-email-haggaie-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
[not found] ` <1431253604-9214-1-git-send-email-haggaie-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
` (7 subsequent siblings)
10 siblings, 1 reply; 46+ messages in thread
From: Haggai Eran @ 2015-05-10 10:26 UTC (permalink / raw)
To: Doug Ledford
Cc: linux-rdma, netdev, Liran Liss, Guy Shapiro, Shachar Raindel,
Yotam Kenneth, Haggai Eran
Add reference count (kref) to the ib_cm_id to allow automatic destruction
of an ib_cm_id. This will allow multiple RDMA CM IDs to use a single
ib_cm_id when they are on different network namespaces.
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
---
drivers/infiniband/core/cm.c | 41 +++++++++++++++++++++++++++++++++++++----
include/rdma/ib_cm.h | 10 +++++++---
2 files changed, 44 insertions(+), 7 deletions(-)
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 08b18044552a..6b68402fd6df 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -711,6 +711,7 @@ struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
cm_id_priv->id.cm_handler = cm_handler;
cm_id_priv->id.context = context;
cm_id_priv->id.remote_cm_qpn = 1;
+ kref_init(&cm_id_priv->id.ref);
ret = cm_alloc_id(cm_id_priv);
if (ret)
goto error;
@@ -921,10 +922,42 @@ retest:
kfree(cm_id_priv);
}
-void ib_destroy_cm_id(struct ib_cm_id *cm_id)
+static void __ib_destroy_cm_id(struct kref *ref)
{
+ struct ib_cm_id *cm_id = container_of(ref, struct ib_cm_id, ref);
+
cm_destroy_id(cm_id, 0);
}
+
+/**
+ * ib_cm_id_get - Increase the reference count on an existing ib_cm_id.
+ * @cm_id: Connection identifier to take reference of
+ */
+void ib_cm_id_get(struct ib_cm_id *cm_id)
+{
+ kref_get(&cm_id->ref);
+}
+EXPORT_SYMBOL(ib_cm_id_get);
+
+/**
+ * ib_cm_id_put - Release a connection identifier.
+ * @cm_id: Connection identifier to release
+ *
+ * This call may block until the connection identifier is destroyed.
+ * Returns 1 if the ib_cm_id has been destroyed
+ */
+int ib_cm_id_put(struct ib_cm_id *cm_id)
+{
+ return kref_put(&cm_id->ref, __ib_destroy_cm_id);
+}
+EXPORT_SYMBOL(ib_cm_id_put);
+
+void ib_destroy_cm_id(struct ib_cm_id *cm_id)
+{
+ int destroyed = ib_cm_id_put(cm_id);
+
+ WARN_ON_ONCE(!destroyed);
+}
EXPORT_SYMBOL(ib_destroy_cm_id);
int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
@@ -1603,7 +1636,7 @@ rejected:
atomic_dec(&cm_id_priv->refcount);
cm_deref_id(listen_cm_id_priv);
destroy:
- ib_destroy_cm_id(cm_id);
+ ib_cm_id_put(cm_id);
return ret;
}
@@ -3038,7 +3071,7 @@ static int cm_sidr_req_handler(struct cm_work *work)
cm_deref_id(cur_cm_id_priv);
return 0;
out:
- ib_destroy_cm_id(&cm_id_priv->id);
+ ib_cm_id_put(&cm_id_priv->id);
return -EINVAL;
}
@@ -3197,7 +3230,7 @@ static void cm_process_send_error(struct ib_mad_send_buf *msg,
ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event);
cm_free_msg(msg);
if (ret)
- ib_destroy_cm_id(&cm_id_priv->id);
+ ib_cm_id_put(&cm_id_priv->id);
return;
discard:
spin_unlock_irq(&cm_id_priv->lock);
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index 39ed2d2fbd51..22da34de3bc7 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -287,9 +287,9 @@ struct ib_cm_event {
* IB_CM_REQ_RECEIVED and all other events, the returned @cm_id corresponds
* to a user's existing communication identifier.
*
- * Users may not call ib_destroy_cm_id while in the context of this callback;
- * however, returning a non-zero value instructs the communication manager to
- * destroy the @cm_id after the callback completes.
+ * Users may not call ib_destroy_cm_id or ib_cm_id_put while in the context of
+ * this callback; however, returning a non-zero value instructs the
+ * communication manager to destroy the @cm_id after the callback completes.
*/
typedef int (*ib_cm_handler)(struct ib_cm_id *cm_id,
struct ib_cm_event *event);
@@ -305,6 +305,7 @@ struct ib_cm_id {
__be32 local_id;
__be32 remote_id;
u32 remote_cm_qpn; /* 1 unless redirected */
+ struct kref ref;
};
/**
@@ -330,6 +331,9 @@ struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
*/
void ib_destroy_cm_id(struct ib_cm_id *cm_id);
+void ib_cm_id_get(struct ib_cm_id *cm_id);
+int ib_cm_id_put(struct ib_cm_id *cm_id);
+
#define IB_SERVICE_ID_AGN_MASK cpu_to_be64(0xFF00000000000000ULL)
#define IB_CM_ASSIGN_SERVICE_ID cpu_to_be64(0x0200000000000000ULL)
#define IB_CMA_SERVICE_ID cpu_to_be64(0x0000000001000000ULL)
--
1.7.11.2
^ permalink raw reply related [flat|nested] 46+ messages in thread
[parent not found: <1431253604-9214-1-git-send-email-haggaie-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>]
* [PATCH v3 for-next 03/13] IB/core: Find the network namespace matching connection parameters
[not found] ` <1431253604-9214-1-git-send-email-haggaie-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2015-05-10 10:26 ` Haggai Eran
2015-05-10 10:26 ` [PATCH v3 for-next 04/13] IB/ipoib: Return IPoIB devices " Haggai Eran
` (2 subsequent siblings)
3 siblings, 0 replies; 46+ messages in thread
From: Haggai Eran @ 2015-05-10 10:26 UTC (permalink / raw)
To: Doug Ledford
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
Liran Liss, Guy Shapiro, Shachar Raindel, Yotam Kenneth,
Haggai Eran
From: Yotam Kenneth <yotamke-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
In the case of IPoIB, and maybe in other cases, the network device is
managed by an upper-layer protocol (ULP). In order to expose this
network device to other users of the IB device, let ULPs implement
a callback that returns network device according to connection parameters.
The IB device and port, together with the P_Key and the IP address should
be enough to uniquely identify the ULP net device.
This function is passed to ib_core as part of the ib_client
registration.
Using this functionality, add a way to get the network namespace
corresponding to a work completion. This is needed so that responses to CM
requests can be sent from the same network namespace as the request.
Signed-off-by: Haggai Eran <haggaie-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Yotam Kenneth <yotamke-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Shachar Raindel <raindel-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Guy Shapiro <guysh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
drivers/infiniband/core/device.c | 53 ++++++++++++++++++++++++++++++++++++++++
include/rdma/ib_verbs.h | 33 +++++++++++++++++++++++++
2 files changed, 86 insertions(+)
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 7d90b2ca2eba..3b9e80ce0b42 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -38,6 +38,7 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/mutex.h>
+#include <linux/netdevice.h>
#include <rdma/rdma_netlink.h>
#include "core_priv.h"
@@ -760,6 +761,58 @@ int ib_find_pkey(struct ib_device *device,
}
EXPORT_SYMBOL(ib_find_pkey);
+static struct net_device *ib_get_net_dev_by_port_pkey_ip(struct ib_device *dev,
+ u8 port,
+ u16 pkey,
+ struct sockaddr *addr)
+{
+ struct net_device *ret = NULL;
+ struct ib_client *client;
+ int id;
+
+ id = srcu_read_lock(&device_srcu);
+
+ list_for_each_entry_rcu(client, &client_list, list)
+ if (client->get_net_device_by_port_pkey_ip) {
+ ret = client->get_net_device_by_port_pkey_ip(dev, port,
+ pkey,
+ addr);
+ if (ret)
+ break;
+ }
+
+ srcu_read_unlock(&device_srcu, id);
+
+ return ret;
+}
+
+struct net *ib_get_net_ns_by_port_pkey_ip(struct ib_device *dev,
+ u8 port,
+ u16 pkey,
+ struct sockaddr *addr)
+{
+ struct net_device *ndev = NULL;
+ struct net *ns;
+
+ if (rdma_protocol_ib(dev, port))
+ ndev = ib_get_net_dev_by_port_pkey_ip(dev, port, pkey, addr);
+
+ if (!ndev)
+ goto not_found;
+
+ rcu_read_lock();
+ ns = maybe_get_net(dev_net(ndev));
+ dev_put(ndev);
+ rcu_read_unlock();
+ if (!ns)
+ goto not_found;
+ return ns;
+
+not_found:
+ return get_net(&init_net);
+}
+EXPORT_SYMBOL(ib_get_net_ns_by_port_pkey_ip);
+
static int __init ib_core_init(void)
{
int ret;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index c7241149f142..7baf3c40ef97 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -48,6 +48,7 @@
#include <linux/rwsem.h>
#include <linux/scatterlist.h>
#include <linux/workqueue.h>
+#include <linux/socket.h>
#include <uapi/linux/if_ether.h>
#include <linux/atomic.h>
@@ -1691,6 +1692,24 @@ struct ib_client {
void (*add) (struct ib_device *);
void (*remove)(struct ib_device *);
+ /* Returns the net_dev belonging to this ib_client and matching the
+ * given parameters.
+ * @dev: An RDMA device that the net_dev use for communication.
+ * @port: A physical port number on the RDMA device.
+ * @pkey: P_Key that the net_dev uses if applicable.
+ * @addr: An IP address the net_dev is configured with.
+ *
+ * An ib_client that implements a net_dev on top of RDMA devices
+ * (such as IP over IB) should implement this callback, allowing the
+ * rdma_cm module to find the right net_dev for a given request.
+ *
+ * The caller is responsible for calling dev_put on the returned
+ * netdev. */
+ struct net_device *(*get_net_device_by_port_pkey_ip)(
+ struct ib_device *dev,
+ u8 port,
+ u16 pkey,
+ struct sockaddr *addr);
struct list_head list;
};
@@ -2834,4 +2853,18 @@ static inline int ib_check_mr_access(int flags)
int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
struct ib_mr_status *mr_status);
+/**
+ * ib_get_net_ns_by_port_pkey_ip() - Return the appropriate net namespace
+ * for a received CM request
+ * @dev: An RDMA device on which the request has been received.
+ * @port: Port number on the RDMA device.
+ * @pkey: The Pkey the request came on.
+ * @addr: Contains the IP address that the request specified as its
+ * destination.
+ */
+struct net *ib_get_net_ns_by_port_pkey_ip(struct ib_device *dev,
+ u8 port,
+ u16 pkey,
+ struct sockaddr *addr);
+
#endif /* IB_VERBS_H */
--
1.7.11.2
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 46+ messages in thread
* [PATCH v3 for-next 04/13] IB/ipoib: Return IPoIB devices matching connection parameters
[not found] ` <1431253604-9214-1-git-send-email-haggaie-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2015-05-10 10:26 ` [PATCH v3 for-next 03/13] IB/core: Find the network namespace matching connection parameters Haggai Eran
@ 2015-05-10 10:26 ` Haggai Eran
2015-05-10 10:26 ` [PATCH v3 for-next 06/13] IB/cm: API to retrieve existing listening CM IDs Haggai Eran
2015-05-10 10:26 ` [PATCH v3 for-next 07/13] IB/cm: Expose service ID in request events Haggai Eran
3 siblings, 0 replies; 46+ messages in thread
From: Haggai Eran @ 2015-05-10 10:26 UTC (permalink / raw)
To: Doug Ledford
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
Liran Liss, Guy Shapiro, Shachar Raindel, Yotam Kenneth,
Haggai Eran
From: Guy Shapiro <guysh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Implement the get_net_device_by_port_pkey_ip callback that returns network
device to ib_core according to connection parameters. Check the ipoib
device and iterate over all child devices to look for a match.
For each ipoib device we iterate through all upper devices when searching
for a matching IP, in order to support bonding.
Signed-off-by: Guy Shapiro <guysh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Haggai Eran <haggaie-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Yotam Kenneth <yotamke-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Shachar Raindel <raindel-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
drivers/infiniband/ulp/ipoib/ipoib_main.c | 139 +++++++++++++++++++++++++++++-
1 file changed, 138 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 3421e42870c3..75def39a4271 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -48,6 +48,9 @@
#include <linux/jhash.h>
#include <net/arp.h>
+#include <net/addrconf.h>
+#include <linux/inetdevice.h>
+#include <rdma/ib_cache.h>
#define DRV_VERSION "1.0.0"
@@ -91,11 +94,15 @@ struct ib_sa_client ipoib_sa_client;
static void ipoib_add_one(struct ib_device *device);
static void ipoib_remove_one(struct ib_device *device);
static void ipoib_neigh_reclaim(struct rcu_head *rp);
+static struct net_device *ipoib_get_net_device_by_port_pkey_ip(
+ struct ib_device *dev, u8 port, u16 pkey,
+ struct sockaddr *addr);
static struct ib_client ipoib_client = {
.name = "ipoib",
.add = ipoib_add_one,
- .remove = ipoib_remove_one
+ .remove = ipoib_remove_one,
+ .get_net_device_by_port_pkey_ip = ipoib_get_net_device_by_port_pkey_ip,
};
int ipoib_open(struct net_device *dev)
@@ -222,6 +229,136 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
return 0;
}
+/* Called with an RCU read lock taken */
+static bool ipoib_is_dev_match_addr(struct sockaddr *addr,
+ struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+ struct in_device *in_dev;
+ struct sockaddr_in *addr_in = (struct sockaddr_in *)addr;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr;
+#endif
+ __be32 ret_addr;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ in_dev = in_dev_get(dev);
+ if (!in_dev)
+ return false;
+
+ ret_addr = inet_confirm_addr(net, in_dev, 0,
+ addr_in->sin_addr.s_addr,
+ RT_SCOPE_HOST);
+ in_dev_put(in_dev);
+ if (ret_addr)
+ return true;
+
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ if (ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1))
+ return true;
+
+ break;
+#endif
+ }
+ return false;
+}
+
+/**
+ * Find a net_device matching the given address, which is an upper device of
+ * the given net_device.
+ * @addr: IP address to look for.
+ * @dev: base IPoIB net_device
+ *
+ * If found, returns the net_device with a reference held. Otherwise return
+ * NULL.
+ */
+static struct net_device *ipoib_get_net_dev_match_addr(struct sockaddr *addr,
+ struct net_device *dev)
+{
+ struct net_device *upper,
+ *result = NULL;
+ struct list_head *iter;
+
+ rcu_read_lock();
+ if (ipoib_is_dev_match_addr(addr, dev)) {
+ dev_hold(dev);
+ result = dev;
+ goto out;
+ }
+
+ netdev_for_each_all_upper_dev_rcu(dev, upper, iter) {
+ if (ipoib_is_dev_match_addr(addr, upper)) {
+ dev_hold(upper);
+ result = upper;
+ break;
+ }
+ }
+out:
+ rcu_read_unlock();
+ return result;
+}
+
+/* returns an IPoIB netdev on top a given ipoib device matching a pkey_index
+ * and address, if one exists. */
+static struct net_device *ipoib_match_pkey_addr(struct ipoib_dev_priv *priv,
+ u16 pkey_index,
+ struct sockaddr *addr)
+{
+ struct ipoib_dev_priv *child_priv;
+ struct net_device *net_dev = NULL;
+
+ if (priv->pkey_index == pkey_index) {
+ net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev);
+ if (net_dev)
+ return net_dev;
+ }
+
+ /* Check child interfaces */
+ down_read(&priv->vlan_rwsem);
+ list_for_each_entry(child_priv, &priv->child_intfs, list) {
+ net_dev = ipoib_match_pkey_addr(child_priv, pkey_index, addr);
+ if (net_dev)
+ break;
+ }
+ up_read(&priv->vlan_rwsem);
+
+ return net_dev;
+}
+
+static struct net_device *ipoib_get_net_device_by_port_pkey_ip(
+ struct ib_device *dev, u8 port, u16 pkey, struct sockaddr *addr)
+{
+ struct ipoib_dev_priv *priv;
+ struct list_head *dev_list;
+ struct net_device *net_dev;
+ u16 pkey_index;
+ int ret;
+
+ ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index);
+ if (ret)
+ return NULL;
+
+ if (!rdma_protocol_ib(dev, port))
+ return NULL;
+
+ dev_list = ib_get_client_data(dev, &ipoib_client);
+ if (!dev_list)
+ return NULL;
+
+ list_for_each_entry(priv, dev_list, list) {
+ if (priv->port != port)
+ continue;
+
+ net_dev = ipoib_match_pkey_addr(priv, pkey_index, addr);
+ if (net_dev)
+ return net_dev;
+ }
+ return NULL;
+}
+
int ipoib_set_mode(struct net_device *dev, const char *buf)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
--
1.7.11.2
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 46+ messages in thread
* [PATCH v3 for-next 06/13] IB/cm: API to retrieve existing listening CM IDs
[not found] ` <1431253604-9214-1-git-send-email-haggaie-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2015-05-10 10:26 ` [PATCH v3 for-next 03/13] IB/core: Find the network namespace matching connection parameters Haggai Eran
2015-05-10 10:26 ` [PATCH v3 for-next 04/13] IB/ipoib: Return IPoIB devices " Haggai Eran
@ 2015-05-10 10:26 ` Haggai Eran
2015-05-10 10:26 ` [PATCH v3 for-next 07/13] IB/cm: Expose service ID in request events Haggai Eran
3 siblings, 0 replies; 46+ messages in thread
From: Haggai Eran @ 2015-05-10 10:26 UTC (permalink / raw)
To: Doug Ledford
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
Liran Liss, Guy Shapiro, Shachar Raindel, Yotam Kenneth,
Haggai Eran
Enabling network namespaces for RDMA CM will allow processes on different
namespaces to listen on the same port. In order to leave namespace support
out of the CM layer, this requires that multiple RDMA CM IDs will be able
to share a single CM ID.
This patch adds infrastructure to retrieve an existing listening ib_cm_id,
based on its device and service ID, or create a new one if one does not
already exist.
Signed-off-by: Haggai Eran <haggaie-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
drivers/infiniband/core/cm.c | 104 ++++++++++++++++++++++++++++++++++++++++---
include/rdma/ib_cm.h | 4 ++
2 files changed, 103 insertions(+), 5 deletions(-)
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 6b68402fd6df..0be96a19734d 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -960,11 +960,32 @@ void ib_destroy_cm_id(struct ib_cm_id *cm_id)
}
EXPORT_SYMBOL(ib_destroy_cm_id);
-int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
- struct ib_cm_compare_data *compare_data)
+/**
+ * __ib_cm_listen - Initiates listening on the specified service ID for
+ * connection and service ID resolution requests.
+ * @cm_id: Connection identifier associated with the listen request.
+ * @service_id: Service identifier matched against incoming connection
+ * and service ID resolution requests. The service ID should be specified
+ * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ * assign a service ID to the caller.
+ * @service_mask: Mask applied to service ID used to listen across a
+ * range of service IDs. If set to 0, the service ID is matched
+ * exactly. This parameter is ignored if %service_id is set to
+ * IB_CM_ASSIGN_SERVICE_ID.
+ * @compare_data: This parameter is optional. It specifies data that must
+ * appear in the private data of a connection request for the specified
+ * listen request.
+ * @lock: If set, lock the cm.lock spin-lock when adding the id to the
+ * listener tree. When false, the caller must already hold the spin-lock,
+ * and compare_data must be NULL.
+ */
+static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
+ __be64 service_mask,
+ struct ib_cm_compare_data *compare_data,
+ bool lock)
{
struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
- unsigned long flags;
+ unsigned long flags = 0;
int ret = 0;
service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
@@ -990,7 +1011,8 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
cm_id->state = IB_CM_LISTEN;
- spin_lock_irqsave(&cm.lock, flags);
+ if (lock)
+ spin_lock_irqsave(&cm.lock, flags);
if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
cm_id->service_mask = ~cpu_to_be64(0);
@@ -999,7 +1021,8 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
cm_id->service_mask = service_mask;
}
cur_cm_id_priv = cm_insert_listen(cm_id_priv);
- spin_unlock_irqrestore(&cm.lock, flags);
+ if (lock)
+ spin_unlock_irqrestore(&cm.lock, flags);
if (cur_cm_id_priv) {
cm_id->state = IB_CM_IDLE;
@@ -1009,8 +1032,79 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
}
return ret;
}
+
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
+ struct ib_cm_compare_data *compare_data)
+{
+ return __ib_cm_listen(cm_id, service_id, service_mask, compare_data,
+ true);
+}
EXPORT_SYMBOL(ib_cm_listen);
+/**
+ * Create a new listening ib_cm_id and listen on the given service ID.
+ *
+ * If there's an existing ID listening on that same device and service ID,
+ * return it.
+ *
+ * @device: Device associated with the cm_id. All related communication will
+ * be associated with the specified device.
+ * @cm_handler: Callback invoked to notify the user of CM events.
+ * @service_id: Service identifier matched against incoming connection
+ * and service ID resolution requests. The service ID should be specified
+ * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ * assign a service ID to the caller.
+ * @service_mask: Mask applied to service ID used to listen across a
+ * range of service IDs. If set to 0, the service ID is matched
+ * exactly. This parameter is ignored if %service_id is set to
+ * IB_CM_ASSIGN_SERVICE_ID.
+ */
+struct ib_cm_id *ib_cm_id_create_and_listen(
+ struct ib_device *device,
+ ib_cm_handler cm_handler,
+ __be64 service_id,
+ __be64 service_mask)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_cm_id *cm_id;
+ unsigned long flags;
+ int err = 0;
+
+ /* Create an ID in advance, since the creation may sleep */
+ cm_id = ib_create_cm_id(device, cm_handler, NULL);
+ if (IS_ERR(cm_id))
+ return cm_id;
+
+ spin_lock_irqsave(&cm.lock, flags);
+
+ if (service_id != IB_CM_ASSIGN_SERVICE_ID) {
+ /* Find an existing ID */
+ cm_id_priv = cm_find_listen(device, service_id, NULL);
+ if (cm_id_priv) {
+ spin_unlock_irqrestore(&cm.lock, flags);
+ ib_cm_id_put(cm_id);
+ cm_id = &cm_id_priv->id;
+ if (cm_id->cm_handler != cm_handler ||
+ cm_id->context)
+ return ERR_PTR(-EINVAL);
+ ib_cm_id_get(cm_id);
+ return cm_id;
+ }
+ }
+
+ /* Use newly created ID */
+ err = __ib_cm_listen(cm_id, service_id, service_mask, NULL, false);
+
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ if (err) {
+ ib_cm_id_put(cm_id);
+ return ERR_PTR(err);
+ }
+ return cm_id;
+}
+EXPORT_SYMBOL(ib_cm_id_create_and_listen);
+
static __be64 cm_form_tid(struct cm_id_private *cm_id_priv,
enum cm_msg_sequence msg_seq)
{
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index 22da34de3bc7..4a17394bb556 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -365,6 +365,10 @@ struct ib_cm_compare_data {
int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
struct ib_cm_compare_data *compare_data);
+struct ib_cm_id *ib_cm_id_create_and_listen(
+ struct ib_device *device, ib_cm_handler cm_handler,
+ __be64 service_id, __be64 service_mask);
+
struct ib_cm_req_param {
struct ib_sa_path_rec *primary_path;
struct ib_sa_path_rec *alternate_path;
--
1.7.11.2
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 46+ messages in thread
* [PATCH v3 for-next 07/13] IB/cm: Expose service ID in request events
[not found] ` <1431253604-9214-1-git-send-email-haggaie-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
` (2 preceding siblings ...)
2015-05-10 10:26 ` [PATCH v3 for-next 06/13] IB/cm: API to retrieve existing listening CM IDs Haggai Eran
@ 2015-05-10 10:26 ` Haggai Eran
3 siblings, 0 replies; 46+ messages in thread
From: Haggai Eran @ 2015-05-10 10:26 UTC (permalink / raw)
To: Doug Ledford
Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
Liran Liss, Guy Shapiro, Shachar Raindel, Yotam Kenneth,
Haggai Eran
Expose the service ID on an incoming CM or SIDR request to the event
handler. This will allow the RDMA CM module to de-multiplex connection
requests based on the information encoded in the service ID.
Signed-off-by: Haggai Eran <haggaie-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
drivers/infiniband/core/cm.c | 3 +++
include/rdma/ib_cm.h | 1 +
2 files changed, 4 insertions(+)
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 0be96a19734d..fc33fb215e55 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1381,6 +1381,7 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
primary_path->packet_life_time =
cm_req_get_primary_local_ack_timeout(req_msg);
primary_path->packet_life_time -= (primary_path->packet_life_time > 0);
+ primary_path->service_id = req_msg->service_id;
if (req_msg->alt_local_lid) {
memset(alt_path, 0, sizeof *alt_path);
@@ -1402,6 +1403,7 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
alt_path->packet_life_time =
cm_req_get_alt_local_ack_timeout(req_msg);
alt_path->packet_life_time -= (alt_path->packet_life_time > 0);
+ alt_path->service_id = req_msg->service_id;
}
}
@@ -3105,6 +3107,7 @@ static void cm_format_sidr_req_event(struct cm_work *work,
param = &work->cm_event.param.sidr_req_rcvd;
param->pkey = __be16_to_cpu(sidr_req_msg->pkey);
param->listen_id = listen_id;
+ param->service_id = sidr_req_msg->service_id;
param->port = work->port->port_num;
work->cm_event.private_data = &sidr_req_msg->private_data;
}
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index 4a17394bb556..17e072d1677c 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -223,6 +223,7 @@ struct ib_cm_apr_event_param {
struct ib_cm_sidr_req_event_param {
struct ib_cm_id *listen_id;
+ __be64 service_id;
u8 port;
u16 pkey;
};
--
1.7.11.2
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related [flat|nested] 46+ messages in thread
* [PATCH v3 for-next 08/13] IB/cma: Refactor RDMA IP CM private-data parsing code
2015-05-10 10:26 [PATCH v3 for-next 00/13] Add network namespace support in the RDMA-CM Haggai Eran
` (3 preceding siblings ...)
[not found] ` <1431253604-9214-1-git-send-email-haggaie-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2015-05-10 10:26 ` Haggai Eran
2015-05-10 10:26 ` [PATCH v3 for-next 09/13] IB/cma: Add compare_data checks to the RDMA CM module Haggai Eran
` (5 subsequent siblings)
10 siblings, 0 replies; 46+ messages in thread
From: Haggai Eran @ 2015-05-10 10:26 UTC (permalink / raw)
To: Doug Ledford
Cc: linux-rdma, netdev, Liran Liss, Guy Shapiro, Shachar Raindel,
Yotam Kenneth, Haggai Eran
When receiving a connection request, rdma_cm needs to associate the request
with a network namespace. To do this, it needs to know the request's
destination IP. For this the module needs to allow getting this information
from the private data in the request packet, instead of relying on the
information already being in the listening RDMA CM ID.
When creating a new incoming connection ID, the code in
cma_save_ip{4,6}_info can no longer rely on the listener's private data to
find the port number, so it reads it from the requested service ID.
Signed-off-by: Guy Shapiro <guysh@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Yotam Kenneth <yotamke@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
---
drivers/infiniband/core/cma.c | 150 ++++++++++++++++++++++++++----------------
1 file changed, 92 insertions(+), 58 deletions(-)
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index c06d181d69c4..b5d3321e613e 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -843,97 +843,122 @@ static inline int cma_any_port(struct sockaddr *addr)
return !cma_port(addr);
}
-static void cma_save_ib_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
+static void cma_save_ib_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
struct ib_sa_path_rec *path)
{
- struct sockaddr_ib *listen_ib, *ib;
+ struct sockaddr_ib *ib;
- listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr;
- ib = (struct sockaddr_ib *) &id->route.addr.src_addr;
- ib->sib_family = listen_ib->sib_family;
- ib->sib_pkey = path->pkey;
- ib->sib_flowinfo = path->flow_label;
- memcpy(&ib->sib_addr, &path->sgid, 16);
- ib->sib_sid = listen_ib->sib_sid;
- ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
- ib->sib_scope_id = listen_ib->sib_scope_id;
-
- ib = (struct sockaddr_ib *) &id->route.addr.dst_addr;
- ib->sib_family = listen_ib->sib_family;
- ib->sib_pkey = path->pkey;
- ib->sib_flowinfo = path->flow_label;
- memcpy(&ib->sib_addr, &path->dgid, 16);
-}
-
-static __be16 ss_get_port(const struct sockaddr_storage *ss)
-{
- if (ss->ss_family == AF_INET)
- return ((struct sockaddr_in *)ss)->sin_port;
- else if (ss->ss_family == AF_INET6)
- return ((struct sockaddr_in6 *)ss)->sin6_port;
- BUG();
+ if (src_addr) {
+ ib = (struct sockaddr_ib *)src_addr;
+ ib->sib_family = AF_IB;
+ ib->sib_pkey = path->pkey;
+ ib->sib_flowinfo = path->flow_label;
+ memcpy(&ib->sib_addr, &path->sgid, 16);
+ ib->sib_sid = path->service_id;
+ ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
+ ib->sib_scope_id = 0;
+ }
+ if (dst_addr) {
+ ib = (struct sockaddr_ib *)dst_addr;
+ ib->sib_family = AF_IB;
+ ib->sib_pkey = path->pkey;
+ ib->sib_flowinfo = path->flow_label;
+ memcpy(&ib->sib_addr, &path->dgid, 16);
+ }
}
-static void cma_save_ip4_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
- struct cma_hdr *hdr)
+static void cma_save_ip4_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct cma_hdr *hdr,
+ __be16 local_port)
{
struct sockaddr_in *ip4;
- ip4 = (struct sockaddr_in *) &id->route.addr.src_addr;
- ip4->sin_family = AF_INET;
- ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr;
- ip4->sin_port = ss_get_port(&listen_id->route.addr.src_addr);
+ if (src_addr) {
+ ip4 = (struct sockaddr_in *)src_addr;
+ ip4->sin_family = AF_INET;
+ ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr;
+ ip4->sin_port = local_port;
+ }
- ip4 = (struct sockaddr_in *) &id->route.addr.dst_addr;
- ip4->sin_family = AF_INET;
- ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr;
- ip4->sin_port = hdr->port;
+ if (dst_addr) {
+ ip4 = (struct sockaddr_in *)dst_addr;
+ ip4->sin_family = AF_INET;
+ ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr;
+ ip4->sin_port = hdr->port;
+ }
}
-static void cma_save_ip6_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
- struct cma_hdr *hdr)
+static void cma_save_ip6_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct cma_hdr *hdr,
+ __be16 local_port)
{
struct sockaddr_in6 *ip6;
- ip6 = (struct sockaddr_in6 *) &id->route.addr.src_addr;
- ip6->sin6_family = AF_INET6;
- ip6->sin6_addr = hdr->dst_addr.ip6;
- ip6->sin6_port = ss_get_port(&listen_id->route.addr.src_addr);
+ if (src_addr) {
+ ip6 = (struct sockaddr_in6 *)src_addr;
+ ip6->sin6_family = AF_INET6;
+ ip6->sin6_addr = hdr->dst_addr.ip6;
+ ip6->sin6_port = local_port;
+ }
- ip6 = (struct sockaddr_in6 *) &id->route.addr.dst_addr;
- ip6->sin6_family = AF_INET6;
- ip6->sin6_addr = hdr->src_addr.ip6;
- ip6->sin6_port = hdr->port;
+ if (dst_addr) {
+ ip6 = (struct sockaddr_in6 *)dst_addr;
+ ip6->sin6_family = AF_INET6;
+ ip6->sin6_addr = hdr->src_addr.ip6;
+ ip6->sin6_port = hdr->port;
+ }
}
-static int cma_save_net_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
- struct ib_cm_event *ib_event)
+static u16 cma_port_from_service_id(__be64 service_id)
{
- struct cma_hdr *hdr;
+ return be64_to_cpu(service_id);
+}
- if ((listen_id->route.addr.src_addr.ss_family == AF_IB) &&
- (ib_event->event == IB_CM_REQ_RECEIVED)) {
- cma_save_ib_info(id, listen_id, ib_event->param.req_rcvd.primary_path);
- return 0;
- }
+static int cma_save_ip_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct ib_cm_event *ib_event,
+ __be64 service_id)
+{
+ struct cma_hdr *hdr;
+ __be16 port;
hdr = ib_event->private_data;
if (hdr->cma_version != CMA_VERSION)
return -EINVAL;
+ port = htons(cma_port_from_service_id(service_id));
+
switch (cma_get_ip_ver(hdr)) {
case 4:
- cma_save_ip4_info(id, listen_id, hdr);
+ cma_save_ip4_info(src_addr, dst_addr, hdr, port);
break;
case 6:
- cma_save_ip6_info(id, listen_id, hdr);
+ cma_save_ip6_info(src_addr, dst_addr, hdr, port);
break;
default:
return -EINVAL;
}
+
return 0;
}
+static int cma_save_net_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct ib_cm_event *ib_event,
+ sa_family_t sa_family, __be64 service_id)
+{
+ if (sa_family == AF_IB && ib_event->event == IB_CM_REQ_RECEIVED) {
+ cma_save_ib_info(src_addr, dst_addr,
+ ib_event->param.req_rcvd.primary_path);
+ return 0;
+ }
+
+ return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id);
+}
+
static inline int cma_user_data_offset(struct rdma_id_private *id_priv)
{
return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr);
@@ -1184,6 +1209,9 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
struct rdma_id_private *id_priv;
struct rdma_cm_id *id;
struct rdma_route *rt;
+ const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
+ const __be64 service_id =
+ ib_event->param.req_rcvd.primary_path->service_id;
int ret;
id = rdma_create_id(listen_id->event_handler, listen_id->context,
@@ -1192,7 +1220,9 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
return NULL;
id_priv = container_of(id, struct rdma_id_private, id);
- if (cma_save_net_info(id, listen_id, ib_event))
+ if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+ (struct sockaddr *)&id->route.addr.dst_addr,
+ ib_event, ss_family, service_id))
goto err;
rt = &id->route;
@@ -1238,7 +1268,11 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
return NULL;
id_priv = container_of(id, struct rdma_id_private, id);
- if (cma_save_net_info(id, listen_id, ib_event))
+ if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+ (struct sockaddr *)&id->route.addr.dst_addr,
+ ib_event,
+ listen_id->route.addr.src_addr.ss_family,
+ ib_event->param.sidr_req_rcvd.service_id))
goto err;
if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) {
--
1.7.11.2
^ permalink raw reply related [flat|nested] 46+ messages in thread
* [PATCH v3 for-next 09/13] IB/cma: Add compare_data checks to the RDMA CM module
2015-05-10 10:26 [PATCH v3 for-next 00/13] Add network namespace support in the RDMA-CM Haggai Eran
` (4 preceding siblings ...)
2015-05-10 10:26 ` [PATCH v3 for-next 08/13] IB/cma: Refactor RDMA IP CM private-data parsing code Haggai Eran
@ 2015-05-10 10:26 ` Haggai Eran
2015-05-10 10:26 ` [PATCH v3 for-next 10/13] IB/cma: Separate port allocation to network namespaces Haggai Eran
` (4 subsequent siblings)
10 siblings, 0 replies; 46+ messages in thread
From: Haggai Eran @ 2015-05-10 10:26 UTC (permalink / raw)
To: Doug Ledford
Cc: linux-rdma, netdev, Liran Liss, Guy Shapiro, Shachar Raindel,
Yotam Kenneth, Haggai Eran
Previously RDMA CM relied on the CM module to check the incoming requests
against "compare data" struct to dispatch events for different RDMA CM IDs
based on the request parameters (IP address, address family, etc.). With
namespace support, multiple namespaces in RDMA CM will need to share a
single CM ID. Such an ID cannot be associated with a specific compare data,
because that could create conflicts with other namespaces.
The patch adds checks to verify that incoming requests match their RDMA CM
ID destination inside the RDMA CM module itself.
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
---
drivers/infiniband/core/cm.c | 5 +++--
drivers/infiniband/core/cma.c | 12 +++++++++---
include/rdma/ib_cm.h | 3 +++
3 files changed, 15 insertions(+), 5 deletions(-)
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index fc33fb215e55..d023639a9f75 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -459,8 +459,8 @@ static int cm_compare_data(struct ib_cm_compare_data *src_data,
return memcmp(src, dst, sizeof(src));
}
-static int cm_compare_private_data(u32 *private_data,
- struct ib_cm_compare_data *dst_data)
+int cm_compare_private_data(u32 *private_data,
+ struct ib_cm_compare_data *dst_data)
{
u32 src[IB_CM_COMPARE_SIZE];
@@ -470,6 +470,7 @@ static int cm_compare_private_data(u32 *private_data,
cm_mask_copy(src, private_data, dst_data->mask);
return memcmp(src, dst_data->data, sizeof(src));
}
+EXPORT_SYMBOL(cm_compare_private_data);
/*
* Trivial helpers to strip endian annotation and compare; the
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index b5d3321e613e..1f591458b4de 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -146,6 +146,7 @@ struct rdma_id_private {
u8 tos;
u8 reuseaddr;
u8 afonly;
+ struct ib_cm_compare_data compare_data;
};
struct cma_multicast {
@@ -1319,6 +1320,10 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
int offset, ret;
listen_id = cm_id->context;
+ if (cm_compare_private_data(ib_event->private_data,
+ &listen_id->compare_data))
+ return -EINVAL;
+
if (!cma_check_req_qp_type(&listen_id->id, ib_event))
return -EINVAL;
@@ -1586,7 +1591,6 @@ out:
static int cma_ib_listen(struct rdma_id_private *id_priv)
{
- struct ib_cm_compare_data compare_data;
struct sockaddr *addr;
struct ib_cm_id *id;
__be64 svc_id;
@@ -1603,8 +1607,10 @@ static int cma_ib_listen(struct rdma_id_private *id_priv)
if (cma_any_addr(addr) && !id_priv->afonly)
ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
else {
- cma_set_compare_data(id_priv->id.ps, addr, &compare_data);
- ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data);
+ cma_set_compare_data(id_priv->id.ps, addr,
+ &id_priv->compare_data);
+ ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0,
+ &id_priv->compare_data);
}
if (ret) {
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index 17e072d1677c..4c767b5daf50 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -347,6 +347,9 @@ struct ib_cm_compare_data {
u32 mask[IB_CM_COMPARE_SIZE];
};
+int cm_compare_private_data(u32 *private_data,
+ struct ib_cm_compare_data *dst_data);
+
/**
* ib_cm_listen - Initiates listening on the specified service ID for
* connection and service ID resolution requests.
--
1.7.11.2
^ permalink raw reply related [flat|nested] 46+ messages in thread
* [PATCH v3 for-next 10/13] IB/cma: Separate port allocation to network namespaces
2015-05-10 10:26 [PATCH v3 for-next 00/13] Add network namespace support in the RDMA-CM Haggai Eran
` (5 preceding siblings ...)
2015-05-10 10:26 ` [PATCH v3 for-next 09/13] IB/cma: Add compare_data checks to the RDMA CM module Haggai Eran
@ 2015-05-10 10:26 ` Haggai Eran
2015-05-10 10:26 ` [PATCH v3 for-next 11/13] IB/cma: Share CM IDs between namespaces Haggai Eran
` (3 subsequent siblings)
10 siblings, 0 replies; 46+ messages in thread
From: Haggai Eran @ 2015-05-10 10:26 UTC (permalink / raw)
To: Doug Ledford
Cc: linux-rdma, netdev, Liran Liss, Guy Shapiro, Shachar Raindel,
Yotam Kenneth, Haggai Eran
Keep a struct for each network namespace containing the IDRs for the RDMA
CM port spaces. The struct is created dynamically using the generic_net
mechanism.
This patch is internal infrastructure work for the following patches. In
this patch, init_net is statically used as the network namespace for
the new port-space API.
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Yotam Kenneth <yotamke@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
Signed-off-by: Guy Shapiro <guysh@mellanox.com>
---
drivers/infiniband/core/cma.c | 142 +++++++++++++++++++++++++++++++++---------
1 file changed, 113 insertions(+), 29 deletions(-)
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 1f591458b4de..6538d208a97d 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -44,6 +44,8 @@
#include <linux/module.h>
#include <net/route.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include <net/tcp.h>
#include <net/ipv6.h>
@@ -80,10 +82,37 @@ static LIST_HEAD(dev_list);
static LIST_HEAD(listen_any_list);
static DEFINE_MUTEX(lock);
static struct workqueue_struct *cma_wq;
-static DEFINE_IDR(tcp_ps);
-static DEFINE_IDR(udp_ps);
-static DEFINE_IDR(ipoib_ps);
-static DEFINE_IDR(ib_ps);
+static int cma_pernet_id;
+
+struct cma_pernet {
+ struct idr tcp_ps;
+ struct idr udp_ps;
+ struct idr ipoib_ps;
+ struct idr ib_ps;
+};
+
+static struct cma_pernet *cma_pernet(struct net *net)
+{
+ return net_generic(net, cma_pernet_id);
+}
+
+static struct idr *cma_pernet_idr(struct net *net, enum rdma_port_space ps)
+{
+ struct cma_pernet *pernet = cma_pernet(net);
+
+ switch (ps) {
+ case RDMA_PS_TCP:
+ return &pernet->tcp_ps;
+ case RDMA_PS_UDP:
+ return &pernet->udp_ps;
+ case RDMA_PS_IPOIB:
+ return &pernet->ipoib_ps;
+ case RDMA_PS_IB:
+ return &pernet->ib_ps;
+ default:
+ return NULL;
+ }
+}
struct cma_device {
struct list_head list;
@@ -94,11 +123,34 @@ struct cma_device {
};
struct rdma_bind_list {
- struct idr *ps;
+ enum rdma_port_space ps;
struct hlist_head owners;
unsigned short port;
};
+static int cma_ps_alloc(struct net *net, enum rdma_port_space ps,
+ struct rdma_bind_list *bind_list, int snum)
+{
+ struct idr *idr = cma_pernet_idr(net, ps);
+
+ return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL);
+}
+
+static struct rdma_bind_list *cma_ps_find(struct net *net,
+ enum rdma_port_space ps, int snum)
+{
+ struct idr *idr = cma_pernet_idr(net, ps);
+
+ return idr_find(idr, snum);
+}
+
+static void cma_ps_remove(struct net *net, enum rdma_port_space ps, int snum)
+{
+ struct idr *idr = cma_pernet_idr(net, ps);
+
+ idr_remove(idr, snum);
+}
+
enum {
CMA_OPTION_AFONLY,
};
@@ -1027,7 +1079,7 @@ static void cma_release_port(struct rdma_id_private *id_priv)
mutex_lock(&lock);
hlist_del(&id_priv->node);
if (hlist_empty(&bind_list->owners)) {
- idr_remove(bind_list->ps, bind_list->port);
+ cma_ps_remove(&init_net, bind_list->ps, bind_list->port);
kfree(bind_list);
}
mutex_unlock(&lock);
@@ -2327,8 +2379,8 @@ static void cma_bind_port(struct rdma_bind_list *bind_list,
hlist_add_head(&id_priv->node, &bind_list->owners);
}
-static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
- unsigned short snum)
+static int cma_alloc_port(enum rdma_port_space ps,
+ struct rdma_id_private *id_priv, unsigned short snum)
{
struct rdma_bind_list *bind_list;
int ret;
@@ -2337,7 +2389,7 @@ static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
if (!bind_list)
return -ENOMEM;
- ret = idr_alloc(ps, bind_list, snum, snum + 1, GFP_KERNEL);
+ ret = cma_ps_alloc(&init_net, ps, bind_list, snum);
if (ret < 0)
goto err;
@@ -2350,7 +2402,8 @@ err:
return ret == -ENOSPC ? -EADDRNOTAVAIL : ret;
}
-static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
+static int cma_alloc_any_port(enum rdma_port_space ps,
+ struct rdma_id_private *id_priv)
{
static unsigned int last_used_port;
int low, high, remaining;
@@ -2361,7 +2414,7 @@ static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
rover = prandom_u32() % remaining + low;
retry:
if (last_used_port != rover &&
- !idr_find(ps, (unsigned short) rover)) {
+ !cma_ps_find(&init_net, ps, (unsigned short)rover)) {
int ret = cma_alloc_port(ps, id_priv, rover);
/*
* Remember previously used port number in order to avoid
@@ -2416,7 +2469,8 @@ static int cma_check_port(struct rdma_bind_list *bind_list,
return 0;
}
-static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
+static int cma_use_port(enum rdma_port_space ps,
+ struct rdma_id_private *id_priv)
{
struct rdma_bind_list *bind_list;
unsigned short snum;
@@ -2426,7 +2480,7 @@ static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
return -EACCES;
- bind_list = idr_find(ps, snum);
+ bind_list = cma_ps_find(&init_net, ps, snum);
if (!bind_list) {
ret = cma_alloc_port(ps, id_priv, snum);
} else {
@@ -2449,25 +2503,24 @@ static int cma_bind_listen(struct rdma_id_private *id_priv)
return ret;
}
-static struct idr *cma_select_inet_ps(struct rdma_id_private *id_priv)
+static enum rdma_port_space cma_select_inet_ps(
+ struct rdma_id_private *id_priv)
{
switch (id_priv->id.ps) {
case RDMA_PS_TCP:
- return &tcp_ps;
case RDMA_PS_UDP:
- return &udp_ps;
case RDMA_PS_IPOIB:
- return &ipoib_ps;
case RDMA_PS_IB:
- return &ib_ps;
+ return id_priv->id.ps;
default:
- return NULL;
+
+ return 0;
}
}
-static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
+static enum rdma_port_space cma_select_ib_ps(struct rdma_id_private *id_priv)
{
- struct idr *ps = NULL;
+ enum rdma_port_space ps = 0;
struct sockaddr_ib *sib;
u64 sid_ps, mask, sid;
@@ -2477,15 +2530,15 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) {
sid_ps = RDMA_IB_IP_PS_IB;
- ps = &ib_ps;
+ ps = RDMA_PS_IB;
} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) &&
(sid == (RDMA_IB_IP_PS_TCP & mask))) {
sid_ps = RDMA_IB_IP_PS_TCP;
- ps = &tcp_ps;
+ ps = RDMA_PS_TCP;
} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) &&
(sid == (RDMA_IB_IP_PS_UDP & mask))) {
sid_ps = RDMA_IB_IP_PS_UDP;
- ps = &udp_ps;
+ ps = RDMA_PS_UDP;
}
if (ps) {
@@ -2498,7 +2551,7 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
static int cma_get_port(struct rdma_id_private *id_priv)
{
- struct idr *ps;
+ enum rdma_port_space ps;
int ret;
if (cma_family(id_priv) != AF_IB)
@@ -3647,6 +3700,35 @@ static const struct ibnl_client_cbs cma_cb_table[] = {
.module = THIS_MODULE },
};
+static int cma_init_net(struct net *net)
+{
+ struct cma_pernet *pernet = cma_pernet(net);
+
+ idr_init(&pernet->tcp_ps);
+ idr_init(&pernet->udp_ps);
+ idr_init(&pernet->ipoib_ps);
+ idr_init(&pernet->ib_ps);
+
+ return 0;
+}
+
+static void cma_exit_net(struct net *net)
+{
+ struct cma_pernet *pernet = cma_pernet(net);
+
+ idr_destroy(&pernet->tcp_ps);
+ idr_destroy(&pernet->udp_ps);
+ idr_destroy(&pernet->ipoib_ps);
+ idr_destroy(&pernet->ib_ps);
+}
+
+static struct pernet_operations cma_pernet_operations = {
+ .init = cma_init_net,
+ .exit = cma_exit_net,
+ .id = &cma_pernet_id,
+ .size = sizeof(struct cma_pernet),
+};
+
static int __init cma_init(void)
{
int ret;
@@ -3655,6 +3737,10 @@ static int __init cma_init(void)
if (!cma_wq)
return -ENOMEM;
+ ret = register_pernet_subsys(&cma_pernet_operations);
+ if (ret)
+ goto err_wq;
+
ib_sa_register_client(&sa_client);
rdma_addr_register_client(&addr_client);
register_netdevice_notifier(&cma_nb);
@@ -3672,6 +3758,7 @@ err:
unregister_netdevice_notifier(&cma_nb);
rdma_addr_unregister_client(&addr_client);
ib_sa_unregister_client(&sa_client);
+err_wq:
destroy_workqueue(cma_wq);
return ret;
}
@@ -3683,11 +3770,8 @@ static void __exit cma_cleanup(void)
unregister_netdevice_notifier(&cma_nb);
rdma_addr_unregister_client(&addr_client);
ib_sa_unregister_client(&sa_client);
+ unregister_pernet_subsys(&cma_pernet_operations);
destroy_workqueue(cma_wq);
- idr_destroy(&tcp_ps);
- idr_destroy(&udp_ps);
- idr_destroy(&ipoib_ps);
- idr_destroy(&ib_ps);
}
module_init(cma_init);
--
1.7.11.2
^ permalink raw reply related [flat|nested] 46+ messages in thread
* [PATCH v3 for-next 11/13] IB/cma: Share CM IDs between namespaces
2015-05-10 10:26 [PATCH v3 for-next 00/13] Add network namespace support in the RDMA-CM Haggai Eran
` (6 preceding siblings ...)
2015-05-10 10:26 ` [PATCH v3 for-next 10/13] IB/cma: Separate port allocation to network namespaces Haggai Eran
@ 2015-05-10 10:26 ` Haggai Eran
2015-05-10 10:26 ` [PATCH v3 for-next 12/13] IB/cma: Add support for network namespaces Haggai Eran
` (2 subsequent siblings)
10 siblings, 0 replies; 46+ messages in thread
From: Haggai Eran @ 2015-05-10 10:26 UTC (permalink / raw)
To: Doug Ledford
Cc: linux-rdma, netdev, Liran Liss, Guy Shapiro, Shachar Raindel,
Yotam Kenneth, Haggai Eran
Use ib_cm_id_create_and_listen to create listening IB CM IDs or share
existing ones if needed. When given a request on a specific CM ID, the code
now needs to find the namespace matching the request, and find the RDMA CM
ID based on the namespace and the request parameters, instead of using the
context field of ib_cm_id as was previously done.
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Guy Shapiro <guysh@mellanox.com>
Signed-off-by: Yotam Kenneth <yotamke@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
---
drivers/infiniband/core/cma.c | 142 +++++++++++++++++++++++++++++++++++-------
1 file changed, 118 insertions(+), 24 deletions(-)
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 6538d208a97d..9c72cdde44b5 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -1012,6 +1012,112 @@ static int cma_save_net_info(struct sockaddr *src_addr,
return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id);
}
+struct cma_req_info {
+ struct ib_device *device;
+ int port;
+ __be64 service_id;
+ u16 pkey;
+};
+
+static int cma_save_req_info(struct ib_cm_event *ib_event,
+ struct cma_req_info *req)
+{
+ struct ib_cm_req_event_param *req_param = &ib_event->param.req_rcvd;
+ struct ib_cm_sidr_req_event_param *sidr_param =
+ &ib_event->param.sidr_req_rcvd;
+
+ switch (ib_event->event) {
+ case IB_CM_REQ_RECEIVED:
+ req->device = req_param->listen_id->device;
+ req->port = req_param->port;
+ req->service_id = req_param->primary_path->service_id;
+ req->pkey = be16_to_cpu(req_param->primary_path->pkey);
+ break;
+ case IB_CM_SIDR_REQ_RECEIVED:
+ req->device = sidr_param->listen_id->device;
+ req->port = sidr_param->port;
+ req->service_id = sidr_param->service_id;
+ req->pkey = sidr_param->pkey;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct net *cma_get_net_ns(struct ib_cm_event *ib_event,
+ struct cma_req_info *req)
+{
+ struct sockaddr_storage addr_storage;
+ struct sockaddr *listen_addr;
+ int err = 0;
+
+ listen_addr = (struct sockaddr *)&addr_storage;
+ err = cma_save_ip_info(listen_addr, NULL, ib_event, req->service_id);
+ if (err)
+ return ERR_PTR(err);
+
+ return ib_get_net_ns_by_port_pkey_ip(req->device, req->port,
+ req->pkey, listen_addr);
+}
+
+static enum rdma_port_space rdma_ps_from_service_id(__be64 service_id)
+{
+ return (be64_to_cpu(service_id) >> 16) & 0xffff;
+}
+
+static struct rdma_id_private *cma_find_listener(
+ struct rdma_bind_list *bind_list,
+ struct ib_cm_id *cm_id,
+ struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *id_priv, *id_priv_dev;
+
+ if (!bind_list)
+ return ERR_PTR(-EINVAL);
+
+ hlist_for_each_entry(id_priv, &bind_list->owners, node) {
+ if (!cm_compare_private_data(ib_event->private_data,
+ &id_priv->compare_data)) {
+ if (id_priv->id.device == cm_id->device)
+ return id_priv;
+ list_for_each_entry(id_priv_dev,
+ &id_priv->listen_list,
+ listen_list) {
+ if (id_priv_dev->id.device == cm_id->device)
+ return id_priv_dev;
+ }
+ }
+ }
+
+ return ERR_PTR(-EINVAL);
+}
+
+static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id,
+ struct ib_cm_event *ib_event)
+{
+ struct cma_req_info req;
+ struct net *net;
+ struct rdma_bind_list *bind_list;
+ struct rdma_id_private *id_priv;
+ int err;
+
+ err = cma_save_req_info(ib_event, &req);
+ if (err)
+ return ERR_PTR(err);
+
+ net = cma_get_net_ns(ib_event, &req);
+ if (IS_ERR(net))
+ return ERR_PTR(PTR_ERR(net));
+
+ bind_list = cma_ps_find(net, rdma_ps_from_service_id(req.service_id),
+ cma_port_from_service_id(req.service_id));
+ id_priv = cma_find_listener(bind_list, cm_id, ib_event);
+ put_net(net);
+ return id_priv;
+}
+
static inline int cma_user_data_offset(struct rdma_id_private *id_priv)
{
return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr);
@@ -1121,7 +1227,7 @@ void rdma_destroy_id(struct rdma_cm_id *id)
if (id_priv->cma_dev) {
if (cap_ib_cm(id_priv->id.device, 1)) {
if (id_priv->cm_id.ib)
- ib_destroy_cm_id(id_priv->cm_id.ib);
+ ib_cm_id_put(id_priv->cm_id.ib);
} else if (cap_iw_cm(id_priv->id.device, 1)) {
if (id_priv->cm_id.iw)
iw_destroy_cm_id(id_priv->cm_id.iw);
@@ -1371,10 +1477,9 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
struct rdma_cm_event event;
int offset, ret;
- listen_id = cm_id->context;
- if (cm_compare_private_data(ib_event->private_data,
- &listen_id->compare_data))
- return -EINVAL;
+ listen_id = cma_id_from_event(cm_id, ib_event);
+ if (IS_ERR(listen_id))
+ return PTR_ERR(listen_id);
if (!cma_check_req_qp_type(&listen_id->id, ib_event))
return -EINVAL;
@@ -1648,27 +1753,16 @@ static int cma_ib_listen(struct rdma_id_private *id_priv)
__be64 svc_id;
int ret;
- id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv);
- if (IS_ERR(id))
- return PTR_ERR(id);
-
- id_priv->cm_id.ib = id;
-
addr = cma_src_addr(id_priv);
svc_id = rdma_get_service_id(&id_priv->id, addr);
- if (cma_any_addr(addr) && !id_priv->afonly)
- ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
- else {
+ if (!cma_any_addr(addr) || id_priv->afonly)
cma_set_compare_data(id_priv->id.ps, addr,
&id_priv->compare_data);
- ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0,
- &id_priv->compare_data);
- }
-
- if (ret) {
- ib_destroy_cm_id(id_priv->cm_id.ib);
- id_priv->cm_id.ib = NULL;
- }
+ id = ib_cm_id_create_and_listen(id_priv->id.device, cma_req_handler,
+ svc_id, 0);
+ if (IS_ERR(id))
+ return PTR_ERR(id);
+ id_priv->cm_id.ib = id;
return ret;
}
@@ -2827,7 +2921,7 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req);
if (ret) {
- ib_destroy_cm_id(id_priv->cm_id.ib);
+ ib_cm_id_put(id_priv->cm_id.ib);
id_priv->cm_id.ib = NULL;
}
out:
@@ -2898,7 +2992,7 @@ static int cma_connect_ib(struct rdma_id_private *id_priv,
ret = ib_send_cm_req(id_priv->cm_id.ib, &req);
out:
if (ret && !IS_ERR(id)) {
- ib_destroy_cm_id(id);
+ ib_cm_id_put(id);
id_priv->cm_id.ib = NULL;
}
--
1.7.11.2
^ permalink raw reply related [flat|nested] 46+ messages in thread
* [PATCH v3 for-next 12/13] IB/cma: Add support for network namespaces
2015-05-10 10:26 [PATCH v3 for-next 00/13] Add network namespace support in the RDMA-CM Haggai Eran
` (7 preceding siblings ...)
2015-05-10 10:26 ` [PATCH v3 for-next 11/13] IB/cma: Share CM IDs between namespaces Haggai Eran
@ 2015-05-10 10:26 ` Haggai Eran
2015-05-10 10:26 ` [PATCH v3 for-next 13/13] IB/ucma: Take the network namespace from the process Haggai Eran
2015-05-12 17:52 ` [PATCH v3 for-next 00/13] Add network namespace support in the RDMA-CM Hefty, Sean
10 siblings, 0 replies; 46+ messages in thread
From: Haggai Eran @ 2015-05-10 10:26 UTC (permalink / raw)
To: Doug Ledford
Cc: linux-rdma, netdev, Liran Liss, Guy Shapiro, Shachar Raindel,
Yotam Kenneth, Haggai Eran
From: Guy Shapiro <guysh@mellanox.com>
Add support for network namespaces in the ib_cma module. This is
accomplished by:
1. Adding network namespace parameter for rdma_create_id. This parameter is
used to populate the network namespace field in rdma_id_private.
rdma_create_id keeps a reference on the network namespace.
2. Using the network namespace from the rdma_id instead of init_net inside
of ib_cma, when listening on an ID and when looking for an ID for an
incoming request.
3. Decrementing the reference count for the appropriate network namespace
when calling rdma_destroy_id.
In order to preserve the current behavior init_net is passed when calling
from other modules.
Signed-off-by: Guy Shapiro <guysh@mellanox.com>
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Yotam Kenneth <yotamke@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
---
drivers/infiniband/core/cma.c | 42 +++++++++++++---------
drivers/infiniband/core/ucma.c | 3 +-
drivers/infiniband/ulp/iser/iser_verbs.c | 2 +-
drivers/infiniband/ulp/isert/ib_isert.c | 2 +-
.../staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h | 4 ++-
include/rdma/rdma_cm.h | 6 +++-
net/9p/trans_rdma.c | 4 +--
net/rds/ib.c | 2 +-
net/rds/ib_cm.c | 2 +-
net/rds/iw.c | 2 +-
net/rds/iw_cm.c | 2 +-
net/rds/rdma_transport.c | 4 +--
net/sunrpc/xprtrdma/svc_rdma_transport.c | 4 +--
net/sunrpc/xprtrdma/verbs.c | 3 +-
14 files changed, 50 insertions(+), 32 deletions(-)
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 9c72cdde44b5..b55ad0a06479 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -553,7 +553,8 @@ static int cma_disable_callback(struct rdma_id_private *id_priv,
return 0;
}
-struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
+struct rdma_cm_id *rdma_create_id(struct net *net,
+ rdma_cm_event_handler event_handler,
void *context, enum rdma_port_space ps,
enum ib_qp_type qp_type)
{
@@ -577,7 +578,7 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
INIT_LIST_HEAD(&id_priv->listen_list);
INIT_LIST_HEAD(&id_priv->mc_list);
get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
- id_priv->id.route.addr.dev_addr.net = &init_net;
+ id_priv->id.route.addr.dev_addr.net = get_net(net);
return &id_priv->id;
}
@@ -1178,6 +1179,7 @@ static void cma_cancel_operation(struct rdma_id_private *id_priv,
static void cma_release_port(struct rdma_id_private *id_priv)
{
struct rdma_bind_list *bind_list = id_priv->bind_list;
+ struct net *net = id_priv->id.route.addr.dev_addr.net;
if (!bind_list)
return;
@@ -1185,7 +1187,7 @@ static void cma_release_port(struct rdma_id_private *id_priv)
mutex_lock(&lock);
hlist_del(&id_priv->node);
if (hlist_empty(&bind_list->owners)) {
- cma_ps_remove(&init_net, bind_list->ps, bind_list->port);
+ cma_ps_remove(net, bind_list->ps, bind_list->port);
kfree(bind_list);
}
mutex_unlock(&lock);
@@ -1244,6 +1246,7 @@ void rdma_destroy_id(struct rdma_cm_id *id)
cma_deref_id(id_priv->id.context);
kfree(id_priv->id.route.path_rec);
+ put_net(id_priv->id.route.addr.dev_addr.net);
kfree(id_priv);
}
EXPORT_SYMBOL(rdma_destroy_id);
@@ -1373,7 +1376,8 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
ib_event->param.req_rcvd.primary_path->service_id;
int ret;
- id = rdma_create_id(listen_id->event_handler, listen_id->context,
+ id = rdma_create_id(listen_id->route.addr.dev_addr.net,
+ listen_id->event_handler, listen_id->context,
listen_id->ps, ib_event->param.req_rcvd.qp_type);
if (IS_ERR(id))
return NULL;
@@ -1419,9 +1423,10 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
{
struct rdma_id_private *id_priv;
struct rdma_cm_id *id;
+ struct net *net = listen_id->route.addr.dev_addr.net;
int ret;
- id = rdma_create_id(listen_id->event_handler, listen_id->context,
+ id = rdma_create_id(net, listen_id->event_handler, listen_id->context,
listen_id->ps, IB_QPT_UD);
if (IS_ERR(id))
return NULL;
@@ -1676,7 +1681,8 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
return -ECONNABORTED;
/* Create a new RDMA id for the new IW CM ID */
- new_cm_id = rdma_create_id(listen_id->id.event_handler,
+ new_cm_id = rdma_create_id(listen_id->id.route.addr.dev_addr.net,
+ listen_id->id.event_handler,
listen_id->id.context,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(new_cm_id)) {
@@ -1808,12 +1814,13 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv,
{
struct rdma_id_private *dev_id_priv;
struct rdma_cm_id *id;
+ struct net *net = id_priv->id.route.addr.dev_addr.net;
int ret;
if (cma_family(id_priv) == AF_IB && !cap_ib_cm(cma_dev->device, 1))
return;
- id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps,
+ id = rdma_create_id(net, cma_listen_handler, id_priv, id_priv->id.ps,
id_priv->id.qp_type);
if (IS_ERR(id))
return;
@@ -2483,7 +2490,8 @@ static int cma_alloc_port(enum rdma_port_space ps,
if (!bind_list)
return -ENOMEM;
- ret = cma_ps_alloc(&init_net, ps, bind_list, snum);
+ ret = cma_ps_alloc(id_priv->id.route.addr.dev_addr.net, ps, bind_list,
+ snum);
if (ret < 0)
goto err;
@@ -2502,13 +2510,14 @@ static int cma_alloc_any_port(enum rdma_port_space ps,
static unsigned int last_used_port;
int low, high, remaining;
unsigned int rover;
+ struct net *net = id_priv->id.route.addr.dev_addr.net;
- inet_get_local_port_range(&init_net, &low, &high);
+ inet_get_local_port_range(net, &low, &high);
remaining = (high - low) + 1;
rover = prandom_u32() % remaining + low;
retry:
if (last_used_port != rover &&
- !cma_ps_find(&init_net, ps, (unsigned short)rover)) {
+ !cma_ps_find(net, ps, (unsigned short)rover)) {
int ret = cma_alloc_port(ps, id_priv, rover);
/*
* Remember previously used port number in order to avoid
@@ -2574,7 +2583,7 @@ static int cma_use_port(enum rdma_port_space ps,
if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
return -EACCES;
- bind_list = cma_ps_find(&init_net, ps, snum);
+ bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum);
if (!bind_list) {
ret = cma_alloc_port(ps, id_priv, snum);
} else {
@@ -2766,8 +2775,11 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
if (addr->sa_family == AF_INET)
id_priv->afonly = 1;
#if IS_ENABLED(CONFIG_IPV6)
- else if (addr->sa_family == AF_INET6)
- id_priv->afonly = init_net.ipv6.sysctl.bindv6only;
+ else if (addr->sa_family == AF_INET6) {
+ struct net *net = id_priv->id.route.addr.dev_addr.net;
+
+ id_priv->afonly = net->ipv6.sysctl.bindv6only;
+ }
#endif
}
ret = cma_get_port(id_priv);
@@ -3571,6 +3583,7 @@ static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id
dev_addr = &id_priv->id.route.addr.dev_addr;
if ((dev_addr->bound_dev_if == ndev->ifindex) &&
+ (net_eq(dev_net(ndev), dev_addr->net)) &&
memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) {
printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n",
ndev->name, &id_priv->id);
@@ -3596,9 +3609,6 @@ static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
struct rdma_id_private *id_priv;
int ret = NOTIFY_DONE;
- if (dev_net(ndev) != &init_net)
- return NOTIFY_DONE;
-
if (event != NETDEV_BONDING_FAILOVER)
return NOTIFY_DONE;
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 6204065f40c4..b595d78b8ab0 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -391,7 +391,8 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf,
return -ENOMEM;
ctx->uid = cmd.uid;
- ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps, qp_type);
+ ctx->cm_id = rdma_create_id(&init_net, ucma_event_handler, ctx, cmd.ps,
+ qp_type);
if (IS_ERR(ctx->cm_id)) {
ret = PTR_ERR(ctx->cm_id);
goto err1;
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index cc2dd35ffbc0..5517333738a8 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -960,7 +960,7 @@ int iser_connect(struct iser_conn *iser_conn,
ib_conn->beacon.wr_id = ISER_BEACON_WRID;
ib_conn->beacon.opcode = IB_WR_SEND;
- ib_conn->cma_id = rdma_create_id(iser_cma_handler,
+ ib_conn->cma_id = rdma_create_id(&init_net, iser_cma_handler,
(void *)iser_conn,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(ib_conn->cma_id)) {
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
index 327529ee85eb..746d05ef2cea 100644
--- a/drivers/infiniband/ulp/isert/ib_isert.c
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -3043,7 +3043,7 @@ isert_setup_id(struct isert_np *isert_np)
sa = (struct sockaddr *)&np->np_sockaddr;
isert_dbg("ksockaddr: %p, sa: %p\n", &np->np_sockaddr, sa);
- id = rdma_create_id(isert_cma_handler, isert_np,
+ id = rdma_create_id(&init_net, isert_cma_handler, isert_np,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(id)) {
isert_err("rdma_create_id() failed: %ld\n", PTR_ERR(id));
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
index cd664d025f41..f4aab7485a84 100644
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
@@ -125,7 +125,9 @@ extern kib_tunables_t kiblnd_tunables;
IBLND_CREDIT_HIGHWATER_V1 : \
*kiblnd_tunables.kib_peercredits_hiw) /* when eagerly to return credits */
-#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(cb, dev, ps, qpt)
+#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(&init_net, \
+ cb, dev, \
+ ps, qpt)
static inline int
kiblnd_concurrent_sends_v1(void)
diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h
index 1ed2088dc9f5..f45566f8dc1a 100644
--- a/include/rdma/rdma_cm.h
+++ b/include/rdma/rdma_cm.h
@@ -158,13 +158,17 @@ struct rdma_cm_id {
/**
* rdma_create_id - Create an RDMA identifier.
*
+ * @net: The network namespace in which to create the new id.
* @event_handler: User callback invoked to report events associated with the
* returned rdma_id.
* @context: User specified context associated with the id.
* @ps: RDMA port space.
* @qp_type: type of queue pair associated with the id.
+ *
+ * The id holds a reference on the network namespace until it is destroyed.
*/
-struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
+struct rdma_cm_id *rdma_create_id(struct net *net,
+ rdma_cm_event_handler event_handler,
void *context, enum rdma_port_space ps,
enum ib_qp_type qp_type);
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 3533d2a53ab6..8bc3b5cbedf1 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -660,8 +660,8 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
return -ENOMEM;
/* Create the RDMA CM ID */
- rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP,
- IB_QPT_RC);
+ rdma->cm_id = rdma_create_id(&init_net, p9_cm_event_handler, client,
+ RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(rdma->cm_id))
goto error;
diff --git a/net/rds/ib.c b/net/rds/ib.c
index ba2dffeff608..c68e7b606975 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -326,7 +326,7 @@ static int rds_ib_laddr_check(__be32 addr)
/* Create a CMA ID and try to bind it. This catches both
* IB and iWARP capable NICs.
*/
- cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
+ cm_id = rdma_create_id(&init_net, NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(cm_id))
return PTR_ERR(cm_id);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 31b74f5e61ad..0640f66d5d9e 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -583,7 +583,7 @@ int rds_ib_conn_connect(struct rds_connection *conn)
/* XXX I wonder what affect the port space has */
/* delegate cm event handler to rdma_transport */
- ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
+ ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(ic->i_cm_id)) {
ret = PTR_ERR(ic->i_cm_id);
diff --git a/net/rds/iw.c b/net/rds/iw.c
index 589935661d66..230da79e62a1 100644
--- a/net/rds/iw.c
+++ b/net/rds/iw.c
@@ -227,7 +227,7 @@ static int rds_iw_laddr_check(__be32 addr)
/* Create a CMA ID and try to bind it. This catches both
* IB and iWARP capable NICs.
*/
- cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
+ cm_id = rdma_create_id(&init_net, NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(cm_id))
return PTR_ERR(cm_id);
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
index a6c2bea9f8f9..5c18819f13fc 100644
--- a/net/rds/iw_cm.c
+++ b/net/rds/iw_cm.c
@@ -520,7 +520,7 @@ int rds_iw_conn_connect(struct rds_connection *conn)
/* XXX I wonder what affect the port space has */
/* delegate cm event handler to rdma_transport */
- ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
+ ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(ic->i_cm_id)) {
ret = PTR_ERR(ic->i_cm_id);
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 6cd9d1deafc3..0c88f24aed85 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -159,8 +159,8 @@ static int rds_rdma_listen_init(void)
struct rdma_cm_id *cm_id;
int ret;
- cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP,
- IB_QPT_RC);
+ cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, NULL,
+ RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(cm_id)) {
ret = PTR_ERR(cm_id);
printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 3df8320c6efe..6d5c61c2b381 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -704,8 +704,8 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
if (!cma_xprt)
return ERR_PTR(-ENOMEM);
- listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP,
- IB_QPT_RC);
+ listen_id = rdma_create_id(&init_net, rdma_listen_handler, cma_xprt,
+ RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(listen_id)) {
ret = PTR_ERR(listen_id);
dprintk("svcrdma: rdma_create_id failed = %d\n", ret);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 4870d272e006..12d225359e8d 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -509,7 +509,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
init_completion(&ia->ri_done);
- id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
+ id = rdma_create_id(&init_net, rpcrdma_conn_upcall, xprt, RDMA_PS_TCP,
+ IB_QPT_RC);
if (IS_ERR(id)) {
rc = PTR_ERR(id);
dprintk("RPC: %s: rdma_create_id() failed %i\n",
--
1.7.11.2
^ permalink raw reply related [flat|nested] 46+ messages in thread
* [PATCH v3 for-next 13/13] IB/ucma: Take the network namespace from the process
2015-05-10 10:26 [PATCH v3 for-next 00/13] Add network namespace support in the RDMA-CM Haggai Eran
` (8 preceding siblings ...)
2015-05-10 10:26 ` [PATCH v3 for-next 12/13] IB/cma: Add support for network namespaces Haggai Eran
@ 2015-05-10 10:26 ` Haggai Eran
2015-05-12 17:52 ` [PATCH v3 for-next 00/13] Add network namespace support in the RDMA-CM Hefty, Sean
10 siblings, 0 replies; 46+ messages in thread
From: Haggai Eran @ 2015-05-10 10:26 UTC (permalink / raw)
To: Doug Ledford
Cc: linux-rdma, netdev, Liran Liss, Guy Shapiro, Shachar Raindel,
Yotam Kenneth, Haggai Eran
From: Guy Shapiro <guysh@mellanox.com>
Add support for network namespaces from user space. This is done by passing
the network namespace of the process instead of init_net.
Signed-off-by: Haggai Eran <haggaie@mellanox.com>
Signed-off-by: Yotam Kenneth <yotamke@mellanox.com>
Signed-off-by: Shachar Raindel <raindel@mellanox.com>
Signed-off-by: Guy Shapiro <guysh@mellanox.com>
---
drivers/infiniband/core/ucma.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index b595d78b8ab0..0289d64d717c 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -42,6 +42,7 @@
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/module.h>
+#include <linux/nsproxy.h>
#include <rdma/rdma_user_cm.h>
#include <rdma/ib_marshall.h>
@@ -391,8 +392,8 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf,
return -ENOMEM;
ctx->uid = cmd.uid;
- ctx->cm_id = rdma_create_id(&init_net, ucma_event_handler, ctx, cmd.ps,
- qp_type);
+ ctx->cm_id = rdma_create_id(current->nsproxy->net_ns,
+ ucma_event_handler, ctx, cmd.ps, qp_type);
if (IS_ERR(ctx->cm_id)) {
ret = PTR_ERR(ctx->cm_id);
goto err1;
--
1.7.11.2
^ permalink raw reply related [flat|nested] 46+ messages in thread
* RE: [PATCH v3 for-next 00/13] Add network namespace support in the RDMA-CM
2015-05-10 10:26 [PATCH v3 for-next 00/13] Add network namespace support in the RDMA-CM Haggai Eran
` (9 preceding siblings ...)
2015-05-10 10:26 ` [PATCH v3 for-next 13/13] IB/ucma: Take the network namespace from the process Haggai Eran
@ 2015-05-12 17:52 ` Hefty, Sean
[not found] ` <1828884A29C6694DAF28B7E6B8A82373A8FD7B85-P5GAC/sN6hkd3b2yrw5b5LfspsVTdybXVpNB7YpNyf8@public.gmane.org>
10 siblings, 1 reply; 46+ messages in thread
From: Hefty, Sean @ 2015-05-12 17:52 UTC (permalink / raw)
To: Haggai Eran, Doug Ledford
Cc: linux-rdma, netdev, Liran Liss, Guy Shapiro, Shachar Raindel,
Yotam Kenneth
> Guy Shapiro (4):
> IB/addr: Pass network namespace as a parameter
> IB/ipoib: Return IPoIB devices matching connection parameters
> IB/cma: Add support for network namespaces
> IB/ucma: Take the network namespace from the process
>
> Haggai Eran (8):
> IB/core: Use SRCU when reading client_list or device_list
> IB/cm: Reference count ib_cm_ids
> IB/cm: API to retrieve existing listening CM IDs
> IB/cm: Expose service ID in request events
> IB/cma: Refactor RDMA IP CM private-data parsing code
> IB/cma: Add compare_data checks to the RDMA CM module
> IB/cma: Separate port allocation to network namespaces
> IB/cma: Share CM IDs between namespaces
>
> Yotam Kenneth (1):
> IB/core: Find the network namespace matching connection parameters
How does network namespace support work with iWarp?
^ permalink raw reply [flat|nested] 46+ messages in thread