All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH for-next 0/6] RDMA/rxe: Fix potential races
@ 2021-10-10 23:59 Bob Pearson
  2021-10-10 23:59 ` [PATCH for-next 1/6] RDMA/rxe: Make rxe_alloc() take pool lock Bob Pearson
                   ` (6 more replies)
  0 siblings, 7 replies; 21+ messages in thread
From: Bob Pearson @ 2021-10-10 23:59 UTC (permalink / raw)
  To: jgg, zyjzyj2000, linux-rdma; +Cc: Bob Pearson

There are possible race conditions related to attempting to access
rxe pool objects at the same time as the pools or elements are being
freed. This series of patches addresses these races.

Bob Pearson (6):
  RDMA/rxe: Make rxe_alloc() take pool lock
  RDMA/rxe: Copy setup parameters into rxe_pool
  RDMA/rxe: Save object pointer in pool element
  RDMA/rxe: Combine rxe_add_index with rxe_alloc
  RDMA/rxe: Combine rxe_add_key with rxe_alloc
  RDMA/rxe: Fix potential race condition in rxe_pool

 drivers/infiniband/sw/rxe/rxe_mcast.c |   5 +-
 drivers/infiniband/sw/rxe/rxe_mr.c    |   1 -
 drivers/infiniband/sw/rxe/rxe_mw.c    |   5 +-
 drivers/infiniband/sw/rxe/rxe_pool.c  | 235 +++++++++++++-------------
 drivers/infiniband/sw/rxe/rxe_pool.h  |  67 +++-----
 drivers/infiniband/sw/rxe/rxe_verbs.c |  10 --
 6 files changed, 140 insertions(+), 183 deletions(-)

-- 
2.30.2


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH for-next 1/6] RDMA/rxe: Make rxe_alloc() take pool lock
  2021-10-10 23:59 [PATCH for-next 0/6] RDMA/rxe: Fix potential races Bob Pearson
@ 2021-10-10 23:59 ` Bob Pearson
  2021-10-20 23:16   ` Jason Gunthorpe
  2021-10-10 23:59 ` [PATCH for-next 2/6] RDMA/rxe: Copy setup parameters into rxe_pool Bob Pearson
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 21+ messages in thread
From: Bob Pearson @ 2021-10-10 23:59 UTC (permalink / raw)
  To: jgg, zyjzyj2000, linux-rdma; +Cc: Bob Pearson

In rxe there are two separate pool APIs for creating a new object
rxe_alloc() and rxe_alloc_locked(). Currently they are identical.
Make rxe_alloc() take the pool lock which is in line with the other
APIs in the library.

Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_pool.c | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index ffa8420b4765..7a288ebacceb 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -352,27 +352,14 @@ void *rxe_alloc_locked(struct rxe_pool *pool)
 
 void *rxe_alloc(struct rxe_pool *pool)
 {
-	struct rxe_type_info *info = &rxe_type_info[pool->type];
-	struct rxe_pool_entry *elem;
+	unsigned long flags;
 	u8 *obj;
 
-	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
-		goto out_cnt;
-
-	obj = kzalloc(info->size, GFP_KERNEL);
-	if (!obj)
-		goto out_cnt;
-
-	elem = (struct rxe_pool_entry *)(obj + info->elem_offset);
-
-	elem->pool = pool;
-	kref_init(&elem->ref_cnt);
+	write_lock_irqsave(&pool->pool_lock, flags);
+	obj = rxe_alloc_locked(pool);
+	write_unlock_irqrestore(&pool->pool_lock, flags);
 
 	return obj;
-
-out_cnt:
-	atomic_dec(&pool->num_elem);
-	return NULL;
 }
 
 int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem)
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH for-next 2/6] RDMA/rxe: Copy setup parameters into rxe_pool
  2021-10-10 23:59 [PATCH for-next 0/6] RDMA/rxe: Fix potential races Bob Pearson
  2021-10-10 23:59 ` [PATCH for-next 1/6] RDMA/rxe: Make rxe_alloc() take pool lock Bob Pearson
@ 2021-10-10 23:59 ` Bob Pearson
  2021-10-10 23:59 ` [PATCH for-next 3/6] RDMA/rxe: Save object pointer in pool element Bob Pearson
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 21+ messages in thread
From: Bob Pearson @ 2021-10-10 23:59 UTC (permalink / raw)
  To: jgg, zyjzyj2000, linux-rdma; +Cc: Bob Pearson

In rxe_pool.c  copied remaining pool parameters from rxe_pool_info into
rxe_pool. Saves looking up rxe_pool_info in the performance path.

Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_pool.c | 47 ++++++++++++----------------
 drivers/infiniband/sw/rxe/rxe_pool.h |  5 +--
 2 files changed, 23 insertions(+), 29 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index 7a288ebacceb..e9d74ad3f0b7 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -9,7 +9,7 @@
 
 /* info about object pools
  */
-struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
+static const struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
 	[RXE_TYPE_UC] = {
 		.name		= "rxe-uc",
 		.size		= sizeof(struct rxe_ucontext),
@@ -86,11 +86,6 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
 	},
 };
 
-static inline const char *pool_name(struct rxe_pool *pool)
-{
-	return rxe_type_info[pool->type].name;
-}
-
 static int rxe_pool_init_index(struct rxe_pool *pool, u32 max, u32 min)
 {
 	int err = 0;
@@ -125,35 +120,37 @@ int rxe_pool_init(
 	enum rxe_elem_type	type,
 	unsigned int		max_elem)
 {
+	const struct rxe_type_info *info = &rxe_type_info[type];
 	int			err = 0;
-	size_t			size = rxe_type_info[type].size;
 
 	memset(pool, 0, sizeof(*pool));
 
 	pool->rxe		= rxe;
+	pool->name		= info->name;
 	pool->type		= type;
 	pool->max_elem		= max_elem;
-	pool->elem_size		= ALIGN(size, RXE_POOL_ALIGN);
-	pool->flags		= rxe_type_info[type].flags;
+	pool->elem_size		= ALIGN(info->size, RXE_POOL_ALIGN);
+	pool->flags		= info->flags;
 	pool->index.tree	= RB_ROOT;
 	pool->key.tree		= RB_ROOT;
-	pool->cleanup		= rxe_type_info[type].cleanup;
+	pool->cleanup		= info->cleanup;
+	pool->size		= info->size;
+	pool->elem_offset	= info->elem_offset;
 
 	atomic_set(&pool->num_elem, 0);
 
 	rwlock_init(&pool->pool_lock);
 
-	if (rxe_type_info[type].flags & RXE_POOL_INDEX) {
-		err = rxe_pool_init_index(pool,
-					  rxe_type_info[type].max_index,
-					  rxe_type_info[type].min_index);
+	if (info->flags & RXE_POOL_INDEX) {
+		err = rxe_pool_init_index(pool, info->max_index,
+					  info->min_index);
 		if (err)
 			goto out;
 	}
 
-	if (rxe_type_info[type].flags & RXE_POOL_KEY) {
-		pool->key.key_offset = rxe_type_info[type].key_offset;
-		pool->key.key_size = rxe_type_info[type].key_size;
+	if (info->flags & RXE_POOL_KEY) {
+		pool->key.key_offset = info->key_offset;
+		pool->key.key_size = info->key_size;
 	}
 
 out:
@@ -164,7 +161,7 @@ void rxe_pool_cleanup(struct rxe_pool *pool)
 {
 	if (atomic_read(&pool->num_elem) > 0)
 		pr_warn("%s pool destroyed with unfree'd elem\n",
-			pool_name(pool));
+			pool->name);
 
 	kfree(pool->index.table);
 }
@@ -327,18 +324,17 @@ void __rxe_drop_index(struct rxe_pool_entry *elem)
 
 void *rxe_alloc_locked(struct rxe_pool *pool)
 {
-	struct rxe_type_info *info = &rxe_type_info[pool->type];
 	struct rxe_pool_entry *elem;
 	u8 *obj;
 
 	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
 		goto out_cnt;
 
-	obj = kzalloc(info->size, GFP_ATOMIC);
+	obj = kzalloc(pool->size, GFP_ATOMIC);
 	if (!obj)
 		goto out_cnt;
 
-	elem = (struct rxe_pool_entry *)(obj + info->elem_offset);
+	elem = (struct rxe_pool_entry *)(obj + pool->elem_offset);
 
 	elem->pool = pool;
 	kref_init(&elem->ref_cnt);
@@ -382,14 +378,13 @@ void rxe_elem_release(struct kref *kref)
 	struct rxe_pool_entry *elem =
 		container_of(kref, struct rxe_pool_entry, ref_cnt);
 	struct rxe_pool *pool = elem->pool;
-	struct rxe_type_info *info = &rxe_type_info[pool->type];
 	u8 *obj;
 
 	if (pool->cleanup)
 		pool->cleanup(elem);
 
 	if (!(pool->flags & RXE_POOL_NO_ALLOC)) {
-		obj = (u8 *)elem - info->elem_offset;
+		obj = (u8 *)elem - pool->elem_offset;
 		kfree(obj);
 	}
 
@@ -398,7 +393,6 @@ void rxe_elem_release(struct kref *kref)
 
 void *rxe_pool_get_index_locked(struct rxe_pool *pool, u32 index)
 {
-	struct rxe_type_info *info = &rxe_type_info[pool->type];
 	struct rb_node *node;
 	struct rxe_pool_entry *elem;
 	u8 *obj;
@@ -418,7 +412,7 @@ void *rxe_pool_get_index_locked(struct rxe_pool *pool, u32 index)
 
 	if (node) {
 		kref_get(&elem->ref_cnt);
-		obj = (u8 *)elem - info->elem_offset;
+		obj = (u8 *)elem - pool->elem_offset;
 	} else {
 		obj = NULL;
 	}
@@ -440,7 +434,6 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index)
 
 void *rxe_pool_get_key_locked(struct rxe_pool *pool, void *key)
 {
-	struct rxe_type_info *info = &rxe_type_info[pool->type];
 	struct rb_node *node;
 	struct rxe_pool_entry *elem;
 	u8 *obj;
@@ -464,7 +457,7 @@ void *rxe_pool_get_key_locked(struct rxe_pool *pool, void *key)
 
 	if (node) {
 		kref_get(&elem->ref_cnt);
-		obj = (u8 *)elem - info->elem_offset;
+		obj = (u8 *)elem - pool->elem_offset;
 	} else {
 		obj = NULL;
 	}
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
index 1feca1bffced..cd962dc5de9d 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.h
+++ b/drivers/infiniband/sw/rxe/rxe_pool.h
@@ -44,8 +44,6 @@ struct rxe_type_info {
 	size_t			key_size;
 };
 
-extern struct rxe_type_info rxe_type_info[];
-
 struct rxe_pool_entry {
 	struct rxe_pool		*pool;
 	struct kref		ref_cnt;
@@ -61,6 +59,7 @@ struct rxe_pool_entry {
 
 struct rxe_pool {
 	struct rxe_dev		*rxe;
+	const char		*name;
 	rwlock_t		pool_lock; /* protects pool add/del/search */
 	size_t			elem_size;
 	void			(*cleanup)(struct rxe_pool_entry *obj);
@@ -69,6 +68,8 @@ struct rxe_pool {
 
 	unsigned int		max_elem;
 	atomic_t		num_elem;
+	size_t			size;
+	size_t			elem_offset;
 
 	/* only used if indexed */
 	struct {
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH for-next 3/6] RDMA/rxe: Save object pointer in pool element
  2021-10-10 23:59 [PATCH for-next 0/6] RDMA/rxe: Fix potential races Bob Pearson
  2021-10-10 23:59 ` [PATCH for-next 1/6] RDMA/rxe: Make rxe_alloc() take pool lock Bob Pearson
  2021-10-10 23:59 ` [PATCH for-next 2/6] RDMA/rxe: Copy setup parameters into rxe_pool Bob Pearson
@ 2021-10-10 23:59 ` Bob Pearson
  2021-10-20 23:20   ` Jason Gunthorpe
  2021-10-10 23:59 ` [PATCH for-next 4/6] RDMA/rxe: Combine rxe_add_index with rxe_alloc Bob Pearson
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 21+ messages in thread
From: Bob Pearson @ 2021-10-10 23:59 UTC (permalink / raw)
  To: jgg, zyjzyj2000, linux-rdma; +Cc: Bob Pearson

In rxe_pool.c currently there are many cases where it is necessary to
compute the offset from a pool element struct to the object containing
the pool element in a type independent way. By saving a pointer to the
object when they are created extra work can be saved.

Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_pool.c | 16 +++++++++-------
 drivers/infiniband/sw/rxe/rxe_pool.h |  1 +
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index e9d74ad3f0b7..4caaecdb0d68 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -337,6 +337,7 @@ void *rxe_alloc_locked(struct rxe_pool *pool)
 	elem = (struct rxe_pool_entry *)(obj + pool->elem_offset);
 
 	elem->pool = pool;
+	elem->obj = obj;
 	kref_init(&elem->ref_cnt);
 
 	return obj;
@@ -349,7 +350,7 @@ void *rxe_alloc_locked(struct rxe_pool *pool)
 void *rxe_alloc(struct rxe_pool *pool)
 {
 	unsigned long flags;
-	u8 *obj;
+	void *obj;
 
 	write_lock_irqsave(&pool->pool_lock, flags);
 	obj = rxe_alloc_locked(pool);
@@ -364,6 +365,7 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem)
 		goto out_cnt;
 
 	elem->pool = pool;
+	elem->obj = (u8 *)elem - pool->elem_offset;
 	kref_init(&elem->ref_cnt);
 
 	return 0;
@@ -378,13 +380,13 @@ void rxe_elem_release(struct kref *kref)
 	struct rxe_pool_entry *elem =
 		container_of(kref, struct rxe_pool_entry, ref_cnt);
 	struct rxe_pool *pool = elem->pool;
-	u8 *obj;
+	void *obj;
 
 	if (pool->cleanup)
 		pool->cleanup(elem);
 
 	if (!(pool->flags & RXE_POOL_NO_ALLOC)) {
-		obj = (u8 *)elem - pool->elem_offset;
+		obj = elem->obj;
 		kfree(obj);
 	}
 
@@ -395,7 +397,7 @@ void *rxe_pool_get_index_locked(struct rxe_pool *pool, u32 index)
 {
 	struct rb_node *node;
 	struct rxe_pool_entry *elem;
-	u8 *obj;
+	void *obj;
 
 	node = pool->index.tree.rb_node;
 
@@ -412,7 +414,7 @@ void *rxe_pool_get_index_locked(struct rxe_pool *pool, u32 index)
 
 	if (node) {
 		kref_get(&elem->ref_cnt);
-		obj = (u8 *)elem - pool->elem_offset;
+		obj = elem->obj;
 	} else {
 		obj = NULL;
 	}
@@ -436,7 +438,7 @@ void *rxe_pool_get_key_locked(struct rxe_pool *pool, void *key)
 {
 	struct rb_node *node;
 	struct rxe_pool_entry *elem;
-	u8 *obj;
+	void *obj;
 	int cmp;
 
 	node = pool->key.tree.rb_node;
@@ -457,7 +459,7 @@ void *rxe_pool_get_key_locked(struct rxe_pool *pool, void *key)
 
 	if (node) {
 		kref_get(&elem->ref_cnt);
-		obj = (u8 *)elem - pool->elem_offset;
+		obj = elem->obj;
 	} else {
 		obj = NULL;
 	}
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
index cd962dc5de9d..570bda77f4a6 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.h
+++ b/drivers/infiniband/sw/rxe/rxe_pool.h
@@ -46,6 +46,7 @@ struct rxe_type_info {
 
 struct rxe_pool_entry {
 	struct rxe_pool		*pool;
+	void			*obj;
 	struct kref		ref_cnt;
 	struct list_head	list;
 
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH for-next 4/6] RDMA/rxe: Combine rxe_add_index with rxe_alloc
  2021-10-10 23:59 [PATCH for-next 0/6] RDMA/rxe: Fix potential races Bob Pearson
                   ` (2 preceding siblings ...)
  2021-10-10 23:59 ` [PATCH for-next 3/6] RDMA/rxe: Save object pointer in pool element Bob Pearson
@ 2021-10-10 23:59 ` Bob Pearson
  2021-10-10 23:59 ` [PATCH for-next 5/6] RDMA/rxe: Combine rxe_add_key " Bob Pearson
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 21+ messages in thread
From: Bob Pearson @ 2021-10-10 23:59 UTC (permalink / raw)
  To: jgg, zyjzyj2000, linux-rdma; +Cc: Bob Pearson

Currently rxe objects which have an index require adding and dropping
the indices in a separate API call from allocating and freeing the
object. These are always performed together so this patch combines
them in a single operation.

By taking a single pool lock around allocating the object and adding
the index metadata or dropping the index metadata and releasing the
object the possibility of a race condition where the metadata is not
consistent with the state of the object is removed.

Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_mr.c    |  1 -
 drivers/infiniband/sw/rxe/rxe_mw.c    |  5 +--
 drivers/infiniband/sw/rxe/rxe_pool.c  | 59 +++++++++++++++------------
 drivers/infiniband/sw/rxe/rxe_pool.h  | 22 ----------
 drivers/infiniband/sw/rxe/rxe_verbs.c | 10 -----
 5 files changed, 33 insertions(+), 64 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index 53271df10e47..6e71f67ccfe9 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -693,7 +693,6 @@ int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 
 	mr->state = RXE_MR_STATE_INVALID;
 	rxe_drop_ref(mr_pd(mr));
-	rxe_drop_index(mr);
 	rxe_drop_ref(mr);
 
 	return 0;
diff --git a/drivers/infiniband/sw/rxe/rxe_mw.c b/drivers/infiniband/sw/rxe/rxe_mw.c
index 9534a7fe1a98..854d0c283521 100644
--- a/drivers/infiniband/sw/rxe/rxe_mw.c
+++ b/drivers/infiniband/sw/rxe/rxe_mw.c
@@ -20,7 +20,6 @@ int rxe_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
 		return ret;
 	}
 
-	rxe_add_index(mw);
 	mw->rkey = ibmw->rkey = (mw->pelem.index << 8) | rxe_get_next_key(-1);
 	mw->state = (mw->ibmw.type == IB_MW_TYPE_2) ?
 			RXE_MW_STATE_FREE : RXE_MW_STATE_VALID;
@@ -335,7 +334,5 @@ struct rxe_mw *rxe_lookup_mw(struct rxe_qp *qp, int access, u32 rkey)
 
 void rxe_mw_cleanup(struct rxe_pool_entry *elem)
 {
-	struct rxe_mw *mw = container_of(elem, typeof(*mw), pelem);
-
-	rxe_drop_index(mw);
+	/* nothing to do currently */
 }
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index 4caaecdb0d68..d55a40291692 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -166,12 +166,16 @@ void rxe_pool_cleanup(struct rxe_pool *pool)
 	kfree(pool->index.table);
 }
 
+/* should never fail because there are at least as many indices as
+ * max objects
+ */
 static u32 alloc_index(struct rxe_pool *pool)
 {
 	u32 index;
 	u32 range = pool->index.max_index - pool->index.min_index + 1;
 
-	index = find_next_zero_bit(pool->index.table, range, pool->index.last);
+	index = find_next_zero_bit(pool->index.table, range,
+				   pool->index.last);
 	if (index >= range)
 		index = find_first_zero_bit(pool->index.table, range);
 
@@ -192,7 +196,8 @@ static int rxe_insert_index(struct rxe_pool *pool, struct rxe_pool_entry *new)
 		elem = rb_entry(parent, struct rxe_pool_entry, index_node);
 
 		if (elem->index == new->index) {
-			pr_warn("element already exists!\n");
+			pr_warn("element with index = 0x%x already exists!\n",
+					new->index);
 			return -EINVAL;
 		}
 
@@ -280,31 +285,21 @@ void __rxe_drop_key(struct rxe_pool_entry *elem)
 	write_unlock_irqrestore(&pool->pool_lock, flags);
 }
 
-int __rxe_add_index_locked(struct rxe_pool_entry *elem)
+static int rxe_add_index(struct rxe_pool_entry *elem)
 {
 	struct rxe_pool *pool = elem->pool;
 	int err;
 
 	elem->index = alloc_index(pool);
 	err = rxe_insert_index(pool, elem);
+	if (err)
+		clear_bit(elem->index - pool->index.min_index,
+			  pool->index.table);
 
 	return err;
 }
 
-int __rxe_add_index(struct rxe_pool_entry *elem)
-{
-	struct rxe_pool *pool = elem->pool;
-	unsigned long flags;
-	int err;
-
-	write_lock_irqsave(&pool->pool_lock, flags);
-	err = __rxe_add_index_locked(elem);
-	write_unlock_irqrestore(&pool->pool_lock, flags);
-
-	return err;
-}
-
-void __rxe_drop_index_locked(struct rxe_pool_entry *elem)
+static void rxe_drop_index(struct rxe_pool_entry *elem)
 {
 	struct rxe_pool *pool = elem->pool;
 
@@ -312,20 +307,11 @@ void __rxe_drop_index_locked(struct rxe_pool_entry *elem)
 	rb_erase(&elem->index_node, &pool->index.tree);
 }
 
-void __rxe_drop_index(struct rxe_pool_entry *elem)
-{
-	struct rxe_pool *pool = elem->pool;
-	unsigned long flags;
-
-	write_lock_irqsave(&pool->pool_lock, flags);
-	__rxe_drop_index_locked(elem);
-	write_unlock_irqrestore(&pool->pool_lock, flags);
-}
-
 void *rxe_alloc_locked(struct rxe_pool *pool)
 {
 	struct rxe_pool_entry *elem;
 	u8 *obj;
+	int err;
 
 	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
 		goto out_cnt;
@@ -340,6 +326,14 @@ void *rxe_alloc_locked(struct rxe_pool *pool)
 	elem->obj = obj;
 	kref_init(&elem->ref_cnt);
 
+	if (pool->flags & RXE_POOL_INDEX) {
+		err = rxe_add_index(elem);
+		if (err) {
+			kfree(obj);
+			goto out_cnt;
+		}
+	}
+
 	return obj;
 
 out_cnt:
@@ -361,6 +355,8 @@ void *rxe_alloc(struct rxe_pool *pool)
 
 int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem)
 {
+	int err;
+
 	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
 		goto out_cnt;
 
@@ -368,6 +364,12 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem)
 	elem->obj = (u8 *)elem - pool->elem_offset;
 	kref_init(&elem->ref_cnt);
 
+	if (pool->flags & RXE_POOL_INDEX) {
+		err = rxe_add_index(elem);
+		if (err)
+			goto out_cnt;
+	}
+
 	return 0;
 
 out_cnt:
@@ -385,6 +387,9 @@ void rxe_elem_release(struct kref *kref)
 	if (pool->cleanup)
 		pool->cleanup(elem);
 
+	if (pool->flags & RXE_POOL_INDEX)
+		rxe_drop_index(elem);
+
 	if (!(pool->flags & RXE_POOL_NO_ALLOC)) {
 		obj = elem->obj;
 		kfree(obj);
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
index 570bda77f4a6..41eaf47a64a3 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.h
+++ b/drivers/infiniband/sw/rxe/rxe_pool.h
@@ -110,28 +110,6 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem);
 
 #define rxe_add_to_pool(pool, obj) __rxe_add_to_pool(pool, &(obj)->pelem)
 
-/* assign an index to an indexed object and insert object into
- *  pool's rb tree holding and not holding the pool_lock
- */
-int __rxe_add_index_locked(struct rxe_pool_entry *elem);
-
-#define rxe_add_index_locked(obj) __rxe_add_index_locked(&(obj)->pelem)
-
-int __rxe_add_index(struct rxe_pool_entry *elem);
-
-#define rxe_add_index(obj) __rxe_add_index(&(obj)->pelem)
-
-/* drop an index and remove object from rb tree
- * holding and not holding the pool_lock
- */
-void __rxe_drop_index_locked(struct rxe_pool_entry *elem);
-
-#define rxe_drop_index_locked(obj) __rxe_drop_index_locked(&(obj)->pelem)
-
-void __rxe_drop_index(struct rxe_pool_entry *elem);
-
-#define rxe_drop_index(obj) __rxe_drop_index(&(obj)->pelem)
-
 /* assign a key to a keyed object and insert object into
  * pool's rb tree holding and not holding pool_lock
  */
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index c49ba0381964..bc40200436f0 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -409,7 +409,6 @@ static int rxe_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init,
 	if (err)
 		return err;
 
-	rxe_add_index(qp);
 	err = rxe_qp_from_init(rxe, qp, pd, init, uresp, ibqp->pd, udata);
 	if (err)
 		goto qp_init;
@@ -417,7 +416,6 @@ static int rxe_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init,
 	return 0;
 
 qp_init:
-	rxe_drop_index(qp);
 	rxe_drop_ref(qp);
 	return err;
 }
@@ -462,7 +460,6 @@ static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 	struct rxe_qp *qp = to_rqp(ibqp);
 
 	rxe_qp_destroy(qp);
-	rxe_drop_index(qp);
 	rxe_drop_ref(qp);
 	return 0;
 }
@@ -871,7 +868,6 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access)
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
-	rxe_add_index(mr);
 	rxe_add_ref(pd);
 	rxe_mr_init_dma(pd, access, mr);
 
@@ -895,8 +891,6 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd,
 		goto err2;
 	}
 
-	rxe_add_index(mr);
-
 	rxe_add_ref(pd);
 
 	err = rxe_mr_init_user(pd, start, length, iova, access, mr);
@@ -907,7 +901,6 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd,
 
 err3:
 	rxe_drop_ref(pd);
-	rxe_drop_index(mr);
 	rxe_drop_ref(mr);
 err2:
 	return ERR_PTR(err);
@@ -930,8 +923,6 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
 		goto err1;
 	}
 
-	rxe_add_index(mr);
-
 	rxe_add_ref(pd);
 
 	err = rxe_mr_init_fast(pd, max_num_sg, mr);
@@ -942,7 +933,6 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
 
 err2:
 	rxe_drop_ref(pd);
-	rxe_drop_index(mr);
 	rxe_drop_ref(mr);
 err1:
 	return ERR_PTR(err);
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH for-next 5/6] RDMA/rxe: Combine rxe_add_key with rxe_alloc
  2021-10-10 23:59 [PATCH for-next 0/6] RDMA/rxe: Fix potential races Bob Pearson
                   ` (3 preceding siblings ...)
  2021-10-10 23:59 ` [PATCH for-next 4/6] RDMA/rxe: Combine rxe_add_index with rxe_alloc Bob Pearson
@ 2021-10-10 23:59 ` Bob Pearson
  2021-10-10 23:59 ` [PATCH for-next 6/6] RDMA/rxe: Fix potential race condition in rxe_pool Bob Pearson
  2021-10-12  6:34 ` [PATCH for-next 0/6] RDMA/rxe: Fix potential races Leon Romanovsky
  6 siblings, 0 replies; 21+ messages in thread
From: Bob Pearson @ 2021-10-10 23:59 UTC (permalink / raw)
  To: jgg, zyjzyj2000, linux-rdma; +Cc: Bob Pearson

Currently adding and dropping a key from a rxe object requires separate
API calls from allocating and freeing the object but these are always
performed together. This patch combines these into single APIs.
This requires adding new rxe_allocate_with_key(_locked) APIs.

By combining allocating an object and adding key metadata inside a
single locked sequence and dropping the key metadata and releasing the
object the possibility of a race condition where the object state and
key metadata state are inconsistent is removed.

Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_mcast.c |  5 +-
 drivers/infiniband/sw/rxe/rxe_pool.c  | 81 +++++++++++++--------------
 drivers/infiniband/sw/rxe/rxe_pool.h  | 24 ++------
 3 files changed, 45 insertions(+), 65 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c
index 1c1d1b53312d..337dc2c68051 100644
--- a/drivers/infiniband/sw/rxe/rxe_mcast.c
+++ b/drivers/infiniband/sw/rxe/rxe_mcast.c
@@ -15,18 +15,16 @@ static struct rxe_mc_grp *create_grp(struct rxe_dev *rxe,
 	int err;
 	struct rxe_mc_grp *grp;
 
-	grp = rxe_alloc_locked(&rxe->mc_grp_pool);
+	grp = rxe_alloc_with_key_locked(&rxe->mc_grp_pool, mgid);
 	if (!grp)
 		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&grp->qp_list);
 	spin_lock_init(&grp->mcg_lock);
 	grp->rxe = rxe;
-	rxe_add_key_locked(grp, mgid);
 
 	err = rxe_mcast_add(rxe, mgid);
 	if (unlikely(err)) {
-		rxe_drop_key_locked(grp);
 		rxe_drop_ref(grp);
 		return ERR_PTR(err);
 	}
@@ -174,6 +172,5 @@ void rxe_mc_cleanup(struct rxe_pool_entry *arg)
 	struct rxe_mc_grp *grp = container_of(arg, typeof(*grp), pelem);
 	struct rxe_dev *rxe = grp->rxe;
 
-	rxe_drop_key(grp);
 	rxe_mcast_delete(rxe, &grp->mgid);
 }
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index d55a40291692..70f407108b92 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -244,47 +244,6 @@ static int rxe_insert_key(struct rxe_pool *pool, struct rxe_pool_entry *new)
 	return 0;
 }
 
-int __rxe_add_key_locked(struct rxe_pool_entry *elem, void *key)
-{
-	struct rxe_pool *pool = elem->pool;
-	int err;
-
-	memcpy((u8 *)elem + pool->key.key_offset, key, pool->key.key_size);
-	err = rxe_insert_key(pool, elem);
-
-	return err;
-}
-
-int __rxe_add_key(struct rxe_pool_entry *elem, void *key)
-{
-	struct rxe_pool *pool = elem->pool;
-	unsigned long flags;
-	int err;
-
-	write_lock_irqsave(&pool->pool_lock, flags);
-	err = __rxe_add_key_locked(elem, key);
-	write_unlock_irqrestore(&pool->pool_lock, flags);
-
-	return err;
-}
-
-void __rxe_drop_key_locked(struct rxe_pool_entry *elem)
-{
-	struct rxe_pool *pool = elem->pool;
-
-	rb_erase(&elem->key_node, &pool->key.tree);
-}
-
-void __rxe_drop_key(struct rxe_pool_entry *elem)
-{
-	struct rxe_pool *pool = elem->pool;
-	unsigned long flags;
-
-	write_lock_irqsave(&pool->pool_lock, flags);
-	__rxe_drop_key_locked(elem);
-	write_unlock_irqrestore(&pool->pool_lock, flags);
-}
-
 static int rxe_add_index(struct rxe_pool_entry *elem)
 {
 	struct rxe_pool *pool = elem->pool;
@@ -341,6 +300,31 @@ void *rxe_alloc_locked(struct rxe_pool *pool)
 	return NULL;
 }
 
+void *rxe_alloc_with_key_locked(struct rxe_pool *pool, void *key)
+{
+	struct rxe_pool_entry *elem;
+	u8 *obj;
+	int err;
+
+	obj = rxe_alloc_locked(pool);
+	if (!obj)
+		return NULL;
+
+	elem = (struct rxe_pool_entry *)(obj + pool->elem_offset);
+	memcpy((u8 *)elem + pool->key.key_offset, key, pool->key.key_size);
+	err = rxe_insert_key(pool, elem);
+	if (err) {
+		kfree(obj);
+		goto out_cnt;
+	}
+
+	return obj;
+
+out_cnt:
+	atomic_dec(&pool->num_elem);
+	return NULL;
+}
+
 void *rxe_alloc(struct rxe_pool *pool)
 {
 	unsigned long flags;
@@ -353,6 +337,18 @@ void *rxe_alloc(struct rxe_pool *pool)
 	return obj;
 }
 
+void *rxe_alloc_with_key(struct rxe_pool *pool, void *key)
+{
+	unsigned long flags;
+	void *obj;
+
+	write_lock_irqsave(&pool->pool_lock, flags);
+	obj = rxe_alloc_with_key_locked(pool, key);
+	write_unlock_irqrestore(&pool->pool_lock, flags);
+
+	return obj;
+}
+
 int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem)
 {
 	int err;
@@ -390,6 +386,9 @@ void rxe_elem_release(struct kref *kref)
 	if (pool->flags & RXE_POOL_INDEX)
 		rxe_drop_index(elem);
 
+	if (pool->flags & RXE_POOL_KEY)
+		rb_erase(&elem->key_node, &pool->key.tree);
+
 	if (!(pool->flags & RXE_POOL_NO_ALLOC)) {
 		obj = elem->obj;
 		kfree(obj);
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
index 41eaf47a64a3..ad287c4ddc1a 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.h
+++ b/drivers/infiniband/sw/rxe/rxe_pool.h
@@ -105,31 +105,15 @@ void *rxe_alloc_locked(struct rxe_pool *pool);
 
 void *rxe_alloc(struct rxe_pool *pool);
 
+void *rxe_alloc_with_key_locked(struct rxe_pool *pool, void *key);
+
+void *rxe_alloc_with_key(struct rxe_pool *pool, void *key);
+
 /* connect already allocated object to pool */
 int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem);
 
 #define rxe_add_to_pool(pool, obj) __rxe_add_to_pool(pool, &(obj)->pelem)
 
-/* assign a key to a keyed object and insert object into
- * pool's rb tree holding and not holding pool_lock
- */
-int __rxe_add_key_locked(struct rxe_pool_entry *elem, void *key);
-
-#define rxe_add_key_locked(obj, key) __rxe_add_key_locked(&(obj)->pelem, key)
-
-int __rxe_add_key(struct rxe_pool_entry *elem, void *key);
-
-#define rxe_add_key(obj, key) __rxe_add_key(&(obj)->pelem, key)
-
-/* remove elem from rb tree holding and not holding the pool_lock */
-void __rxe_drop_key_locked(struct rxe_pool_entry *elem);
-
-#define rxe_drop_key_locked(obj) __rxe_drop_key_locked(&(obj)->pelem)
-
-void __rxe_drop_key(struct rxe_pool_entry *elem);
-
-#define rxe_drop_key(obj) __rxe_drop_key(&(obj)->pelem)
-
 /* lookup an indexed object from index holding and not holding the pool_lock.
  * takes a reference on object
  */
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH for-next 6/6] RDMA/rxe: Fix potential race condition in rxe_pool
  2021-10-10 23:59 [PATCH for-next 0/6] RDMA/rxe: Fix potential races Bob Pearson
                   ` (4 preceding siblings ...)
  2021-10-10 23:59 ` [PATCH for-next 5/6] RDMA/rxe: Combine rxe_add_key " Bob Pearson
@ 2021-10-10 23:59 ` Bob Pearson
  2021-10-20 23:23   ` Jason Gunthorpe
  2021-10-12  6:34 ` [PATCH for-next 0/6] RDMA/rxe: Fix potential races Leon Romanovsky
  6 siblings, 1 reply; 21+ messages in thread
From: Bob Pearson @ 2021-10-10 23:59 UTC (permalink / raw)
  To: jgg, zyjzyj2000, linux-rdma; +Cc: Bob Pearson

Currently there is a possible race condition related to rxe indexed or
keyed objects where one thread is the last one holding a reference to
an object and drops that reference triggering a call to rxe_elem_release()
while at the same time another thread looks up the object from its index
or key by calling rxe_pool_get_index(/_key). This can happen if an
unexpected packet arrives as a result of a retry attempt and looks up
its rkey or a multicast packet arrives just as the verbs consumer drops
the mcast group.

Add locking to prevent looking up an object from its index or key
while another thread is trying to destroy the object.

Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_pool.c | 53 +++++++++++++++++-----------
 drivers/infiniband/sw/rxe/rxe_pool.h | 15 ++++++--
 2 files changed, 46 insertions(+), 22 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index 70f407108b92..c6a583894956 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -266,10 +266,10 @@ static void rxe_drop_index(struct rxe_pool_entry *elem)
 	rb_erase(&elem->index_node, &pool->index.tree);
 }
 
-void *rxe_alloc_locked(struct rxe_pool *pool)
+static void *__rxe_alloc_locked(struct rxe_pool *pool)
 {
 	struct rxe_pool_entry *elem;
-	u8 *obj;
+	void *obj;
 	int err;
 
 	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
@@ -279,11 +279,10 @@ void *rxe_alloc_locked(struct rxe_pool *pool)
 	if (!obj)
 		goto out_cnt;
 
-	elem = (struct rxe_pool_entry *)(obj + pool->elem_offset);
+	elem = (struct rxe_pool_entry *)((u8 *)obj + pool->elem_offset);
 
 	elem->pool = pool;
 	elem->obj = obj;
-	kref_init(&elem->ref_cnt);
 
 	if (pool->flags & RXE_POOL_INDEX) {
 		err = rxe_add_index(elem);
@@ -300,17 +299,32 @@ void *rxe_alloc_locked(struct rxe_pool *pool)
 	return NULL;
 }
 
+void *rxe_alloc_locked(struct rxe_pool *pool)
+{
+	struct rxe_pool_entry *elem;
+	void *obj;
+
+	obj = __rxe_alloc_locked(pool);
+	if (!obj)
+		return NULL;
+
+	elem = (struct rxe_pool_entry *)(obj + pool->elem_offset);
+	kref_init(&elem->ref_cnt);
+
+	return obj;
+}
+
 void *rxe_alloc_with_key_locked(struct rxe_pool *pool, void *key)
 {
 	struct rxe_pool_entry *elem;
-	u8 *obj;
+	void *obj;
 	int err;
 
-	obj = rxe_alloc_locked(pool);
+	obj = __rxe_alloc_locked(pool);
 	if (!obj)
 		return NULL;
 
-	elem = (struct rxe_pool_entry *)(obj + pool->elem_offset);
+	elem = (struct rxe_pool_entry *)((u8 *)obj + pool->elem_offset);
 	memcpy((u8 *)elem + pool->key.key_offset, key, pool->key.key_size);
 	err = rxe_insert_key(pool, elem);
 	if (err) {
@@ -318,6 +332,8 @@ void *rxe_alloc_with_key_locked(struct rxe_pool *pool, void *key)
 		goto out_cnt;
 	}
 
+	kref_init(&elem->ref_cnt);
+
 	return obj;
 
 out_cnt:
@@ -351,14 +367,15 @@ void *rxe_alloc_with_key(struct rxe_pool *pool, void *key)
 
 int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem)
 {
+	unsigned long flags;
 	int err;
 
+	write_lock_irqsave(&pool->pool_lock, flags);
 	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
 		goto out_cnt;
 
 	elem->pool = pool;
 	elem->obj = (u8 *)elem - pool->elem_offset;
-	kref_init(&elem->ref_cnt);
 
 	if (pool->flags & RXE_POOL_INDEX) {
 		err = rxe_add_index(elem);
@@ -366,10 +383,14 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem)
 			goto out_cnt;
 	}
 
+	kref_init(&elem->ref_cnt);
+	write_unlock_irqrestore(&pool->pool_lock, flags);
+
 	return 0;
 
 out_cnt:
 	atomic_dec(&pool->num_elem);
+	write_unlock_irqrestore(&pool->pool_lock, flags);
 	return -EINVAL;
 }
 
@@ -401,7 +422,7 @@ void *rxe_pool_get_index_locked(struct rxe_pool *pool, u32 index)
 {
 	struct rb_node *node;
 	struct rxe_pool_entry *elem;
-	void *obj;
+	void *obj = NULL;
 
 	node = pool->index.tree.rb_node;
 
@@ -416,12 +437,8 @@ void *rxe_pool_get_index_locked(struct rxe_pool *pool, u32 index)
 			break;
 	}
 
-	if (node) {
-		kref_get(&elem->ref_cnt);
+	if (node && kref_get_unless_zero(&elem->ref_cnt))
 		obj = elem->obj;
-	} else {
-		obj = NULL;
-	}
 
 	return obj;
 }
@@ -442,7 +459,7 @@ void *rxe_pool_get_key_locked(struct rxe_pool *pool, void *key)
 {
 	struct rb_node *node;
 	struct rxe_pool_entry *elem;
-	void *obj;
+	void *obj = NULL;
 	int cmp;
 
 	node = pool->key.tree.rb_node;
@@ -461,12 +478,8 @@ void *rxe_pool_get_key_locked(struct rxe_pool *pool, void *key)
 			break;
 	}
 
-	if (node) {
-		kref_get(&elem->ref_cnt);
+	if (node && kref_get_unless_zero(&elem->ref_cnt))
 		obj = elem->obj;
-	} else {
-		obj = NULL;
-	}
 
 	return obj;
 }
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
index ad287c4ddc1a..43dac03ad82e 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.h
+++ b/drivers/infiniband/sw/rxe/rxe_pool.h
@@ -132,9 +132,20 @@ void *rxe_pool_get_key(struct rxe_pool *pool, void *key);
 void rxe_elem_release(struct kref *kref);
 
 /* take a reference on an object */
-#define rxe_add_ref(elem) kref_get(&(elem)->pelem.ref_cnt)
+static inline int __rxe_add_ref(struct rxe_pool_entry *elem)
+{
+	int ret = kref_get_unless_zero(&elem->ref_cnt);
+
+	if (unlikely(!ret))
+		pr_warn("Taking a reference on a %s object that is already destroyed\n",
+			elem->pool->name);
+
+	return (ret) ? 0 : -EINVAL;
+}
+
+#define rxe_add_ref(obj) __rxe_add_ref(&(obj)->pelem)
 
 /* drop a reference on an object */
-#define rxe_drop_ref(elem) kref_put(&(elem)->pelem.ref_cnt, rxe_elem_release)
+#define rxe_drop_ref(obj) kref_put(&(obj)->pelem.ref_cnt, rxe_elem_release)
 
 #endif /* RXE_POOL_H */
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH for-next 0/6] RDMA/rxe: Fix potential races
  2021-10-10 23:59 [PATCH for-next 0/6] RDMA/rxe: Fix potential races Bob Pearson
                   ` (5 preceding siblings ...)
  2021-10-10 23:59 ` [PATCH for-next 6/6] RDMA/rxe: Fix potential race condition in rxe_pool Bob Pearson
@ 2021-10-12  6:34 ` Leon Romanovsky
  2021-10-12 20:19   ` Bob Pearson
  6 siblings, 1 reply; 21+ messages in thread
From: Leon Romanovsky @ 2021-10-12  6:34 UTC (permalink / raw)
  To: Bob Pearson; +Cc: jgg, zyjzyj2000, linux-rdma

On Sun, Oct 10, 2021 at 06:59:25PM -0500, Bob Pearson wrote:
> There are possible race conditions related to attempting to access
> rxe pool objects at the same time as the pools or elements are being
> freed. This series of patches addresses these races.

Can we get rid of this pool?

Thanks

> 
> Bob Pearson (6):
>   RDMA/rxe: Make rxe_alloc() take pool lock
>   RDMA/rxe: Copy setup parameters into rxe_pool
>   RDMA/rxe: Save object pointer in pool element
>   RDMA/rxe: Combine rxe_add_index with rxe_alloc
>   RDMA/rxe: Combine rxe_add_key with rxe_alloc
>   RDMA/rxe: Fix potential race condition in rxe_pool
> 
>  drivers/infiniband/sw/rxe/rxe_mcast.c |   5 +-
>  drivers/infiniband/sw/rxe/rxe_mr.c    |   1 -
>  drivers/infiniband/sw/rxe/rxe_mw.c    |   5 +-
>  drivers/infiniband/sw/rxe/rxe_pool.c  | 235 +++++++++++++-------------
>  drivers/infiniband/sw/rxe/rxe_pool.h  |  67 +++-----
>  drivers/infiniband/sw/rxe/rxe_verbs.c |  10 --
>  6 files changed, 140 insertions(+), 183 deletions(-)
> 
> -- 
> 2.30.2
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH for-next 0/6] RDMA/rxe: Fix potential races
  2021-10-12  6:34 ` [PATCH for-next 0/6] RDMA/rxe: Fix potential races Leon Romanovsky
@ 2021-10-12 20:19   ` Bob Pearson
  2021-10-19 13:07     ` Leon Romanovsky
  0 siblings, 1 reply; 21+ messages in thread
From: Bob Pearson @ 2021-10-12 20:19 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: jgg, zyjzyj2000, linux-rdma, Jason Gunthorpe

On 10/12/21 1:34 AM, Leon Romanovsky wrote:
> On Sun, Oct 10, 2021 at 06:59:25PM -0500, Bob Pearson wrote:
>> There are possible race conditions related to attempting to access
>> rxe pool objects at the same time as the pools or elements are being
>> freed. This series of patches addresses these races.
> 
> Can we get rid of this pool?
> 
> Thanks
> 
>>
>> Bob Pearson (6):
>>   RDMA/rxe: Make rxe_alloc() take pool lock
>>   RDMA/rxe: Copy setup parameters into rxe_pool
>>   RDMA/rxe: Save object pointer in pool element
>>   RDMA/rxe: Combine rxe_add_index with rxe_alloc
>>   RDMA/rxe: Combine rxe_add_key with rxe_alloc
>>   RDMA/rxe: Fix potential race condition in rxe_pool
>>
>>  drivers/infiniband/sw/rxe/rxe_mcast.c |   5 +-
>>  drivers/infiniband/sw/rxe/rxe_mr.c    |   1 -
>>  drivers/infiniband/sw/rxe/rxe_mw.c    |   5 +-
>>  drivers/infiniband/sw/rxe/rxe_pool.c  | 235 +++++++++++++-------------
>>  drivers/infiniband/sw/rxe/rxe_pool.h  |  67 +++-----
>>  drivers/infiniband/sw/rxe/rxe_verbs.c |  10 --
>>  6 files changed, 140 insertions(+), 183 deletions(-)
>>
>> -- 
>> 2.30.2
>>

Not sure which 'this' you mean? This set of patches is motivated by someone at HPE
running into seg faults caused very infrequently by rdma packets causing seg faults
when trying to copy data to or from an MR. This can only happen (other than just dumb
bug which doesn't seem to be the case) by a late packet arriving after the MR is
de-registered. The root cause of that is the way rxe currently defers cleaning up
objects with krefs and potential races between cleanup and new packets looking up
rkeys. I found a lot of potential race conditions and tried to close them off. There
are another couple of patches coming as well.

This is an attempt to fix up the code the way it is now. Later I would like to use
xarrays to handle rkey indices and qpns etc which looks cleaner.

Pools is mostly a misnomer since you moved all the allocates into rdma-core except for
a couple. Really they are a way to add indices or keys to the objects that are already
there.

Bob

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH for-next 0/6] RDMA/rxe: Fix potential races
  2021-10-12 20:19   ` Bob Pearson
@ 2021-10-19 13:07     ` Leon Romanovsky
  2021-10-19 16:35       ` Bob Pearson
  0 siblings, 1 reply; 21+ messages in thread
From: Leon Romanovsky @ 2021-10-19 13:07 UTC (permalink / raw)
  To: Bob Pearson; +Cc: jgg, zyjzyj2000, linux-rdma

On Tue, Oct 12, 2021 at 03:19:46PM -0500, Bob Pearson wrote:
> On 10/12/21 1:34 AM, Leon Romanovsky wrote:
> > On Sun, Oct 10, 2021 at 06:59:25PM -0500, Bob Pearson wrote:
> >> There are possible race conditions related to attempting to access
> >> rxe pool objects at the same time as the pools or elements are being
> >> freed. This series of patches addresses these races.
> > 
> > Can we get rid of this pool?
> > 
> > Thanks
> > 
> >>
> >> Bob Pearson (6):
> >>   RDMA/rxe: Make rxe_alloc() take pool lock
> >>   RDMA/rxe: Copy setup parameters into rxe_pool
> >>   RDMA/rxe: Save object pointer in pool element
> >>   RDMA/rxe: Combine rxe_add_index with rxe_alloc
> >>   RDMA/rxe: Combine rxe_add_key with rxe_alloc
> >>   RDMA/rxe: Fix potential race condition in rxe_pool
> >>
> >>  drivers/infiniband/sw/rxe/rxe_mcast.c |   5 +-
> >>  drivers/infiniband/sw/rxe/rxe_mr.c    |   1 -
> >>  drivers/infiniband/sw/rxe/rxe_mw.c    |   5 +-
> >>  drivers/infiniband/sw/rxe/rxe_pool.c  | 235 +++++++++++++-------------
> >>  drivers/infiniband/sw/rxe/rxe_pool.h  |  67 +++-----
> >>  drivers/infiniband/sw/rxe/rxe_verbs.c |  10 --
> >>  6 files changed, 140 insertions(+), 183 deletions(-)
> >>
> >> -- 
> >> 2.30.2
> >>
> 
> Not sure which 'this' you mean? This set of patches is motivated by someone at HPE
> running into seg faults caused very infrequently by rdma packets causing seg faults
> when trying to copy data to or from an MR. This can only happen (other than just dumb
> bug which doesn't seem to be the case) by a late packet arriving after the MR is
> de-registered. The root cause of that is the way rxe currently defers cleaning up
> objects with krefs and potential races between cleanup and new packets looking up
> rkeys. I found a lot of potential race conditions and tried to close them off. There
> are another couple of patches coming as well.

I have no doubts that this series fixes RXE, but my request was more general.
Is there way/path to remove everything declared in rxe_pool.c|h?

Thanks

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH for-next 0/6] RDMA/rxe: Fix potential races
  2021-10-19 13:07     ` Leon Romanovsky
@ 2021-10-19 16:35       ` Bob Pearson
  2021-10-19 18:43         ` Jason Gunthorpe
  0 siblings, 1 reply; 21+ messages in thread
From: Bob Pearson @ 2021-10-19 16:35 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: jgg, zyjzyj2000, linux-rdma

On 10/19/21 8:07 AM, Leon Romanovsky wrote:
> On Tue, Oct 12, 2021 at 03:19:46PM -0500, Bob Pearson wrote:
>> On 10/12/21 1:34 AM, Leon Romanovsky wrote:
>>> On Sun, Oct 10, 2021 at 06:59:25PM -0500, Bob Pearson wrote:
>>>> There are possible race conditions related to attempting to access
>>>> rxe pool objects at the same time as the pools or elements are being
>>>> freed. This series of patches addresses these races.
>>>
>>> Can we get rid of this pool?
>>>
>>> Thanks
>>>
>>>>
>>>> Bob Pearson (6):
>>>>   RDMA/rxe: Make rxe_alloc() take pool lock
>>>>   RDMA/rxe: Copy setup parameters into rxe_pool
>>>>   RDMA/rxe: Save object pointer in pool element
>>>>   RDMA/rxe: Combine rxe_add_index with rxe_alloc
>>>>   RDMA/rxe: Combine rxe_add_key with rxe_alloc
>>>>   RDMA/rxe: Fix potential race condition in rxe_pool
>>>>
>>>>  drivers/infiniband/sw/rxe/rxe_mcast.c |   5 +-
>>>>  drivers/infiniband/sw/rxe/rxe_mr.c    |   1 -
>>>>  drivers/infiniband/sw/rxe/rxe_mw.c    |   5 +-
>>>>  drivers/infiniband/sw/rxe/rxe_pool.c  | 235 +++++++++++++-------------
>>>>  drivers/infiniband/sw/rxe/rxe_pool.h  |  67 +++-----
>>>>  drivers/infiniband/sw/rxe/rxe_verbs.c |  10 --
>>>>  6 files changed, 140 insertions(+), 183 deletions(-)
>>>>
>>>> -- 
>>>> 2.30.2
>>>>
>>
>> Not sure which 'this' you mean? This set of patches is motivated by someone at HPE
>> running into seg faults caused very infrequently by rdma packets causing seg faults
>> when trying to copy data to or from an MR. This can only happen (other than just dumb
>> bug which doesn't seem to be the case) by a late packet arriving after the MR is
>> de-registered. The root cause of that is the way rxe currently defers cleaning up
>> objects with krefs and potential races between cleanup and new packets looking up
>> rkeys. I found a lot of potential race conditions and tried to close them off. There
>> are another couple of patches coming as well.
> 
> I have no doubts that this series fixes RXE, but my request was more general.
> Is there way/path to remove everything declared in rxe_pool.c|h?
> 
> Thanks
> 

Take a look at the note I copied you on more recently. There is some progress but not
complete elimination of rxe_pool. There is another project suggested by Jason which is
replacing red black trees by xarrays as an alternative approach to indexing rdma objects.
This would still duplicate the indexing done by rdma-core. A while back I looked at
trying to reuse the rdma-core indexing but no effort was made to make that easy. All
of the APIs are private to rdma-core. These indices are managed by the rxe driver for
use as lkeys/rkeys, qpns, srqns, and more recently address handles. xarrays seem to be
more efficient when the indices are fairly compact. There is a suggestion that IB and RoCE
should attempt to make indices that are visible on the network more sparse. Nothing
will make them secure but they could be a lot more secure than they are currently. I
believe mlx5 is now using random keys for this reason.

Bob 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH for-next 0/6] RDMA/rxe: Fix potential races
  2021-10-19 16:35       ` Bob Pearson
@ 2021-10-19 18:43         ` Jason Gunthorpe
  2021-10-19 22:51           ` Bob Pearson
  0 siblings, 1 reply; 21+ messages in thread
From: Jason Gunthorpe @ 2021-10-19 18:43 UTC (permalink / raw)
  To: Bob Pearson; +Cc: Leon Romanovsky, zyjzyj2000, linux-rdma

On Tue, Oct 19, 2021 at 11:35:30AM -0500, Bob Pearson wrote:

> Take a look at the note I copied you on more recently. There is some
> progress but not complete elimination of rxe_pool. There is another
> project suggested by Jason which is replacing red black trees by
> xarrays as an alternative approach to indexing rdma objects.  This
> would still duplicate the indexing done by rdma-core. A while back I
> looked at trying to reuse the rdma-core indexing but no effort was
> made to make that easy.

I have no expecation that a driver can re-use the various rdma-core
indexes.. that is not what they are for, and they have a different
lifetime semantic from wha the driver needs.

> of the APIs are private to rdma-core. These indices are managed by
> the rxe driver for use as lkeys/rkeys, qpns, srqns, and more
> recently address handles. xarrays seem to be more efficient when the
> indices are fairly compact. There is a suggestion that IB and RoCE
> should attempt to make indices that are visible on the network more
> sparse. Nothing will make them secure but they could be a lot more
> secure than they are currently. I believe mlx5 is now using random
> keys for this reason.

Only qpn really benifits from something like this, and it is more
about maximum lifetime before qpn re-use which is a cyclic allocating
xarray.

Jason

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH for-next 0/6] RDMA/rxe: Fix potential races
  2021-10-19 18:43         ` Jason Gunthorpe
@ 2021-10-19 22:51           ` Bob Pearson
  0 siblings, 0 replies; 21+ messages in thread
From: Bob Pearson @ 2021-10-19 22:51 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Leon Romanovsky, zyjzyj2000, linux-rdma

On 10/19/21 1:43 PM, Jason Gunthorpe wrote:
> On Tue, Oct 19, 2021 at 11:35:30AM -0500, Bob Pearson wrote:
> 
>> Take a look at the note I copied you on more recently. There is some
>> progress but not complete elimination of rxe_pool. There is another
>> project suggested by Jason which is replacing red black trees by
>> xarrays as an alternative approach to indexing rdma objects.  This
>> would still duplicate the indexing done by rdma-core. A while back I
>> looked at trying to reuse the rdma-core indexing but no effort was
>> made to make that easy.
> 
> I have no expecation that a driver can re-use the various rdma-core
> indexes.. that is not what they are for, and they have a different
> lifetime semantic from wha the driver needs.
> 
>> of the APIs are private to rdma-core. These indices are managed by
>> the rxe driver for use as lkeys/rkeys, qpns, srqns, and more
>> recently address handles. xarrays seem to be more efficient when the
>> indices are fairly compact. There is a suggestion that IB and RoCE
>> should attempt to make indices that are visible on the network more
>> sparse. Nothing will make them secure but they could be a lot more
>> secure than they are currently. I believe mlx5 is now using random
>> keys for this reason.
> 
> Only qpn really benifits from something like this, and it is more
> about maximum lifetime before qpn re-use which is a cyclic allocating
> xarray.
> 
> Jason
> 

Thanks. I had given up long ago on using the rdma-core indices. Leon would like
to get rid of rxe pools. Actually (I just checked) there is only one remaining
object type that is not already allocated in rdma-core and that is MR. (The multicast
groups are a special case and not really in the same league as PD, QP, CQ, etc. They
probably should just be replaced with open coded kzalloc/kfree. They are not shared
or visible to rdma-core.) So with this exception the pools are really just a way to add
indices to objects which can be done cleanly with xarrays (I have a version that
already does this and it works fine. No performance difference with red-black trees
though. Still looking to get rid of the spinlocks and use the rcu locking in xarrays.)

Bob

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH for-next 1/6] RDMA/rxe: Make rxe_alloc() take pool lock
  2021-10-10 23:59 ` [PATCH for-next 1/6] RDMA/rxe: Make rxe_alloc() take pool lock Bob Pearson
@ 2021-10-20 23:16   ` Jason Gunthorpe
  2021-10-21 17:46     ` Bob Pearson
  0 siblings, 1 reply; 21+ messages in thread
From: Jason Gunthorpe @ 2021-10-20 23:16 UTC (permalink / raw)
  To: Bob Pearson; +Cc: zyjzyj2000, linux-rdma

On Sun, Oct 10, 2021 at 06:59:26PM -0500, Bob Pearson wrote:
> In rxe there are two separate pool APIs for creating a new object
> rxe_alloc() and rxe_alloc_locked(). Currently they are identical.
> Make rxe_alloc() take the pool lock which is in line with the other
> APIs in the library.
> 
> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
>  drivers/infiniband/sw/rxe/rxe_pool.c | 21 ++++-----------------
>  1 file changed, 4 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
> index ffa8420b4765..7a288ebacceb 100644
> +++ b/drivers/infiniband/sw/rxe/rxe_pool.c
> @@ -352,27 +352,14 @@ void *rxe_alloc_locked(struct rxe_pool *pool)
>  
>  void *rxe_alloc(struct rxe_pool *pool)
>  {
> -	struct rxe_type_info *info = &rxe_type_info[pool->type];
> -	struct rxe_pool_entry *elem;
> +	unsigned long flags;
>  	u8 *obj;
>  
> -	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
> -		goto out_cnt;
> -
> -	obj = kzalloc(info->size, GFP_KERNEL);
> -	if (!obj)
> -		goto out_cnt;
> -
> -	elem = (struct rxe_pool_entry *)(obj + info->elem_offset);
> -
> -	elem->pool = pool;
> -	kref_init(&elem->ref_cnt);
> +	write_lock_irqsave(&pool->pool_lock, flags);
> +	obj = rxe_alloc_locked(pool);
> +	write_unlock_irqrestore(&pool->pool_lock, flags);

But why? This just makes a GFP_KERNEL allocation into a GFP_ATOMIC
allocation, which is bad.

Jason

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH for-next 3/6] RDMA/rxe: Save object pointer in pool element
  2021-10-10 23:59 ` [PATCH for-next 3/6] RDMA/rxe: Save object pointer in pool element Bob Pearson
@ 2021-10-20 23:20   ` Jason Gunthorpe
  2021-10-21 17:21     ` Bob Pearson
  0 siblings, 1 reply; 21+ messages in thread
From: Jason Gunthorpe @ 2021-10-20 23:20 UTC (permalink / raw)
  To: Bob Pearson; +Cc: zyjzyj2000, linux-rdma

On Sun, Oct 10, 2021 at 06:59:28PM -0500, Bob Pearson wrote:
> In rxe_pool.c currently there are many cases where it is necessary to
> compute the offset from a pool element struct to the object containing
> the pool element in a type independent way. By saving a pointer to the
> object when they are created extra work can be saved.
> 
> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> ---
>  drivers/infiniband/sw/rxe/rxe_pool.c | 16 +++++++++-------
>  drivers/infiniband/sw/rxe/rxe_pool.h |  1 +
>  2 files changed, 10 insertions(+), 7 deletions(-)

This would be better to just add a static_assert or build_bug_on that
the offsetof() the rxe_pool_entry == 0. There is no reason for these
to be sprinkled at every offset

Then you don't need to do anything at all

Jason

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH for-next 6/6] RDMA/rxe: Fix potential race condition in rxe_pool
  2021-10-10 23:59 ` [PATCH for-next 6/6] RDMA/rxe: Fix potential race condition in rxe_pool Bob Pearson
@ 2021-10-20 23:23   ` Jason Gunthorpe
  0 siblings, 0 replies; 21+ messages in thread
From: Jason Gunthorpe @ 2021-10-20 23:23 UTC (permalink / raw)
  To: Bob Pearson; +Cc: zyjzyj2000, linux-rdma

On Sun, Oct 10, 2021 at 06:59:31PM -0500, Bob Pearson wrote:
> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h
> index ad287c4ddc1a..43dac03ad82e 100644
> +++ b/drivers/infiniband/sw/rxe/rxe_pool.h
> @@ -132,9 +132,20 @@ void *rxe_pool_get_key(struct rxe_pool *pool, void *key);
>  void rxe_elem_release(struct kref *kref);
>  
>  /* take a reference on an object */
> -#define rxe_add_ref(elem) kref_get(&(elem)->pelem.ref_cnt)
> +static inline int __rxe_add_ref(struct rxe_pool_entry *elem)
> +{
> +	int ret = kref_get_unless_zero(&elem->ref_cnt);
> +
> +	if (unlikely(!ret))
> +		pr_warn("Taking a reference on a %s object that is already destroyed\n",
> +			elem->pool->name);

There is already debugging prints for this built into the kref, you
don't need to open code them

Jason

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH for-next 3/6] RDMA/rxe: Save object pointer in pool element
  2021-10-20 23:20   ` Jason Gunthorpe
@ 2021-10-21 17:21     ` Bob Pearson
  2021-10-25 15:40       ` Jason Gunthorpe
  0 siblings, 1 reply; 21+ messages in thread
From: Bob Pearson @ 2021-10-21 17:21 UTC (permalink / raw)
  To: Jason Gunthorpe, Zhu Yanjun, linux-rdma

On 10/20/21 6:20 PM, Jason Gunthorpe wrote:
> On Sun, Oct 10, 2021 at 06:59:28PM -0500, Bob Pearson wrote:
>> In rxe_pool.c currently there are many cases where it is necessary to
>> compute the offset from a pool element struct to the object containing
>> the pool element in a type independent way. By saving a pointer to the
>> object when they are created extra work can be saved.
>>
>> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
>> ---
>>  drivers/infiniband/sw/rxe/rxe_pool.c | 16 +++++++++-------
>>  drivers/infiniband/sw/rxe/rxe_pool.h |  1 +
>>  2 files changed, 10 insertions(+), 7 deletions(-)
> 
> This would be better to just add a static_assert or build_bug_on that
> the offsetof() the rxe_pool_entry == 0. There is no reason for these
> to be sprinkled at every offset
> 
> Then you don't need to do anything at all
> 
> Jason
> 
I think you missed something. Once upon a time all the rxe objects had the local
pool entry struct followed by the ib_xx struct and then anything else needed locally.
As Leon has been moving the allocations to rdma-core he had to make the ib_xx struct 
first followed by the pool element. So there is a mish mash of offsets. Some
are/were 0, and the rest are all different depending on the size of the struct from
rdma-core. E.g. sizeof(struct ib_mr) != sizeof(struct ib_qp), etc. In order to make
the API simple and consistent I use a macro to convert from object to pool elem. e.g.

	#define some_function(void *obj) __some_function(&obj->elem)

but to convert back from elem to obj we need to subtract some random offset e.g.

	obj = elem->obj;

without knowing the type of obj which is simpler than what we had before, roughly

	struct rxe_pool_info *info = &pool_info[elem->pool->type];

	obj = (u8 *)(elem - info->elem_offset);

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH for-next 1/6] RDMA/rxe: Make rxe_alloc() take pool lock
  2021-10-20 23:16   ` Jason Gunthorpe
@ 2021-10-21 17:46     ` Bob Pearson
  2021-10-25 12:43       ` Jason Gunthorpe
  0 siblings, 1 reply; 21+ messages in thread
From: Bob Pearson @ 2021-10-21 17:46 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: zyjzyj2000, linux-rdma

On 10/20/21 6:16 PM, Jason Gunthorpe wrote:
> On Sun, Oct 10, 2021 at 06:59:26PM -0500, Bob Pearson wrote:
>> In rxe there are two separate pool APIs for creating a new object
>> rxe_alloc() and rxe_alloc_locked(). Currently they are identical.
>> Make rxe_alloc() take the pool lock which is in line with the other
>> APIs in the library.
>>
>> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
>>  drivers/infiniband/sw/rxe/rxe_pool.c | 21 ++++-----------------
>>  1 file changed, 4 insertions(+), 17 deletions(-)
>>
>> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
>> index ffa8420b4765..7a288ebacceb 100644
>> +++ b/drivers/infiniband/sw/rxe/rxe_pool.c
>> @@ -352,27 +352,14 @@ void *rxe_alloc_locked(struct rxe_pool *pool)
>>  
>>  void *rxe_alloc(struct rxe_pool *pool)
>>  {
>> -	struct rxe_type_info *info = &rxe_type_info[pool->type];
>> -	struct rxe_pool_entry *elem;
>> +	unsigned long flags;
>>  	u8 *obj;
>>  
>> -	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
>> -		goto out_cnt;
>> -
>> -	obj = kzalloc(info->size, GFP_KERNEL);
>> -	if (!obj)
>> -		goto out_cnt;
>> -
>> -	elem = (struct rxe_pool_entry *)(obj + info->elem_offset);
>> -
>> -	elem->pool = pool;
>> -	kref_init(&elem->ref_cnt);
>> +	write_lock_irqsave(&pool->pool_lock, flags);
>> +	obj = rxe_alloc_locked(pool);
>> +	write_unlock_irqrestore(&pool->pool_lock, flags);
> 
> But why? This just makes a GFP_KERNEL allocation into a GFP_ATOMIC
> allocation, which is bad.
> 
> Jason
> 
how bad? It only has to happen once in the driver for mcast group elements where
currently I have (to avoid the race when two QPs try to join the same mcast grp
on different CPUs at the same time)

	spin_lock()
	grp = rxe_get_key_locked(pool, mgid)
	if !grp
		grp = rxe_alloc_locked(pool)
	spin_unlock()

Here the kzalloc has to be GFP_ATOMIC. But I could write after fixing things to
move the kzalloc out of the lock in rxe_alloc().

	newgrp = rxe_alloc(pool)	/* using GFP_KERNEL */
	spin_lock()
	grp = rxe_get_key_locked(pool, mgid)
	if (grp)
		kfree(newgrp)
	else {
		grp = newgrp
		<set key in grp>
	}
	spin_unlock()

A typical use case would be for a bunch of QPs to join a mcast group and most of the
time the key lookup succeeds. The trade off is between extra malloc/free and occasional
bad behavior from GFP_ATOMIC.

The majority of uses for rxe_alloc() do not have these issues and I can move the kzalloc
outside of the lock and use GFP_KERNEL.

Bob

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH for-next 1/6] RDMA/rxe: Make rxe_alloc() take pool lock
  2021-10-21 17:46     ` Bob Pearson
@ 2021-10-25 12:43       ` Jason Gunthorpe
  2021-10-25 18:48         ` Robert Pearson
  0 siblings, 1 reply; 21+ messages in thread
From: Jason Gunthorpe @ 2021-10-25 12:43 UTC (permalink / raw)
  To: Bob Pearson; +Cc: zyjzyj2000, linux-rdma

On Thu, Oct 21, 2021 at 12:46:50PM -0500, Bob Pearson wrote:
> On 10/20/21 6:16 PM, Jason Gunthorpe wrote:
> > On Sun, Oct 10, 2021 at 06:59:26PM -0500, Bob Pearson wrote:
> >> In rxe there are two separate pool APIs for creating a new object
> >> rxe_alloc() and rxe_alloc_locked(). Currently they are identical.
> >> Make rxe_alloc() take the pool lock which is in line with the other
> >> APIs in the library.
> >>
> >> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> >>  drivers/infiniband/sw/rxe/rxe_pool.c | 21 ++++-----------------
> >>  1 file changed, 4 insertions(+), 17 deletions(-)
> >>
> >> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
> >> index ffa8420b4765..7a288ebacceb 100644
> >> +++ b/drivers/infiniband/sw/rxe/rxe_pool.c
> >> @@ -352,27 +352,14 @@ void *rxe_alloc_locked(struct rxe_pool *pool)
> >>  
> >>  void *rxe_alloc(struct rxe_pool *pool)
> >>  {
> >> -	struct rxe_type_info *info = &rxe_type_info[pool->type];
> >> -	struct rxe_pool_entry *elem;
> >> +	unsigned long flags;
> >>  	u8 *obj;
> >>  
> >> -	if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
> >> -		goto out_cnt;
> >> -
> >> -	obj = kzalloc(info->size, GFP_KERNEL);
> >> -	if (!obj)
> >> -		goto out_cnt;
> >> -
> >> -	elem = (struct rxe_pool_entry *)(obj + info->elem_offset);
> >> -
> >> -	elem->pool = pool;
> >> -	kref_init(&elem->ref_cnt);
> >> +	write_lock_irqsave(&pool->pool_lock, flags);
> >> +	obj = rxe_alloc_locked(pool);
> >> +	write_unlock_irqrestore(&pool->pool_lock, flags);
> > 
> > But why? This just makes a GFP_KERNEL allocation into a GFP_ATOMIC
> > allocation, which is bad.
> > 
> > Jason
> > 
> how bad?

Quite bad..

> It only has to happen once in the driver for mcast group elements
> where currently I have (to avoid the race when two QPs try to join
> the same mcast grp on different CPUs at the same time)
> 
> 	spin_lock()
> 	grp = rxe_get_key_locked(pool, mgid)
> 	if !grp
> 		grp = rxe_alloc_locked(pool)
> 	spin_unlock()

When you have xarray the general pattern is:

old = xa_load(mgid)
if (old
   return old

new = kzalloc()
old = xa_cmpxchg(mgid, NULL, new)
if (old)
     kfree(new)
     return old;
return new;

There are several examples of this with various locking patterns

Jason

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH for-next 3/6] RDMA/rxe: Save object pointer in pool element
  2021-10-21 17:21     ` Bob Pearson
@ 2021-10-25 15:40       ` Jason Gunthorpe
  0 siblings, 0 replies; 21+ messages in thread
From: Jason Gunthorpe @ 2021-10-25 15:40 UTC (permalink / raw)
  To: Bob Pearson; +Cc: Zhu Yanjun, linux-rdma

On Thu, Oct 21, 2021 at 12:21:15PM -0500, Bob Pearson wrote:
> On 10/20/21 6:20 PM, Jason Gunthorpe wrote:
> > On Sun, Oct 10, 2021 at 06:59:28PM -0500, Bob Pearson wrote:
> >> In rxe_pool.c currently there are many cases where it is necessary to
> >> compute the offset from a pool element struct to the object containing
> >> the pool element in a type independent way. By saving a pointer to the
> >> object when they are created extra work can be saved.
> >>
> >> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> >>  drivers/infiniband/sw/rxe/rxe_pool.c | 16 +++++++++-------
> >>  drivers/infiniband/sw/rxe/rxe_pool.h |  1 +
> >>  2 files changed, 10 insertions(+), 7 deletions(-)
> > 
> > This would be better to just add a static_assert or build_bug_on that
> > the offsetof() the rxe_pool_entry == 0. There is no reason for these
> > to be sprinkled at every offset
> > 
> > Then you don't need to do anything at all
> > 
> > Jason
> > 
> I think you missed something. Once upon a time all the rxe objects had the local
> pool entry struct followed by the ib_xx struct and then anything else needed locally.
> As Leon has been moving the allocations to rdma-core he had to make the ib_xx struct 
> first followed by the pool element.

Oh, I forgot that we were forcing the ib_xx to be at the start already

Jason

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH for-next 1/6] RDMA/rxe: Make rxe_alloc() take pool lock
  2021-10-25 12:43       ` Jason Gunthorpe
@ 2021-10-25 18:48         ` Robert Pearson
  0 siblings, 0 replies; 21+ messages in thread
From: Robert Pearson @ 2021-10-25 18:48 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Zhu Yanjun, RDMA mailing list

This looks useful but I had given up using xarrays to hold multicast
groups because the mgids are 128 bits. The ib_attach_mcast() verb just
passes in the QP and MGID and I couldn't think of an efficient way to
reduce the MGID to a 32 bit non-sparse index. The xarray documentation
advises against hashing the object to get an index. If Linux hands out
mcast group IDs sequentially it may be OK to just use the flag and
scope bits concatenated with the low order 24 bits of the group number
but one would have to deal with (maybe never happen) collisions. If
this worked I could get rid of all the RB trees and make everything be
xarrays.

Bob

On Mon, Oct 25, 2021 at 7:43 AM Jason Gunthorpe <jgg@nvidia.com> wrote:
>
> On Thu, Oct 21, 2021 at 12:46:50PM -0500, Bob Pearson wrote:
> > On 10/20/21 6:16 PM, Jason Gunthorpe wrote:
> > > On Sun, Oct 10, 2021 at 06:59:26PM -0500, Bob Pearson wrote:
> > >> In rxe there are two separate pool APIs for creating a new object
> > >> rxe_alloc() and rxe_alloc_locked(). Currently they are identical.
> > >> Make rxe_alloc() take the pool lock which is in line with the other
> > >> APIs in the library.
> > >>
> > >> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> > >>  drivers/infiniband/sw/rxe/rxe_pool.c | 21 ++++-----------------
> > >>  1 file changed, 4 insertions(+), 17 deletions(-)
> > >>
> > >> diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
> > >> index ffa8420b4765..7a288ebacceb 100644
> > >> +++ b/drivers/infiniband/sw/rxe/rxe_pool.c
> > >> @@ -352,27 +352,14 @@ void *rxe_alloc_locked(struct rxe_pool *pool)
> > >>
> > >>  void *rxe_alloc(struct rxe_pool *pool)
> > >>  {
> > >> -  struct rxe_type_info *info = &rxe_type_info[pool->type];
> > >> -  struct rxe_pool_entry *elem;
> > >> +  unsigned long flags;
> > >>    u8 *obj;
> > >>
> > >> -  if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
> > >> -          goto out_cnt;
> > >> -
> > >> -  obj = kzalloc(info->size, GFP_KERNEL);
> > >> -  if (!obj)
> > >> -          goto out_cnt;
> > >> -
> > >> -  elem = (struct rxe_pool_entry *)(obj + info->elem_offset);
> > >> -
> > >> -  elem->pool = pool;
> > >> -  kref_init(&elem->ref_cnt);
> > >> +  write_lock_irqsave(&pool->pool_lock, flags);
> > >> +  obj = rxe_alloc_locked(pool);
> > >> +  write_unlock_irqrestore(&pool->pool_lock, flags);
> > >
> > > But why? This just makes a GFP_KERNEL allocation into a GFP_ATOMIC
> > > allocation, which is bad.
> > >
> > > Jason
> > >
> > how bad?
>
> Quite bad..
>
> > It only has to happen once in the driver for mcast group elements
> > where currently I have (to avoid the race when two QPs try to join
> > the same mcast grp on different CPUs at the same time)
> >
> >       spin_lock()
> >       grp = rxe_get_key_locked(pool, mgid)
> >       if !grp
> >               grp = rxe_alloc_locked(pool)
> >       spin_unlock()
>
> When you have xarray the general pattern is:
>
> old = xa_load(mgid)
> if (old
>    return old
>
> new = kzalloc()
> old = xa_cmpxchg(mgid, NULL, new)
> if (old)
>      kfree(new)
>      return old;
> return new;
>
> There are several examples of this with various locking patterns
>
> Jason

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2021-10-25 18:49 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-10 23:59 [PATCH for-next 0/6] RDMA/rxe: Fix potential races Bob Pearson
2021-10-10 23:59 ` [PATCH for-next 1/6] RDMA/rxe: Make rxe_alloc() take pool lock Bob Pearson
2021-10-20 23:16   ` Jason Gunthorpe
2021-10-21 17:46     ` Bob Pearson
2021-10-25 12:43       ` Jason Gunthorpe
2021-10-25 18:48         ` Robert Pearson
2021-10-10 23:59 ` [PATCH for-next 2/6] RDMA/rxe: Copy setup parameters into rxe_pool Bob Pearson
2021-10-10 23:59 ` [PATCH for-next 3/6] RDMA/rxe: Save object pointer in pool element Bob Pearson
2021-10-20 23:20   ` Jason Gunthorpe
2021-10-21 17:21     ` Bob Pearson
2021-10-25 15:40       ` Jason Gunthorpe
2021-10-10 23:59 ` [PATCH for-next 4/6] RDMA/rxe: Combine rxe_add_index with rxe_alloc Bob Pearson
2021-10-10 23:59 ` [PATCH for-next 5/6] RDMA/rxe: Combine rxe_add_key " Bob Pearson
2021-10-10 23:59 ` [PATCH for-next 6/6] RDMA/rxe: Fix potential race condition in rxe_pool Bob Pearson
2021-10-20 23:23   ` Jason Gunthorpe
2021-10-12  6:34 ` [PATCH for-next 0/6] RDMA/rxe: Fix potential races Leon Romanovsky
2021-10-12 20:19   ` Bob Pearson
2021-10-19 13:07     ` Leon Romanovsky
2021-10-19 16:35       ` Bob Pearson
2021-10-19 18:43         ` Jason Gunthorpe
2021-10-19 22:51           ` Bob Pearson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.