linux-rdma.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH rdma-next 0/7] MR cache enhancement
@ 2021-12-06  9:10 Leon Romanovsky
  2021-12-06  9:10 ` [PATCH rdma-next 1/7] RDMA/mlx5: Merge similar flows of allocating MR from the cache Leon Romanovsky
                   ` (7 more replies)
  0 siblings, 8 replies; 15+ messages in thread
From: Leon Romanovsky @ 2021-12-06  9:10 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Leon Romanovsky, Aharon Landau, linux-rdma

From: Leon Romanovsky <leonro@nvidia.com>

Hi,

This series from Aharon refactors mlx5 MR cache management logic to
speedup deregistration significantly.

Thanks

Aharon Landau (7):
  RDMA/mlx5: Merge similar flows of allocating MR from the cache
  RDMA/mlx5: Replace cache list with Xarray
  RDMA/mlx5: Store in the cache mkeys instead of mrs
  RDMA/mlx5: Change the cache structure to an RB-tree
  RDMA/mlx5: Reorder calls to pcie_relaxed_ordering_enabled()
  RDMA/mlx5: Delay the deregistration of a non-cache mkey
  RDMA/mlx5: Rename the mkey cache variables and functions

 drivers/infiniband/hw/mlx5/main.c    |   4 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  78 +--
 drivers/infiniband/hw/mlx5/mr.c      | 930 +++++++++++++++++----------
 drivers/infiniband/hw/mlx5/odp.c     |  72 ++-
 include/linux/mlx5/driver.h          |   7 +-
 5 files changed, 673 insertions(+), 418 deletions(-)

-- 
2.33.1


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH rdma-next 1/7] RDMA/mlx5: Merge similar flows of allocating MR from the cache
  2021-12-06  9:10 [PATCH rdma-next 0/7] MR cache enhancement Leon Romanovsky
@ 2021-12-06  9:10 ` Leon Romanovsky
  2021-12-06  9:10 ` [PATCH rdma-next 2/7] RDMA/mlx5: Replace cache list with Xarray Leon Romanovsky
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 15+ messages in thread
From: Leon Romanovsky @ 2021-12-06  9:10 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Aharon Landau, linux-kernel, linux-rdma

From: Aharon Landau <aharonl@nvidia.com>

When allocating an MR from the cache, the driver calls to
get_cache_mr(), and in case of failure, retries with create_cache_mr().
This is the flow of mlx5_mr_cache_alloc(), so use it instead.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  3 +-
 drivers/infiniband/hw/mlx5/mr.c      | 51 +++++-----------------------
 drivers/infiniband/hw/mlx5/odp.c     | 11 ++++--
 3 files changed, 19 insertions(+), 46 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index e636e954f6bf..804748174752 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1343,7 +1343,8 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
 
 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
-				       unsigned int entry, int access_flags);
+				       struct mlx5_cache_ent *ent,
+				       int access_flags);
 
 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
 			    struct ib_mr_status *mr_status);
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 157d862fb864..2cba55bb7825 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -566,25 +566,22 @@ static void cache_work_func(struct work_struct *work)
 	__cache_work_func(ent);
 }
 
-/* Allocate a special entry from the cache */
 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
-				       unsigned int entry, int access_flags)
+				       struct mlx5_cache_ent *ent,
+				       int access_flags)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
-	struct mlx5_cache_ent *ent;
 	struct mlx5_ib_mr *mr;
 
-	if (WARN_ON(entry <= MR_CACHE_LAST_STD_ENTRY ||
-		    entry >= ARRAY_SIZE(cache->ent)))
-		return ERR_PTR(-EINVAL);
-
 	/* Matches access in alloc_cache_mr() */
 	if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags))
 		return ERR_PTR(-EOPNOTSUPP);
 
-	ent = &cache->ent[entry];
 	spin_lock_irq(&ent->lock);
 	if (list_empty(&ent->head)) {
+		if (ent->limit) {
+			queue_adjust_cache_locked(ent);
+			ent->miss++;
+		}
 		spin_unlock_irq(&ent->lock);
 		mr = create_cache_mr(ent);
 		if (IS_ERR(mr))
@@ -598,32 +595,9 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
 
 		mlx5_clear_mr(mr);
 	}
-	mr->access_flags = access_flags;
 	return mr;
 }
 
-/* Return a MR already available in the cache */
-static struct mlx5_ib_mr *get_cache_mr(struct mlx5_cache_ent *req_ent)
-{
-	struct mlx5_ib_mr *mr = NULL;
-	struct mlx5_cache_ent *ent = req_ent;
-
-	spin_lock_irq(&ent->lock);
-	if (!list_empty(&ent->head)) {
-		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
-		list_del(&mr->list);
-		ent->available_mrs--;
-		queue_adjust_cache_locked(ent);
-		spin_unlock_irq(&ent->lock);
-		mlx5_clear_mr(mr);
-		return mr;
-	}
-	queue_adjust_cache_locked(ent);
-	spin_unlock_irq(&ent->lock);
-	req_ent->miss++;
-	return NULL;
-}
-
 static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
 	struct mlx5_cache_ent *ent = mr->cache_ent;
@@ -959,16 +933,9 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 		return mr;
 	}
 
-	mr = get_cache_mr(ent);
-	if (!mr) {
-		mr = create_cache_mr(ent);
-		/*
-		 * The above already tried to do the same stuff as reg_create(),
-		 * no reason to try it again.
-		 */
-		if (IS_ERR(mr))
-			return mr;
-	}
+	mr = mlx5_mr_cache_alloc(dev, ent, access_flags);
+	if (IS_ERR(mr))
+		return mr;
 
 	mr->ibmr.pd = pd;
 	mr->umem = umem;
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 91eb615b89ee..0972afc3e952 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -407,6 +407,7 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
 static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
 						unsigned long idx)
 {
+	struct mlx5_ib_dev *dev = mr_to_mdev(imr);
 	struct ib_umem_odp *odp;
 	struct mlx5_ib_mr *mr;
 	struct mlx5_ib_mr *ret;
@@ -418,13 +419,14 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
 	if (IS_ERR(odp))
 		return ERR_CAST(odp);
 
-	mr = mlx5_mr_cache_alloc(
-		mr_to_mdev(imr), MLX5_IMR_MTT_CACHE_ENTRY, imr->access_flags);
+	mr = mlx5_mr_cache_alloc(dev, &dev->cache.ent[MLX5_IMR_MTT_CACHE_ENTRY],
+				 imr->access_flags);
 	if (IS_ERR(mr)) {
 		ib_umem_odp_release(odp);
 		return mr;
 	}
 
+	mr->access_flags = imr->access_flags;
 	mr->ibmr.pd = imr->ibmr.pd;
 	mr->ibmr.device = &mr_to_mdev(imr)->ib_dev;
 	mr->umem = &odp->umem;
@@ -493,12 +495,15 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 	if (IS_ERR(umem_odp))
 		return ERR_CAST(umem_odp);
 
-	imr = mlx5_mr_cache_alloc(dev, MLX5_IMR_KSM_CACHE_ENTRY, access_flags);
+	imr = mlx5_mr_cache_alloc(dev,
+				  &dev->cache.ent[MLX5_IMR_KSM_CACHE_ENTRY],
+				  access_flags);
 	if (IS_ERR(imr)) {
 		ib_umem_odp_release(umem_odp);
 		return imr;
 	}
 
+	imr->access_flags = access_flags;
 	imr->ibmr.pd = &pd->ibpd;
 	imr->ibmr.iova = 0;
 	imr->umem = &umem_odp->umem;
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH rdma-next 2/7] RDMA/mlx5: Replace cache list with Xarray
  2021-12-06  9:10 [PATCH rdma-next 0/7] MR cache enhancement Leon Romanovsky
  2021-12-06  9:10 ` [PATCH rdma-next 1/7] RDMA/mlx5: Merge similar flows of allocating MR from the cache Leon Romanovsky
@ 2021-12-06  9:10 ` Leon Romanovsky
  2021-12-08 16:46   ` Jason Gunthorpe
  2021-12-06  9:10 ` [PATCH rdma-next 3/7] RDMA/mlx5: Store in the cache mkeys instead of mrs Leon Romanovsky
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 15+ messages in thread
From: Leon Romanovsky @ 2021-12-06  9:10 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Aharon Landau, linux-kernel, linux-rdma

From: Aharon Landau <aharonl@nvidia.com>

The Xarray allows us to store the cached mkeys in memory efficient way
and internal xa_lock is used to protect the indexes. It helps us to get
rid of ent->lock as it is not required anymore.

Entries are reserved in the Xarray using xa_cmpxchg before calling to
the upcoming callbacks to avoid allocations in interrupt context.
The xa_cmpxchg can sleep when using GFP_KERNEL, so we call it in
a loop to ensure one reserved entry for each process trying to reserve.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  11 +-
 drivers/infiniband/hw/mlx5/mr.c      | 206 +++++++++++++++------------
 2 files changed, 119 insertions(+), 98 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 804748174752..d0224f468169 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -754,11 +754,9 @@ struct umr_common {
 };
 
 struct mlx5_cache_ent {
-	struct list_head	head;
-	/* sync access to the cahce entry
-	 */
-	spinlock_t		lock;
-
+	struct xarray		mkeys;
+	unsigned long		stored;
+	unsigned long		reserved;
 
 	char                    name[4];
 	u32                     order;
@@ -770,8 +768,6 @@ struct mlx5_cache_ent {
 	u8 fill_to_high_water:1;
 
 	/*
-	 * - available_mrs is the length of list head, ie the number of MRs
-	 *   available for immediate allocation.
 	 * - total_mrs is available_mrs plus all in use MRs that could be
 	 *   returned to the cache.
 	 * - limit is the low water mark for available_mrs, 2* limit is the
@@ -779,7 +775,6 @@ struct mlx5_cache_ent {
 	 * - pending is the number of MRs currently being created
 	 */
 	u32 total_mrs;
-	u32 available_mrs;
 	u32 limit;
 	u32 pending;
 
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 2cba55bb7825..e62f8da8558d 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -147,14 +147,17 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
 	struct mlx5_cache_ent *ent = mr->cache_ent;
 	struct mlx5_ib_dev *dev = ent->dev;
 	unsigned long flags;
+	void *xa_ent;
 
 	if (status) {
 		mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
 		kfree(mr);
-		spin_lock_irqsave(&ent->lock, flags);
+		xa_lock_irqsave(&ent->mkeys, flags);
+		xa_ent = __xa_erase(&ent->mkeys, ent->reserved--);
+		WARN_ON(xa_ent != NULL);
 		ent->pending--;
 		WRITE_ONCE(dev->fill_delay, 1);
-		spin_unlock_irqrestore(&ent->lock, flags);
+		xa_unlock_irqrestore(&ent->mkeys, flags);
 		mod_timer(&dev->delay_timer, jiffies + HZ);
 		return;
 	}
@@ -166,14 +169,14 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
 
 	WRITE_ONCE(dev->cache.last_add, jiffies);
 
-	spin_lock_irqsave(&ent->lock, flags);
-	list_add_tail(&mr->list, &ent->head);
-	ent->available_mrs++;
+	xa_lock_irqsave(&ent->mkeys, flags);
+	xa_ent = __xa_store(&ent->mkeys, ent->stored++, mr, GFP_ATOMIC);
+	WARN_ON(xa_ent != NULL);
+	ent->pending--;
 	ent->total_mrs++;
 	/* If we are doing fill_to_high_water then keep going. */
 	queue_adjust_cache_locked(ent);
-	ent->pending--;
-	spin_unlock_irqrestore(&ent->lock, flags);
+	xa_unlock_irqrestore(&ent->mkeys, flags);
 }
 
 static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
@@ -196,6 +199,25 @@ static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
 	return mr;
 }
 
+static int _push_reserve_mkey(struct mlx5_cache_ent *ent)
+{
+	unsigned long to_reserve;
+	int rc;
+
+	while (true) {
+		to_reserve = ent->reserved;
+		rc = xa_err(__xa_cmpxchg(&ent->mkeys, to_reserve, NULL,
+					 XA_ZERO_ENTRY, GFP_KERNEL));
+		if (rc)
+			return rc;
+		if (to_reserve != ent->reserved)
+			continue;
+		ent->reserved++;
+		break;
+	}
+	return 0;
+}
+
 /* Asynchronously schedule new MRs to be populated in the cache. */
 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 {
@@ -217,23 +239,32 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 			err = -ENOMEM;
 			break;
 		}
-		spin_lock_irq(&ent->lock);
+		xa_lock_irq(&ent->mkeys);
 		if (ent->pending >= MAX_PENDING_REG_MR) {
+			xa_unlock_irq(&ent->mkeys);
 			err = -EAGAIN;
-			spin_unlock_irq(&ent->lock);
+			kfree(mr);
+			break;
+		}
+
+		err = _push_reserve_mkey(ent);
+		if (err) {
+			xa_unlock_irq(&ent->mkeys);
 			kfree(mr);
 			break;
 		}
 		ent->pending++;
-		spin_unlock_irq(&ent->lock);
+		xa_unlock_irq(&ent->mkeys);
 		err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey,
 					     &ent->dev->async_ctx, in, inlen,
 					     mr->out, sizeof(mr->out),
 					     &mr->cb_work);
 		if (err) {
-			spin_lock_irq(&ent->lock);
+			xa_lock_irq(&ent->mkeys);
+			WARN_ON(__xa_erase(&ent->mkeys, ent->reserved--) !=
+				NULL);
 			ent->pending--;
-			spin_unlock_irq(&ent->lock);
+			xa_unlock_irq(&ent->mkeys);
 			mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
 			kfree(mr);
 			break;
@@ -271,9 +302,9 @@ static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent)
 	init_waitqueue_head(&mr->mmkey.wait);
 	mr->mmkey.type = MLX5_MKEY_MR;
 	WRITE_ONCE(ent->dev->cache.last_add, jiffies);
-	spin_lock_irq(&ent->lock);
+	xa_lock_irq(&ent->mkeys);
 	ent->total_mrs++;
-	spin_unlock_irq(&ent->lock);
+	xa_unlock_irq(&ent->mkeys);
 	kfree(in);
 	return mr;
 free_mr:
@@ -287,39 +318,37 @@ static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
 {
 	struct mlx5_ib_mr *mr;
 
-	lockdep_assert_held(&ent->lock);
-	if (list_empty(&ent->head))
+	if (!ent->stored)
 		return;
-	mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
-	list_del(&mr->list);
-	ent->available_mrs--;
+	mr = __xa_store(&ent->mkeys, --ent->stored, NULL, GFP_KERNEL);
+	WARN_ON(mr == NULL || xa_is_err(mr));
+	WARN_ON(__xa_erase(&ent->mkeys, --ent->reserved) != NULL);
 	ent->total_mrs--;
-	spin_unlock_irq(&ent->lock);
+	xa_unlock_irq(&ent->mkeys);
 	mlx5_core_destroy_mkey(ent->dev->mdev, mr->mmkey.key);
 	kfree(mr);
-	spin_lock_irq(&ent->lock);
+	xa_lock_irq(&ent->mkeys);
 }
 
 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
 				bool limit_fill)
+	 __acquires(&ent->lock) __releases(&ent->lock)
 {
 	int err;
 
-	lockdep_assert_held(&ent->lock);
-
 	while (true) {
 		if (limit_fill)
 			target = ent->limit * 2;
-		if (target == ent->available_mrs + ent->pending)
+		if (target == ent->reserved)
 			return 0;
-		if (target > ent->available_mrs + ent->pending) {
-			u32 todo = target - (ent->available_mrs + ent->pending);
+		if (target > ent->reserved) {
+			u32 todo = target - ent->reserved;
 
-			spin_unlock_irq(&ent->lock);
+			xa_unlock_irq(&ent->mkeys);
 			err = add_keys(ent, todo);
 			if (err == -EAGAIN)
 				usleep_range(3000, 5000);
-			spin_lock_irq(&ent->lock);
+			xa_lock_irq(&ent->mkeys);
 			if (err) {
 				if (err != -EAGAIN)
 					return err;
@@ -347,12 +376,13 @@ static ssize_t size_write(struct file *filp, const char __user *buf,
 	 * cannot free MRs that are in use. Compute the target value for
 	 * available_mrs.
 	 */
-	spin_lock_irq(&ent->lock);
-	if (target < ent->total_mrs - ent->available_mrs) {
+
+	xa_lock_irq(&ent->mkeys);
+	if (target < ent->total_mrs - ent->stored) {
 		err = -EINVAL;
 		goto err_unlock;
 	}
-	target = target - (ent->total_mrs - ent->available_mrs);
+	target = target - (ent->total_mrs - ent->stored);
 	if (target < ent->limit || target > ent->limit*2) {
 		err = -EINVAL;
 		goto err_unlock;
@@ -360,12 +390,12 @@ static ssize_t size_write(struct file *filp, const char __user *buf,
 	err = resize_available_mrs(ent, target, false);
 	if (err)
 		goto err_unlock;
-	spin_unlock_irq(&ent->lock);
+	xa_unlock_irq(&ent->mkeys);
 
 	return count;
 
 err_unlock:
-	spin_unlock_irq(&ent->lock);
+	xa_unlock_irq(&ent->mkeys);
 	return err;
 }
 
@@ -405,10 +435,10 @@ static ssize_t limit_write(struct file *filp, const char __user *buf,
 	 * Upon set we immediately fill the cache to high water mark implied by
 	 * the limit.
 	 */
-	spin_lock_irq(&ent->lock);
+	xa_lock_irq(&ent->mkeys);
 	ent->limit = var;
 	err = resize_available_mrs(ent, 0, true);
-	spin_unlock_irq(&ent->lock);
+	xa_unlock_irq(&ent->mkeys);
 	if (err)
 		return err;
 	return count;
@@ -443,9 +473,9 @@ static bool someone_adding(struct mlx5_mr_cache *cache)
 		struct mlx5_cache_ent *ent = &cache->ent[i];
 		bool ret;
 
-		spin_lock_irq(&ent->lock);
-		ret = ent->available_mrs < ent->limit;
-		spin_unlock_irq(&ent->lock);
+		xa_lock_irq(&ent->mkeys);
+		ret = ent->stored < ent->limit;
+		xa_unlock_irq(&ent->mkeys);
 		if (ret)
 			return true;
 	}
@@ -459,23 +489,21 @@ static bool someone_adding(struct mlx5_mr_cache *cache)
  */
 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
 {
-	lockdep_assert_held(&ent->lock);
-
 	if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
 		return;
-	if (ent->available_mrs < ent->limit) {
+	if (ent->stored < ent->limit) {
 		ent->fill_to_high_water = true;
 		queue_work(ent->dev->cache.wq, &ent->work);
 	} else if (ent->fill_to_high_water &&
-		   ent->available_mrs + ent->pending < 2 * ent->limit) {
+		   ent->reserved < 2 * ent->limit) {
 		/*
 		 * Once we start populating due to hitting a low water mark
 		 * continue until we pass the high water mark.
 		 */
 		queue_work(ent->dev->cache.wq, &ent->work);
-	} else if (ent->available_mrs == 2 * ent->limit) {
+	} else if (ent->stored == 2 * ent->limit) {
 		ent->fill_to_high_water = false;
-	} else if (ent->available_mrs > 2 * ent->limit) {
+	} else if (ent->stored > 2 * ent->limit) {
 		/* Queue deletion of excess entries */
 		ent->fill_to_high_water = false;
 		if (ent->pending)
@@ -492,16 +520,15 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 	struct mlx5_mr_cache *cache = &dev->cache;
 	int err;
 
-	spin_lock_irq(&ent->lock);
+	xa_lock_irq(&ent->mkeys);
 	if (ent->disabled)
 		goto out;
 
-	if (ent->fill_to_high_water &&
-	    ent->available_mrs + ent->pending < 2 * ent->limit &&
+	if (ent->fill_to_high_water && ent->reserved < 2 * ent->limit &&
 	    !READ_ONCE(dev->fill_delay)) {
-		spin_unlock_irq(&ent->lock);
+		xa_unlock_irq(&ent->mkeys);
 		err = add_keys(ent, 1);
-		spin_lock_irq(&ent->lock);
+		xa_lock_irq(&ent->mkeys);
 		if (ent->disabled)
 			goto out;
 		if (err) {
@@ -519,7 +546,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 						   msecs_to_jiffies(1000));
 			}
 		}
-	} else if (ent->available_mrs > 2 * ent->limit) {
+	} else if (ent->stored > 2 * ent->limit) {
 		bool need_delay;
 
 		/*
@@ -534,11 +561,11 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 		 * the garbage collection work to try to run in next cycle, in
 		 * order to free CPU resources to other tasks.
 		 */
-		spin_unlock_irq(&ent->lock);
+		xa_unlock_irq(&ent->mkeys);
 		need_delay = need_resched() || someone_adding(cache) ||
 			     !time_after(jiffies,
 					 READ_ONCE(cache->last_add) + 300 * HZ);
-		spin_lock_irq(&ent->lock);
+		xa_lock_irq(&ent->mkeys);
 		if (ent->disabled)
 			goto out;
 		if (need_delay)
@@ -547,7 +574,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 		queue_adjust_cache_locked(ent);
 	}
 out:
-	spin_unlock_irq(&ent->lock);
+	xa_unlock_irq(&ent->mkeys);
 }
 
 static void delayed_cache_work_func(struct work_struct *work)
@@ -576,22 +603,23 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
 	if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags))
 		return ERR_PTR(-EOPNOTSUPP);
 
-	spin_lock_irq(&ent->lock);
-	if (list_empty(&ent->head)) {
+	xa_lock_irq(&ent->mkeys);
+	if (!ent->stored) {
 		if (ent->limit) {
 			queue_adjust_cache_locked(ent);
 			ent->miss++;
 		}
-		spin_unlock_irq(&ent->lock);
+		xa_unlock_irq(&ent->mkeys);
 		mr = create_cache_mr(ent);
 		if (IS_ERR(mr))
 			return mr;
 	} else {
-		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
-		list_del(&mr->list);
-		ent->available_mrs--;
+		mr = __xa_store(&ent->mkeys, --ent->stored, NULL,
+				GFP_KERNEL);
+		WARN_ON(mr == NULL || xa_is_err(mr));
+		WARN_ON(__xa_erase(&ent->mkeys, --ent->reserved) != NULL);
 		queue_adjust_cache_locked(ent);
-		spin_unlock_irq(&ent->lock);
+		xa_unlock_irq(&ent->mkeys);
 
 		mlx5_clear_mr(mr);
 	}
@@ -602,38 +630,26 @@ static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
 	struct mlx5_cache_ent *ent = mr->cache_ent;
 
-	spin_lock_irq(&ent->lock);
-	list_add_tail(&mr->list, &ent->head);
-	ent->available_mrs++;
+	xa_lock_irq(&ent->mkeys);
+	WARN_ON(__xa_store(&ent->mkeys, ent->stored++, mr, 0) != NULL);
 	queue_adjust_cache_locked(ent);
-	spin_unlock_irq(&ent->lock);
+	xa_unlock_irq(&ent->mkeys);
 }
 
 static void clean_keys(struct mlx5_ib_dev *dev, int c)
 {
 	struct mlx5_mr_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent = &cache->ent[c];
-	struct mlx5_ib_mr *tmp_mr;
 	struct mlx5_ib_mr *mr;
-	LIST_HEAD(del_list);
+	unsigned long index;
 
 	cancel_delayed_work(&ent->dwork);
-	while (1) {
-		spin_lock_irq(&ent->lock);
-		if (list_empty(&ent->head)) {
-			spin_unlock_irq(&ent->lock);
-			break;
-		}
-		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
-		list_move(&mr->list, &del_list);
-		ent->available_mrs--;
+	xa_for_each(&ent->mkeys, index, mr) {
+		xa_lock_irq(&ent->mkeys);
+		__xa_erase(&ent->mkeys, index);
 		ent->total_mrs--;
-		spin_unlock_irq(&ent->lock);
+		xa_unlock_irq(&ent->mkeys);
 		mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
-	}
-
-	list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
-		list_del(&mr->list);
 		kfree(mr);
 	}
 }
@@ -665,7 +681,7 @@ static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
 		dir = debugfs_create_dir(ent->name, cache->root);
 		debugfs_create_file("size", 0600, dir, ent, &size_fops);
 		debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
-		debugfs_create_u32("cur", 0400, dir, &ent->available_mrs);
+		debugfs_create_ulong("cur", 0400, dir, &ent->stored);
 		debugfs_create_u32("miss", 0600, dir, &ent->miss);
 	}
 }
@@ -694,8 +710,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 	timer_setup(&dev->delay_timer, delay_time_func, 0);
 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 		ent = &cache->ent[i];
-		INIT_LIST_HEAD(&ent->head);
-		spin_lock_init(&ent->lock);
+		xa_init_flags(&ent->mkeys, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
 		ent->order = i + 2;
 		ent->dev = dev;
 		ent->limit = 0;
@@ -721,9 +736,9 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 			ent->limit = dev->mdev->profile.mr_cache[i].limit;
 		else
 			ent->limit = 0;
-		spin_lock_irq(&ent->lock);
+		xa_lock_irq(&ent->mkeys);
 		queue_adjust_cache_locked(ent);
-		spin_unlock_irq(&ent->lock);
+		xa_unlock_irq(&ent->mkeys);
 	}
 
 	mlx5_mr_cache_debugfs_init(dev);
@@ -741,9 +756,9 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 		struct mlx5_cache_ent *ent = &dev->cache.ent[i];
 
-		spin_lock_irq(&ent->lock);
+		xa_lock_irq(&ent->mkeys);
 		ent->disabled = true;
-		spin_unlock_irq(&ent->lock);
+		xa_unlock_irq(&ent->mkeys);
 		cancel_work_sync(&ent->work);
 		cancel_delayed_work_sync(&ent->dwork);
 	}
@@ -1886,6 +1901,17 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
 	}
 }
 
+static int push_reserve_mkey(struct mlx5_cache_ent *ent)
+{
+	int ret;
+
+	xa_lock_irq(&ent->mkeys);
+	ret = _push_reserve_mkey(ent);
+	xa_unlock_irq(&ent->mkeys);
+
+	return ret;
+}
+
 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
@@ -1932,10 +1958,10 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 
 	/* Stop DMA */
 	if (mr->cache_ent) {
-		if (revoke_mr(mr)) {
-			spin_lock_irq(&mr->cache_ent->lock);
+		if (revoke_mr(mr) || push_reserve_mkey(mr->cache_ent)) {
+			xa_lock_irq(&mr->cache_ent->mkeys);
 			mr->cache_ent->total_mrs--;
-			spin_unlock_irq(&mr->cache_ent->lock);
+			xa_unlock_irq(&mr->cache_ent->mkeys);
 			mr->cache_ent = NULL;
 		}
 	}
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH rdma-next 3/7] RDMA/mlx5: Store in the cache mkeys instead of mrs
  2021-12-06  9:10 [PATCH rdma-next 0/7] MR cache enhancement Leon Romanovsky
  2021-12-06  9:10 ` [PATCH rdma-next 1/7] RDMA/mlx5: Merge similar flows of allocating MR from the cache Leon Romanovsky
  2021-12-06  9:10 ` [PATCH rdma-next 2/7] RDMA/mlx5: Replace cache list with Xarray Leon Romanovsky
@ 2021-12-06  9:10 ` Leon Romanovsky
  2021-12-08 20:44   ` Jason Gunthorpe
  2021-12-06  9:10 ` [PATCH rdma-next 4/7] RDMA/mlx5: Change the cache structure to an RB-tree Leon Romanovsky
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 15+ messages in thread
From: Leon Romanovsky @ 2021-12-06  9:10 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Aharon Landau, linux-kernel, linux-rdma

From: Aharon Landau <aharonl@nvidia.com>

Currently, the driver stores the entire mlx5_ib_mr struct in the cache,
although the only use of the cached MR is the mkey. Store only the mkey
in the cache.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  21 ++---
 drivers/infiniband/hw/mlx5/mr.c      | 135 +++++++++++++--------------
 2 files changed, 71 insertions(+), 85 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index d0224f468169..9b12e970ca01 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -668,14 +668,6 @@ struct mlx5_ib_mr {
 
 	/* This is zero'd when the MR is allocated */
 	union {
-		/* Used only while the MR is in the cache */
-		struct {
-			u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
-			struct mlx5_async_work cb_work;
-			/* Cache list element */
-			struct list_head list;
-		};
-
 		/* Used only by kernel MRs (umem == NULL) */
 		struct {
 			void *descs;
@@ -715,12 +707,6 @@ struct mlx5_ib_mr {
 	};
 };
 
-/* Zero the fields in the mr that are variant depending on usage */
-static inline void mlx5_clear_mr(struct mlx5_ib_mr *mr)
-{
-	memset(mr->out, 0, sizeof(*mr) - offsetof(struct mlx5_ib_mr, out));
-}
-
 static inline bool is_odp_mr(struct mlx5_ib_mr *mr)
 {
 	return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem &&
@@ -786,6 +772,13 @@ struct mlx5_cache_ent {
 	struct delayed_work	dwork;
 };
 
+struct mlx5_async_create_mkey {
+	u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
+	struct mlx5_async_work cb_work;
+	struct mlx5_cache_ent *ent;
+	u32 mkey;
+};
+
 struct mlx5_mr_cache {
 	struct workqueue_struct *wq;
 	struct mlx5_cache_ent	ent[MAX_MR_CACHE_ENTRIES];
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index e62f8da8558d..e64f6466f13d 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -88,15 +88,14 @@ static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
 	MLX5_SET64(mkc, mkc, start_addr, start_addr);
 }
 
-static void assign_mkey_variant(struct mlx5_ib_dev *dev,
-				struct mlx5_ib_mkey *mkey, u32 *in)
+static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in)
 {
 	u8 key = atomic_inc_return(&dev->mkey_var);
 	void *mkc;
 
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 	MLX5_SET(mkc, mkc, mkey_7_0, key);
-	mkey->key = key;
+	*mkey = key;
 }
 
 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
@@ -104,7 +103,7 @@ static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
 {
 	int ret;
 
-	assign_mkey_variant(dev, mkey, in);
+	assign_mkey_variant(dev, &mkey->key, in);
 	ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen);
 	if (!ret)
 		init_waitqueue_head(&mkey->wait);
@@ -113,8 +112,7 @@ static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev,
 }
 
 static int
-mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
-		       struct mlx5_ib_mkey *mkey,
+mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev, u32 *mkey,
 		       struct mlx5_async_ctx *async_ctx,
 		       u32 *in, int inlen, u32 *out, int outlen,
 		       struct mlx5_async_work *context)
@@ -142,16 +140,16 @@ static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 
 static void create_mkey_callback(int status, struct mlx5_async_work *context)
 {
-	struct mlx5_ib_mr *mr =
-		container_of(context, struct mlx5_ib_mr, cb_work);
-	struct mlx5_cache_ent *ent = mr->cache_ent;
+	struct mlx5_async_create_mkey *mkey_out =
+		container_of(context, struct mlx5_async_create_mkey, cb_work);
+	struct mlx5_cache_ent *ent = mkey_out->ent;
 	struct mlx5_ib_dev *dev = ent->dev;
 	unsigned long flags;
 	void *xa_ent;
 
 	if (status) {
 		mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
-		kfree(mr);
+		kfree(mkey_out);
 		xa_lock_irqsave(&ent->mkeys, flags);
 		xa_ent = __xa_erase(&ent->mkeys, ent->reserved--);
 		WARN_ON(xa_ent != NULL);
@@ -162,32 +160,24 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
 		return;
 	}
 
-	mr->mmkey.type = MLX5_MKEY_MR;
-	mr->mmkey.key |= mlx5_idx_to_mkey(
-		MLX5_GET(create_mkey_out, mr->out, mkey_index));
-	init_waitqueue_head(&mr->mmkey.wait);
-
+	mkey_out->mkey |= mlx5_idx_to_mkey(
+		MLX5_GET(create_mkey_out, mkey_out->out, mkey_index));
 	WRITE_ONCE(dev->cache.last_add, jiffies);
 
 	xa_lock_irqsave(&ent->mkeys, flags);
-	xa_ent = __xa_store(&ent->mkeys, ent->stored++, mr, GFP_ATOMIC);
+	xa_ent = __xa_store(&ent->mkeys, ent->stored++,
+			    xa_mk_value(mkey_out->mkey), GFP_ATOMIC);
 	WARN_ON(xa_ent != NULL);
 	ent->pending--;
 	ent->total_mrs++;
 	/* If we are doing fill_to_high_water then keep going. */
 	queue_adjust_cache_locked(ent);
 	xa_unlock_irqrestore(&ent->mkeys, flags);
+	kfree(mkey_out);
 }
 
-static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
+static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
 {
-	struct mlx5_ib_mr *mr;
-
-	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
-	if (!mr)
-		return NULL;
-	mr->cache_ent = ent;
-
 	set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd);
 	MLX5_SET(mkc, mkc, free, 1);
 	MLX5_SET(mkc, mkc, umr_en, 1);
@@ -196,7 +186,7 @@ static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
 
 	MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
 	MLX5_SET(mkc, mkc, log_page_size, ent->page);
-	return mr;
+	return;
 }
 
 static int _push_reserve_mkey(struct mlx5_cache_ent *ent)
@@ -222,7 +212,7 @@ static int _push_reserve_mkey(struct mlx5_cache_ent *ent)
 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 {
 	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
-	struct mlx5_ib_mr *mr;
+	struct mlx5_async_create_mkey *async_out;
 	void *mkc;
 	u32 *in;
 	int err = 0;
@@ -233,32 +223,37 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 		return -ENOMEM;
 
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	set_cache_mkc(ent, mkc);
 	for (i = 0; i < num; i++) {
-		mr = alloc_cache_mr(ent, mkc);
-		if (!mr) {
+		async_out = kzalloc(sizeof(struct mlx5_async_create_mkey),
+				    GFP_KERNEL);
+		if (!async_out) {
 			err = -ENOMEM;
 			break;
 		}
+		async_out->ent = ent;
+
 		xa_lock_irq(&ent->mkeys);
 		if (ent->pending >= MAX_PENDING_REG_MR) {
 			xa_unlock_irq(&ent->mkeys);
 			err = -EAGAIN;
-			kfree(mr);
+			kfree(async_out);
 			break;
 		}
 
 		err = _push_reserve_mkey(ent);
 		if (err) {
 			xa_unlock_irq(&ent->mkeys);
-			kfree(mr);
+			kfree(async_out);
 			break;
 		}
 		ent->pending++;
 		xa_unlock_irq(&ent->mkeys);
-		err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey,
+		err = mlx5_ib_create_mkey_cb(ent->dev, &async_out->mkey,
 					     &ent->dev->async_ctx, in, inlen,
-					     mr->out, sizeof(mr->out),
-					     &mr->cb_work);
+					     async_out->out,
+					     sizeof(async_out->out),
+					     &async_out->cb_work);
 		if (err) {
 			xa_lock_irq(&ent->mkeys);
 			WARN_ON(__xa_erase(&ent->mkeys, ent->reserved--) !=
@@ -266,7 +261,7 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 			ent->pending--;
 			xa_unlock_irq(&ent->mkeys);
 			mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
-			kfree(mr);
+			kfree(async_out);
 			break;
 		}
 	}
@@ -276,57 +271,44 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 }
 
 /* Synchronously create a MR in the cache */
-static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent)
+static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey)
 {
 	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
-	struct mlx5_ib_mr *mr;
 	void *mkc;
 	u32 *in;
 	int err;
 
 	in = kzalloc(inlen, GFP_KERNEL);
 	if (!in)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	set_cache_mkc(ent, mkc);
 
-	mr = alloc_cache_mr(ent, mkc);
-	if (!mr) {
-		err = -ENOMEM;
-		goto free_in;
-	}
-
-	err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey.key, in, inlen);
+	err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen);
 	if (err)
-		goto free_mr;
+		goto free_in;
 
-	init_waitqueue_head(&mr->mmkey.wait);
-	mr->mmkey.type = MLX5_MKEY_MR;
 	WRITE_ONCE(ent->dev->cache.last_add, jiffies);
 	xa_lock_irq(&ent->mkeys);
 	ent->total_mrs++;
 	xa_unlock_irq(&ent->mkeys);
-	kfree(in);
-	return mr;
-free_mr:
-	kfree(mr);
 free_in:
 	kfree(in);
-	return ERR_PTR(err);
+	return err;
 }
 
 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
 {
-	struct mlx5_ib_mr *mr;
+	void *xa_ent;
 
 	if (!ent->stored)
 		return;
-	mr = __xa_store(&ent->mkeys, --ent->stored, NULL, GFP_KERNEL);
-	WARN_ON(mr == NULL || xa_is_err(mr));
+	xa_ent = __xa_store(&ent->mkeys, --ent->stored, NULL, GFP_KERNEL);
+	WARN_ON(xa_ent == NULL || xa_is_err(xa_ent));
 	WARN_ON(__xa_erase(&ent->mkeys, --ent->reserved) != NULL);
 	ent->total_mrs--;
 	xa_unlock_irq(&ent->mkeys);
-	mlx5_core_destroy_mkey(ent->dev->mdev, mr->mmkey.key);
-	kfree(mr);
+	mlx5_core_destroy_mkey(ent->dev->mdev, (u32)xa_to_value(xa_ent));
 	xa_lock_irq(&ent->mkeys);
 }
 
@@ -598,11 +580,16 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
 				       int access_flags)
 {
 	struct mlx5_ib_mr *mr;
+	void *xa_ent;
+	int err;
 
-	/* Matches access in alloc_cache_mr() */
 	if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags))
 		return ERR_PTR(-EOPNOTSUPP);
 
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
 	xa_lock_irq(&ent->mkeys);
 	if (!ent->stored) {
 		if (ent->limit) {
@@ -610,9 +597,11 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
 			ent->miss++;
 		}
 		xa_unlock_irq(&ent->mkeys);
-		mr = create_cache_mr(ent);
-		if (IS_ERR(mr))
-			return mr;
+		err = create_cache_mkey(ent, &mr->mmkey.key);
+		if (err) {
+			kfree(mr);
+			return ERR_PTR(err);
+		}
 	} else {
 		mr = __xa_store(&ent->mkeys, --ent->stored, NULL,
 				GFP_KERNEL);
@@ -621,9 +610,13 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
 		queue_adjust_cache_locked(ent);
 		xa_unlock_irq(&ent->mkeys);
 
-		mlx5_clear_mr(mr);
+		mr->mmkey.key = (u32)xa_to_value(xa_ent);
 	}
+	mr->cache_ent = ent;
+	mr->mmkey.type = MLX5_MKEY_MR;
+	init_waitqueue_head(&mr->mmkey.wait);
 	return mr;
+
 }
 
 static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
@@ -631,7 +624,8 @@ static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 	struct mlx5_cache_ent *ent = mr->cache_ent;
 
 	xa_lock_irq(&ent->mkeys);
-	WARN_ON(__xa_store(&ent->mkeys, ent->stored++, mr, 0) != NULL);
+	WARN_ON(__xa_store(&ent->mkeys, ent->stored++,
+			   xa_mk_value(mr->mmkey.key), 0) != NULL);
 	queue_adjust_cache_locked(ent);
 	xa_unlock_irq(&ent->mkeys);
 }
@@ -640,17 +634,16 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c)
 {
 	struct mlx5_mr_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent = &cache->ent[c];
-	struct mlx5_ib_mr *mr;
 	unsigned long index;
+	void *entry;
 
 	cancel_delayed_work(&ent->dwork);
-	xa_for_each(&ent->mkeys, index, mr) {
+	xa_for_each(&ent->mkeys, index, entry) {
 		xa_lock_irq(&ent->mkeys);
 		__xa_erase(&ent->mkeys, index);
 		ent->total_mrs--;
 		xa_unlock_irq(&ent->mkeys);
-		mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key);
-		kfree(mr);
+		mlx5_core_destroy_mkey(dev->mdev, (u32)xa_to_value(entry));
 	}
 }
 
@@ -1982,12 +1975,12 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 			mlx5_ib_free_odp_mr(mr);
 	}
 
-	if (mr->cache_ent) {
+	if (mr->cache_ent)
 		mlx5_mr_cache_free(dev, mr);
-	} else {
+	else
 		mlx5_free_priv_descs(mr);
-		kfree(mr);
-	}
+
+	kfree(mr);
 	return 0;
 }
 
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH rdma-next 4/7] RDMA/mlx5: Change the cache structure to an RB-tree
  2021-12-06  9:10 [PATCH rdma-next 0/7] MR cache enhancement Leon Romanovsky
                   ` (2 preceding siblings ...)
  2021-12-06  9:10 ` [PATCH rdma-next 3/7] RDMA/mlx5: Store in the cache mkeys instead of mrs Leon Romanovsky
@ 2021-12-06  9:10 ` Leon Romanovsky
  2021-12-08 20:58   ` Jason Gunthorpe
  2021-12-06  9:10 ` [PATCH rdma-next 5/7] RDMA/mlx5: Reorder calls to pcie_relaxed_ordering_enabled() Leon Romanovsky
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 15+ messages in thread
From: Leon Romanovsky @ 2021-12-06  9:10 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Aharon Landau, linux-kernel, linux-rdma

From: Aharon Landau <aharonl@nvidia.com>

Currently, the cache structure is a linear array held within
mlx5_ib_dev. Therefore, limits to the number of entries.

The existing entries are dedicated to mkeys of size 2^x and with no
access_flags and later in the series, we allow caching mkeys with
different attributes.

In this patch, we change the cache structure to an RB-tree of Xarray
of mkeys. The tree key is the mkc used to create the stored mkeys.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  28 +-
 drivers/infiniband/hw/mlx5/mr.c      | 435 +++++++++++++++++----------
 drivers/infiniband/hw/mlx5/odp.c     |  71 +++--
 include/linux/mlx5/driver.h          |   5 +-
 4 files changed, 340 insertions(+), 199 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 9b12e970ca01..202d8fbc423d 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -745,10 +745,7 @@ struct mlx5_cache_ent {
 	unsigned long		reserved;
 
 	char                    name[4];
-	u32                     order;
-	u32			xlt;
-	u32			access_mode;
-	u32			page;
+	int			ndescs;
 
 	u8 disabled:1;
 	u8 fill_to_high_water:1;
@@ -770,6 +767,9 @@ struct mlx5_cache_ent {
 	struct mlx5_ib_dev     *dev;
 	struct work_struct	work;
 	struct delayed_work	dwork;
+
+	struct rb_node		node;
+	void			*mkc;
 };
 
 struct mlx5_async_create_mkey {
@@ -781,9 +781,11 @@ struct mlx5_async_create_mkey {
 
 struct mlx5_mr_cache {
 	struct workqueue_struct *wq;
-	struct mlx5_cache_ent	ent[MAX_MR_CACHE_ENTRIES];
+	struct rb_root		cache_root;
+	struct mutex		cache_lock;
 	struct dentry		*root;
 	unsigned long		last_add;
+	bool			maintained_cache;
 };
 
 struct mlx5_ib_port_resources {
@@ -1330,9 +1332,12 @@ int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
 
-struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
-				       struct mlx5_cache_ent *ent,
-				       int access_flags);
+int mlx5_acc_flags_to_ent_flags(struct mlx5_ib_dev *dev, int access_flags);
+void mlx5_set_cache_mkc(struct mlx5_ib_dev *dev, void *mkc, int access_flags,
+			unsigned int access_mode, unsigned int page_shift);
+struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int *in,
+				       int inlen, unsigned int ndescs,
+				       unsigned int access_mode, bool force);
 
 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
 			    struct ib_mr_status *mr_status);
@@ -1356,7 +1361,7 @@ int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq);
 void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
 int __init mlx5_ib_odp_init(void);
 void mlx5_ib_odp_cleanup(void);
-void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
+int mlx5_odp_init_mr_cache_entry(struct mlx5_ib_dev *dev);
 void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
 			   struct mlx5_ib_mr *mr, int flags);
 
@@ -1375,7 +1380,10 @@ static inline int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev,
 static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {}
 static inline int mlx5_ib_odp_init(void) { return 0; }
 static inline void mlx5_ib_odp_cleanup(void)				    {}
-static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}
+static inline int mlx5_odp_init_mr_cache_entry(struct mlx5_ib_dev *dev)
+{
+	return 0;
+}
 static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
 					 struct mlx5_ib_mr *mr, int flags) {}
 
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index e64f6466f13d..6000acbedc73 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -176,16 +176,16 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
 	kfree(mkey_out);
 }
 
-static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc)
+void mlx5_set_cache_mkc(struct mlx5_ib_dev *dev, void *mkc, int access_flags,
+			unsigned int access_mode, unsigned int page_shift)
 {
-	set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd);
+	set_mkc_access_pd_addr_fields(mkc, access_flags, 0, dev->umrc.pd);
 	MLX5_SET(mkc, mkc, free, 1);
 	MLX5_SET(mkc, mkc, umr_en, 1);
-	MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
-	MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7);
+	MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
+	MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
 
-	MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
-	MLX5_SET(mkc, mkc, log_page_size, ent->page);
+	MLX5_SET(mkc, mkc, log_page_size, page_shift);
 	return;
 }
 
@@ -223,7 +223,7 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 		return -ENOMEM;
 
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
-	set_cache_mkc(ent, mkc);
+	memcpy(mkc, ent->mkc, MLX5_ST_SZ_BYTES(mkc));
 	for (i = 0; i < num; i++) {
 		async_out = kzalloc(sizeof(struct mlx5_async_create_mkey),
 				    GFP_KERNEL);
@@ -270,33 +270,6 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
 	return err;
 }
 
-/* Synchronously create a MR in the cache */
-static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey)
-{
-	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
-	void *mkc;
-	u32 *in;
-	int err;
-
-	in = kzalloc(inlen, GFP_KERNEL);
-	if (!in)
-		return -ENOMEM;
-	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
-	set_cache_mkc(ent, mkc);
-
-	err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen);
-	if (err)
-		goto free_in;
-
-	WRITE_ONCE(ent->dev->cache.last_add, jiffies);
-	xa_lock_irq(&ent->mkeys);
-	ent->total_mrs++;
-	xa_unlock_irq(&ent->mkeys);
-free_in:
-	kfree(in);
-	return err;
-}
-
 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
 {
 	void *xa_ent;
@@ -423,6 +396,7 @@ static ssize_t limit_write(struct file *filp, const char __user *buf,
 	xa_unlock_irq(&ent->mkeys);
 	if (err)
 		return err;
+	ent->dev->cache.maintained_cache = true;
 	return count;
 }
 
@@ -449,18 +423,22 @@ static const struct file_operations limit_fops = {
 
 static bool someone_adding(struct mlx5_mr_cache *cache)
 {
-	unsigned int i;
-
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
-		struct mlx5_cache_ent *ent = &cache->ent[i];
-		bool ret;
+	struct mlx5_cache_ent *ent;
+	struct rb_node *node;
+	bool ret;
 
+	mutex_lock(&cache->cache_lock);
+	for (node = rb_first(&cache->cache_root); node; node = rb_next(node)) {
+		ent = container_of(node, struct mlx5_cache_ent, node);
 		xa_lock_irq(&ent->mkeys);
 		ret = ent->stored < ent->limit;
 		xa_unlock_irq(&ent->mkeys);
-		if (ret)
+		if (ret) {
+			mutex_unlock(&cache->cache_lock);
 			return true;
+		}
 	}
+	mutex_unlock(&cache->cache_lock);
 	return false;
 }
 
@@ -522,8 +500,8 @@ static void __cache_work_func(struct mlx5_cache_ent *ent)
 			if (err != -EAGAIN) {
 				mlx5_ib_warn(
 					dev,
-					"command failed order %d, err %d\n",
-					ent->order, err);
+					"command failed order %s, err %d\n",
+					ent->name, err);
 				queue_delayed_work(cache->wq, &ent->dwork,
 						   msecs_to_jiffies(1000));
 			}
@@ -575,48 +553,116 @@ static void cache_work_func(struct work_struct *work)
 	__cache_work_func(ent);
 }
 
-struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
-				       struct mlx5_cache_ent *ent,
-				       int access_flags)
+static struct mlx5_cache_ent *ent_search(struct mlx5_mr_cache *cache, void *mkc)
+{
+	struct rb_node *node = cache->cache_root.rb_node;
+	int size = MLX5_ST_SZ_BYTES(mkc);
+	struct mlx5_cache_ent *cur;
+	int cmp;
+
+	while (node) {
+		cur = container_of(node, struct mlx5_cache_ent, node);
+		cmp = memcmp(mkc, cur->mkc, size);
+
+		if (cmp < 0)
+			node = node->rb_left;
+		else if (cmp > 0)
+			node = node->rb_right;
+		else
+			return cur;
+	}
+	return NULL;
+}
+
+static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs)
+{
+	if (access_mode == MLX5_MKC_ACCESS_MODE_MTT)
+		return DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
+						    sizeof(struct mlx5_mtt));
+	else if (access_mode == MLX5_MKC_ACCESS_MODE_KSM)
+		return DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD /
+						    sizeof(struct mlx5_klm));
+	else {
+		WARN_ON(1);
+		return 0;
+	}
+}
+
+struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int *in,
+				       int inlen, unsigned int ndescs,
+				       unsigned int access_mode, bool force)
 {
+	struct mlx5_cache_ent *ent = NULL;
 	struct mlx5_ib_mr *mr;
 	void *xa_ent;
+	void *mkc;
 	int err;
 
-	if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags))
-		return ERR_PTR(-EOPNOTSUPP);
-
 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
-	xa_lock_irq(&ent->mkeys);
-	if (!ent->stored) {
-		if (ent->limit) {
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	if (dev->cache.maintained_cache && !force) {
+		int order;
+
+		/*
+		 * Try to get an mkey from pool.
+		 */
+		order = order_base_2(ndescs) > 2 ? order_base_2(ndescs) : 2;
+		MLX5_SET(mkc, mkc, translations_octword_size,
+			 get_mkc_octo_size(access_mode, 1 << order));
+		mutex_lock(&dev->cache.cache_lock);
+		ent = ent_search(&dev->cache, mkc);
+		mutex_unlock(&dev->cache.cache_lock);
+	}
+
+	if (ent && (ent->limit || force)) {
+		xa_lock_irq(&ent->mkeys);
+		if (!ent->stored) {
+			if (ent->limit) {
+				queue_adjust_cache_locked(ent);
+				ent->miss++;
+			}
+			xa_unlock_irq(&ent->mkeys);
+
+			err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
+			if (err)
+				goto err;
+
+			WRITE_ONCE(ent->dev->cache.last_add, jiffies);
+			xa_lock_irq(&ent->mkeys);
+			ent->total_mrs++;
+			xa_unlock_irq(&ent->mkeys);
+		} else {
+			xa_ent = __xa_store(&ent->mkeys, --ent->stored,
+					  NULL, GFP_KERNEL);
+			WARN_ON(xa_ent == NULL || xa_is_err(xa_ent));
+			WARN_ON(__xa_erase(&ent->mkeys, --ent->reserved) !=
+				NULL);
 			queue_adjust_cache_locked(ent);
-			ent->miss++;
-		}
-		xa_unlock_irq(&ent->mkeys);
-		err = create_cache_mkey(ent, &mr->mmkey.key);
-		if (err) {
-			kfree(mr);
-			return ERR_PTR(err);
+			xa_unlock_irq(&ent->mkeys);
+			mr->mmkey.key = (u32)xa_to_value(xa_ent);
 		}
+		mr->cache_ent = ent;
 	} else {
-		mr = __xa_store(&ent->mkeys, --ent->stored, NULL,
-				GFP_KERNEL);
-		WARN_ON(mr == NULL || xa_is_err(mr));
-		WARN_ON(__xa_erase(&ent->mkeys, --ent->reserved) != NULL);
-		queue_adjust_cache_locked(ent);
-		xa_unlock_irq(&ent->mkeys);
-
-		mr->mmkey.key = (u32)xa_to_value(xa_ent);
+		/*
+		 * Can not use a cache mkey.
+		 * Create an mkey with the exact needed size.
+		 */
+		MLX5_SET(mkc, mkc, translations_octword_size,
+			 get_mkc_octo_size(access_mode, ndescs));
+		err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
+		if (err)
+			goto err;
 	}
-	mr->cache_ent = ent;
 	mr->mmkey.type = MLX5_MKEY_MR;
 	init_waitqueue_head(&mr->mmkey.wait);
 	return mr;
 
+err:
+	kfree(mr);
+	return ERR_PTR(err);
 }
 
 static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
@@ -630,10 +676,8 @@ static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 	xa_unlock_irq(&ent->mkeys);
 }
 
-static void clean_keys(struct mlx5_ib_dev *dev, int c)
+static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
-	struct mlx5_cache_ent *ent = &cache->ent[c];
 	unsigned long index;
 	void *entry;
 
@@ -656,27 +700,21 @@ static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 	dev->cache.root = NULL;
 }
 
-static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
+static void mlx5_cache_ent_debugfs_init(struct mlx5_ib_dev *dev,
+					struct mlx5_cache_ent *ent, int order)
 {
 	struct mlx5_mr_cache *cache = &dev->cache;
-	struct mlx5_cache_ent *ent;
 	struct dentry *dir;
-	int i;
 
 	if (!mlx5_debugfs_root || dev->is_rep)
 		return;
 
-	cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
-
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
-		ent = &cache->ent[i];
-		sprintf(ent->name, "%d", ent->order);
-		dir = debugfs_create_dir(ent->name, cache->root);
-		debugfs_create_file("size", 0600, dir, ent, &size_fops);
-		debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
-		debugfs_create_ulong("cur", 0400, dir, &ent->stored);
-		debugfs_create_u32("miss", 0600, dir, &ent->miss);
-	}
+	sprintf(ent->name, "%d", order);
+	dir = debugfs_create_dir(ent->name, cache->root);
+	debugfs_create_file("size", 0600, dir, ent, &size_fops);
+	debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
+	debugfs_create_ulong("cur", 0400, dir, &ent->stored);
+	debugfs_create_u32("miss", 0600, dir, &ent->miss);
 }
 
 static void delay_time_func(struct timer_list *t)
@@ -686,69 +724,135 @@ static void delay_time_func(struct timer_list *t)
 	WRITE_ONCE(dev->fill_delay, 0);
 }
 
+static int ent_insert(struct mlx5_mr_cache *cache, struct mlx5_cache_ent *ent)
+{
+	struct rb_node **new = &cache->cache_root.rb_node, *parent = NULL;
+	int size = MLX5_ST_SZ_BYTES(mkc);
+	struct mlx5_cache_ent *cur;
+	int cmp;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		cur = container_of(*new, struct mlx5_cache_ent, node);
+		parent = *new;
+		cmp = memcmp(ent->mkc, cur->mkc, size);
+		if (cmp < 0)
+			new = &((*new)->rb_left);
+		else if (cmp > 0)
+			new = &((*new)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&ent->node, parent, new);
+	rb_insert_color(&ent->node, &cache->cache_root);
+
+	return 0;
+}
+
+static struct mlx5_cache_ent *mlx5_ib_create_cache_ent(struct mlx5_ib_dev *dev,
+						       int order)
+{
+	struct mlx5_cache_ent *ent;
+	int ret;
+
+	ent = kzalloc(sizeof(*ent), GFP_KERNEL);
+	if (!ent)
+		return ERR_PTR(-ENOMEM);
+
+	ent->mkc = kzalloc(MLX5_ST_SZ_BYTES(mkc), GFP_KERNEL);
+	if (!ent->mkc) {
+		kfree(ent);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ent->ndescs = 1 << order;
+	mlx5_set_cache_mkc(dev, ent->mkc, 0, MLX5_MKC_ACCESS_MODE_MTT,
+			   PAGE_SHIFT);
+	MLX5_SET(mkc, ent->mkc, translations_octword_size,
+		 get_mkc_octo_size(MLX5_MKC_ACCESS_MODE_MTT, ent->ndescs));
+	mutex_lock(&dev->cache.cache_lock);
+	ret = ent_insert(&dev->cache, ent);
+	mutex_unlock(&dev->cache.cache_lock);
+	if (ret) {
+		kfree(ent->mkc);
+		kfree(ent);
+		return ERR_PTR(ret);
+	}
+
+	xa_init_flags(&ent->mkeys, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
+	ent->dev = dev;
+
+	INIT_WORK(&ent->work, cache_work_func);
+	INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
+
+	mlx5_cache_ent_debugfs_init(dev, ent, order);
+	return ent;
+}
+
 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 {
 	struct mlx5_mr_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
-	int i;
+	int order, err;
 
 	mutex_init(&dev->slow_path_mutex);
+	mutex_init(&dev->cache.cache_lock);
+	cache->cache_root = RB_ROOT;
 	cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
 	if (!cache->wq) {
-		mlx5_ib_warn(dev, "failed to create work queue\n");
+		mlx5_ib_warn(dev, "failed tocreate work queue\n");
 		return -ENOMEM;
 	}
 
+	if (mlx5_debugfs_root && !dev->is_rep)
+		cache->root = debugfs_create_dir("mr_cache",
+						 dev->mdev->priv.dbg_root);
+
+	cache->maintained_cache =
+		(dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
+		!dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
+		mlx5_ib_can_load_pas_with_umr(dev, 0);
+
 	mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
 	timer_setup(&dev->delay_timer, delay_time_func, 0);
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
-		ent = &cache->ent[i];
-		xa_init_flags(&ent->mkeys, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
-		ent->order = i + 2;
-		ent->dev = dev;
-		ent->limit = 0;
-
-		INIT_WORK(&ent->work, cache_work_func);
-		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
-
-		if (i > MR_CACHE_LAST_STD_ENTRY) {
-			mlx5_odp_init_mr_cache_entry(ent);
-			continue;
-		}
+	for (order = 2; order < MAX_MR_CACHE_ENTRIES + 2; order++) {
+		ent = mlx5_ib_create_cache_ent(dev, order);
 
-		if (ent->order > mr_cache_max_order(dev))
-			continue;
+		if (IS_ERR(ent)) {
+			err = PTR_ERR(ent);
+			goto err;
+		}
 
-		ent->page = PAGE_SHIFT;
-		ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) /
-			   MLX5_IB_UMR_OCTOWORD;
-		ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
-		if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) &&
-		    !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
-		    mlx5_ib_can_load_pas_with_umr(dev, 0))
-			ent->limit = dev->mdev->profile.mr_cache[i].limit;
-		else
-			ent->limit = 0;
-		xa_lock_irq(&ent->mkeys);
-		queue_adjust_cache_locked(ent);
-		xa_unlock_irq(&ent->mkeys);
+		if (cache->maintained_cache &&
+		    order <= mr_cache_max_order(dev)) {
+			ent->limit =
+				dev->mdev->profile.mr_cache[order - 2].limit;
+			xa_lock_irq(&ent->mkeys);
+			queue_adjust_cache_locked(ent);
+			xa_unlock_irq(&ent->mkeys);
+		}
 	}
 
-	mlx5_mr_cache_debugfs_init(dev);
-
 	return 0;
+err:
+	mlx5_mr_cache_cleanup(dev);
+	return err;
 }
 
 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
 {
-	unsigned int i;
+	struct rb_root *root = &dev->cache.cache_root;
+	struct mlx5_cache_ent *ent;
+	struct rb_node *node;
 
 	if (!dev->cache.wq)
 		return 0;
 
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
-		struct mlx5_cache_ent *ent = &dev->cache.ent[i];
-
+	mutex_lock(&dev->cache.cache_lock);
+	for (node = rb_first(root); node; node = rb_next(node)) {
+		ent = container_of(node, struct mlx5_cache_ent, node);
 		xa_lock_irq(&ent->mkeys);
 		ent->disabled = true;
 		xa_unlock_irq(&ent->mkeys);
@@ -759,8 +863,16 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
 	mlx5_mr_cache_debugfs_cleanup(dev);
 	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
 
-	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
-		clean_keys(dev, i);
+	node = rb_first(root);
+	while (node) {
+		ent = container_of(node, struct mlx5_cache_ent, node);
+		node = rb_next(node);
+		clean_keys(dev, ent);
+		rb_erase(&ent->node, root);
+		kfree(ent->mkc);
+		kfree(ent);
+	}
+	mutex_unlock(&dev->cache.cache_lock);
 
 	destroy_workqueue(dev->cache.wq);
 	del_timer_sync(&dev->delay_timer);
@@ -829,7 +941,7 @@ static int get_octo_len(u64 addr, u64 len, int page_shift)
 static int mr_cache_max_order(struct mlx5_ib_dev *dev)
 {
 	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
-		return MR_CACHE_LAST_STD_ENTRY + 2;
+		return MAX_MR_CACHE_ENTRIES + 2;
 	return MLX5_MAX_UMR_SHIFT;
 }
 
@@ -876,18 +988,6 @@ static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev,
 	return err;
 }
 
-static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev,
-						      unsigned int order)
-{
-	struct mlx5_mr_cache *cache = &dev->cache;
-
-	if (order < cache->ent[0].order)
-		return &cache->ent[0];
-	order = order - cache->ent[0].order;
-	if (order > MR_CACHE_LAST_STD_ENTRY)
-		return NULL;
-	return &cache->ent[order];
-}
 
 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
 			  u64 length, int access_flags, u64 iova)
@@ -911,14 +1011,38 @@ static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
 	return PAGE_SIZE;
 }
 
+int mlx5_acc_flags_to_ent_flags(struct mlx5_ib_dev *dev, int access_flags)
+{
+	int ret = 0;
+
+	if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+	    MLX5_CAP_GEN(dev->mdev, atomic) &&
+	    MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
+		ret |= IB_ACCESS_REMOTE_ATOMIC;
+
+	if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
+	    MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) &&
+	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
+		ret |= IB_ACCESS_RELAXED_ORDERING;
+
+	if ((access_flags & IB_ACCESS_RELAXED_ORDERING) &&
+	    MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) &&
+	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
+		ret |= IB_ACCESS_RELAXED_ORDERING;
+
+	return ret;
+}
+
 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 					     struct ib_umem *umem, u64 iova,
 					     int access_flags)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
-	struct mlx5_cache_ent *ent;
+	unsigned int page_size, ndescs;
 	struct mlx5_ib_mr *mr;
-	unsigned int page_size;
+	void *mkc;
+	int inlen;
+	int *in;
 
 	if (umem->is_dmabuf)
 		page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova);
@@ -927,29 +1051,32 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 						     0, iova);
 	if (WARN_ON(!page_size))
 		return ERR_PTR(-EINVAL);
-	ent = mr_cache_ent_from_order(
-		dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size)));
-	/*
-	 * Matches access in alloc_cache_mr(). If the MR can't come from the
-	 * cache then synchronously create an uncached one.
-	 */
-	if (!ent || ent->limit == 0 ||
-	    !mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags)) {
-		mutex_lock(&dev->slow_path_mutex);
-		mr = reg_create(pd, umem, iova, access_flags, page_size, false);
-		mutex_unlock(&dev->slow_path_mutex);
-		return mr;
-	}
 
-	mr = mlx5_mr_cache_alloc(dev, ent, access_flags);
-	if (IS_ERR(mr))
+	ndescs = ib_umem_num_dma_blocks(umem, page_size);
+	inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return ERR_PTR(-ENOMEM);
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	mlx5_set_cache_mkc(dev, mkc,
+			   mlx5_acc_flags_to_ent_flags(dev, access_flags),
+			   MLX5_MKC_ACCESS_MODE_MTT, PAGE_SHIFT);
+
+	mr = mlx5_mr_cache_alloc(
+		dev, in, inlen, ndescs, MLX5_MKC_ACCESS_MODE_MTT,
+		!mlx5_ib_can_reconfig_with_umr(dev, access_flags, 0));
+	if (IS_ERR(mr)) {
+		kfree(in);
 		return mr;
+	}
 
 	mr->ibmr.pd = pd;
 	mr->umem = umem;
 	mr->page_shift = order_base_2(page_size);
 	set_mr_fields(dev, mr, umem->length, access_flags, iova);
 
+	kfree(in);
 	return mr;
 }
 
@@ -1699,7 +1826,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr,
 		mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova);
 	if (WARN_ON(!*page_size))
 		return false;
-	return (1ULL << mr->cache_ent->order) >=
+	return (mr->cache_ent->ndescs) >=
 	       ib_umem_num_dma_blocks(new_umem, *page_size);
 }
 
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 0972afc3e952..3d86a448ec97 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -411,6 +411,9 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
 	struct ib_umem_odp *odp;
 	struct mlx5_ib_mr *mr;
 	struct mlx5_ib_mr *ret;
+	void *mkc;
+	int inlen;
+	int *in;
 	int err;
 
 	odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem),
@@ -419,10 +422,23 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
 	if (IS_ERR(odp))
 		return ERR_CAST(odp);
 
-	mr = mlx5_mr_cache_alloc(dev, &dev->cache.ent[MLX5_IMR_MTT_CACHE_ENTRY],
-				 imr->access_flags);
+	inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		ib_umem_odp_release(odp);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	mlx5_set_cache_mkc(dev, mkc,
+			   mlx5_acc_flags_to_ent_flags(dev, imr->access_flags),
+			   MLX5_MKC_ACCESS_MODE_MTT, PAGE_SHIFT);
+
+	mr = mlx5_mr_cache_alloc(dev, in, inlen, MLX5_IMR_MTT_ENTRIES,
+				 MLX5_MKC_ACCESS_MODE_MTT, true);
 	if (IS_ERR(mr)) {
 		ib_umem_odp_release(odp);
+		kfree(in);
 		return mr;
 	}
 
@@ -470,12 +486,14 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
 	xa_unlock(&imr->implicit_children);
 
 	mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr);
+	kfree(in);
 	return mr;
 
 out_lock:
 	xa_unlock(&imr->implicit_children);
 out_mr:
 	mlx5_ib_dereg_mr(&mr->ibmr, NULL);
+	kfree(in);
 	return ret;
 }
 
@@ -485,6 +503,9 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 	struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device);
 	struct ib_umem_odp *umem_odp;
 	struct mlx5_ib_mr *imr;
+	void *mkc;
+	int inlen;
+	int *in;
 	int err;
 
 	if (!mlx5_ib_can_load_pas_with_umr(dev,
@@ -495,11 +516,23 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 	if (IS_ERR(umem_odp))
 		return ERR_CAST(umem_odp);
 
-	imr = mlx5_mr_cache_alloc(dev,
-				  &dev->cache.ent[MLX5_IMR_KSM_CACHE_ENTRY],
-				  access_flags);
+	inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		ib_umem_odp_release(umem_odp);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	mlx5_set_cache_mkc(dev, mkc,
+			   mlx5_acc_flags_to_ent_flags(dev, access_flags),
+			   MLX5_MKC_ACCESS_MODE_KSM, PAGE_SHIFT);
+
+	imr = mlx5_mr_cache_alloc(dev, in, inlen, mlx5_imr_ksm_entries,
+				  MLX5_MKC_ACCESS_MODE_KSM, true);
 	if (IS_ERR(imr)) {
 		ib_umem_odp_release(umem_odp);
+		kfree(in);
 		return imr;
 	}
 
@@ -528,10 +561,12 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 		goto out_mr;
 
 	mlx5_ib_dbg(dev, "key %x mr %p\n", imr->mmkey.key, imr);
+	kfree(in);
 	return imr;
 out_mr:
 	mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
 	mlx5_ib_dereg_mr(&imr->ibmr, NULL);
+	kfree(in);
 	return ERR_PTR(err);
 }
 
@@ -1596,32 +1631,6 @@ mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
 	return err;
 }
 
-void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
-{
-	if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
-		return;
-
-	switch (ent->order - 2) {
-	case MLX5_IMR_MTT_CACHE_ENTRY:
-		ent->page = PAGE_SHIFT;
-		ent->xlt = MLX5_IMR_MTT_ENTRIES *
-			   sizeof(struct mlx5_mtt) /
-			   MLX5_IB_UMR_OCTOWORD;
-		ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
-		ent->limit = 0;
-		break;
-
-	case MLX5_IMR_KSM_CACHE_ENTRY:
-		ent->page = MLX5_KSM_PAGE_SHIFT;
-		ent->xlt = mlx5_imr_ksm_entries *
-			   sizeof(struct mlx5_klm) /
-			   MLX5_IB_UMR_OCTOWORD;
-		ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM;
-		ent->limit = 0;
-		break;
-	}
-}
-
 static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
 	.advise_mr = mlx5_ib_advise_mr,
 };
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index a623ec635947..c33f71134136 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -699,10 +699,7 @@ enum {
 };
 
 enum {
-	MR_CACHE_LAST_STD_ENTRY = 20,
-	MLX5_IMR_MTT_CACHE_ENTRY,
-	MLX5_IMR_KSM_CACHE_ENTRY,
-	MAX_MR_CACHE_ENTRIES
+	MAX_MR_CACHE_ENTRIES = 21,
 };
 
 struct mlx5_profile {
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH rdma-next 5/7] RDMA/mlx5: Reorder calls to pcie_relaxed_ordering_enabled()
  2021-12-06  9:10 [PATCH rdma-next 0/7] MR cache enhancement Leon Romanovsky
                   ` (3 preceding siblings ...)
  2021-12-06  9:10 ` [PATCH rdma-next 4/7] RDMA/mlx5: Change the cache structure to an RB-tree Leon Romanovsky
@ 2021-12-06  9:10 ` Leon Romanovsky
  2021-12-06  9:10 ` [PATCH rdma-next 6/7] RDMA/mlx5: Delay the deregistration of a non-cache mkey Leon Romanovsky
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 15+ messages in thread
From: Leon Romanovsky @ 2021-12-06  9:10 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Aharon Landau, linux-kernel, linux-rdma

From: Aharon Landau <aharonl@nvidia.com>

The mkc is the key for the mkey cache, hence, created in each attempt to
get a cache mkey, while pcie_relaxed_ordering_enabled() is called during
the setting of the mkc, but used only for cases where IB_ACCESS_RELAXED_ORDERING
is set.

pcie_relaxed_ordering_enabled() is an expensive call (26 us). Reorder the
code so the driver will call it only when it is needed.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/mr.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 6000acbedc73..ca6faf599cd3 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -68,7 +68,6 @@ static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
 					  struct ib_pd *pd)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
-	bool ro_pci_enabled = pcie_relaxed_ordering_enabled(dev->mdev->pdev);
 
 	MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
 	MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
@@ -76,12 +75,13 @@ static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
 	MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
 	MLX5_SET(mkc, mkc, lr, 1);
 
-	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
-		MLX5_SET(mkc, mkc, relaxed_ordering_write,
-			 (acc & IB_ACCESS_RELAXED_ORDERING) && ro_pci_enabled);
-	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read))
-		MLX5_SET(mkc, mkc, relaxed_ordering_read,
-			 (acc & IB_ACCESS_RELAXED_ORDERING) && ro_pci_enabled);
+	if ((acc & IB_ACCESS_RELAXED_ORDERING) &&
+	    pcie_relaxed_ordering_enabled(dev->mdev->pdev)) {
+		if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
+			MLX5_SET(mkc, mkc, relaxed_ordering_write, 1);
+		if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read))
+			MLX5_SET(mkc, mkc, relaxed_ordering_read, 1);
+	}
 
 	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH rdma-next 6/7] RDMA/mlx5: Delay the deregistration of a non-cache mkey
  2021-12-06  9:10 [PATCH rdma-next 0/7] MR cache enhancement Leon Romanovsky
                   ` (4 preceding siblings ...)
  2021-12-06  9:10 ` [PATCH rdma-next 5/7] RDMA/mlx5: Reorder calls to pcie_relaxed_ordering_enabled() Leon Romanovsky
@ 2021-12-06  9:10 ` Leon Romanovsky
  2021-12-06  9:10 ` [PATCH rdma-next 7/7] RDMA/mlx5: Rename the mkey cache variables and functions Leon Romanovsky
  2021-12-09  8:40 ` [PATCH rdma-next 0/7] MR cache enhancement Leon Romanovsky
  7 siblings, 0 replies; 15+ messages in thread
From: Leon Romanovsky @ 2021-12-06  9:10 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Aharon Landau, linux-kernel, linux-rdma

From: Aharon Landau <aharonl@nvidia.com>

When restarting an application with many non-cached mkeys, all the mkeys
will be destroyed and then recreated.

This process takes a long time (about 20 seconds for deregistration and
28 seconds for registration of 100,000 MRs).

To shorten the restart runtime, insert the mkeys temporarily into the
cache and schedule a delayed work to destroy them later. If there is no
fitting entry to these mkeys, create a temporary entry that fits them.

If 30 seconds have passed and no user reclaimed the temporarily cached
mkeys, the scheduled work will destroy the mkeys and the temporary
entries.

When restarting an application, the mkeys will still be in the cache
when trying to reg them again, therefore, the registration will be
faster (4 seconds for deregistration and 5 seconds or registration of
100,000 MRs).

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |   3 +
 drivers/infiniband/hw/mlx5/mr.c      | 157 ++++++++++++++++++++++++---
 2 files changed, 146 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 202d8fbc423d..7dcc9e69a649 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -638,6 +638,7 @@ struct mlx5_ib_mkey {
 	u32 key;
 	enum mlx5_mkey_type type;
 	unsigned int ndescs;
+	unsigned int access_mode;
 	struct wait_queue_head wait;
 	refcount_t usecount;
 };
@@ -747,6 +748,7 @@ struct mlx5_cache_ent {
 	char                    name[4];
 	int			ndescs;
 
+	u8 is_tmp:1;
 	u8 disabled:1;
 	u8 fill_to_high_water:1;
 
@@ -786,6 +788,7 @@ struct mlx5_mr_cache {
 	struct dentry		*root;
 	unsigned long		last_add;
 	bool			maintained_cache;
+	struct delayed_work	remove_ent_dwork;
 };
 
 struct mlx5_ib_port_resources {
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index ca6faf599cd3..29888a426b33 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -449,7 +449,7 @@ static bool someone_adding(struct mlx5_mr_cache *cache)
  */
 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
 {
-	if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
+	if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp)
 		return;
 	if (ent->stored < ent->limit) {
 		ent->fill_to_high_water = true;
@@ -603,6 +603,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int *in,
 		return ERR_PTR(-ENOMEM);
 
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	mutex_lock(&dev->cache.cache_lock);
 	if (dev->cache.maintained_cache && !force) {
 		int order;
 
@@ -612,13 +613,25 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int *in,
 		order = order_base_2(ndescs) > 2 ? order_base_2(ndescs) : 2;
 		MLX5_SET(mkc, mkc, translations_octword_size,
 			 get_mkc_octo_size(access_mode, 1 << order));
-		mutex_lock(&dev->cache.cache_lock);
 		ent = ent_search(&dev->cache, mkc);
-		mutex_unlock(&dev->cache.cache_lock);
 	}
-
-	if (ent && (ent->limit || force)) {
+	if (!ent) {
+		/*
+		 * Can not use a cache mkey.
+		 * Create an mkey with the exact needed size.
+		 */
+		MLX5_SET(mkc, mkc, translations_octword_size,
+			 get_mkc_octo_size(access_mode, ndescs));
+		ent = ent_search(&dev->cache, mkc);
+	}
+	if (ent)
 		xa_lock_irq(&ent->mkeys);
+	else
+		__acquire(&ent->lock);
+	mutex_unlock(&dev->cache.cache_lock);
+
+	if (ent && !ent->disabled &&
+	    (ent->stored || ent->limit || force)) {
 		if (!ent->stored) {
 			if (ent->limit) {
 				queue_adjust_cache_locked(ent);
@@ -633,7 +646,6 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int *in,
 			WRITE_ONCE(ent->dev->cache.last_add, jiffies);
 			xa_lock_irq(&ent->mkeys);
 			ent->total_mrs++;
-			xa_unlock_irq(&ent->mkeys);
 		} else {
 			xa_ent = __xa_store(&ent->mkeys, --ent->stored,
 					  NULL, GFP_KERNEL);
@@ -641,23 +653,30 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int *in,
 			WARN_ON(__xa_erase(&ent->mkeys, --ent->reserved) !=
 				NULL);
 			queue_adjust_cache_locked(ent);
-			xa_unlock_irq(&ent->mkeys);
 			mr->mmkey.key = (u32)xa_to_value(xa_ent);
 		}
-		mr->cache_ent = ent;
+		if (ent->is_tmp) {
+			ent->total_mrs--;
+			queue_delayed_work(dev->cache.wq,
+					   &dev->cache.remove_ent_dwork,
+					   msecs_to_jiffies(30 * 1000));
+		} else
+			mr->cache_ent = ent;
+
+		xa_unlock_irq(&ent->mkeys);
 	} else {
-		/*
-		 * Can not use a cache mkey.
-		 * Create an mkey with the exact needed size.
-		 */
-		MLX5_SET(mkc, mkc, translations_octword_size,
-			 get_mkc_octo_size(access_mode, ndescs));
+		if (ent)
+			xa_unlock_irq(&ent->mkeys);
+		else
+			__release(&ent->lock);
 		err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
 		if (err)
 			goto err;
 	}
+	mr->mmkey.ndescs = ndescs;
 	mr->mmkey.type = MLX5_MKEY_MR;
 	init_waitqueue_head(&mr->mmkey.wait);
+	mr->mmkey.access_mode = access_mode;
 	return mr;
 
 err:
@@ -791,6 +810,38 @@ static struct mlx5_cache_ent *mlx5_ib_create_cache_ent(struct mlx5_ib_dev *dev,
 	return ent;
 }
 
+static void remove_ent_work_func(struct work_struct *work)
+{
+	struct mlx5_mr_cache *cache;
+	struct mlx5_cache_ent *ent;
+	struct rb_node *cur;
+
+	cache = container_of(work, struct mlx5_mr_cache, remove_ent_dwork.work);
+	mutex_lock(&cache->cache_lock);
+	cur = rb_last(&cache->cache_root);
+	while (cur) {
+		ent = container_of(cur, struct mlx5_cache_ent, node);
+		cur = rb_prev(cur);
+		mutex_unlock(&cache->cache_lock);
+
+		xa_lock_irq(&ent->mkeys);
+		if (!ent->is_tmp || ent->total_mrs != ent->stored) {
+			xa_unlock_irq(&ent->mkeys);
+			mutex_lock(&cache->cache_lock);
+			continue;
+		}
+		ent->disabled = true;
+		xa_unlock_irq(&ent->mkeys);
+
+		clean_keys(ent->dev, ent);
+		mutex_lock(&cache->cache_lock);
+		rb_erase(&ent->node, &cache->cache_root);
+		kfree(ent->mkc);
+		kfree(ent);
+	}
+	mutex_unlock(&cache->cache_lock);
+}
+
 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 {
 	struct mlx5_mr_cache *cache = &dev->cache;
@@ -800,6 +851,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 	mutex_init(&dev->slow_path_mutex);
 	mutex_init(&dev->cache.cache_lock);
 	cache->cache_root = RB_ROOT;
+	INIT_DELAYED_WORK(&cache->remove_ent_dwork, remove_ent_work_func);
 	cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
 	if (!cache->wq) {
 		mlx5_ib_warn(dev, "failed tocreate work queue\n");
@@ -850,6 +902,7 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
 	if (!dev->cache.wq)
 		return 0;
 
+	cancel_delayed_work_sync(&dev->cache.remove_ent_dwork);
 	mutex_lock(&dev->cache.cache_lock);
 	for (node = rb_first(root); node; node = rb_next(node)) {
 		ent = container_of(node, struct mlx5_cache_ent, node);
@@ -2032,6 +2085,79 @@ static int push_reserve_mkey(struct mlx5_cache_ent *ent)
 	return ret;
 }
 
+static struct mlx5_cache_ent *
+create_tmp_cache_ent(struct mlx5_ib_dev *dev, void *mkc, unsigned int ndescs)
+{
+	struct mlx5_cache_ent *ent;
+	int ret;
+
+	ent = kzalloc(sizeof(*ent), GFP_KERNEL);
+	if (!ent)
+		return ERR_PTR(-ENOMEM);
+
+	ent->mkc = mkc;
+	ret = ent_insert(&dev->cache, ent);
+	if (ret) {
+		kfree(ent);
+		return ERR_PTR(ret);
+	}
+
+	xa_init_flags(&ent->mkeys, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
+	ent->ndescs = ndescs;
+	ent->dev = dev;
+	ent->is_tmp = true;
+
+	INIT_WORK(&ent->work, cache_work_func);
+	INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
+
+	return ent;
+}
+
+static void tmp_cache_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct ib_umem *umem = mr->umem;
+	struct mlx5_cache_ent *ent;
+	void *mkc;
+
+	if (!umem || !mlx5_ib_can_load_pas_with_umr(dev, umem->length))
+		return;
+
+	mkc = kzalloc(MLX5_ST_SZ_BYTES(mkc), GFP_KERNEL);
+	if (!mkc)
+		return;
+
+	mlx5_set_cache_mkc(dev, mkc,
+			   mlx5_acc_flags_to_ent_flags(dev, mr->access_flags),
+			   mr->mmkey.access_mode, PAGE_SHIFT);
+	MLX5_SET(mkc, mkc, translations_octword_size,
+		 get_mkc_octo_size(mr->mmkey.access_mode, mr->mmkey.ndescs));
+	mutex_lock(&cache->cache_lock);
+	ent = ent_search(cache, mkc);
+	if (!ent) {
+		ent = create_tmp_cache_ent(dev, mkc, mr->mmkey.ndescs);
+		if (IS_ERR(ent)) {
+			mutex_unlock(&cache->cache_lock);
+			kfree(mkc);
+			return;
+		}
+	} else
+		kfree(mkc);
+
+	xa_lock_irq(&ent->mkeys);
+	if (ent->disabled) {
+		xa_unlock_irq(&ent->mkeys);
+		mutex_unlock(&cache->cache_lock);
+		return;
+	}
+	ent->total_mrs++;
+	xa_unlock_irq(&ent->mkeys);
+	queue_delayed_work(cache->wq, &cache->remove_ent_dwork,
+			   msecs_to_jiffies(30 * 1000));
+	mutex_unlock(&cache->cache_lock);
+	mr->cache_ent = ent;
+}
+
 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
@@ -2076,6 +2202,9 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 		mr->sig = NULL;
 	}
 
+	if (!mr->cache_ent)
+		tmp_cache_mkey(dev, mr);
+
 	/* Stop DMA */
 	if (mr->cache_ent) {
 		if (revoke_mr(mr) || push_reserve_mkey(mr->cache_ent)) {
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH rdma-next 7/7] RDMA/mlx5: Rename the mkey cache variables and functions
  2021-12-06  9:10 [PATCH rdma-next 0/7] MR cache enhancement Leon Romanovsky
                   ` (5 preceding siblings ...)
  2021-12-06  9:10 ` [PATCH rdma-next 6/7] RDMA/mlx5: Delay the deregistration of a non-cache mkey Leon Romanovsky
@ 2021-12-06  9:10 ` Leon Romanovsky
  2021-12-09  8:40 ` [PATCH rdma-next 0/7] MR cache enhancement Leon Romanovsky
  7 siblings, 0 replies; 15+ messages in thread
From: Leon Romanovsky @ 2021-12-06  9:10 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Aharon Landau, linux-kernel, linux-rdma

From: Aharon Landau <aharonl@nvidia.com>

After replacing the MR cache with an Mkey cache, rename the variables
and functions to fit the new meaning.

Signed-off-by: Aharon Landau <aharonl@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mlx5/main.c    |  4 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h | 26 ++++----
 drivers/infiniband/hw/mlx5/mr.c      | 88 ++++++++++++++--------------
 drivers/infiniband/hw/mlx5/odp.c     |  8 +--
 include/linux/mlx5/driver.h          |  4 +-
 5 files changed, 66 insertions(+), 64 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 5ec8bd2f0b2f..74f32b563109 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -4034,7 +4034,7 @@ static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
 {
 	int err;
 
-	err = mlx5_mr_cache_cleanup(dev);
+	err = mlx5_mkey_cache_cleanup(dev);
 	if (err)
 		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
 
@@ -4131,7 +4131,7 @@ static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
 	dev->umrc.pd = pd;
 
 	sema_init(&dev->umrc.sem, MAX_UMR_WR);
-	ret = mlx5_mr_cache_init(dev);
+	ret = mlx5_mkey_cache_init(dev);
 	if (ret) {
 		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
 		goto error_4;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 7dcc9e69a649..54a5c4bc2919 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -753,13 +753,13 @@ struct mlx5_cache_ent {
 	u8 fill_to_high_water:1;
 
 	/*
-	 * - total_mrs is available_mrs plus all in use MRs that could be
+	 * - total_mkeys is stored mkeys plus all in use mkeys that could be
 	 *   returned to the cache.
-	 * - limit is the low water mark for available_mrs, 2* limit is the
+	 * - limit is the low water mark for available_mkeys, 2 * limit is the
 	 *   upper water mark.
-	 * - pending is the number of MRs currently being created
+	 * - pending is the number of mkeys currently being created
 	 */
-	u32 total_mrs;
+	u32 total_mkeys;
 	u32 limit;
 	u32 pending;
 
@@ -781,7 +781,7 @@ struct mlx5_async_create_mkey {
 	u32 mkey;
 };
 
-struct mlx5_mr_cache {
+struct mlx5_mkey_cache {
 	struct workqueue_struct *wq;
 	struct rb_root		cache_root;
 	struct mutex		cache_lock;
@@ -1085,7 +1085,7 @@ struct mlx5_ib_dev {
 	struct mlx5_ib_resources	devr;
 
 	atomic_t			mkey_var;
-	struct mlx5_mr_cache		cache;
+	struct mlx5_mkey_cache		cache;
 	struct timer_list		delay_timer;
 	/* Prevents soft lock on massive reg MRs */
 	struct mutex			slow_path_mutex;
@@ -1332,15 +1332,15 @@ void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas,
 			  u64 access_flags);
 void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
 int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
-int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
-int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
+int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev);
+int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev);
 
 int mlx5_acc_flags_to_ent_flags(struct mlx5_ib_dev *dev, int access_flags);
 void mlx5_set_cache_mkc(struct mlx5_ib_dev *dev, void *mkc, int access_flags,
 			unsigned int access_mode, unsigned int page_shift);
-struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int *in,
-				       int inlen, unsigned int ndescs,
-				       unsigned int access_mode, bool force);
+struct mlx5_ib_mr *mlx5_mkey_cache_alloc(struct mlx5_ib_dev *dev, int *in,
+					 int inlen, unsigned int ndescs,
+					 unsigned int access_mode, bool force);
 
 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
 			    struct ib_mr_status *mr_status);
@@ -1364,7 +1364,7 @@ int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq);
 void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
 int __init mlx5_ib_odp_init(void);
 void mlx5_ib_odp_cleanup(void);
-int mlx5_odp_init_mr_cache_entry(struct mlx5_ib_dev *dev);
+int mlx5_odp_init_mkey_cache_entry(struct mlx5_ib_dev *dev);
 void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
 			   struct mlx5_ib_mr *mr, int flags);
 
@@ -1383,7 +1383,7 @@ static inline int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev,
 static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {}
 static inline int mlx5_ib_odp_init(void) { return 0; }
 static inline void mlx5_ib_odp_cleanup(void)				    {}
-static inline int mlx5_odp_init_mr_cache_entry(struct mlx5_ib_dev *dev)
+static inline int mlx5_odp_init_mkey_cache_entry(struct mlx5_ib_dev *dev)
 {
 	return 0;
 }
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 29888a426b33..11e797e27873 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -123,7 +123,7 @@ mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev, u32 *mkey,
 				create_mkey_callback, context);
 }
 
-static int mr_cache_max_order(struct mlx5_ib_dev *dev);
+static int mkey_cache_max_order(struct mlx5_ib_dev *dev);
 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
 
 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
@@ -169,7 +169,7 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
 			    xa_mk_value(mkey_out->mkey), GFP_ATOMIC);
 	WARN_ON(xa_ent != NULL);
 	ent->pending--;
-	ent->total_mrs++;
+	ent->total_mkeys++;
 	/* If we are doing fill_to_high_water then keep going. */
 	queue_adjust_cache_locked(ent);
 	xa_unlock_irqrestore(&ent->mkeys, flags);
@@ -279,15 +279,15 @@ static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
 	xa_ent = __xa_store(&ent->mkeys, --ent->stored, NULL, GFP_KERNEL);
 	WARN_ON(xa_ent == NULL || xa_is_err(xa_ent));
 	WARN_ON(__xa_erase(&ent->mkeys, --ent->reserved) != NULL);
-	ent->total_mrs--;
+	ent->total_mkeys--;
 	xa_unlock_irq(&ent->mkeys);
 	mlx5_core_destroy_mkey(ent->dev->mdev, (u32)xa_to_value(xa_ent));
 	xa_lock_irq(&ent->mkeys);
 }
 
-static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
-				bool limit_fill)
-	 __acquires(&ent->lock) __releases(&ent->lock)
+static int resize_available_mkeys(struct mlx5_cache_ent *ent,
+				  unsigned int target, bool limit_fill)
+	__acquires(&ent->lock) __releases(&ent->lock)
 {
 	int err;
 
@@ -327,22 +327,22 @@ static ssize_t size_write(struct file *filp, const char __user *buf,
 		return err;
 
 	/*
-	 * Target is the new value of total_mrs the user requests, however we
+	 * Target is the new value of total_mkeys the user requests, however we
 	 * cannot free MRs that are in use. Compute the target value for
-	 * available_mrs.
+	 * available_mkeys.
 	 */
 
 	xa_lock_irq(&ent->mkeys);
-	if (target < ent->total_mrs - ent->stored) {
+	if (target < ent->total_mkeys - ent->stored) {
 		err = -EINVAL;
 		goto err_unlock;
 	}
-	target = target - (ent->total_mrs - ent->stored);
+	target = target - (ent->total_mkeys - ent->stored);
 	if (target < ent->limit || target > ent->limit*2) {
 		err = -EINVAL;
 		goto err_unlock;
 	}
-	err = resize_available_mrs(ent, target, false);
+	err = resize_available_mkeys(ent, target, false);
 	if (err)
 		goto err_unlock;
 	xa_unlock_irq(&ent->mkeys);
@@ -361,7 +361,7 @@ static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
 	char lbuf[20];
 	int err;
 
-	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs);
+	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mkeys);
 	if (err < 0)
 		return err;
 
@@ -392,7 +392,7 @@ static ssize_t limit_write(struct file *filp, const char __user *buf,
 	 */
 	xa_lock_irq(&ent->mkeys);
 	ent->limit = var;
-	err = resize_available_mrs(ent, 0, true);
+	err = resize_available_mkeys(ent, 0, true);
 	xa_unlock_irq(&ent->mkeys);
 	if (err)
 		return err;
@@ -421,7 +421,7 @@ static const struct file_operations limit_fops = {
 	.read	= limit_read,
 };
 
-static bool someone_adding(struct mlx5_mr_cache *cache)
+static bool someone_adding(struct mlx5_mkey_cache *cache)
 {
 	struct mlx5_cache_ent *ent;
 	struct rb_node *node;
@@ -477,7 +477,7 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
 static void __cache_work_func(struct mlx5_cache_ent *ent)
 {
 	struct mlx5_ib_dev *dev = ent->dev;
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	int err;
 
 	xa_lock_irq(&ent->mkeys);
@@ -553,7 +553,8 @@ static void cache_work_func(struct work_struct *work)
 	__cache_work_func(ent);
 }
 
-static struct mlx5_cache_ent *ent_search(struct mlx5_mr_cache *cache, void *mkc)
+static struct mlx5_cache_ent *ent_search(struct mlx5_mkey_cache *cache,
+					 void *mkc)
 {
 	struct rb_node *node = cache->cache_root.rb_node;
 	int size = MLX5_ST_SZ_BYTES(mkc);
@@ -588,9 +589,9 @@ static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs)
 	}
 }
 
-struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int *in,
-				       int inlen, unsigned int ndescs,
-				       unsigned int access_mode, bool force)
+struct mlx5_ib_mr *mlx5_mkey_cache_alloc(struct mlx5_ib_dev *dev, int *in,
+					 int inlen, unsigned int ndescs,
+					 unsigned int access_mode, bool force)
 {
 	struct mlx5_cache_ent *ent = NULL;
 	struct mlx5_ib_mr *mr;
@@ -645,7 +646,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int *in,
 
 			WRITE_ONCE(ent->dev->cache.last_add, jiffies);
 			xa_lock_irq(&ent->mkeys);
-			ent->total_mrs++;
+			ent->total_mkeys++;
 		} else {
 			xa_ent = __xa_store(&ent->mkeys, --ent->stored,
 					  NULL, GFP_KERNEL);
@@ -656,7 +657,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int *in,
 			mr->mmkey.key = (u32)xa_to_value(xa_ent);
 		}
 		if (ent->is_tmp) {
-			ent->total_mrs--;
+			ent->total_mkeys--;
 			queue_delayed_work(dev->cache.wq,
 					   &dev->cache.remove_ent_dwork,
 					   msecs_to_jiffies(30 * 1000));
@@ -684,7 +685,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int *in,
 	return ERR_PTR(err);
 }
 
-static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+static void mlx5_mkey_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
 	struct mlx5_cache_ent *ent = mr->cache_ent;
 
@@ -704,13 +705,13 @@ static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent)
 	xa_for_each(&ent->mkeys, index, entry) {
 		xa_lock_irq(&ent->mkeys);
 		__xa_erase(&ent->mkeys, index);
-		ent->total_mrs--;
+		ent->total_mkeys--;
 		xa_unlock_irq(&ent->mkeys);
 		mlx5_core_destroy_mkey(dev->mdev, (u32)xa_to_value(entry));
 	}
 }
 
-static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 {
 	if (!mlx5_debugfs_root || dev->is_rep)
 		return;
@@ -722,7 +723,7 @@ static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 static void mlx5_cache_ent_debugfs_init(struct mlx5_ib_dev *dev,
 					struct mlx5_cache_ent *ent, int order)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	struct dentry *dir;
 
 	if (!mlx5_debugfs_root || dev->is_rep)
@@ -743,7 +744,7 @@ static void delay_time_func(struct timer_list *t)
 	WRITE_ONCE(dev->fill_delay, 0);
 }
 
-static int ent_insert(struct mlx5_mr_cache *cache, struct mlx5_cache_ent *ent)
+static int ent_insert(struct mlx5_mkey_cache *cache, struct mlx5_cache_ent *ent)
 {
 	struct rb_node **new = &cache->cache_root.rb_node, *parent = NULL;
 	int size = MLX5_ST_SZ_BYTES(mkc);
@@ -812,11 +813,12 @@ static struct mlx5_cache_ent *mlx5_ib_create_cache_ent(struct mlx5_ib_dev *dev,
 
 static void remove_ent_work_func(struct work_struct *work)
 {
-	struct mlx5_mr_cache *cache;
+	struct mlx5_mkey_cache *cache;
 	struct mlx5_cache_ent *ent;
 	struct rb_node *cur;
 
-	cache = container_of(work, struct mlx5_mr_cache, remove_ent_dwork.work);
+	cache = container_of(work, struct mlx5_mkey_cache,
+			     remove_ent_dwork.work);
 	mutex_lock(&cache->cache_lock);
 	cur = rb_last(&cache->cache_root);
 	while (cur) {
@@ -825,7 +827,7 @@ static void remove_ent_work_func(struct work_struct *work)
 		mutex_unlock(&cache->cache_lock);
 
 		xa_lock_irq(&ent->mkeys);
-		if (!ent->is_tmp || ent->total_mrs != ent->stored) {
+		if (!ent->is_tmp || ent->total_mkeys != ent->stored) {
 			xa_unlock_irq(&ent->mkeys);
 			mutex_lock(&cache->cache_lock);
 			continue;
@@ -842,9 +844,9 @@ static void remove_ent_work_func(struct work_struct *work)
 	mutex_unlock(&cache->cache_lock);
 }
 
-int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
+int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
 	int order, err;
 
@@ -869,7 +871,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 
 	mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
 	timer_setup(&dev->delay_timer, delay_time_func, 0);
-	for (order = 2; order < MAX_MR_CACHE_ENTRIES + 2; order++) {
+	for (order = 2; order < MAX_MKEY_CACHE_ENTRIES + 2; order++) {
 		ent = mlx5_ib_create_cache_ent(dev, order);
 
 		if (IS_ERR(ent)) {
@@ -878,7 +880,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 		}
 
 		if (cache->maintained_cache &&
-		    order <= mr_cache_max_order(dev)) {
+		    order <= mkey_cache_max_order(dev)) {
 			ent->limit =
 				dev->mdev->profile.mr_cache[order - 2].limit;
 			xa_lock_irq(&ent->mkeys);
@@ -889,11 +891,11 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 
 	return 0;
 err:
-	mlx5_mr_cache_cleanup(dev);
+	mlx5_mkey_cache_cleanup(dev);
 	return err;
 }
 
-int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
+int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev)
 {
 	struct rb_root *root = &dev->cache.cache_root;
 	struct mlx5_cache_ent *ent;
@@ -913,7 +915,7 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
 		cancel_delayed_work_sync(&ent->dwork);
 	}
 
-	mlx5_mr_cache_debugfs_cleanup(dev);
+	mlx5_mkey_cache_debugfs_cleanup(dev);
 	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
 
 	node = rb_first(root);
@@ -991,10 +993,10 @@ static int get_octo_len(u64 addr, u64 len, int page_shift)
 	return (npages + 1) / 2;
 }
 
-static int mr_cache_max_order(struct mlx5_ib_dev *dev)
+static int mkey_cache_max_order(struct mlx5_ib_dev *dev)
 {
 	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
-		return MAX_MR_CACHE_ENTRIES + 2;
+		return MAX_MKEY_CACHE_ENTRIES + 2;
 	return MLX5_MAX_UMR_SHIFT;
 }
 
@@ -1116,7 +1118,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 			   mlx5_acc_flags_to_ent_flags(dev, access_flags),
 			   MLX5_MKC_ACCESS_MODE_MTT, PAGE_SHIFT);
 
-	mr = mlx5_mr_cache_alloc(
+	mr = mlx5_mkey_cache_alloc(
 		dev, in, inlen, ndescs, MLX5_MKC_ACCESS_MODE_MTT,
 		!mlx5_ib_can_reconfig_with_umr(dev, access_flags, 0));
 	if (IS_ERR(mr)) {
@@ -2115,7 +2117,7 @@ create_tmp_cache_ent(struct mlx5_ib_dev *dev, void *mkc, unsigned int ndescs)
 
 static void tmp_cache_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_mkey_cache *cache = &dev->cache;
 	struct ib_umem *umem = mr->umem;
 	struct mlx5_cache_ent *ent;
 	void *mkc;
@@ -2150,7 +2152,7 @@ static void tmp_cache_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 		mutex_unlock(&cache->cache_lock);
 		return;
 	}
-	ent->total_mrs++;
+	ent->total_mkeys++;
 	xa_unlock_irq(&ent->mkeys);
 	queue_delayed_work(cache->wq, &cache->remove_ent_dwork,
 			   msecs_to_jiffies(30 * 1000));
@@ -2209,7 +2211,7 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 	if (mr->cache_ent) {
 		if (revoke_mr(mr) || push_reserve_mkey(mr->cache_ent)) {
 			xa_lock_irq(&mr->cache_ent->mkeys);
-			mr->cache_ent->total_mrs--;
+			mr->cache_ent->total_mkeys--;
 			xa_unlock_irq(&mr->cache_ent->mkeys);
 			mr->cache_ent = NULL;
 		}
@@ -2232,7 +2234,7 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 	}
 
 	if (mr->cache_ent)
-		mlx5_mr_cache_free(dev, mr);
+		mlx5_mkey_cache_free(dev, mr);
 	else
 		mlx5_free_priv_descs(mr);
 
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 3d86a448ec97..25328abaedc9 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -434,8 +434,8 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
 			   mlx5_acc_flags_to_ent_flags(dev, imr->access_flags),
 			   MLX5_MKC_ACCESS_MODE_MTT, PAGE_SHIFT);
 
-	mr = mlx5_mr_cache_alloc(dev, in, inlen, MLX5_IMR_MTT_ENTRIES,
-				 MLX5_MKC_ACCESS_MODE_MTT, true);
+	mr = mlx5_mkey_cache_alloc(dev, in, inlen, MLX5_IMR_MTT_ENTRIES,
+				   MLX5_MKC_ACCESS_MODE_MTT, true);
 	if (IS_ERR(mr)) {
 		ib_umem_odp_release(odp);
 		kfree(in);
@@ -528,8 +528,8 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 			   mlx5_acc_flags_to_ent_flags(dev, access_flags),
 			   MLX5_MKC_ACCESS_MODE_KSM, PAGE_SHIFT);
 
-	imr = mlx5_mr_cache_alloc(dev, in, inlen, mlx5_imr_ksm_entries,
-				  MLX5_MKC_ACCESS_MODE_KSM, true);
+	imr = mlx5_mkey_cache_alloc(dev, in, inlen, mlx5_imr_ksm_entries,
+				    MLX5_MKC_ACCESS_MODE_KSM, true);
 	if (IS_ERR(imr)) {
 		ib_umem_odp_release(umem_odp);
 		kfree(in);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index c33f71134136..51b30c11116e 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -699,7 +699,7 @@ enum {
 };
 
 enum {
-	MAX_MR_CACHE_ENTRIES = 21,
+	MAX_MKEY_CACHE_ENTRIES = 21,
 };
 
 struct mlx5_profile {
@@ -708,7 +708,7 @@ struct mlx5_profile {
 	struct {
 		int	size;
 		int	limit;
-	} mr_cache[MAX_MR_CACHE_ENTRIES];
+	} mr_cache[MAX_MKEY_CACHE_ENTRIES];
 };
 
 struct mlx5_hca_cap {
-- 
2.33.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH rdma-next 2/7] RDMA/mlx5: Replace cache list with Xarray
  2021-12-06  9:10 ` [PATCH rdma-next 2/7] RDMA/mlx5: Replace cache list with Xarray Leon Romanovsky
@ 2021-12-08 16:46   ` Jason Gunthorpe
  2021-12-09  8:03     ` Leon Romanovsky
  2021-12-09  8:21     ` Aharon Landau
  0 siblings, 2 replies; 15+ messages in thread
From: Jason Gunthorpe @ 2021-12-08 16:46 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: Aharon Landau, linux-kernel, linux-rdma

> @@ -166,14 +169,14 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
>  
>  	WRITE_ONCE(dev->cache.last_add, jiffies);
>  
> -	spin_lock_irqsave(&ent->lock, flags);
> -	list_add_tail(&mr->list, &ent->head);
> -	ent->available_mrs++;
> +	xa_lock_irqsave(&ent->mkeys, flags);
> +	xa_ent = __xa_store(&ent->mkeys, ent->stored++, mr, GFP_ATOMIC);
> +	WARN_ON(xa_ent != NULL);
> +	ent->pending--;
>  	ent->total_mrs++;
>  	/* If we are doing fill_to_high_water then keep going. */
>  	queue_adjust_cache_locked(ent);
> -	ent->pending--;
> -	spin_unlock_irqrestore(&ent->lock, flags);
> +	xa_unlock_irqrestore(&ent->mkeys, flags);
>  }
>  
>  static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
> @@ -196,6 +199,25 @@ static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
>  	return mr;
>  }
>  
> +static int _push_reserve_mkey(struct mlx5_cache_ent *ent)
> +{
> +	unsigned long to_reserve;
> +	int rc;
> +
> +	while (true) {
> +		to_reserve = ent->reserved;
> +		rc = xa_err(__xa_cmpxchg(&ent->mkeys, to_reserve, NULL,
> +					 XA_ZERO_ENTRY, GFP_KERNEL));
> +		if (rc)
> +			return rc;

What about when old != NULL ?

> +		if (to_reserve != ent->reserved)
> +			continue;

There is an edge case where where reserved could have shrunk alot
while the lock was released, and xa_cmpxchg could succeed. The above
if will protect things, however a ZERO_ENTRY will have been written to
some weird place in the XA. It needs a 

 if (old == NULL) // ie we stored something someplace weird
    __xa_erase(&ent->mkeys, to_reserve)

> +		ent->reserved++;
> +		break;
> +	}
> +	return 0;
> +}
> +
>  /* Asynchronously schedule new MRs to be populated in the cache. */
>  static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
>  {
> @@ -217,23 +239,32 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
>  			err = -ENOMEM;
>  			break;
>  		}
> -		spin_lock_irq(&ent->lock);
> +		xa_lock_irq(&ent->mkeys);
>  		if (ent->pending >= MAX_PENDING_REG_MR) {
> +			xa_unlock_irq(&ent->mkeys);
>  			err = -EAGAIN;
> -			spin_unlock_irq(&ent->lock);
> +			kfree(mr);
> +			break;
> +		}
> +
> +		err = _push_reserve_mkey(ent);

The test of ent->pending is out of date now since this can drop the
lock

It feels like pending and (reserved - stored) are really the same
thing, so maybe just directly limit the number of reserved and test it
after

> @@ -287,39 +318,37 @@ static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
>  {
>  	struct mlx5_ib_mr *mr;
>  
> -	lockdep_assert_held(&ent->lock);
> -	if (list_empty(&ent->head))
> +	if (!ent->stored)
>  		return;
> -	mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
> -	list_del(&mr->list);
> -	ent->available_mrs--;

> +	mr = __xa_store(&ent->mkeys, --ent->stored, NULL, GFP_KERNEL);
> +	WARN_ON(mr == NULL || xa_is_err(mr));

Add a if (reserved != stored)  before the below?

> +	WARN_ON(__xa_erase(&ent->mkeys, --ent->reserved) != NULL);

Also please avoid writing WARN_ON(something with side effects)

  old = __xa_erase(&ent->mkeys, --ent->reserved);
  WARN_ON(old != NULL);

Same for all places

>  static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
>  				bool limit_fill)
> +	 __acquires(&ent->lock) __releases(&ent->lock)

Why?

>  {
>  	int err;
>  
> -	lockdep_assert_held(&ent->lock);
> -

Why?

>  static void clean_keys(struct mlx5_ib_dev *dev, int c)
>  {
>  	struct mlx5_mr_cache *cache = &dev->cache;
>  	struct mlx5_cache_ent *ent = &cache->ent[c];
> -	struct mlx5_ib_mr *tmp_mr;
>  	struct mlx5_ib_mr *mr;
> -	LIST_HEAD(del_list);
> +	unsigned long index;
>  
>  	cancel_delayed_work(&ent->dwork);
> -	while (1) {
> -		spin_lock_irq(&ent->lock);
> -		if (list_empty(&ent->head)) {
> -			spin_unlock_irq(&ent->lock);
> -			break;
> -		}
> -		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
> -		list_move(&mr->list, &del_list);
> -		ent->available_mrs--;
> +	xa_for_each(&ent->mkeys, index, mr) {

This isn't quite the same thing, the above tolerates concurrent add,
this does not.

It should be more like

while (ent->stored) {
   mr = __xa_erase(stored--);

> @@ -1886,6 +1901,17 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
>  	}
>  }
>  
> +static int push_reserve_mkey(struct mlx5_cache_ent *ent)
> +{
> +	int ret;
> +
> +	xa_lock_irq(&ent->mkeys);
> +	ret = _push_reserve_mkey(ent);
> +	xa_unlock_irq(&ent->mkeys);
> +
> +	return ret;
> +}

Put this close to _push_reserve_mkey() please

Jason

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH rdma-next 3/7] RDMA/mlx5: Store in the cache mkeys instead of mrs
  2021-12-06  9:10 ` [PATCH rdma-next 3/7] RDMA/mlx5: Store in the cache mkeys instead of mrs Leon Romanovsky
@ 2021-12-08 20:44   ` Jason Gunthorpe
  0 siblings, 0 replies; 15+ messages in thread
From: Jason Gunthorpe @ 2021-12-08 20:44 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: Aharon Landau, linux-kernel, linux-rdma

On Mon, Dec 06, 2021 at 11:10:48AM +0200, Leon Romanovsky wrote:
> From: Aharon Landau <aharonl@nvidia.com>
> 
> Currently, the driver stores the entire mlx5_ib_mr struct in the cache,
> although the only use of the cached MR is the mkey. Store only the mkey
> in the cache.
> 
> Signed-off-by: Aharon Landau <aharonl@nvidia.com>
> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
>  drivers/infiniband/hw/mlx5/mlx5_ib.h |  21 ++---
>  drivers/infiniband/hw/mlx5/mr.c      | 135 +++++++++++++--------------
>  2 files changed, 71 insertions(+), 85 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> index d0224f468169..9b12e970ca01 100644
> +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> @@ -668,14 +668,6 @@ struct mlx5_ib_mr {

Shouldn't this delete cache_ent too?

Jason

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH rdma-next 4/7] RDMA/mlx5: Change the cache structure to an RB-tree
  2021-12-06  9:10 ` [PATCH rdma-next 4/7] RDMA/mlx5: Change the cache structure to an RB-tree Leon Romanovsky
@ 2021-12-08 20:58   ` Jason Gunthorpe
  0 siblings, 0 replies; 15+ messages in thread
From: Jason Gunthorpe @ 2021-12-08 20:58 UTC (permalink / raw)
  To: Leon Romanovsky; +Cc: Aharon Landau, linux-kernel, linux-rdma

On Mon, Dec 06, 2021 at 11:10:49AM +0200, Leon Romanovsky wrote:

> +static struct mlx5_cache_ent *ent_search(struct mlx5_mr_cache *cache, void *mkc)

mlx5_cache_ent_find() ?

This search isn't really what I would expect, it should stop if it
finds a (somewhat) larger and otherwise compatible entry instead of
continuing to bin on power of two sizes.

> +	struct rb_node *node = cache->cache_root.rb_node;
> +	int size = MLX5_ST_SZ_BYTES(mkc);

size_t not int

> +	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
> +	if (dev->cache.maintained_cache && !force) {
> +		int order;

unsigned int

> +		/*
> +		 * Try to get an mkey from pool.
> +		 */
> +		order = order_base_2(ndescs) > 2 ? order_base_2(ndescs) : 2;
> +		MLX5_SET(mkc, mkc, translations_octword_size,
> +			 get_mkc_octo_size(access_mode, 1 << order));
> +		mutex_lock(&dev->cache.cache_lock);
> +		ent = ent_search(&dev->cache, mkc);
> +		mutex_unlock(&dev->cache.cache_lock);

Why is it OK to drop the lock here? What is protecting the ent pointer
against free?

> +	if (ent && (ent->limit || force)) {

What locking protects ent->limit ?

> +		xa_lock_irq(&ent->mkeys);
> +		if (!ent->stored) {
> +			if (ent->limit) {
> +				queue_adjust_cache_locked(ent);
> +				ent->miss++;
> +			}
> +			xa_unlock_irq(&ent->mkeys);
> +
> +			err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
> +			if (err)
> +				goto err;

So if there is no entry we create a bigger one rounded up to pow2?

> +
> +			WRITE_ONCE(ent->dev->cache.last_add, jiffies);
> +			xa_lock_irq(&ent->mkeys);
> +			ent->total_mrs++;
> +			xa_unlock_irq(&ent->mkeys);

May as well optimize for the normal case, do the total_mrs++ before
create_mkey while we still have the lock and undo it if it fails. It
is just minor stat reporting right?

> +		} else {
> +			xa_ent = __xa_store(&ent->mkeys, --ent->stored,
> +					  NULL, GFP_KERNEL);
> +			WARN_ON(xa_ent == NULL || xa_is_err(xa_ent));
> +			WARN_ON(__xa_erase(&ent->mkeys, --ent->reserved) !=
> +				NULL);

This whole bigger block want to be in its own function 'mlx5_ent_get_mkey()'

Then you can write it simpler without all the duplicated error
handling and deep indenting

>  			queue_adjust_cache_locked(ent);
> -			ent->miss++;
> -		}
> -		xa_unlock_irq(&ent->mkeys);
> -		err = create_cache_mkey(ent, &mr->mmkey.key);
> -		if (err) {
> -			kfree(mr);
> -			return ERR_PTR(err);
> +			xa_unlock_irq(&ent->mkeys);
> +			mr->mmkey.key = (u32)xa_to_value(xa_ent);
>  		}
> +		mr->cache_ent = ent;
>  	} else {
> -		mr = __xa_store(&ent->mkeys, --ent->stored, NULL,
> -				GFP_KERNEL);
> -		WARN_ON(mr == NULL || xa_is_err(mr));
> -		WARN_ON(__xa_erase(&ent->mkeys, --ent->reserved) != NULL);
> -		queue_adjust_cache_locked(ent);
> -		xa_unlock_irq(&ent->mkeys);
> -
> -		mr->mmkey.key = (u32)xa_to_value(xa_ent);
> +		/*
> +		 * Can not use a cache mkey.
> +		 * Create an mkey with the exact needed size.
> +		 */
> +		MLX5_SET(mkc, mkc, translations_octword_size,
> +			 get_mkc_octo_size(access_mode, ndescs));
> +		err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
> +		if (err)
> +			goto err;
>  	}

I think this needs to be broken to functions, it is hard to read with
all these ifs and inlined locking

>  
> +static int ent_insert(struct mlx5_mr_cache *cache, struct mlx5_cache_ent *ent)
> +{
> +	struct rb_node **new = &cache->cache_root.rb_node, *parent = NULL;
> +	int size = MLX5_ST_SZ_BYTES(mkc);
> +	struct mlx5_cache_ent *cur;
> +	int cmp;
> +
> +	/* Figure out where to put new node */
> +	while (*new) {
> +		cur = container_of(*new, struct mlx5_cache_ent, node);
> +		parent = *new;
> +		cmp = memcmp(ent->mkc, cur->mkc, size);
> +		if (cmp < 0)
> +			new = &((*new)->rb_left);
> +		else if (cmp > 0)
> +			new = &((*new)->rb_right);
> +		else
> +			return -EEXIST;
> +	}
> +
> +	/* Add new node and rebalance tree. */
> +	rb_link_node(&ent->node, parent, new);
> +	rb_insert_color(&ent->node, &cache->cache_root);
> +
> +	return 0;
> +}

This should be near the find

> +
> +static struct mlx5_cache_ent *mlx5_ib_create_cache_ent(struct mlx5_ib_dev *dev,
> +						       int order)
> +{
> +	struct mlx5_cache_ent *ent;
> +	int ret;
> +
> +	ent = kzalloc(sizeof(*ent), GFP_KERNEL);
> +	if (!ent)
> +		return ERR_PTR(-ENOMEM);
> +
> +	ent->mkc = kzalloc(MLX5_ST_SZ_BYTES(mkc), GFP_KERNEL);
> +	if (!ent->mkc) {
> +		kfree(ent);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	ent->ndescs = 1 << order;
> +	mlx5_set_cache_mkc(dev, ent->mkc, 0, MLX5_MKC_ACCESS_MODE_MTT,
> +			   PAGE_SHIFT);
> +	MLX5_SET(mkc, ent->mkc, translations_octword_size,
> +		 get_mkc_octo_size(MLX5_MKC_ACCESS_MODE_MTT, ent->ndescs));
> +	mutex_lock(&dev->cache.cache_lock);
> +	ret = ent_insert(&dev->cache, ent);
> +	mutex_unlock(&dev->cache.cache_lock);
> +	if (ret) {
> +		kfree(ent->mkc);
> +		kfree(ent);
> +		return ERR_PTR(ret);
> +	}
> +
> +	xa_init_flags(&ent->mkeys, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
> +	ent->dev = dev;
> +
> +	INIT_WORK(&ent->work, cache_work_func);
> +	INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);

And the ent should be fully setup before adding it to the cache

Jason

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH rdma-next 2/7] RDMA/mlx5: Replace cache list with Xarray
  2021-12-08 16:46   ` Jason Gunthorpe
@ 2021-12-09  8:03     ` Leon Romanovsky
  2021-12-09  8:21     ` Aharon Landau
  1 sibling, 0 replies; 15+ messages in thread
From: Leon Romanovsky @ 2021-12-09  8:03 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Aharon Landau, linux-kernel, linux-rdma

On Wed, Dec 08, 2021 at 12:46:11PM -0400, Jason Gunthorpe wrote:
> > @@ -166,14 +169,14 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)

<...>

> >  static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
> >  				bool limit_fill)
> > +	 __acquires(&ent->lock) __releases(&ent->lock)
> 
> Why?

This is mine, I got tons of complains from sparse [1] because of this commit [2]. 

[1] drivers/infiniband/hw/mlx5/mr.c:280:25: warning: context imbalance in 'resize_available_mkeys' - unexpected unlock
[2] https://lwn.net/Articles/109066/

Thanks

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH rdma-next 2/7] RDMA/mlx5: Replace cache list with Xarray
  2021-12-08 16:46   ` Jason Gunthorpe
  2021-12-09  8:03     ` Leon Romanovsky
@ 2021-12-09  8:21     ` Aharon Landau
  2021-12-09 17:56       ` Jason Gunthorpe
  1 sibling, 1 reply; 15+ messages in thread
From: Aharon Landau @ 2021-12-09  8:21 UTC (permalink / raw)
  To: Jason Gunthorpe, Leon Romanovsky; +Cc: linux-kernel, linux-rdma


On 12/8/2021 6:46 PM, Jason Gunthorpe wrote:
>> @@ -166,14 +169,14 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context)
>>   
>>   	WRITE_ONCE(dev->cache.last_add, jiffies);
>>   
>> -	spin_lock_irqsave(&ent->lock, flags);
>> -	list_add_tail(&mr->list, &ent->head);
>> -	ent->available_mrs++;
>> +	xa_lock_irqsave(&ent->mkeys, flags);
>> +	xa_ent = __xa_store(&ent->mkeys, ent->stored++, mr, GFP_ATOMIC);
>> +	WARN_ON(xa_ent != NULL);
>> +	ent->pending--;
>>   	ent->total_mrs++;
>>   	/* If we are doing fill_to_high_water then keep going. */
>>   	queue_adjust_cache_locked(ent);
>> -	ent->pending--;
>> -	spin_unlock_irqrestore(&ent->lock, flags);
>> +	xa_unlock_irqrestore(&ent->mkeys, flags);
>>   }
>>   
>>   static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
>> @@ -196,6 +199,25 @@ static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
>>   	return mr;
>>   }
>>   
>> +static int _push_reserve_mkey(struct mlx5_cache_ent *ent)
>> +{
>> +	unsigned long to_reserve;
>> +	int rc;
>> +
>> +	while (true) {
>> +		to_reserve = ent->reserved;
>> +		rc = xa_err(__xa_cmpxchg(&ent->mkeys, to_reserve, NULL,
>> +					 XA_ZERO_ENTRY, GFP_KERNEL));
>> +		if (rc)
>> +			return rc;
> What about when old != NULL ?
>
>> +		if (to_reserve != ent->reserved)
>> +			continue;
> There is an edge case where where reserved could have shrunk alot
> while the lock was released, and xa_cmpxchg could succeed. The above
> if will protect things, however a ZERO_ENTRY will have been written to
> some weird place in the XA. It needs a
>
>   if (old == NULL) // ie we stored something someplace weird
>      __xa_erase(&ent->mkeys, to_reserve)
>
>> +		ent->reserved++;
>> +		break;
>> +	}
>> +	return 0;
>> +}
>> +
>>   /* Asynchronously schedule new MRs to be populated in the cache. */
>>   static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
>>   {
>> @@ -217,23 +239,32 @@ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
>>   			err = -ENOMEM;
>>   			break;
>>   		}
>> -		spin_lock_irq(&ent->lock);
>> +		xa_lock_irq(&ent->mkeys);
>>   		if (ent->pending >= MAX_PENDING_REG_MR) {
>> +			xa_unlock_irq(&ent->mkeys);
>>   			err = -EAGAIN;
>> -			spin_unlock_irq(&ent->lock);
>> +			kfree(mr);
>> +			break;
>> +		}
>> +
>> +		err = _push_reserve_mkey(ent);
> The test of ent->pending is out of date now since this can drop the
> lock
>
> It feels like pending and (reserved - stored) are really the same
> thing, so maybe just directly limit the number of reserved and test it
> after
The mlx5_ib_dereg_mr is reserving entries as well. Should I limit 
create_mkey_cb due to pending deregs?
>> @@ -287,39 +318,37 @@ static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
>>   {
>>   	struct mlx5_ib_mr *mr;
>>   
>> -	lockdep_assert_held(&ent->lock);
>> -	if (list_empty(&ent->head))
>> +	if (!ent->stored)
>>   		return;
>> -	mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
>> -	list_del(&mr->list);
>> -	ent->available_mrs--;
>> +	mr = __xa_store(&ent->mkeys, --ent->stored, NULL, GFP_KERNEL);
>> +	WARN_ON(mr == NULL || xa_is_err(mr));
> Add a if (reserved != stored)  before the below?
I initiated the xarray using XA_FLAGS_ALLOC, therefore, the __xa_store 
above will mark the entry as ZERO_ENTRY.
>
>> +	WARN_ON(__xa_erase(&ent->mkeys, --ent->reserved) != NULL);
> Also please avoid writing WARN_ON(something with side effects)
>
>    old = __xa_erase(&ent->mkeys, --ent->reserved);
>    WARN_ON(old != NULL);
>
> Same for all places
>
>>   static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
>>   				bool limit_fill)
>> +	 __acquires(&ent->lock) __releases(&ent->lock)
> Why?
>
>>   {
>>   	int err;
>>   
>> -	lockdep_assert_held(&ent->lock);
>> -
> Why?
>
>>   static void clean_keys(struct mlx5_ib_dev *dev, int c)
>>   {
>>   	struct mlx5_mr_cache *cache = &dev->cache;
>>   	struct mlx5_cache_ent *ent = &cache->ent[c];
>> -	struct mlx5_ib_mr *tmp_mr;
>>   	struct mlx5_ib_mr *mr;
>> -	LIST_HEAD(del_list);
>> +	unsigned long index;
>>   
>>   	cancel_delayed_work(&ent->dwork);
>> -	while (1) {
>> -		spin_lock_irq(&ent->lock);
>> -		if (list_empty(&ent->head)) {
>> -			spin_unlock_irq(&ent->lock);
>> -			break;
>> -		}
>> -		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
>> -		list_move(&mr->list, &del_list);
>> -		ent->available_mrs--;
>> +	xa_for_each(&ent->mkeys, index, mr) {
> This isn't quite the same thing, the above tolerates concurrent add,
> this does not.
>
> It should be more like
>
> while (ent->stored) {
>     mr = __xa_erase(stored--);
>
>> @@ -1886,6 +1901,17 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
>>   	}
>>   }
>>   
>> +static int push_reserve_mkey(struct mlx5_cache_ent *ent)
>> +{
>> +	int ret;
>> +
>> +	xa_lock_irq(&ent->mkeys);
>> +	ret = _push_reserve_mkey(ent);
>> +	xa_unlock_irq(&ent->mkeys);
>> +
>> +	return ret;
>> +}
> Put this close to _push_reserve_mkey() please
>
> Jason

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH rdma-next 0/7] MR cache enhancement
  2021-12-06  9:10 [PATCH rdma-next 0/7] MR cache enhancement Leon Romanovsky
                   ` (6 preceding siblings ...)
  2021-12-06  9:10 ` [PATCH rdma-next 7/7] RDMA/mlx5: Rename the mkey cache variables and functions Leon Romanovsky
@ 2021-12-09  8:40 ` Leon Romanovsky
  7 siblings, 0 replies; 15+ messages in thread
From: Leon Romanovsky @ 2021-12-09  8:40 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Aharon Landau, linux-rdma

On Mon, Dec 06, 2021 at 11:10:45AM +0200, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@nvidia.com>
> 
> Hi,
> 
> This series from Aharon refactors mlx5 MR cache management logic to
> speedup deregistration significantly.
> 
> Thanks
> 
> Aharon Landau (7):
>   RDMA/mlx5: Merge similar flows of allocating MR from the cache
>   RDMA/mlx5: Replace cache list with Xarray
>   RDMA/mlx5: Store in the cache mkeys instead of mrs
>   RDMA/mlx5: Change the cache structure to an RB-tree
>   RDMA/mlx5: Reorder calls to pcie_relaxed_ordering_enabled()
>   RDMA/mlx5: Delay the deregistration of a non-cache mkey
>   RDMA/mlx5: Rename the mkey cache variables and functions

Let's me resubmit it after more deep review.

Thanks

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH rdma-next 2/7] RDMA/mlx5: Replace cache list with Xarray
  2021-12-09  8:21     ` Aharon Landau
@ 2021-12-09 17:56       ` Jason Gunthorpe
  0 siblings, 0 replies; 15+ messages in thread
From: Jason Gunthorpe @ 2021-12-09 17:56 UTC (permalink / raw)
  To: Aharon Landau; +Cc: Leon Romanovsky, linux-kernel, linux-rdma

On Thu, Dec 09, 2021 at 10:21:22AM +0200, Aharon Landau wrote:

> > It feels like pending and (reserved - stored) are really the same
> > thing, so maybe just directly limit the number of reserved and test it
> > after
> The mlx5_ib_dereg_mr is reserving entries as well. Should I limit
> create_mkey_cb due to pending deregs?

Sure, it is the same concept, deregs are going to refill things just
as wass as new creations.

> > > @@ -287,39 +318,37 @@ static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
> > >   {
> > >   	struct mlx5_ib_mr *mr;
> > > -	lockdep_assert_held(&ent->lock);
> > > -	if (list_empty(&ent->head))
> > > +	if (!ent->stored)
> > >   		return;
> > > -	mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
> > > -	list_del(&mr->list);
> > > -	ent->available_mrs--;
> > > +	mr = __xa_store(&ent->mkeys, --ent->stored, NULL, GFP_KERNEL);
> > > +	WARN_ON(mr == NULL || xa_is_err(mr));
> > Add a if (reserved != stored)  before the below?
> I initiated the xarray using XA_FLAGS_ALLOC, therefore, the __xa_store above
> will mark the entry as ZERO_ENTRY.

Oh, do not use XA_FLAGS_ALLOC, this is not what is it for

Jason

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2021-12-09 17:56 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-06  9:10 [PATCH rdma-next 0/7] MR cache enhancement Leon Romanovsky
2021-12-06  9:10 ` [PATCH rdma-next 1/7] RDMA/mlx5: Merge similar flows of allocating MR from the cache Leon Romanovsky
2021-12-06  9:10 ` [PATCH rdma-next 2/7] RDMA/mlx5: Replace cache list with Xarray Leon Romanovsky
2021-12-08 16:46   ` Jason Gunthorpe
2021-12-09  8:03     ` Leon Romanovsky
2021-12-09  8:21     ` Aharon Landau
2021-12-09 17:56       ` Jason Gunthorpe
2021-12-06  9:10 ` [PATCH rdma-next 3/7] RDMA/mlx5: Store in the cache mkeys instead of mrs Leon Romanovsky
2021-12-08 20:44   ` Jason Gunthorpe
2021-12-06  9:10 ` [PATCH rdma-next 4/7] RDMA/mlx5: Change the cache structure to an RB-tree Leon Romanovsky
2021-12-08 20:58   ` Jason Gunthorpe
2021-12-06  9:10 ` [PATCH rdma-next 5/7] RDMA/mlx5: Reorder calls to pcie_relaxed_ordering_enabled() Leon Romanovsky
2021-12-06  9:10 ` [PATCH rdma-next 6/7] RDMA/mlx5: Delay the deregistration of a non-cache mkey Leon Romanovsky
2021-12-06  9:10 ` [PATCH rdma-next 7/7] RDMA/mlx5: Rename the mkey cache variables and functions Leon Romanovsky
2021-12-09  8:40 ` [PATCH rdma-next 0/7] MR cache enhancement Leon Romanovsky

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).