All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH for-next 0/3] Add user MR re-registeration support
@ 2014-07-31  8:01 Or Gerlitz
       [not found] ` <1406793690-3816-1-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 7+ messages in thread
From: Or Gerlitz @ 2014-07-31  8:01 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, amirv-VPRAkNaXOzVWk0Htik3J/w,
	Or Gerlitz

Memory re-registration enables one to change the attributes of a 
memory region registered by user-space, including PD, translation 
(address and length) and access flags.

Matan and Or.

Matan Barak (3):
  IB/core: Add user MR re-registeration support
  mlx4_core: Add helper functions to support MR re-registration
  IB/mlx4_ib: Add support for user MR re-registration

 drivers/infiniband/core/uverbs.h                   |    1 +
 drivers/infiniband/core/uverbs_cmd.c               |   93 +++++++++++
 drivers/infiniband/core/uverbs_main.c              |    1 +
 drivers/infiniband/hw/mlx4/main.c                  |    2 +
 drivers/infiniband/hw/mlx4/mlx4_ib.h               |    4 +
 drivers/infiniband/hw/mlx4/mr.c                    |   88 +++++++++++-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |    2 +
 drivers/net/ethernet/mellanox/mlx4/mr.c            |  160 ++++++++++++++++++++
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |   26 +++-
 include/linux/mlx4/device.h                        |   16 ++
 include/rdma/ib_verbs.h                            |   10 +-
 include/uapi/rdma/ib_user_verbs.h                  |   16 ++
 12 files changed, 415 insertions(+), 4 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH for-next 1/3] IB/core: Add user MR re-registeration support
       [not found] ` <1406793690-3816-1-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2014-07-31  8:01   ` Or Gerlitz
  2014-07-31  8:01   ` [PATCH for-next 2/3] mlx4_core: Add helper functions to support MR re-registration Or Gerlitz
  2014-07-31  8:01   ` [PATCH for-next 3/3] IB/mlx4_ib: Add support for user " Or Gerlitz
  2 siblings, 0 replies; 7+ messages in thread
From: Or Gerlitz @ 2014-07-31  8:01 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, amirv-VPRAkNaXOzVWk0Htik3J/w,
	Matan Barak, Or Gerlitz

From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Memory re-registration is a feature that enables one to change
the attributes of a memory region registered by user-space,
including PD, translation (address and length) and access flags.

Add the required support in uverbs and the kernel verbs API.

Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
 drivers/infiniband/core/uverbs.h      |    1 +
 drivers/infiniband/core/uverbs_cmd.c  |   93 +++++++++++++++++++++++++++++++++
 drivers/infiniband/core/uverbs_main.c |    1 +
 include/rdma/ib_verbs.h               |   10 +++-
 include/uapi/rdma/ib_user_verbs.h     |   16 ++++++
 5 files changed, 120 insertions(+), 1 deletions(-)

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index a283274..643c08a 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -221,6 +221,7 @@ IB_UVERBS_DECLARE_CMD(query_port);
 IB_UVERBS_DECLARE_CMD(alloc_pd);
 IB_UVERBS_DECLARE_CMD(dealloc_pd);
 IB_UVERBS_DECLARE_CMD(reg_mr);
+IB_UVERBS_DECLARE_CMD(rereg_mr);
 IB_UVERBS_DECLARE_CMD(dereg_mr);
 IB_UVERBS_DECLARE_CMD(alloc_mw);
 IB_UVERBS_DECLARE_CMD(dealloc_mw);
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index ea6203e..0600c50 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1002,6 +1002,99 @@ err_free:
 	return ret;
 }
 
+ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
+			   const char __user *buf, int in_len,
+			   int out_len)
+{
+	struct ib_uverbs_rereg_mr      cmd;
+	struct ib_uverbs_rereg_mr_resp resp;
+	struct ib_udata              udata;
+	struct ib_pd                *pd = NULL;
+	struct ib_mr                *mr;
+	struct ib_pd		    *old_pd;
+	int                          ret;
+	struct ib_uobject	    *uobj;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof(cmd)))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof(cmd),
+		   (unsigned long) cmd.response + sizeof(resp),
+		   in_len - sizeof(cmd), out_len - sizeof(resp));
+
+	if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags)
+		return -EINVAL;
+
+	if ((cmd.flags & IB_MR_REREG_TRANS) &&
+	    (!cmd.start || !cmd.hca_va || 0 >= cmd.length ||
+	     (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)))
+			return -EINVAL;
+
+	uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle,
+			      file->ucontext);
+
+	if (!uobj)
+		return -EINVAL;
+
+	mr = uobj->object;
+
+	if (cmd.flags & IB_MR_REREG_ACCESS) {
+		ret = ib_check_mr_access(cmd.access_flags);
+		if (ret)
+			goto put_uobjs;
+	}
+
+	if (cmd.flags & IB_MR_REREG_PD) {
+		pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+		if (!pd) {
+			ret = -EINVAL;
+			goto put_uobjs;
+		}
+	}
+
+	if (atomic_read(&mr->usecnt)) {
+		ret = -EBUSY;
+		goto put_uobj_pd;
+	}
+
+	old_pd = mr->pd;
+	ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start,
+					cmd.length, cmd.hca_va,
+					cmd.access_flags, pd, &udata);
+	if (!ret) {
+		if (cmd.flags & IB_MR_REREG_PD) {
+			atomic_inc(&pd->usecnt);
+			mr->pd = pd;
+			atomic_dec(&old_pd->usecnt);
+		}
+	} else {
+		goto put_uobj_pd;
+	}
+
+	memset(&resp, 0, sizeof(resp));
+	resp.lkey      = mr->lkey;
+	resp.rkey      = mr->rkey;
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+	else
+		ret = in_len;
+
+put_uobj_pd:
+	if (cmd.flags & IB_MR_REREG_PD)
+		put_pd_read(pd);
+
+put_uobjs:
+
+	put_uobj_write(mr->uobject);
+
+	return ret;
+}
+
 ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
 			   const char __user *buf, int in_len,
 			   int out_len)
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 08219fb..c73b22a 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -87,6 +87,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
 	[IB_USER_VERBS_CMD_ALLOC_PD]		= ib_uverbs_alloc_pd,
 	[IB_USER_VERBS_CMD_DEALLOC_PD]		= ib_uverbs_dealloc_pd,
 	[IB_USER_VERBS_CMD_REG_MR]		= ib_uverbs_reg_mr,
+	[IB_USER_VERBS_CMD_REREG_MR]		= ib_uverbs_rereg_mr,
 	[IB_USER_VERBS_CMD_DEREG_MR]		= ib_uverbs_dereg_mr,
 	[IB_USER_VERBS_CMD_ALLOC_MW]		= ib_uverbs_alloc_mw,
 	[IB_USER_VERBS_CMD_DEALLOC_MW]		= ib_uverbs_dealloc_mw,
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 7ccef34..ed44cc0 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1097,7 +1097,8 @@ struct ib_mr_attr {
 enum ib_mr_rereg_flags {
 	IB_MR_REREG_TRANS	= 1,
 	IB_MR_REREG_PD		= (1<<1),
-	IB_MR_REREG_ACCESS	= (1<<2)
+	IB_MR_REREG_ACCESS	= (1<<2),
+	IB_MR_REREG_SUPPORTED	= ((IB_MR_REREG_ACCESS << 1) - 1)
 };
 
 /**
@@ -1547,6 +1548,13 @@ struct ib_device {
 						  u64 virt_addr,
 						  int mr_access_flags,
 						  struct ib_udata *udata);
+	int			   (*rereg_user_mr)(struct ib_mr *mr,
+						    int flags,
+						    u64 start, u64 length,
+						    u64 virt_addr,
+						    int mr_access_flags,
+						    struct ib_pd *pd,
+						    struct ib_udata *udata);
 	int                        (*query_mr)(struct ib_mr *mr,
 					       struct ib_mr_attr *mr_attr);
 	int                        (*dereg_mr)(struct ib_mr *mr);
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index cbfdd4c..26daf55 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -276,6 +276,22 @@ struct ib_uverbs_reg_mr_resp {
 	__u32 rkey;
 };
 
+struct ib_uverbs_rereg_mr {
+	__u64 response;
+	__u32 mr_handle;
+	__u32 flags;
+	__u64 start;
+	__u64 length;
+	__u64 hca_va;
+	__u32 pd_handle;
+	__u32 access_flags;
+};
+
+struct ib_uverbs_rereg_mr_resp {
+	__u32 lkey;
+	__u32 rkey;
+};
+
 struct ib_uverbs_dereg_mr {
 	__u32 mr_handle;
 };
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH for-next 2/3] mlx4_core: Add helper functions to support MR re-registration
       [not found] ` <1406793690-3816-1-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  2014-07-31  8:01   ` [PATCH for-next 1/3] IB/core: " Or Gerlitz
@ 2014-07-31  8:01   ` Or Gerlitz
       [not found]     ` <1406793690-3816-3-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  2014-07-31  8:01   ` [PATCH for-next 3/3] IB/mlx4_ib: Add support for user " Or Gerlitz
  2 siblings, 1 reply; 7+ messages in thread
From: Or Gerlitz @ 2014-07-31  8:01 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, amirv-VPRAkNaXOzVWk0Htik3J/w,
	Matan Barak, Jack Morgenstein, Or Gerlitz

From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Add few helper functions to support a mechanism of getting an
MPT, modifying it and updating the HCA with the modified object.

The code takes 2 paths, one for directly changing the MPT (and sometimes
its related MTTs) and another one which queries the MPT and updates the
HCA via fw command SW2HW_MPT. The first path is used in native mode; the
second path is slower and is used only in SRIOV.

Signed-off-by: Jack Morgenstein <jackm-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |    2 +
 drivers/net/ethernet/mellanox/mlx4/mr.c            |  160 ++++++++++++++++++++
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |   26 +++-
 include/linux/mlx4/device.h                        |   16 ++
 4 files changed, 202 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 1d8af73..b40d587 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -279,6 +279,8 @@ struct mlx4_icm_table {
 #define MLX4_MPT_FLAG_PHYSICAL	    (1 <<  9)
 #define MLX4_MPT_FLAG_REGION	    (1 <<  8)
 
+#define MLX4_MPT_PD_MASK	    (0x1FFFFUL)
+#define MLX4_MPT_PD_VF_MASK	    (0xFE0000UL)
 #define MLX4_MPT_PD_FLAG_FAST_REG   (1 << 27)
 #define MLX4_MPT_PD_FLAG_RAE	    (1 << 28)
 #define MLX4_MPT_PD_FLAG_EN_INV	    (3 << 24)
diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c
index 2839abb..7d717ec 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mr.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mr.c
@@ -298,6 +298,131 @@ static int mlx4_HW2SW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox
 			    MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
 }
 
+int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+		       struct mlx4_mpt_entry ***mpt_entry)
+{
+	int err;
+	int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1);
+	struct mlx4_cmd_mailbox *mailbox = NULL;
+
+	/* Make sure that at this point we have single-threaded access only */
+
+	if (mmr->enabled != MLX4_MPT_EN_HW)
+		return -EINVAL;
+
+	err = mlx4_HW2SW_MPT(dev, NULL, key);
+
+	if (err) {
+		mlx4_warn(dev, "HW2SW_MPT failed (%d).", err);
+		mlx4_warn(dev, "Most likely the MR has MWs bound to it.\n");
+		return err;
+	}
+
+	mmr->enabled = MLX4_MPT_EN_SW;
+
+	if (!mlx4_is_mfunc(dev)) {
+		**mpt_entry = mlx4_table_find(
+				&mlx4_priv(dev)->mr_table.dmpt_table,
+				key, NULL);
+	} else {
+		mailbox = mlx4_alloc_cmd_mailbox(dev);
+		if (IS_ERR_OR_NULL(mailbox))
+			return PTR_ERR(mailbox);
+
+		err = mlx4_cmd_box(dev, 0, mailbox->dma, key,
+				   0, MLX4_CMD_QUERY_MPT,
+				   MLX4_CMD_TIME_CLASS_B,
+				   MLX4_CMD_WRAPPED);
+
+		if (err)
+			goto free_mailbox;
+
+		*mpt_entry = (struct mlx4_mpt_entry **)&mailbox->buf;
+	}
+
+	if (!(*mpt_entry) || !(**mpt_entry)) {
+		err = -ENOMEM;
+		goto free_mailbox;
+	}
+
+	return 0;
+
+free_mailbox:
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_get_mpt);
+
+int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+			 struct mlx4_mpt_entry **mpt_entry)
+{
+	int err;
+
+	if (!mlx4_is_mfunc(dev)) {
+		/* Make sure any changes to this entry are flushed */
+		wmb();
+
+		*(u8 *)(*mpt_entry) = MLX4_MPT_STATUS_HW;
+
+		/* Make sure the new status is written */
+		wmb();
+
+		err = mlx4_SYNC_TPT(dev);
+	} else {
+		int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1);
+
+		struct mlx4_cmd_mailbox *mailbox =
+			container_of((void *)mpt_entry, struct mlx4_cmd_mailbox,
+				     buf);
+
+		err = mlx4_SW2HW_MPT(dev, mailbox, key);
+	}
+
+	mmr->pd = be32_to_cpu((*mpt_entry)->pd_flags) & MLX4_MPT_PD_MASK;
+	if (!err)
+		mmr->enabled = MLX4_MPT_EN_HW;
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_write_mpt);
+
+void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev,
+			struct mlx4_mpt_entry **mpt_entry)
+{
+	if (mlx4_is_mfunc(dev)) {
+		struct mlx4_cmd_mailbox *mailbox =
+			container_of((void *)mpt_entry, struct mlx4_cmd_mailbox,
+				     buf);
+		mlx4_free_cmd_mailbox(dev, mailbox);
+	}
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_put_mpt);
+
+int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry,
+			 u32 pdn)
+{
+	u32 pd_flags = be32_to_cpu(mpt_entry->pd_flags);
+	/* The wrapper function will put the slave's id here */
+	if (mlx4_is_mfunc(dev))
+		pd_flags &= ~MLX4_MPT_PD_VF_MASK;
+	mpt_entry->pd_flags = cpu_to_be32((pd_flags &  ~MLX4_MPT_PD_MASK) |
+					  (pdn & MLX4_MPT_PD_MASK)
+					  | MLX4_MPT_PD_FLAG_EN_INV);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_pd);
+
+int mlx4_mr_hw_change_access(struct mlx4_dev *dev,
+			     struct mlx4_mpt_entry *mpt_entry,
+			     u32 access)
+{
+	u32 flags = (be32_to_cpu(mpt_entry->flags) & ~MLX4_PERM_MASK) |
+		    (access & MLX4_PERM_MASK);
+
+	mpt_entry->flags = cpu_to_be32(flags);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_access);
+
 static int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd,
 			   u64 iova, u64 size, u32 access, int npages,
 			   int page_shift, struct mlx4_mr *mr)
@@ -463,6 +588,41 @@ int mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr)
 }
 EXPORT_SYMBOL_GPL(mlx4_mr_free);
 
+void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr)
+{
+	mlx4_mtt_cleanup(dev, &mr->mtt);
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_cleanup);
+
+int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr,
+			    u64 iova, u64 size, int npages,
+			    int page_shift, struct mlx4_mpt_entry *mpt_entry)
+{
+	int err;
+
+	mpt_entry->start       = cpu_to_be64(mr->iova);
+	mpt_entry->length      = cpu_to_be64(mr->size);
+	mpt_entry->entity_size = cpu_to_be32(mr->mtt.page_shift);
+
+	err = mlx4_mtt_init(dev, npages, page_shift, &mr->mtt);
+	if (err)
+		return err;
+
+	if (mr->mtt.order < 0) {
+		mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL);
+		mpt_entry->mtt_addr = 0;
+	} else {
+		mpt_entry->mtt_addr = cpu_to_be64(mlx4_mtt_addr(dev,
+						  &mr->mtt));
+		if (mr->mtt.page_shift == 0)
+			mpt_entry->mtt_sz    = cpu_to_be32(1 << mr->mtt.order);
+	}
+	mr->enabled = MLX4_MPT_EN_SW;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_write);
+
 int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr)
 {
 	struct mlx4_cmd_mailbox *mailbox;
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 0efc136..1089367 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -2613,12 +2613,34 @@ int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave,
 	if (err)
 		return err;
 
-	if (mpt->com.from_state != RES_MPT_HW) {
+	if (mpt->com.from_state == RES_MPT_MAPPED) {
+		/* In order to allow rereg in SRIOV, we need to alter the MPT entry. To do
+		 * that, the VF must read the MPT. But since the MPT entry memory is not
+		 * in the VF's virtual memory space, it must use QUERY_MPT to obtain the
+		 * entry contents. To guarantee that the MPT cannot be changed, the driver
+		 * must perform HW2SW_MPT before this query and return the MPT entry to HW
+		 * ownership fofollowing the change. The change here allows the VF to
+		 * perform QUERY_MPT also when the entry is in SW ownership.
+		 */
+		struct mlx4_mpt_entry *mpt_entry = mlx4_table_find(
+					&mlx4_priv(dev)->mr_table.dmpt_table,
+					mpt->key, NULL);
+
+		if (NULL == mpt_entry || NULL == outbox->buf) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		memcpy(outbox->buf, mpt_entry, sizeof(*mpt_entry));
+
+		err = 0;
+	} else if (mpt->com.from_state == RES_MPT_HW) {
+		err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
+	} else {
 		err = -EBUSY;
 		goto out;
 	}
 
-	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
 
 out:
 	put_res(dev, slave, id, RES_MPT);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index b12f4bb..a64c397 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -262,6 +262,7 @@ enum {
 	MLX4_PERM_REMOTE_WRITE	= 1 << 13,
 	MLX4_PERM_ATOMIC	= 1 << 14,
 	MLX4_PERM_BIND_MW	= 1 << 15,
+	MLX4_PERM_MASK		= 0xFC00
 };
 
 enum {
@@ -1243,4 +1244,19 @@ int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port);
 int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port);
 int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port,
 				 int enable);
+int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+		       struct mlx4_mpt_entry ***mpt_entry);
+int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
+			 struct mlx4_mpt_entry **mpt_entry);
+int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry,
+			 u32 pdn);
+int mlx4_mr_hw_change_access(struct mlx4_dev *dev,
+			     struct mlx4_mpt_entry *mpt_entry,
+			     u32 access);
+void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev,
+			struct mlx4_mpt_entry **mpt_entry);
+void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr);
+int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr,
+			    u64 iova, u64 size, int npages,
+			    int page_shift, struct mlx4_mpt_entry *mpt_entry);
 #endif /* MLX4_DEVICE_H */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH for-next 3/3] IB/mlx4_ib: Add support for user MR re-registration
       [not found] ` <1406793690-3816-1-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  2014-07-31  8:01   ` [PATCH for-next 1/3] IB/core: " Or Gerlitz
  2014-07-31  8:01   ` [PATCH for-next 2/3] mlx4_core: Add helper functions to support MR re-registration Or Gerlitz
@ 2014-07-31  8:01   ` Or Gerlitz
       [not found]     ` <1406793690-3816-4-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  2 siblings, 1 reply; 7+ messages in thread
From: Or Gerlitz @ 2014-07-31  8:01 UTC (permalink / raw)
  To: roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, amirv-VPRAkNaXOzVWk0Htik3J/w,
	Matan Barak, Or Gerlitz

From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

This enables the user to change the protection domain, access
flags and translation (address and length) of the MR.

Use basic mlx4_core helper functions to get, update and set
MPT and MTT objects according to the required modifications.

Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
 drivers/infiniband/hw/mlx4/main.c    |    2 +
 drivers/infiniband/hw/mlx4/mlx4_ib.h |    4 ++
 drivers/infiniband/hw/mlx4/mr.c      |   88 +++++++++++++++++++++++++++++++++-
 3 files changed, 93 insertions(+), 1 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 0f7027e..828a37b 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2007,6 +2007,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
 		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
 		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_REREG_MR)		|
 		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
 		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
 		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
@@ -2059,6 +2060,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->ib_dev.req_notify_cq	= mlx4_ib_arm_cq;
 	ibdev->ib_dev.get_dma_mr	= mlx4_ib_get_dma_mr;
 	ibdev->ib_dev.reg_user_mr	= mlx4_ib_reg_user_mr;
+	ibdev->ib_dev.rereg_user_mr	= mlx4_ib_rereg_user_mr;
 	ibdev->ib_dev.dereg_mr		= mlx4_ib_dereg_mr;
 	ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr;
 	ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list;
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 369da3c..e8cad39 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -788,5 +788,9 @@ int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn);
 void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count);
 int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
 			 int is_attach);
+int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
+			  u64 start, u64 length, u64 virt_addr,
+			  int mr_access_flags, struct ib_pd *pd,
+			  struct ib_udata *udata);
 
 #endif /* MLX4_IB_H */
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index cb2a872..9b0e80e 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -144,8 +144,10 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
+	/* Force registering the memory as writable. */
+	/* Used for memory re-registeration. HCA protects the access */
 	mr->umem = ib_umem_get(pd->uobject->context, start, length,
-			       access_flags, 0);
+			       access_flags | IB_ACCESS_LOCAL_WRITE, 0);
 	if (IS_ERR(mr->umem)) {
 		err = PTR_ERR(mr->umem);
 		goto err_free;
@@ -183,6 +185,90 @@ err_free:
 	return ERR_PTR(err);
 }
 
+int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
+			  u64 start, u64 length, u64 virt_addr,
+			  int mr_access_flags, struct ib_pd *pd,
+			  struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(mr->device);
+	struct mlx4_ib_mr *mmr = to_mmr(mr);
+	struct mlx4_mpt_entry *mpt_entry;
+	struct mlx4_mpt_entry **pmpt_entry = &mpt_entry;
+	int err;
+
+	/* Since we synchronize this call and mlx4_ib_dereg_mr via uverbs,
+	 * we assume that the calls can't run concurrently. Otherwise, a
+	 * race exists.
+	 */
+	err =  mlx4_mr_hw_get_mpt(dev->dev, &mmr->mmr, &pmpt_entry);
+
+	if (err)
+		return err;
+
+	if (flags & IB_MR_REREG_PD) {
+		err = mlx4_mr_hw_change_pd(dev->dev, *pmpt_entry,
+					   to_mpd(pd)->pdn);
+
+		if (err)
+			goto release_mpt_entry;
+	}
+
+	if (flags & IB_MR_REREG_ACCESS) {
+		err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry,
+					       convert_access(mr_access_flags));
+
+		if (err)
+			goto release_mpt_entry;
+	}
+
+	if (flags & IB_MR_REREG_TRANS) {
+		int shift;
+		int err;
+		int n;
+
+		mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
+		ib_umem_release(mmr->umem);
+		mmr->umem = ib_umem_get(mr->uobject->context, start, length,
+					mr_access_flags |
+					IB_ACCESS_LOCAL_WRITE,
+					0);
+		if (IS_ERR(mmr->umem)) {
+			err = PTR_ERR(mmr->umem);
+			mmr->umem = NULL;
+			goto release_mpt_entry;
+		}
+		n = ib_umem_page_count(mmr->umem);
+		shift = ilog2(mmr->umem->page_size);
+
+		mmr->mmr.iova       = virt_addr;
+		mmr->mmr.size       = length;
+		err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr,
+					      virt_addr, length, n, shift,
+					      *pmpt_entry);
+		if (err) {
+			ib_umem_release(mmr->umem);
+			goto release_mpt_entry;
+		}
+
+		err = mlx4_ib_umem_write_mtt(dev, &mmr->mmr.mtt, mmr->umem);
+		if (err) {
+			mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
+			ib_umem_release(mmr->umem);
+			goto release_mpt_entry;
+		}
+	}
+
+	/* If we couldn't transfer the MR to the HCA, just remember to
+	 * return a failure. But dereg_mr will free the resources.
+	 */
+	err = mlx4_mr_hw_write_mpt(dev->dev, &mmr->mmr, pmpt_entry);
+
+release_mpt_entry:
+	mlx4_mr_hw_put_mpt(dev->dev, pmpt_entry);
+
+	return err;
+}
+
 int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
 {
 	struct mlx4_ib_mr *mr = to_mmr(ibmr);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH for-next 2/3] mlx4_core: Add helper functions to support MR re-registration
       [not found]     ` <1406793690-3816-3-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2014-08-06 10:42       ` Sagi Grimberg
       [not found]         ` <53E206A2.6040800-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
  0 siblings, 1 reply; 7+ messages in thread
From: Sagi Grimberg @ 2014-08-06 10:42 UTC (permalink / raw)
  To: Or Gerlitz, roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, amirv-VPRAkNaXOzVWk0Htik3J/w,
	Matan Barak, Jack Morgenstein

On 7/31/2014 11:01 AM, Or Gerlitz wrote:
> From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>
> Add few helper functions to support a mechanism of getting an
> MPT, modifying it and updating the HCA with the modified object.
>
> The code takes 2 paths, one for directly changing the MPT (and sometimes
> its related MTTs) and another one which queries the MPT and updates the
> HCA via fw command SW2HW_MPT. The first path is used in native mode; the
> second path is slower and is used only in SRIOV.
>

Hey Matan,

Couple of comments below...

> Signed-off-by: Jack Morgenstein <jackm-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
> Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> ---
>   drivers/net/ethernet/mellanox/mlx4/mlx4.h          |    2 +
>   drivers/net/ethernet/mellanox/mlx4/mr.c            |  160 ++++++++++++++++++++
>   .../net/ethernet/mellanox/mlx4/resource_tracker.c  |   26 +++-
>   include/linux/mlx4/device.h                        |   16 ++
>   4 files changed, 202 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
> index 1d8af73..b40d587 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
> +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
> @@ -279,6 +279,8 @@ struct mlx4_icm_table {
>   #define MLX4_MPT_FLAG_PHYSICAL	    (1 <<  9)
>   #define MLX4_MPT_FLAG_REGION	    (1 <<  8)
>
> +#define MLX4_MPT_PD_MASK	    (0x1FFFFUL)
> +#define MLX4_MPT_PD_VF_MASK	    (0xFE0000UL)
>   #define MLX4_MPT_PD_FLAG_FAST_REG   (1 << 27)
>   #define MLX4_MPT_PD_FLAG_RAE	    (1 << 28)
>   #define MLX4_MPT_PD_FLAG_EN_INV	    (3 << 24)
> diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c
> index 2839abb..7d717ec 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/mr.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/mr.c
> @@ -298,6 +298,131 @@ static int mlx4_HW2SW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox
>   			    MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
>   }
>
> +int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
> +		       struct mlx4_mpt_entry ***mpt_entry)
> +{
> +	int err;
> +	int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1);
> +	struct mlx4_cmd_mailbox *mailbox = NULL;
> +
> +	/* Make sure that at this point we have single-threaded access only */

This is not a specific comment for the below, but a general comment for 
the routine (otherwise I would expect the condition below to be 
atomic_read). Maybe it's better to document it as a requirement of the 
routine above (in kernel-doc style)?

> +
> +	if (mmr->enabled != MLX4_MPT_EN_HW)
> +		return -EINVAL;
> +
> +	err = mlx4_HW2SW_MPT(dev, NULL, key);

Nit: I'd lose this extra line between err=/if(err) statements.
I see you did in some sections of this patch...

> +
> +	if (err) {
> +		mlx4_warn(dev, "HW2SW_MPT failed (%d).", err);
> +		mlx4_warn(dev, "Most likely the MR has MWs bound to it.\n");
> +		return err;
> +	}
> +
> +	mmr->enabled = MLX4_MPT_EN_SW;
> +
> +	if (!mlx4_is_mfunc(dev)) {
> +		**mpt_entry = mlx4_table_find(
> +				&mlx4_priv(dev)->mr_table.dmpt_table,
> +				key, NULL);
> +	} else {
> +		mailbox = mlx4_alloc_cmd_mailbox(dev);
> +		if (IS_ERR_OR_NULL(mailbox))
> +			return PTR_ERR(mailbox);
> +
> +		err = mlx4_cmd_box(dev, 0, mailbox->dma, key,
> +				   0, MLX4_CMD_QUERY_MPT,
> +				   MLX4_CMD_TIME_CLASS_B,
> +				   MLX4_CMD_WRAPPED);
> +
> +		if (err)
> +			goto free_mailbox;
> +
> +		*mpt_entry = (struct mlx4_mpt_entry **)&mailbox->buf;
> +	}
> +
> +	if (!(*mpt_entry) || !(**mpt_entry)) {
> +		err = -ENOMEM;
> +		goto free_mailbox;
> +	}
> +
> +	return 0;
> +
> +free_mailbox:
> +	mlx4_free_cmd_mailbox(dev, mailbox);
> +	return err;
> +}
> +EXPORT_SYMBOL_GPL(mlx4_mr_hw_get_mpt);
> +
> +int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
> +			 struct mlx4_mpt_entry **mpt_entry)
> +{
> +	int err;
> +
> +	if (!mlx4_is_mfunc(dev)) {
> +		/* Make sure any changes to this entry are flushed */
> +		wmb();
> +
> +		*(u8 *)(*mpt_entry) = MLX4_MPT_STATUS_HW;
> +
> +		/* Make sure the new status is written */
> +		wmb();
> +
> +		err = mlx4_SYNC_TPT(dev);
> +	} else {
> +		int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1);
> +
> +		struct mlx4_cmd_mailbox *mailbox =
> +			container_of((void *)mpt_entry, struct mlx4_cmd_mailbox,
> +				     buf);
> +
> +		err = mlx4_SW2HW_MPT(dev, mailbox, key);
> +	}
> +
> +	mmr->pd = be32_to_cpu((*mpt_entry)->pd_flags) & MLX4_MPT_PD_MASK;

Here you update the PD even if you failed while ib_core changes them 
only if it succeeded, that would causes inconsistency. I think it will 
be better to write this update only after write/put_mpt succeeded 
(meaning it might be better to move it to the caller - mlx4_ib).

> +	if (!err)
> +		mmr->enabled = MLX4_MPT_EN_HW;
> +	return err;
> +}
> +EXPORT_SYMBOL_GPL(mlx4_mr_hw_write_mpt);
> +
> +void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev,
> +			struct mlx4_mpt_entry **mpt_entry)
> +{
> +	if (mlx4_is_mfunc(dev)) {
> +		struct mlx4_cmd_mailbox *mailbox =
> +			container_of((void *)mpt_entry, struct mlx4_cmd_mailbox,
> +				     buf);
> +		mlx4_free_cmd_mailbox(dev, mailbox);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(mlx4_mr_hw_put_mpt);
> +
> +int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry,
> +			 u32 pdn)
> +{
> +	u32 pd_flags = be32_to_cpu(mpt_entry->pd_flags);
> +	/* The wrapper function will put the slave's id here */
> +	if (mlx4_is_mfunc(dev))
> +		pd_flags &= ~MLX4_MPT_PD_VF_MASK;
> +	mpt_entry->pd_flags = cpu_to_be32((pd_flags &  ~MLX4_MPT_PD_MASK) |

Here you protect against bad pd_flags (corrupted mpt_entry was given?) 
with & ~MLX4_MPT_PD_MASK, but you might hide the mpt_corruption by 
masking it... isn't it better to fail (internal error) in this case?

> +					  (pdn & MLX4_MPT_PD_MASK)
> +					  | MLX4_MPT_PD_FLAG_EN_INV);
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_pd);
> +
> +int mlx4_mr_hw_change_access(struct mlx4_dev *dev,
> +			     struct mlx4_mpt_entry *mpt_entry,
> +			     u32 access)
> +{
> +	u32 flags = (be32_to_cpu(mpt_entry->flags) & ~MLX4_PERM_MASK) |
> +		    (access & MLX4_PERM_MASK);
> +
> +	mpt_entry->flags = cpu_to_be32(flags);

Note that you don't update the mmr access flags no where. after you
succeed query_mr will probably return the old permissions.

> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_access);
> +
>   static int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd,
>   			   u64 iova, u64 size, u32 access, int npages,
>   			   int page_shift, struct mlx4_mr *mr)
> @@ -463,6 +588,41 @@ int mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr)
>   }
>   EXPORT_SYMBOL_GPL(mlx4_mr_free);
>
> +void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr)
> +{
> +	mlx4_mtt_cleanup(dev, &mr->mtt);
> +}
> +EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_cleanup);
> +
> +int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr,
> +			    u64 iova, u64 size, int npages,
> +			    int page_shift, struct mlx4_mpt_entry *mpt_entry)
> +{
> +	int err;
> +
> +	mpt_entry->start       = cpu_to_be64(mr->iova);
> +	mpt_entry->length      = cpu_to_be64(mr->size);

You pass u64 iova, u64 size, but you don't use them...
probably better to use them (and update the mmr after the change - at 
mlx4_ib)

> +	mpt_entry->entity_size = cpu_to_be32(mr->mtt.page_shift);
> +
> +	err = mlx4_mtt_init(dev, npages, page_shift, &mr->mtt);
> +	if (err)
> +		return err;
> +
> +	if (mr->mtt.order < 0) {
> +		mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL);
> +		mpt_entry->mtt_addr = 0;
> +	} else {
> +		mpt_entry->mtt_addr = cpu_to_be64(mlx4_mtt_addr(dev,
> +						  &mr->mtt));
> +		if (mr->mtt.page_shift == 0)
> +			mpt_entry->mtt_sz    = cpu_to_be32(1 << mr->mtt.order);
> +	}
> +	mr->enabled = MLX4_MPT_EN_SW;
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_write);
> +
>   int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr)
>   {
>   	struct mlx4_cmd_mailbox *mailbox;
> diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
> index 0efc136..1089367 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
> @@ -2613,12 +2613,34 @@ int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave,
>   	if (err)
>   		return err;
>
> -	if (mpt->com.from_state != RES_MPT_HW) {
> +	if (mpt->com.from_state == RES_MPT_MAPPED) {
> +		/* In order to allow rereg in SRIOV, we need to alter the MPT entry. To do
> +		 * that, the VF must read the MPT. But since the MPT entry memory is not
> +		 * in the VF's virtual memory space, it must use QUERY_MPT to obtain the
> +		 * entry contents. To guarantee that the MPT cannot be changed, the driver
> +		 * must perform HW2SW_MPT before this query and return the MPT entry to HW
> +		 * ownership fofollowing the change. The change here allows the VF to
> +		 * perform QUERY_MPT also when the entry is in SW ownership.
> +		 */
> +		struct mlx4_mpt_entry *mpt_entry = mlx4_table_find(
> +					&mlx4_priv(dev)->mr_table.dmpt_table,
> +					mpt->key, NULL);
> +
> +		if (NULL == mpt_entry || NULL == outbox->buf) {
> +			err = -EINVAL;
> +			goto out;
> +		}
> +
> +		memcpy(outbox->buf, mpt_entry, sizeof(*mpt_entry));
> +
> +		err = 0;
> +	} else if (mpt->com.from_state == RES_MPT_HW) {
> +		err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
> +	} else {
>   		err = -EBUSY;
>   		goto out;
>   	}
>
> -	err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
>
>   out:
>   	put_res(dev, slave, id, RES_MPT);
> diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
> index b12f4bb..a64c397 100644
> --- a/include/linux/mlx4/device.h
> +++ b/include/linux/mlx4/device.h
> @@ -262,6 +262,7 @@ enum {
>   	MLX4_PERM_REMOTE_WRITE	= 1 << 13,
>   	MLX4_PERM_ATOMIC	= 1 << 14,
>   	MLX4_PERM_BIND_MW	= 1 << 15,
> +	MLX4_PERM_MASK		= 0xFC00
>   };
>
>   enum {
> @@ -1243,4 +1244,19 @@ int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port);
>   int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port);
>   int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port,
>   				 int enable);
> +int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
> +		       struct mlx4_mpt_entry ***mpt_entry);
> +int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
> +			 struct mlx4_mpt_entry **mpt_entry);
> +int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry,
> +			 u32 pdn);
> +int mlx4_mr_hw_change_access(struct mlx4_dev *dev,
> +			     struct mlx4_mpt_entry *mpt_entry,
> +			     u32 access);
> +void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev,
> +			struct mlx4_mpt_entry **mpt_entry);
> +void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr);
> +int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr,
> +			    u64 iova, u64 size, int npages,
> +			    int page_shift, struct mlx4_mpt_entry *mpt_entry);
>   #endif /* MLX4_DEVICE_H */
>

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH for-next 3/3] IB/mlx4_ib: Add support for user MR re-registration
       [not found]     ` <1406793690-3816-4-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2014-08-06 10:46       ` Sagi Grimberg
  0 siblings, 0 replies; 7+ messages in thread
From: Sagi Grimberg @ 2014-08-06 10:46 UTC (permalink / raw)
  To: Or Gerlitz, roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, amirv-VPRAkNaXOzVWk0Htik3J/w,
	Matan Barak

On 7/31/2014 11:01 AM, Or Gerlitz wrote:
> From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>
> This enables the user to change the protection domain, access
> flags and translation (address and length) of the MR.
>
> Use basic mlx4_core helper functions to get, update and set
> MPT and MTT objects according to the required modifications.
>
> Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> ---
>   drivers/infiniband/hw/mlx4/main.c    |    2 +
>   drivers/infiniband/hw/mlx4/mlx4_ib.h |    4 ++
>   drivers/infiniband/hw/mlx4/mr.c      |   88 +++++++++++++++++++++++++++++++++-
>   3 files changed, 93 insertions(+), 1 deletions(-)
>
> diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
> index 0f7027e..828a37b 100644
> --- a/drivers/infiniband/hw/mlx4/main.c
> +++ b/drivers/infiniband/hw/mlx4/main.c
> @@ -2007,6 +2007,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
>   		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
>   		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
>   		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
> +		(1ull << IB_USER_VERBS_CMD_REREG_MR)		|
>   		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
>   		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
>   		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
> @@ -2059,6 +2060,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
>   	ibdev->ib_dev.req_notify_cq	= mlx4_ib_arm_cq;
>   	ibdev->ib_dev.get_dma_mr	= mlx4_ib_get_dma_mr;
>   	ibdev->ib_dev.reg_user_mr	= mlx4_ib_reg_user_mr;
> +	ibdev->ib_dev.rereg_user_mr	= mlx4_ib_rereg_user_mr;
>   	ibdev->ib_dev.dereg_mr		= mlx4_ib_dereg_mr;
>   	ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr;
>   	ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list;
> diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
> index 369da3c..e8cad39 100644
> --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
> +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
> @@ -788,5 +788,9 @@ int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn);
>   void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count);
>   int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
>   			 int is_attach);
> +int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
> +			  u64 start, u64 length, u64 virt_addr,
> +			  int mr_access_flags, struct ib_pd *pd,
> +			  struct ib_udata *udata);
>
>   #endif /* MLX4_IB_H */
> diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
> index cb2a872..9b0e80e 100644
> --- a/drivers/infiniband/hw/mlx4/mr.c
> +++ b/drivers/infiniband/hw/mlx4/mr.c
> @@ -144,8 +144,10 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
>   	if (!mr)
>   		return ERR_PTR(-ENOMEM);
>
> +	/* Force registering the memory as writable. */
> +	/* Used for memory re-registeration. HCA protects the access */
>   	mr->umem = ib_umem_get(pd->uobject->context, start, length,
> -			       access_flags, 0);
> +			       access_flags | IB_ACCESS_LOCAL_WRITE, 0);
>   	if (IS_ERR(mr->umem)) {
>   		err = PTR_ERR(mr->umem);
>   		goto err_free;
> @@ -183,6 +185,90 @@ err_free:
>   	return ERR_PTR(err);
>   }
>
> +int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
> +			  u64 start, u64 length, u64 virt_addr,
> +			  int mr_access_flags, struct ib_pd *pd,
> +			  struct ib_udata *udata)
> +{
> +	struct mlx4_ib_dev *dev = to_mdev(mr->device);
> +	struct mlx4_ib_mr *mmr = to_mmr(mr);
> +	struct mlx4_mpt_entry *mpt_entry;
> +	struct mlx4_mpt_entry **pmpt_entry = &mpt_entry;
> +	int err;
> +
> +	/* Since we synchronize this call and mlx4_ib_dereg_mr via uverbs,
> +	 * we assume that the calls can't run concurrently. Otherwise, a
> +	 * race exists.
> +	 */
> +	err =  mlx4_mr_hw_get_mpt(dev->dev, &mmr->mmr, &pmpt_entry);
> +
> +	if (err)
> +		return err;
> +
> +	if (flags & IB_MR_REREG_PD) {
> +		err = mlx4_mr_hw_change_pd(dev->dev, *pmpt_entry,
> +					   to_mpd(pd)->pdn);
> +
> +		if (err)
> +			goto release_mpt_entry;
> +	}
> +
> +	if (flags & IB_MR_REREG_ACCESS) {
> +		err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry,
> +					       convert_access(mr_access_flags));
> +
> +		if (err)
> +			goto release_mpt_entry;
> +	}
> +
> +	if (flags & IB_MR_REREG_TRANS) {
> +		int shift;
> +		int err;
> +		int n;
> +
> +		mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
> +		ib_umem_release(mmr->umem);
> +		mmr->umem = ib_umem_get(mr->uobject->context, start, length,
> +					mr_access_flags |
> +					IB_ACCESS_LOCAL_WRITE,
> +					0);
> +		if (IS_ERR(mmr->umem)) {
> +			err = PTR_ERR(mmr->umem);

This NULL assignment is for the future dereg_mr not getting this 
ERR_PTR. I think its worth a little comment since it took my time to 
figure it out...

> +			mmr->umem = NULL;
> +			goto release_mpt_entry;
> +		}
> +		n = ib_umem_page_count(mmr->umem);
> +		shift = ilog2(mmr->umem->page_size);
> +
> +		mmr->mmr.iova       = virt_addr;
> +		mmr->mmr.size       = length;

See my previous review, these updates are better after 
mlx4_mr_rereg_mem_write

> +		err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr,
> +					      virt_addr, length, n, shift,
> +					      *pmpt_entry);
> +		if (err) {
> +			ib_umem_release(mmr->umem);
> +			goto release_mpt_entry;
> +		}
> +
> +		err = mlx4_ib_umem_write_mtt(dev, &mmr->mmr.mtt, mmr->umem);
> +		if (err) {
> +			mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
> +			ib_umem_release(mmr->umem);
> +			goto release_mpt_entry;
> +		}
> +	}
> +
> +	/* If we couldn't transfer the MR to the HCA, just remember to
> +	 * return a failure. But dereg_mr will free the resources.
> +	 */
> +	err = mlx4_mr_hw_write_mpt(dev->dev, &mmr->mmr, pmpt_entry);
> +
> +release_mpt_entry:
> +	mlx4_mr_hw_put_mpt(dev->dev, pmpt_entry);
> +
> +	return err;
> +}
> +
>   int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
>   {
>   	struct mlx4_ib_mr *mr = to_mmr(ibmr);
>

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH for-next 2/3] mlx4_core: Add helper functions to support MR re-registration
       [not found]         ` <53E206A2.6040800-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
@ 2014-08-17 13:16           ` Matan Barak
  0 siblings, 0 replies; 7+ messages in thread
From: Matan Barak @ 2014-08-17 13:16 UTC (permalink / raw)
  To: Sagi Grimberg, Or Gerlitz, roland-DgEjT+Ai2ygdnm+yROfE0A
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, amirv-VPRAkNaXOzVWk0Htik3J/w,
	Jack Morgenstein

On 6/8/2014 1:42 PM, Sagi Grimberg wrote:
> On 7/31/2014 11:01 AM, Or Gerlitz wrote:
>> From: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>
>> Add few helper functions to support a mechanism of getting an
>> MPT, modifying it and updating the HCA with the modified object.
>>
>> The code takes 2 paths, one for directly changing the MPT (and sometimes
>> its related MTTs) and another one which queries the MPT and updates the
>> HCA via fw command SW2HW_MPT. The first path is used in native mode; the
>> second path is slower and is used only in SRIOV.
>>
>
> Hey Matan,
>
> Couple of comments below...
>
>> Signed-off-by: Jack Morgenstein <jackm-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
>> Signed-off-by: Matan Barak <matanb-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>> Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>> ---
>>   drivers/net/ethernet/mellanox/mlx4/mlx4.h          |    2 +
>>   drivers/net/ethernet/mellanox/mlx4/mr.c            |  160
>> ++++++++++++++++++++
>>   .../net/ethernet/mellanox/mlx4/resource_tracker.c  |   26 +++-
>>   include/linux/mlx4/device.h                        |   16 ++
>>   4 files changed, 202 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
>> b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
>> index 1d8af73..b40d587 100644
>> --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
>> +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
>> @@ -279,6 +279,8 @@ struct mlx4_icm_table {
>>   #define MLX4_MPT_FLAG_PHYSICAL        (1 <<  9)
>>   #define MLX4_MPT_FLAG_REGION        (1 <<  8)
>>
>> +#define MLX4_MPT_PD_MASK        (0x1FFFFUL)
>> +#define MLX4_MPT_PD_VF_MASK        (0xFE0000UL)
>>   #define MLX4_MPT_PD_FLAG_FAST_REG   (1 << 27)
>>   #define MLX4_MPT_PD_FLAG_RAE        (1 << 28)
>>   #define MLX4_MPT_PD_FLAG_EN_INV        (3 << 24)
>> diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c
>> b/drivers/net/ethernet/mellanox/mlx4/mr.c
>> index 2839abb..7d717ec 100644
>> --- a/drivers/net/ethernet/mellanox/mlx4/mr.c
>> +++ b/drivers/net/ethernet/mellanox/mlx4/mr.c
>> @@ -298,6 +298,131 @@ static int mlx4_HW2SW_MPT(struct mlx4_dev *dev,
>> struct mlx4_cmd_mailbox *mailbox
>>                   MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
>>   }
>>
>> +int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
>> +               struct mlx4_mpt_entry ***mpt_entry)
>> +{
>> +    int err;
>> +    int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1);
>> +    struct mlx4_cmd_mailbox *mailbox = NULL;
>> +
>> +    /* Make sure that at this point we have single-threaded access
>> only */
>
> This is not a specific comment for the below, but a general comment for
> the routine (otherwise I would expect the condition below to be
> atomic_read). Maybe it's better to document it as a requirement of the
> routine above (in kernel-doc style)?
>
>> +
>> +    if (mmr->enabled != MLX4_MPT_EN_HW)
>> +        return -EINVAL;
>> +
>> +    err = mlx4_HW2SW_MPT(dev, NULL, key);
>
> Nit: I'd lose this extra line between err=/if(err) statements.
> I see you did in some sections of this patch...
>
>> +
>> +    if (err) {
>> +        mlx4_warn(dev, "HW2SW_MPT failed (%d).", err);
>> +        mlx4_warn(dev, "Most likely the MR has MWs bound to it.\n");
>> +        return err;
>> +    }
>> +
>> +    mmr->enabled = MLX4_MPT_EN_SW;
>> +
>> +    if (!mlx4_is_mfunc(dev)) {
>> +        **mpt_entry = mlx4_table_find(
>> +                &mlx4_priv(dev)->mr_table.dmpt_table,
>> +                key, NULL);
>> +    } else {
>> +        mailbox = mlx4_alloc_cmd_mailbox(dev);
>> +        if (IS_ERR_OR_NULL(mailbox))
>> +            return PTR_ERR(mailbox);
>> +
>> +        err = mlx4_cmd_box(dev, 0, mailbox->dma, key,
>> +                   0, MLX4_CMD_QUERY_MPT,
>> +                   MLX4_CMD_TIME_CLASS_B,
>> +                   MLX4_CMD_WRAPPED);
>> +
>> +        if (err)
>> +            goto free_mailbox;
>> +
>> +        *mpt_entry = (struct mlx4_mpt_entry **)&mailbox->buf;
>> +    }
>> +
>> +    if (!(*mpt_entry) || !(**mpt_entry)) {
>> +        err = -ENOMEM;
>> +        goto free_mailbox;
>> +    }
>> +
>> +    return 0;
>> +
>> +free_mailbox:
>> +    mlx4_free_cmd_mailbox(dev, mailbox);
>> +    return err;
>> +}
>> +EXPORT_SYMBOL_GPL(mlx4_mr_hw_get_mpt);
>> +
>> +int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
>> +             struct mlx4_mpt_entry **mpt_entry)
>> +{
>> +    int err;
>> +
>> +    if (!mlx4_is_mfunc(dev)) {
>> +        /* Make sure any changes to this entry are flushed */
>> +        wmb();
>> +
>> +        *(u8 *)(*mpt_entry) = MLX4_MPT_STATUS_HW;
>> +
>> +        /* Make sure the new status is written */
>> +        wmb();
>> +
>> +        err = mlx4_SYNC_TPT(dev);
>> +    } else {
>> +        int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1);
>> +
>> +        struct mlx4_cmd_mailbox *mailbox =
>> +            container_of((void *)mpt_entry, struct mlx4_cmd_mailbox,
>> +                     buf);
>> +
>> +        err = mlx4_SW2HW_MPT(dev, mailbox, key);
>> +    }
>> +
>> +    mmr->pd = be32_to_cpu((*mpt_entry)->pd_flags) & MLX4_MPT_PD_MASK;
>
> Here you update the PD even if you failed while ib_core changes them
> only if it succeeded, that would causes inconsistency. I think it will
> be better to write this update only after write/put_mpt succeeded
> (meaning it might be better to move it to the caller - mlx4_ib).
>
>> +    if (!err)
>> +        mmr->enabled = MLX4_MPT_EN_HW;
>> +    return err;
>> +}
>> +EXPORT_SYMBOL_GPL(mlx4_mr_hw_write_mpt);
>> +
>> +void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev,
>> +            struct mlx4_mpt_entry **mpt_entry)
>> +{
>> +    if (mlx4_is_mfunc(dev)) {
>> +        struct mlx4_cmd_mailbox *mailbox =
>> +            container_of((void *)mpt_entry, struct mlx4_cmd_mailbox,
>> +                     buf);
>> +        mlx4_free_cmd_mailbox(dev, mailbox);
>> +    }
>> +}
>> +EXPORT_SYMBOL_GPL(mlx4_mr_hw_put_mpt);
>> +
>> +int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry
>> *mpt_entry,
>> +             u32 pdn)
>> +{
>> +    u32 pd_flags = be32_to_cpu(mpt_entry->pd_flags);
>> +    /* The wrapper function will put the slave's id here */
>> +    if (mlx4_is_mfunc(dev))
>> +        pd_flags &= ~MLX4_MPT_PD_VF_MASK;
>> +    mpt_entry->pd_flags = cpu_to_be32((pd_flags &  ~MLX4_MPT_PD_MASK) |
>
> Here you protect against bad pd_flags (corrupted mpt_entry was given?)
> with & ~MLX4_MPT_PD_MASK, but you might hide the mpt_corruption by
> masking it... isn't it better to fail (internal error) in this case?
>
>> +                      (pdn & MLX4_MPT_PD_MASK)
>> +                      | MLX4_MPT_PD_FLAG_EN_INV);
>> +    return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_pd);
>> +
>> +int mlx4_mr_hw_change_access(struct mlx4_dev *dev,
>> +                 struct mlx4_mpt_entry *mpt_entry,
>> +                 u32 access)
>> +{
>> +    u32 flags = (be32_to_cpu(mpt_entry->flags) & ~MLX4_PERM_MASK) |
>> +            (access & MLX4_PERM_MASK);
>> +
>> +    mpt_entry->flags = cpu_to_be32(flags);
>
> Note that you don't update the mmr access flags no where. after you
> succeed query_mr will probably return the old permissions.
>
>> +    return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_access);
>> +
>>   static int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx,
>> u32 pd,
>>                  u64 iova, u64 size, u32 access, int npages,
>>                  int page_shift, struct mlx4_mr *mr)
>> @@ -463,6 +588,41 @@ int mlx4_mr_free(struct mlx4_dev *dev, struct
>> mlx4_mr *mr)
>>   }
>>   EXPORT_SYMBOL_GPL(mlx4_mr_free);
>>
>> +void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr)
>> +{
>> +    mlx4_mtt_cleanup(dev, &mr->mtt);
>> +}
>> +EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_cleanup);
>> +
>> +int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr,
>> +                u64 iova, u64 size, int npages,
>> +                int page_shift, struct mlx4_mpt_entry *mpt_entry)
>> +{
>> +    int err;
>> +
>> +    mpt_entry->start       = cpu_to_be64(mr->iova);
>> +    mpt_entry->length      = cpu_to_be64(mr->size);
>
> You pass u64 iova, u64 size, but you don't use them...
> probably better to use them (and update the mmr after the change - at
> mlx4_ib)
>
>> +    mpt_entry->entity_size = cpu_to_be32(mr->mtt.page_shift);
>> +
>> +    err = mlx4_mtt_init(dev, npages, page_shift, &mr->mtt);
>> +    if (err)
>> +        return err;
>> +
>> +    if (mr->mtt.order < 0) {
>> +        mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL);
>> +        mpt_entry->mtt_addr = 0;
>> +    } else {
>> +        mpt_entry->mtt_addr = cpu_to_be64(mlx4_mtt_addr(dev,
>> +                          &mr->mtt));
>> +        if (mr->mtt.page_shift == 0)
>> +            mpt_entry->mtt_sz    = cpu_to_be32(1 << mr->mtt.order);
>> +    }
>> +    mr->enabled = MLX4_MPT_EN_SW;
>> +
>> +    return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_write);
>> +
>>   int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr)
>>   {
>>       struct mlx4_cmd_mailbox *mailbox;
>> diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
>> b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
>> index 0efc136..1089367 100644
>> --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
>> +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
>> @@ -2613,12 +2613,34 @@ int mlx4_QUERY_MPT_wrapper(struct mlx4_dev
>> *dev, int slave,
>>       if (err)
>>           return err;
>>
>> -    if (mpt->com.from_state != RES_MPT_HW) {
>> +    if (mpt->com.from_state == RES_MPT_MAPPED) {
>> +        /* In order to allow rereg in SRIOV, we need to alter the MPT
>> entry. To do
>> +         * that, the VF must read the MPT. But since the MPT entry
>> memory is not
>> +         * in the VF's virtual memory space, it must use QUERY_MPT to
>> obtain the
>> +         * entry contents. To guarantee that the MPT cannot be
>> changed, the driver
>> +         * must perform HW2SW_MPT before this query and return the
>> MPT entry to HW
>> +         * ownership fofollowing the change. The change here allows
>> the VF to
>> +         * perform QUERY_MPT also when the entry is in SW ownership.
>> +         */
>> +        struct mlx4_mpt_entry *mpt_entry = mlx4_table_find(
>> +                    &mlx4_priv(dev)->mr_table.dmpt_table,
>> +                    mpt->key, NULL);
>> +
>> +        if (NULL == mpt_entry || NULL == outbox->buf) {
>> +            err = -EINVAL;
>> +            goto out;
>> +        }
>> +
>> +        memcpy(outbox->buf, mpt_entry, sizeof(*mpt_entry));
>> +
>> +        err = 0;
>> +    } else if (mpt->com.from_state == RES_MPT_HW) {
>> +        err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
>> +    } else {
>>           err = -EBUSY;
>>           goto out;
>>       }
>>
>> -    err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
>>
>>   out:
>>       put_res(dev, slave, id, RES_MPT);
>> diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
>> index b12f4bb..a64c397 100644
>> --- a/include/linux/mlx4/device.h
>> +++ b/include/linux/mlx4/device.h
>> @@ -262,6 +262,7 @@ enum {
>>       MLX4_PERM_REMOTE_WRITE    = 1 << 13,
>>       MLX4_PERM_ATOMIC    = 1 << 14,
>>       MLX4_PERM_BIND_MW    = 1 << 15,
>> +    MLX4_PERM_MASK        = 0xFC00
>>   };
>>
>>   enum {
>> @@ -1243,4 +1244,19 @@ int mlx4_vf_smi_enabled(struct mlx4_dev *dev,
>> int slave, int port);
>>   int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave,
>> int port);
>>   int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave,
>> int port,
>>                    int enable);
>> +int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
>> +               struct mlx4_mpt_entry ***mpt_entry);
>> +int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr,
>> +             struct mlx4_mpt_entry **mpt_entry);
>> +int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry
>> *mpt_entry,
>> +             u32 pdn);
>> +int mlx4_mr_hw_change_access(struct mlx4_dev *dev,
>> +                 struct mlx4_mpt_entry *mpt_entry,
>> +                 u32 access);
>> +void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev,
>> +            struct mlx4_mpt_entry **mpt_entry);
>> +void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr
>> *mr);
>> +int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr,
>> +                u64 iova, u64 size, int npages,
>> +                int page_shift, struct mlx4_mpt_entry *mpt_entry);
>>   #endif /* MLX4_DEVICE_H */
>>
>

Hi Sagi,

Thanks for the comments.
I'll look into that and fix/reply.

Thanks,
Matan
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2014-08-17 13:16 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-07-31  8:01 [PATCH for-next 0/3] Add user MR re-registeration support Or Gerlitz
     [not found] ` <1406793690-3816-1-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2014-07-31  8:01   ` [PATCH for-next 1/3] IB/core: " Or Gerlitz
2014-07-31  8:01   ` [PATCH for-next 2/3] mlx4_core: Add helper functions to support MR re-registration Or Gerlitz
     [not found]     ` <1406793690-3816-3-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2014-08-06 10:42       ` Sagi Grimberg
     [not found]         ` <53E206A2.6040800-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
2014-08-17 13:16           ` Matan Barak
2014-07-31  8:01   ` [PATCH for-next 3/3] IB/mlx4_ib: Add support for user " Or Gerlitz
     [not found]     ` <1406793690-3816-4-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2014-08-06 10:46       ` Sagi Grimberg

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.