All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC contig pages support 0/2] Add contiguous pages support
@ 2015-12-08 15:15 Yishai Hadas
       [not found] ` <1449587707-24214-1-git-send-email-yishaih-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Yishai Hadas @ 2015-12-08 15:15 UTC (permalink / raw)
  To: dledford-H+wXaHxf7aLQT0dZR+AlfA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	yishaih-VPRAkNaXOzVWk0Htik3J/w, ogerlitz-VPRAkNaXOzVWk0Htik3J/w,
	talal-VPRAkNaXOzVWk0Htik3J/w

This RFC introduces an ability to user space application
to work with contiguous pages and boost its performance by
reducing HCA translation's cycles.

IB core introduces a new structure named 'cmem' which
represents a contiguous memory and expose some APIs to
work with it.

IB driver can use those APIs on behalf of a user request to
achieve this task.

Specifically, below APIs were added to enable it:
ib_cmem_alloc_contiguous_pages:
- Allocate contiguous pages based on user context, page order, total size.

ib_cmem_release_contiguous_pages:
- Release memory allocated by ib_cmem_alloc_contiguous_pages.

ib_cmem_map_contiguous_pages_to_vma:
- Map pre-allocated 'cmem' into VMA.

The way to ask from user space this service can be achieved by
calling mmap with some command asking for. The low level driver
will use above APIs to fulfil the request.

Patches:
#1 - This patch exposes the 'cmem' API.
#2 - This patch introduces mlx5 driver usage of this API.

Yishai Hadas (2):
  IB: Supports contiguous memory operations
  IB/mlx5: Exporting to user space the contiguous allocation capability

 drivers/infiniband/core/Makefile     |   2 +-
 drivers/infiniband/core/cmem.c       | 245 +++++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/mlx5/main.c    |  35 ++++-
 drivers/infiniband/hw/mlx5/mlx5_ib.h |   4 +-
 include/rdma/ib_cmem.h               |  41 ++++++
 5 files changed, 324 insertions(+), 3 deletions(-)
 create mode 100644 drivers/infiniband/core/cmem.c
 create mode 100644 include/rdma/ib_cmem.h

-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [RFC contig pages support 1/2] IB: Supports contiguous memory operations
       [not found] ` <1449587707-24214-1-git-send-email-yishaih-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2015-12-08 15:15   ` Yishai Hadas
       [not found]     ` <1449587707-24214-2-git-send-email-yishaih-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  2015-12-08 15:15   ` [RFC contig pages support 2/2] IB/mlx5: Exporting to user space the contiguous allocation capability Yishai Hadas
  1 sibling, 1 reply; 20+ messages in thread
From: Yishai Hadas @ 2015-12-08 15:15 UTC (permalink / raw)
  To: dledford-H+wXaHxf7aLQT0dZR+AlfA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	yishaih-VPRAkNaXOzVWk0Htik3J/w, ogerlitz-VPRAkNaXOzVWk0Htik3J/w,
	talal-VPRAkNaXOzVWk0Htik3J/w

New structure 'cmem' represents the contiguous allocated memory.
It supports:
Allocate, Free, 'Map to virtual address' operations, etc.

Signed-off-by: Yishai Hadas <yishaih-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
 drivers/infiniband/core/Makefile |   2 +-
 drivers/infiniband/core/cmem.c   | 245 +++++++++++++++++++++++++++++++++++++++
 include/rdma/ib_cmem.h           |  41 +++++++
 3 files changed, 287 insertions(+), 1 deletion(-)
 create mode 100644 drivers/infiniband/core/cmem.c
 create mode 100644 include/rdma/ib_cmem.h

diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index d43a899..8549ea4 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -11,7 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
 ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
 				device.o fmr_pool.o cache.o netlink.o \
 				roce_gid_mgmt.o
-ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o cmem.o
 ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
 
 ib_mad-y :=			mad.o smi.o agent.o mad_rmpp.o
diff --git a/drivers/infiniband/core/cmem.c b/drivers/infiniband/core/cmem.c
new file mode 100644
index 0000000..21d8573
--- /dev/null
+++ b/drivers/infiniband/core/cmem.c
@@ -0,0 +1,245 @@
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+#include <linux/dma-attrs.h>
+#include <linux/slab.h>
+#include <rdma/ib_cmem.h>
+#include "uverbs.h"
+
+static void ib_cmem_release(struct kref *ref)
+{
+	struct ib_cmem *cmem;
+	struct ib_cmem_block *cmem_block, *tmp;
+	unsigned long ntotal_pages;
+
+	cmem = container_of(ref, struct ib_cmem, refcount);
+
+	list_for_each_entry_safe(cmem_block, tmp, &cmem->ib_cmem_block, list) {
+		__free_pages(cmem_block->page, cmem->block_order);
+		list_del(&cmem_block->list);
+		kfree(cmem_block);
+	}
+	/* no locking is needed:
+	  * ib_cmem_release is called from vm_close which is always called
+	  * with mm->mmap_sem held for writing.
+	  * The only exception is when the process shutting down but in that case
+	  * counter not relevant any more.
+	  */
+	if (current->mm) {
+		ntotal_pages = PAGE_ALIGN(cmem->length) >> PAGE_SHIFT;
+		current->mm->pinned_vm -= ntotal_pages;
+	}
+	kfree(cmem);
+}
+
+/**
+ * ib_cmem_release_contiguous_pages - release memory allocated by
+ *                                              ib_cmem_alloc_contiguous_pages.
+ * @cmem: cmem struct to release
+ */
+void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem)
+{
+	kref_put(&cmem->refcount, ib_cmem_release);
+}
+EXPORT_SYMBOL(ib_cmem_release_contiguous_pages);
+
+static void cmem_vma_open(struct vm_area_struct *area)
+{
+	struct ib_cmem *ib_cmem;
+
+	ib_cmem = (struct ib_cmem *)(area->vm_private_data);
+
+	/* vm_open and vm_close are always called with mm->mmap_sem held for
+	  * writing. The only exception is when the process is shutting down, at
+	  * which point vm_close is called with no locks held, but since it is
+	  * after the VMAs have been detached, it is impossible that vm_open will
+	  * be called. Therefore, there is no need to synchronize the kref_get and
+	  * kref_put calls.
+	*/
+	kref_get(&ib_cmem->refcount);
+}
+
+static void cmem_vma_close(struct vm_area_struct *area)
+{
+	struct ib_cmem *cmem;
+
+	cmem = (struct ib_cmem *)(area->vm_private_data);
+
+	ib_cmem_release_contiguous_pages(cmem);
+}
+
+static const struct vm_operations_struct cmem_contig_pages_vm_ops = {
+	.open = cmem_vma_open,
+	.close = cmem_vma_close
+};
+
+/**
+ * ib_cmem_map_contiguous_pages_to_vma - map contiguous pages into VMA
+ * @ib_cmem: cmem structure returned by ib_cmem_alloc_contiguous_pages
+ * @vma: VMA to inject pages into.
+ */
+int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
+					struct vm_area_struct *vma)
+{
+	int ret;
+	unsigned long page_entry;
+	unsigned long ntotal_pages;
+	unsigned long ncontig_pages;
+	unsigned long total_size;
+	struct page *page;
+	unsigned long vma_entry_number = 0;
+	struct ib_cmem_block *ib_cmem_block = NULL;
+
+	total_size = vma->vm_end - vma->vm_start;
+	if (ib_cmem->length != total_size)
+		return -EINVAL;
+
+	if (total_size != PAGE_ALIGN(total_size)) {
+		WARN(1,
+		     "ib_cmem_map: total size %lu not aligned to page size\n",
+		     total_size);
+		return -EINVAL;
+	}
+
+	ntotal_pages = total_size >> PAGE_SHIFT;
+	ncontig_pages = 1 << ib_cmem->block_order;
+
+	list_for_each_entry(ib_cmem_block, &ib_cmem->ib_cmem_block, list) {
+		page = ib_cmem_block->page;
+		for (page_entry = 0; page_entry < ncontig_pages; page_entry++) {
+			/* We reached end of vma - going out from both loops */
+			if (vma_entry_number >= ntotal_pages)
+				goto end;
+
+			ret = vm_insert_page(vma, vma->vm_start +
+				(vma_entry_number << PAGE_SHIFT), page);
+			if (ret < 0)
+				goto err_vm_insert;
+
+			vma_entry_number++;
+			page++;
+		}
+	}
+
+end:
+
+	/* We expect to have enough pages   */
+	if (vma_entry_number >= ntotal_pages) {
+		vma->vm_ops =  &cmem_contig_pages_vm_ops;
+		vma->vm_private_data = ib_cmem;
+		return 0;
+	}
+	/* Not expected but if we reached here
+	  * not enough contiguous pages were registered
+	  */
+	ret = -EINVAL;
+
+err_vm_insert:
+
+	zap_vma_ptes(vma, vma->vm_start, total_size);
+	return ret;
+}
+EXPORT_SYMBOL(ib_cmem_map_contiguous_pages_to_vma);
+
+/**
+ * ib_cmem_alloc_contiguous_pages - allocate contiguous pages
+ * @context: userspace context to allocate memory for
+ * @total_size: total required size for that allocation.
+ * @page_size_order: order of one contiguous page.
+ * @numa_nude: From which numa node to allocate memory
+ *             when numa_nude < 0 use default numa_nude.
+ */
+struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
+					       unsigned long total_size,
+					       unsigned long page_size_order,
+					       int numa_node)
+{
+	struct ib_cmem *cmem;
+	unsigned long ntotal_pages;
+	unsigned long ncontiguous_pages;
+	unsigned long ncontiguous_groups;
+	struct page *page;
+	int i;
+	int ncontiguous_pages_order;
+	struct ib_cmem_block *ib_cmem_block;
+	unsigned long locked;
+	unsigned long lock_limit;
+
+	if (page_size_order < PAGE_SHIFT || page_size_order > 31)
+		return ERR_PTR(-EINVAL);
+
+	cmem = kzalloc(sizeof(*cmem), GFP_KERNEL);
+	if (!cmem)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&cmem->refcount);
+	cmem->context   = context;
+	INIT_LIST_HEAD(&cmem->ib_cmem_block);
+
+	/* Total size is expected to be already page aligned -
+	  * verifying anyway.
+	  */
+	ntotal_pages = PAGE_ALIGN(total_size) >> PAGE_SHIFT;
+	/* ib_cmem_alloc_contiguous_pages is called as part of mmap
+	  * with mm->mmap_sem held for writing.
+	  * No need to lock
+	  */
+	locked     = ntotal_pages + current->mm->pinned_vm;
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
+		goto err_alloc;
+
+	/* How many contiguous pages do we need in 1 block */
+	ncontiguous_pages = (1 << page_size_order) >> PAGE_SHIFT;
+	ncontiguous_pages_order = ilog2(ncontiguous_pages);
+	ncontiguous_groups = (ntotal_pages >> ncontiguous_pages_order)  +
+		(!!(ntotal_pages & (ncontiguous_pages - 1)));
+
+	/* Checking MAX_ORDER to prevent WARN via calling alloc_pages below */
+	if (ncontiguous_pages_order >= MAX_ORDER)
+		goto err_alloc;
+	/* we set block_order before starting allocation to prevent
+	  * a leak in a failure flow in ib_cmem_release.
+	  * cmem->length has at that step value 0 from kzalloc as expected
+	  */
+	cmem->block_order = ncontiguous_pages_order;
+	for (i = 0; i < ncontiguous_groups; i++) {
+		/* Allocating the managed entry */
+		ib_cmem_block = kmalloc(sizeof(*ib_cmem_block),
+					GFP_KERNEL);
+		if (!ib_cmem_block)
+			goto err_alloc;
+
+		if (numa_node < 0)
+			page =  alloc_pages(GFP_HIGHUSER | __GFP_ZERO |
+					    __GFP_COMP | __GFP_NOWARN,
+					    ncontiguous_pages_order);
+		else
+			page =  alloc_pages_node(numa_node,
+						 GFP_HIGHUSER | __GFP_ZERO |
+						 __GFP_COMP | __GFP_NOWARN,
+						 ncontiguous_pages_order);
+
+		if (!page) {
+			kfree(ib_cmem_block);
+			/* We should deallocate previous succeeded allocatations
+			  * if exists.
+			  */
+			goto err_alloc;
+		}
+
+		ib_cmem_block->page = page;
+		list_add_tail(&ib_cmem_block->list, &cmem->ib_cmem_block);
+	}
+
+	cmem->length = total_size;
+	current->mm->pinned_vm = locked;
+	return cmem;
+
+err_alloc:
+	ib_cmem_release_contiguous_pages(cmem);
+	return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_cmem_alloc_contiguous_pages);
diff --git a/include/rdma/ib_cmem.h b/include/rdma/ib_cmem.h
new file mode 100644
index 0000000..5f26a49
--- /dev/null
+++ b/include/rdma/ib_cmem.h
@@ -0,0 +1,41 @@
+#ifndef IB_CMEM_H
+#define IB_CMEM_H
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_verbs.h>
+
+/* contiguous memory structure */
+struct ib_cmem {
+	struct ib_ucontext     *context;
+	size_t			length;
+	/* Link list of contiguous blocks being part of that cmem  */
+	struct list_head ib_cmem_block;
+
+	/* Order of cmem block,  2^ block_order will equal number
+	  * of physical pages per block
+	  */
+	unsigned long    block_order;
+	/* Refernce counter for that memory area
+	  * When value became 0 pages will be returned to the kernel.
+	  */
+	struct kref refcount;
+};
+
+struct ib_cmem_block {
+	struct list_head	list;
+	/* page will point to the page struct of the head page
+	  * in the current compound page.
+	  * block order is saved once as part of ib_cmem.
+	  */
+	struct page            *page;
+};
+
+int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
+					struct vm_area_struct *vma);
+struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
+					       unsigned long total_size,
+					       unsigned long page_size_order,
+					       int numa_node);
+void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem);
+
+#endif
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [RFC contig pages support 2/2] IB/mlx5: Exporting to user space the contiguous allocation capability
       [not found] ` <1449587707-24214-1-git-send-email-yishaih-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  2015-12-08 15:15   ` [RFC contig pages support 1/2] IB: Supports contiguous memory operations Yishai Hadas
@ 2015-12-08 15:15   ` Yishai Hadas
  1 sibling, 0 replies; 20+ messages in thread
From: Yishai Hadas @ 2015-12-08 15:15 UTC (permalink / raw)
  To: dledford-H+wXaHxf7aLQT0dZR+AlfA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	yishaih-VPRAkNaXOzVWk0Htik3J/w, ogerlitz-VPRAkNaXOzVWk0Htik3J/w,
	talal-VPRAkNaXOzVWk0Htik3J/w

Extend mlx5_ib_mmap to recognize 'allocate contiguous' command.
Offset field will be encoded to hold both command and its data.
Last 8 bits will hold the command, this will enable future
extension for further commands.

Signed-off-by: Yishai Hadas <yishaih-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
 drivers/infiniband/hw/mlx5/main.c    | 35 ++++++++++++++++++++++++++++++++++-
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  4 +++-
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 7e97cb5..46de426 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -43,6 +43,7 @@
 #include <linux/mlx5/vport.h>
 #include <rdma/ib_smi.h>
 #include <rdma/ib_umem.h>
+#include <rdma/ib_cmem.h>
 #include "user.h"
 #include "mlx5_ib.h"
 
@@ -751,6 +752,11 @@ static int get_index(unsigned long offset)
 	return get_arg(offset);
 }
 
+static int get_pg_order(unsigned long offset)
+{
+	return get_arg(offset);
+}
+
 static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
 {
 	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
@@ -759,6 +765,11 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm
 	unsigned long command;
 	unsigned long idx;
 	phys_addr_t pfn;
+	unsigned long total_size;
+	unsigned long order;
+	struct ib_cmem *ib_cmem;
+	int err;
+	int local_numa_node;
 
 	command = get_command(vma->vm_pgoff);
 	switch (command) {
@@ -784,8 +795,30 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm
 			    (unsigned long long)pfn << PAGE_SHIFT);
 		break;
 
+	case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES_CPU_NUMA:
+	case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES_DEV_NUMA:
 	case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
-		return -ENOSYS;
+		if (command == MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES_CPU_NUMA)
+			local_numa_node = numa_node_id();
+		else if (command == MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES_DEV_NUMA)
+			local_numa_node = dev_to_node(&dev->mdev->pdev->dev);
+		else
+			local_numa_node = -1;
+
+		total_size = vma->vm_end - vma->vm_start;
+		order = get_pg_order(vma->vm_pgoff);
+
+		ib_cmem = ib_cmem_alloc_contiguous_pages(ibcontext, total_size,
+							 order, local_numa_node);
+		if (IS_ERR(ib_cmem))
+			return PTR_ERR(ib_cmem);
+
+		err = ib_cmem_map_contiguous_pages_to_vma(ib_cmem, vma);
+		if (err) {
+			ib_cmem_release_contiguous_pages(ib_cmem);
+			return err;
+		}
+		break;
 
 	default:
 		return -EINVAL;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 6333472..1e2c57e 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -62,7 +62,9 @@ enum {
 
 enum mlx5_ib_mmap_cmd {
 	MLX5_IB_MMAP_REGULAR_PAGE		= 0,
-	MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES	= 1, /* always last */
+	MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES	= 1,
+	MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES_CPU_NUMA  = 0xFC,
+	MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES_DEV_NUMA  = 0xFD,
 };
 
 enum {
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [RFC contig pages support 1/2] IB: Supports contiguous memory operations
  2015-12-08 15:15   ` [RFC contig pages support 1/2] IB: Supports contiguous memory operations Yishai Hadas
@ 2015-12-08 15:18         ` Christoph Hellwig
  0 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2015-12-08 15:18 UTC (permalink / raw)
  To: Yishai Hadas
  Cc: dledford-H+wXaHxf7aLQT0dZR+AlfA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	ogerlitz-VPRAkNaXOzVWk0Htik3J/w, talal-VPRAkNaXOzVWk0Htik3J/w,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg

There is absolutely nothing IB specific here.  If you want to support
anonymous mmaps to allocate large contiguous pages work with the MM
folks on providing that in a generic fashion.

[full quote alert for reference:]

On Tue, Dec 08, 2015 at 05:15:06PM +0200, Yishai Hadas wrote:
> New structure 'cmem' represents the contiguous allocated memory.
> It supports:
> Allocate, Free, 'Map to virtual address' operations, etc.
> 
> Signed-off-by: Yishai Hadas <yishaih-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> ---
>  drivers/infiniband/core/Makefile |   2 +-
>  drivers/infiniband/core/cmem.c   | 245 +++++++++++++++++++++++++++++++++++++++
>  include/rdma/ib_cmem.h           |  41 +++++++
>  3 files changed, 287 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/infiniband/core/cmem.c
>  create mode 100644 include/rdma/ib_cmem.h
> 
> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
> index d43a899..8549ea4 100644
> --- a/drivers/infiniband/core/Makefile
> +++ b/drivers/infiniband/core/Makefile
> @@ -11,7 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
>  ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
>  				device.o fmr_pool.o cache.o netlink.o \
>  				roce_gid_mgmt.o
> -ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
> +ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o cmem.o
>  ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
>  
>  ib_mad-y :=			mad.o smi.o agent.o mad_rmpp.o
> diff --git a/drivers/infiniband/core/cmem.c b/drivers/infiniband/core/cmem.c
> new file mode 100644
> index 0000000..21d8573
> --- /dev/null
> +++ b/drivers/infiniband/core/cmem.c
> @@ -0,0 +1,245 @@
> +#include <linux/mm.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/sched.h>
> +#include <linux/export.h>
> +#include <linux/dma-attrs.h>
> +#include <linux/slab.h>
> +#include <rdma/ib_cmem.h>
> +#include "uverbs.h"
> +
> +static void ib_cmem_release(struct kref *ref)
> +{
> +	struct ib_cmem *cmem;
> +	struct ib_cmem_block *cmem_block, *tmp;
> +	unsigned long ntotal_pages;
> +
> +	cmem = container_of(ref, struct ib_cmem, refcount);
> +
> +	list_for_each_entry_safe(cmem_block, tmp, &cmem->ib_cmem_block, list) {
> +		__free_pages(cmem_block->page, cmem->block_order);
> +		list_del(&cmem_block->list);
> +		kfree(cmem_block);
> +	}
> +	/* no locking is needed:
> +	  * ib_cmem_release is called from vm_close which is always called
> +	  * with mm->mmap_sem held for writing.
> +	  * The only exception is when the process shutting down but in that case
> +	  * counter not relevant any more.
> +	  */
> +	if (current->mm) {
> +		ntotal_pages = PAGE_ALIGN(cmem->length) >> PAGE_SHIFT;
> +		current->mm->pinned_vm -= ntotal_pages;
> +	}
> +	kfree(cmem);
> +}
> +
> +/**
> + * ib_cmem_release_contiguous_pages - release memory allocated by
> + *                                              ib_cmem_alloc_contiguous_pages.
> + * @cmem: cmem struct to release
> + */
> +void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem)
> +{
> +	kref_put(&cmem->refcount, ib_cmem_release);
> +}
> +EXPORT_SYMBOL(ib_cmem_release_contiguous_pages);
> +
> +static void cmem_vma_open(struct vm_area_struct *area)
> +{
> +	struct ib_cmem *ib_cmem;
> +
> +	ib_cmem = (struct ib_cmem *)(area->vm_private_data);
> +
> +	/* vm_open and vm_close are always called with mm->mmap_sem held for
> +	  * writing. The only exception is when the process is shutting down, at
> +	  * which point vm_close is called with no locks held, but since it is
> +	  * after the VMAs have been detached, it is impossible that vm_open will
> +	  * be called. Therefore, there is no need to synchronize the kref_get and
> +	  * kref_put calls.
> +	*/
> +	kref_get(&ib_cmem->refcount);
> +}
> +
> +static void cmem_vma_close(struct vm_area_struct *area)
> +{
> +	struct ib_cmem *cmem;
> +
> +	cmem = (struct ib_cmem *)(area->vm_private_data);
> +
> +	ib_cmem_release_contiguous_pages(cmem);
> +}
> +
> +static const struct vm_operations_struct cmem_contig_pages_vm_ops = {
> +	.open = cmem_vma_open,
> +	.close = cmem_vma_close
> +};
> +
> +/**
> + * ib_cmem_map_contiguous_pages_to_vma - map contiguous pages into VMA
> + * @ib_cmem: cmem structure returned by ib_cmem_alloc_contiguous_pages
> + * @vma: VMA to inject pages into.
> + */
> +int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
> +					struct vm_area_struct *vma)
> +{
> +	int ret;
> +	unsigned long page_entry;
> +	unsigned long ntotal_pages;
> +	unsigned long ncontig_pages;
> +	unsigned long total_size;
> +	struct page *page;
> +	unsigned long vma_entry_number = 0;
> +	struct ib_cmem_block *ib_cmem_block = NULL;
> +
> +	total_size = vma->vm_end - vma->vm_start;
> +	if (ib_cmem->length != total_size)
> +		return -EINVAL;
> +
> +	if (total_size != PAGE_ALIGN(total_size)) {
> +		WARN(1,
> +		     "ib_cmem_map: total size %lu not aligned to page size\n",
> +		     total_size);
> +		return -EINVAL;
> +	}
> +
> +	ntotal_pages = total_size >> PAGE_SHIFT;
> +	ncontig_pages = 1 << ib_cmem->block_order;
> +
> +	list_for_each_entry(ib_cmem_block, &ib_cmem->ib_cmem_block, list) {
> +		page = ib_cmem_block->page;
> +		for (page_entry = 0; page_entry < ncontig_pages; page_entry++) {
> +			/* We reached end of vma - going out from both loops */
> +			if (vma_entry_number >= ntotal_pages)
> +				goto end;
> +
> +			ret = vm_insert_page(vma, vma->vm_start +
> +				(vma_entry_number << PAGE_SHIFT), page);
> +			if (ret < 0)
> +				goto err_vm_insert;
> +
> +			vma_entry_number++;
> +			page++;
> +		}
> +	}
> +
> +end:
> +
> +	/* We expect to have enough pages   */
> +	if (vma_entry_number >= ntotal_pages) {
> +		vma->vm_ops =  &cmem_contig_pages_vm_ops;
> +		vma->vm_private_data = ib_cmem;
> +		return 0;
> +	}
> +	/* Not expected but if we reached here
> +	  * not enough contiguous pages were registered
> +	  */
> +	ret = -EINVAL;
> +
> +err_vm_insert:
> +
> +	zap_vma_ptes(vma, vma->vm_start, total_size);
> +	return ret;
> +}
> +EXPORT_SYMBOL(ib_cmem_map_contiguous_pages_to_vma);
> +
> +/**
> + * ib_cmem_alloc_contiguous_pages - allocate contiguous pages
> + * @context: userspace context to allocate memory for
> + * @total_size: total required size for that allocation.
> + * @page_size_order: order of one contiguous page.
> + * @numa_nude: From which numa node to allocate memory
> + *             when numa_nude < 0 use default numa_nude.
> + */
> +struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
> +					       unsigned long total_size,
> +					       unsigned long page_size_order,
> +					       int numa_node)
> +{
> +	struct ib_cmem *cmem;
> +	unsigned long ntotal_pages;
> +	unsigned long ncontiguous_pages;
> +	unsigned long ncontiguous_groups;
> +	struct page *page;
> +	int i;
> +	int ncontiguous_pages_order;
> +	struct ib_cmem_block *ib_cmem_block;
> +	unsigned long locked;
> +	unsigned long lock_limit;
> +
> +	if (page_size_order < PAGE_SHIFT || page_size_order > 31)
> +		return ERR_PTR(-EINVAL);
> +
> +	cmem = kzalloc(sizeof(*cmem), GFP_KERNEL);
> +	if (!cmem)
> +		return ERR_PTR(-ENOMEM);
> +
> +	kref_init(&cmem->refcount);
> +	cmem->context   = context;
> +	INIT_LIST_HEAD(&cmem->ib_cmem_block);
> +
> +	/* Total size is expected to be already page aligned -
> +	  * verifying anyway.
> +	  */
> +	ntotal_pages = PAGE_ALIGN(total_size) >> PAGE_SHIFT;
> +	/* ib_cmem_alloc_contiguous_pages is called as part of mmap
> +	  * with mm->mmap_sem held for writing.
> +	  * No need to lock
> +	  */
> +	locked     = ntotal_pages + current->mm->pinned_vm;
> +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +
> +	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
> +		goto err_alloc;
> +
> +	/* How many contiguous pages do we need in 1 block */
> +	ncontiguous_pages = (1 << page_size_order) >> PAGE_SHIFT;
> +	ncontiguous_pages_order = ilog2(ncontiguous_pages);
> +	ncontiguous_groups = (ntotal_pages >> ncontiguous_pages_order)  +
> +		(!!(ntotal_pages & (ncontiguous_pages - 1)));
> +
> +	/* Checking MAX_ORDER to prevent WARN via calling alloc_pages below */
> +	if (ncontiguous_pages_order >= MAX_ORDER)
> +		goto err_alloc;
> +	/* we set block_order before starting allocation to prevent
> +	  * a leak in a failure flow in ib_cmem_release.
> +	  * cmem->length has at that step value 0 from kzalloc as expected
> +	  */
> +	cmem->block_order = ncontiguous_pages_order;
> +	for (i = 0; i < ncontiguous_groups; i++) {
> +		/* Allocating the managed entry */
> +		ib_cmem_block = kmalloc(sizeof(*ib_cmem_block),
> +					GFP_KERNEL);
> +		if (!ib_cmem_block)
> +			goto err_alloc;
> +
> +		if (numa_node < 0)
> +			page =  alloc_pages(GFP_HIGHUSER | __GFP_ZERO |
> +					    __GFP_COMP | __GFP_NOWARN,
> +					    ncontiguous_pages_order);
> +		else
> +			page =  alloc_pages_node(numa_node,
> +						 GFP_HIGHUSER | __GFP_ZERO |
> +						 __GFP_COMP | __GFP_NOWARN,
> +						 ncontiguous_pages_order);
> +
> +		if (!page) {
> +			kfree(ib_cmem_block);
> +			/* We should deallocate previous succeeded allocatations
> +			  * if exists.
> +			  */
> +			goto err_alloc;
> +		}
> +
> +		ib_cmem_block->page = page;
> +		list_add_tail(&ib_cmem_block->list, &cmem->ib_cmem_block);
> +	}
> +
> +	cmem->length = total_size;
> +	current->mm->pinned_vm = locked;
> +	return cmem;
> +
> +err_alloc:
> +	ib_cmem_release_contiguous_pages(cmem);
> +	return ERR_PTR(-ENOMEM);
> +}
> +EXPORT_SYMBOL(ib_cmem_alloc_contiguous_pages);
> diff --git a/include/rdma/ib_cmem.h b/include/rdma/ib_cmem.h
> new file mode 100644
> index 0000000..5f26a49
> --- /dev/null
> +++ b/include/rdma/ib_cmem.h
> @@ -0,0 +1,41 @@
> +#ifndef IB_CMEM_H
> +#define IB_CMEM_H
> +
> +#include <rdma/ib_umem.h>
> +#include <rdma/ib_verbs.h>
> +
> +/* contiguous memory structure */
> +struct ib_cmem {
> +	struct ib_ucontext     *context;
> +	size_t			length;
> +	/* Link list of contiguous blocks being part of that cmem  */
> +	struct list_head ib_cmem_block;
> +
> +	/* Order of cmem block,  2^ block_order will equal number
> +	  * of physical pages per block
> +	  */
> +	unsigned long    block_order;
> +	/* Refernce counter for that memory area
> +	  * When value became 0 pages will be returned to the kernel.
> +	  */
> +	struct kref refcount;
> +};
> +
> +struct ib_cmem_block {
> +	struct list_head	list;
> +	/* page will point to the page struct of the head page
> +	  * in the current compound page.
> +	  * block order is saved once as part of ib_cmem.
> +	  */
> +	struct page            *page;
> +};
> +
> +int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
> +					struct vm_area_struct *vma);
> +struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
> +					       unsigned long total_size,
> +					       unsigned long page_size_order,
> +					       int numa_node);
> +void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem);
> +
> +#endif
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
---end quoted text---
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC contig pages support 1/2] IB: Supports contiguous memory operations
@ 2015-12-08 15:18         ` Christoph Hellwig
  0 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2015-12-08 15:18 UTC (permalink / raw)
  To: Yishai Hadas; +Cc: dledford, linux-rdma, ogerlitz, talal, linux-mm

There is absolutely nothing IB specific here.  If you want to support
anonymous mmaps to allocate large contiguous pages work with the MM
folks on providing that in a generic fashion.

[full quote alert for reference:]

On Tue, Dec 08, 2015 at 05:15:06PM +0200, Yishai Hadas wrote:
> New structure 'cmem' represents the contiguous allocated memory.
> It supports:
> Allocate, Free, 'Map to virtual address' operations, etc.
> 
> Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
> ---
>  drivers/infiniband/core/Makefile |   2 +-
>  drivers/infiniband/core/cmem.c   | 245 +++++++++++++++++++++++++++++++++++++++
>  include/rdma/ib_cmem.h           |  41 +++++++
>  3 files changed, 287 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/infiniband/core/cmem.c
>  create mode 100644 include/rdma/ib_cmem.h
> 
> diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
> index d43a899..8549ea4 100644
> --- a/drivers/infiniband/core/Makefile
> +++ b/drivers/infiniband/core/Makefile
> @@ -11,7 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
>  ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
>  				device.o fmr_pool.o cache.o netlink.o \
>  				roce_gid_mgmt.o
> -ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
> +ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o cmem.o
>  ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
>  
>  ib_mad-y :=			mad.o smi.o agent.o mad_rmpp.o
> diff --git a/drivers/infiniband/core/cmem.c b/drivers/infiniband/core/cmem.c
> new file mode 100644
> index 0000000..21d8573
> --- /dev/null
> +++ b/drivers/infiniband/core/cmem.c
> @@ -0,0 +1,245 @@
> +#include <linux/mm.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/sched.h>
> +#include <linux/export.h>
> +#include <linux/dma-attrs.h>
> +#include <linux/slab.h>
> +#include <rdma/ib_cmem.h>
> +#include "uverbs.h"
> +
> +static void ib_cmem_release(struct kref *ref)
> +{
> +	struct ib_cmem *cmem;
> +	struct ib_cmem_block *cmem_block, *tmp;
> +	unsigned long ntotal_pages;
> +
> +	cmem = container_of(ref, struct ib_cmem, refcount);
> +
> +	list_for_each_entry_safe(cmem_block, tmp, &cmem->ib_cmem_block, list) {
> +		__free_pages(cmem_block->page, cmem->block_order);
> +		list_del(&cmem_block->list);
> +		kfree(cmem_block);
> +	}
> +	/* no locking is needed:
> +	  * ib_cmem_release is called from vm_close which is always called
> +	  * with mm->mmap_sem held for writing.
> +	  * The only exception is when the process shutting down but in that case
> +	  * counter not relevant any more.
> +	  */
> +	if (current->mm) {
> +		ntotal_pages = PAGE_ALIGN(cmem->length) >> PAGE_SHIFT;
> +		current->mm->pinned_vm -= ntotal_pages;
> +	}
> +	kfree(cmem);
> +}
> +
> +/**
> + * ib_cmem_release_contiguous_pages - release memory allocated by
> + *                                              ib_cmem_alloc_contiguous_pages.
> + * @cmem: cmem struct to release
> + */
> +void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem)
> +{
> +	kref_put(&cmem->refcount, ib_cmem_release);
> +}
> +EXPORT_SYMBOL(ib_cmem_release_contiguous_pages);
> +
> +static void cmem_vma_open(struct vm_area_struct *area)
> +{
> +	struct ib_cmem *ib_cmem;
> +
> +	ib_cmem = (struct ib_cmem *)(area->vm_private_data);
> +
> +	/* vm_open and vm_close are always called with mm->mmap_sem held for
> +	  * writing. The only exception is when the process is shutting down, at
> +	  * which point vm_close is called with no locks held, but since it is
> +	  * after the VMAs have been detached, it is impossible that vm_open will
> +	  * be called. Therefore, there is no need to synchronize the kref_get and
> +	  * kref_put calls.
> +	*/
> +	kref_get(&ib_cmem->refcount);
> +}
> +
> +static void cmem_vma_close(struct vm_area_struct *area)
> +{
> +	struct ib_cmem *cmem;
> +
> +	cmem = (struct ib_cmem *)(area->vm_private_data);
> +
> +	ib_cmem_release_contiguous_pages(cmem);
> +}
> +
> +static const struct vm_operations_struct cmem_contig_pages_vm_ops = {
> +	.open = cmem_vma_open,
> +	.close = cmem_vma_close
> +};
> +
> +/**
> + * ib_cmem_map_contiguous_pages_to_vma - map contiguous pages into VMA
> + * @ib_cmem: cmem structure returned by ib_cmem_alloc_contiguous_pages
> + * @vma: VMA to inject pages into.
> + */
> +int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
> +					struct vm_area_struct *vma)
> +{
> +	int ret;
> +	unsigned long page_entry;
> +	unsigned long ntotal_pages;
> +	unsigned long ncontig_pages;
> +	unsigned long total_size;
> +	struct page *page;
> +	unsigned long vma_entry_number = 0;
> +	struct ib_cmem_block *ib_cmem_block = NULL;
> +
> +	total_size = vma->vm_end - vma->vm_start;
> +	if (ib_cmem->length != total_size)
> +		return -EINVAL;
> +
> +	if (total_size != PAGE_ALIGN(total_size)) {
> +		WARN(1,
> +		     "ib_cmem_map: total size %lu not aligned to page size\n",
> +		     total_size);
> +		return -EINVAL;
> +	}
> +
> +	ntotal_pages = total_size >> PAGE_SHIFT;
> +	ncontig_pages = 1 << ib_cmem->block_order;
> +
> +	list_for_each_entry(ib_cmem_block, &ib_cmem->ib_cmem_block, list) {
> +		page = ib_cmem_block->page;
> +		for (page_entry = 0; page_entry < ncontig_pages; page_entry++) {
> +			/* We reached end of vma - going out from both loops */
> +			if (vma_entry_number >= ntotal_pages)
> +				goto end;
> +
> +			ret = vm_insert_page(vma, vma->vm_start +
> +				(vma_entry_number << PAGE_SHIFT), page);
> +			if (ret < 0)
> +				goto err_vm_insert;
> +
> +			vma_entry_number++;
> +			page++;
> +		}
> +	}
> +
> +end:
> +
> +	/* We expect to have enough pages   */
> +	if (vma_entry_number >= ntotal_pages) {
> +		vma->vm_ops =  &cmem_contig_pages_vm_ops;
> +		vma->vm_private_data = ib_cmem;
> +		return 0;
> +	}
> +	/* Not expected but if we reached here
> +	  * not enough contiguous pages were registered
> +	  */
> +	ret = -EINVAL;
> +
> +err_vm_insert:
> +
> +	zap_vma_ptes(vma, vma->vm_start, total_size);
> +	return ret;
> +}
> +EXPORT_SYMBOL(ib_cmem_map_contiguous_pages_to_vma);
> +
> +/**
> + * ib_cmem_alloc_contiguous_pages - allocate contiguous pages
> + * @context: userspace context to allocate memory for
> + * @total_size: total required size for that allocation.
> + * @page_size_order: order of one contiguous page.
> + * @numa_nude: From which numa node to allocate memory
> + *             when numa_nude < 0 use default numa_nude.
> + */
> +struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
> +					       unsigned long total_size,
> +					       unsigned long page_size_order,
> +					       int numa_node)
> +{
> +	struct ib_cmem *cmem;
> +	unsigned long ntotal_pages;
> +	unsigned long ncontiguous_pages;
> +	unsigned long ncontiguous_groups;
> +	struct page *page;
> +	int i;
> +	int ncontiguous_pages_order;
> +	struct ib_cmem_block *ib_cmem_block;
> +	unsigned long locked;
> +	unsigned long lock_limit;
> +
> +	if (page_size_order < PAGE_SHIFT || page_size_order > 31)
> +		return ERR_PTR(-EINVAL);
> +
> +	cmem = kzalloc(sizeof(*cmem), GFP_KERNEL);
> +	if (!cmem)
> +		return ERR_PTR(-ENOMEM);
> +
> +	kref_init(&cmem->refcount);
> +	cmem->context   = context;
> +	INIT_LIST_HEAD(&cmem->ib_cmem_block);
> +
> +	/* Total size is expected to be already page aligned -
> +	  * verifying anyway.
> +	  */
> +	ntotal_pages = PAGE_ALIGN(total_size) >> PAGE_SHIFT;
> +	/* ib_cmem_alloc_contiguous_pages is called as part of mmap
> +	  * with mm->mmap_sem held for writing.
> +	  * No need to lock
> +	  */
> +	locked     = ntotal_pages + current->mm->pinned_vm;
> +	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> +
> +	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
> +		goto err_alloc;
> +
> +	/* How many contiguous pages do we need in 1 block */
> +	ncontiguous_pages = (1 << page_size_order) >> PAGE_SHIFT;
> +	ncontiguous_pages_order = ilog2(ncontiguous_pages);
> +	ncontiguous_groups = (ntotal_pages >> ncontiguous_pages_order)  +
> +		(!!(ntotal_pages & (ncontiguous_pages - 1)));
> +
> +	/* Checking MAX_ORDER to prevent WARN via calling alloc_pages below */
> +	if (ncontiguous_pages_order >= MAX_ORDER)
> +		goto err_alloc;
> +	/* we set block_order before starting allocation to prevent
> +	  * a leak in a failure flow in ib_cmem_release.
> +	  * cmem->length has at that step value 0 from kzalloc as expected
> +	  */
> +	cmem->block_order = ncontiguous_pages_order;
> +	for (i = 0; i < ncontiguous_groups; i++) {
> +		/* Allocating the managed entry */
> +		ib_cmem_block = kmalloc(sizeof(*ib_cmem_block),
> +					GFP_KERNEL);
> +		if (!ib_cmem_block)
> +			goto err_alloc;
> +
> +		if (numa_node < 0)
> +			page =  alloc_pages(GFP_HIGHUSER | __GFP_ZERO |
> +					    __GFP_COMP | __GFP_NOWARN,
> +					    ncontiguous_pages_order);
> +		else
> +			page =  alloc_pages_node(numa_node,
> +						 GFP_HIGHUSER | __GFP_ZERO |
> +						 __GFP_COMP | __GFP_NOWARN,
> +						 ncontiguous_pages_order);
> +
> +		if (!page) {
> +			kfree(ib_cmem_block);
> +			/* We should deallocate previous succeeded allocatations
> +			  * if exists.
> +			  */
> +			goto err_alloc;
> +		}
> +
> +		ib_cmem_block->page = page;
> +		list_add_tail(&ib_cmem_block->list, &cmem->ib_cmem_block);
> +	}
> +
> +	cmem->length = total_size;
> +	current->mm->pinned_vm = locked;
> +	return cmem;
> +
> +err_alloc:
> +	ib_cmem_release_contiguous_pages(cmem);
> +	return ERR_PTR(-ENOMEM);
> +}
> +EXPORT_SYMBOL(ib_cmem_alloc_contiguous_pages);
> diff --git a/include/rdma/ib_cmem.h b/include/rdma/ib_cmem.h
> new file mode 100644
> index 0000000..5f26a49
> --- /dev/null
> +++ b/include/rdma/ib_cmem.h
> @@ -0,0 +1,41 @@
> +#ifndef IB_CMEM_H
> +#define IB_CMEM_H
> +
> +#include <rdma/ib_umem.h>
> +#include <rdma/ib_verbs.h>
> +
> +/* contiguous memory structure */
> +struct ib_cmem {
> +	struct ib_ucontext     *context;
> +	size_t			length;
> +	/* Link list of contiguous blocks being part of that cmem  */
> +	struct list_head ib_cmem_block;
> +
> +	/* Order of cmem block,  2^ block_order will equal number
> +	  * of physical pages per block
> +	  */
> +	unsigned long    block_order;
> +	/* Refernce counter for that memory area
> +	  * When value became 0 pages will be returned to the kernel.
> +	  */
> +	struct kref refcount;
> +};
> +
> +struct ib_cmem_block {
> +	struct list_head	list;
> +	/* page will point to the page struct of the head page
> +	  * in the current compound page.
> +	  * block order is saved once as part of ib_cmem.
> +	  */
> +	struct page            *page;
> +};
> +
> +int ib_cmem_map_contiguous_pages_to_vma(struct ib_cmem *ib_cmem,
> +					struct vm_area_struct *vma);
> +struct ib_cmem *ib_cmem_alloc_contiguous_pages(struct ib_ucontext *context,
> +					       unsigned long total_size,
> +					       unsigned long page_size_order,
> +					       int numa_node);
> +void ib_cmem_release_contiguous_pages(struct ib_cmem *cmem);
> +
> +#endif
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
---end quoted text---

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC contig pages support 1/2] IB: Supports contiguous memory operations
  2015-12-08 15:18         ` Christoph Hellwig
@ 2015-12-08 17:15             ` Jason Gunthorpe
  -1 siblings, 0 replies; 20+ messages in thread
From: Jason Gunthorpe @ 2015-12-08 17:15 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Yishai Hadas, dledford-H+wXaHxf7aLQT0dZR+AlfA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	ogerlitz-VPRAkNaXOzVWk0Htik3J/w, talal-VPRAkNaXOzVWk0Htik3J/w,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg

On Tue, Dec 08, 2015 at 07:18:52AM -0800, Christoph Hellwig wrote:
> There is absolutely nothing IB specific here.  If you want to support
> anonymous mmaps to allocate large contiguous pages work with the MM
> folks on providing that in a generic fashion.

Yes please.

We already have huge page mmaps, how much win is had by going from
huge page maps to this contiguous map?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC contig pages support 1/2] IB: Supports contiguous memory operations
@ 2015-12-08 17:15             ` Jason Gunthorpe
  0 siblings, 0 replies; 20+ messages in thread
From: Jason Gunthorpe @ 2015-12-08 17:15 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Yishai Hadas, dledford, linux-rdma, ogerlitz, talal, linux-mm

On Tue, Dec 08, 2015 at 07:18:52AM -0800, Christoph Hellwig wrote:
> There is absolutely nothing IB specific here.  If you want to support
> anonymous mmaps to allocate large contiguous pages work with the MM
> folks on providing that in a generic fashion.

Yes please.

We already have huge page mmaps, how much win is had by going from
huge page maps to this contiguous map?

Jason

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [RFC contig pages support 1/2] IB: Supports contiguous memory operations
  2015-12-08 17:15             ` Jason Gunthorpe
  (?)
@ 2015-12-09 10:00             ` Shachar Raindel
       [not found]               ` <AM4PR05MB146005B448BEA876519335CDDCE80-n5Jp0YuYvM1n/kQvjoF5G9qRiQSDpxhJvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  -1 siblings, 1 reply; 20+ messages in thread
From: Shachar Raindel @ 2015-12-09 10:00 UTC (permalink / raw)
  To: Jason Gunthorpe, Christoph Hellwig
  Cc: Yishai Hadas, dledford, linux-rdma, Or Gerlitz, Tal Alon, linux-mm



> -----Original Message-----
> From: owner-linux-mm@kvack.org [mailto:owner-linux-mm@kvack.org] On
> Behalf Of Jason Gunthorpe
> Sent: Tuesday, December 08, 2015 7:16 PM
> To: Christoph Hellwig <hch@infradead.org>
> Cc: Yishai Hadas <yishaih@mellanox.com>; dledford@redhat.com; linux-
> rdma@vger.kernel.org; Or Gerlitz <ogerlitz@mellanox.com>; Tal Alon
> <talal@mellanox.com>; linux-mm@kvack.org
> Subject: Re: [RFC contig pages support 1/2] IB: Supports contiguous
> memory operations
> 
> On Tue, Dec 08, 2015 at 07:18:52AM -0800, Christoph Hellwig wrote:
> > There is absolutely nothing IB specific here.  If you want to support
> > anonymous mmaps to allocate large contiguous pages work with the MM
> > folks on providing that in a generic fashion.
> 
> Yes please.
> 

Note that other HW vendors are developing similar solutions, see for example: http://www.slideshare.net/linaroorg/hkg15106-replacing-cmem-meeting-tis-soc-shared-buffer-allocation-management-and-address-translation-requirements

> We already have huge page mmaps, how much win is had by going from
> huge page maps to this contiguous map?
> 

As far as gain is concerned, we are seeing gains in two cases here:
1. If the system has lots of non-fragmented, free memory, you can create large contig blocks that are above the CPU huge page size.
2. If the system memory is very fragmented, you cannot allocate huge pages. However, an API that allows you to create small (i.e. 64KB, 128KB, etc.) contig blocks reduces the load on the HW page tables and caches.

Thanks,
--Shachar

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC contig pages support 1/2] IB: Supports contiguous memory operations
  2015-12-09 10:00             ` Shachar Raindel
@ 2015-12-09 17:48                   ` Jason Gunthorpe
  0 siblings, 0 replies; 20+ messages in thread
From: Jason Gunthorpe @ 2015-12-09 17:48 UTC (permalink / raw)
  To: Shachar Raindel
  Cc: Christoph Hellwig, Yishai Hadas, dledford-H+wXaHxf7aLQT0dZR+AlfA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Or Gerlitz, Tal Alon,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg

On Wed, Dec 09, 2015 at 10:00:02AM +0000, Shachar Raindel wrote:
> > Yes please.

> Note that other HW vendors are developing similar solutions, see for
> example:
> http://www.slideshare.net/linaroorg/hkg15106-replacing-cmem-meeting-tis-soc-shared-buffer-allocation-management-and-address-translation-requirements

CMA and it's successors are for something totally different.

> > We already have huge page mmaps, how much win is had by going from
> > huge page maps to this contiguous map?
> 
> As far as gain is concerned, we are seeing gains in two cases here:
> 1. If the system has lots of non-fragmented, free memory, you can
> create large contig blocks that are above the CPU huge page size.
> 2. If the system memory is very fragmented, you cannot allocate huge
> pages. However, an API that allows you to create small (i.e. 64KB,
> 128KB, etc.) contig blocks reduces the load on the HW page tables
> and caches.

I understand what it does, I was looking for performance numbers. The
last time I trivially benchmarked huge pages vs not huge pages on mlx4
I wasn't able to detect a performance difference.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC contig pages support 1/2] IB: Supports contiguous memory operations
@ 2015-12-09 17:48                   ` Jason Gunthorpe
  0 siblings, 0 replies; 20+ messages in thread
From: Jason Gunthorpe @ 2015-12-09 17:48 UTC (permalink / raw)
  To: Shachar Raindel
  Cc: Christoph Hellwig, Yishai Hadas, dledford, linux-rdma,
	Or Gerlitz, Tal Alon, linux-mm

On Wed, Dec 09, 2015 at 10:00:02AM +0000, Shachar Raindel wrote:
> > Yes please.

> Note that other HW vendors are developing similar solutions, see for
> example:
> http://www.slideshare.net/linaroorg/hkg15106-replacing-cmem-meeting-tis-soc-shared-buffer-allocation-management-and-address-translation-requirements

CMA and it's successors are for something totally different.

> > We already have huge page mmaps, how much win is had by going from
> > huge page maps to this contiguous map?
> 
> As far as gain is concerned, we are seeing gains in two cases here:
> 1. If the system has lots of non-fragmented, free memory, you can
> create large contig blocks that are above the CPU huge page size.
> 2. If the system memory is very fragmented, you cannot allocate huge
> pages. However, an API that allows you to create small (i.e. 64KB,
> 128KB, etc.) contig blocks reduces the load on the HW page tables
> and caches.

I understand what it does, I was looking for performance numbers. The
last time I trivially benchmarked huge pages vs not huge pages on mlx4
I wasn't able to detect a performance difference.

Jason

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC contig pages support 1/2] IB: Supports contiguous memory operations
  2015-12-09 10:00             ` Shachar Raindel
@ 2015-12-09 18:39                   ` Christoph Hellwig
  0 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2015-12-09 18:39 UTC (permalink / raw)
  To: Shachar Raindel
  Cc: Jason Gunthorpe, Yishai Hadas, dledford-H+wXaHxf7aLQT0dZR+AlfA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Or Gerlitz, Tal Alon,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg

On Wed, Dec 09, 2015 at 10:00:02AM +0000, Shachar Raindel wrote:
> As far as gain is concerned, we are seeing gains in two cases here:
> 1. If the system has lots of non-fragmented, free memory, you can create large contig blocks that are above the CPU huge page size.
> 2. If the system memory is very fragmented, you cannot allocate huge pages. However, an API that allows you to create small (i.e. 64KB, 128KB, etc.) contig blocks reduces the load on the HW page tables and caches.

None of that is a uniqueue requirement for the mlx4 devices.  Again,
please work with the memory management folks to address your
requirements in a generic way!
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC contig pages support 1/2] IB: Supports contiguous memory operations
@ 2015-12-09 18:39                   ` Christoph Hellwig
  0 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2015-12-09 18:39 UTC (permalink / raw)
  To: Shachar Raindel
  Cc: Jason Gunthorpe, Yishai Hadas, dledford, linux-rdma, Or Gerlitz,
	Tal Alon, linux-mm

On Wed, Dec 09, 2015 at 10:00:02AM +0000, Shachar Raindel wrote:
> As far as gain is concerned, we are seeing gains in two cases here:
> 1. If the system has lots of non-fragmented, free memory, you can create large contig blocks that are above the CPU huge page size.
> 2. If the system memory is very fragmented, you cannot allocate huge pages. However, an API that allows you to create small (i.e. 64KB, 128KB, etc.) contig blocks reduces the load on the HW page tables and caches.

None of that is a uniqueue requirement for the mlx4 devices.  Again,
please work with the memory management folks to address your
requirements in a generic way!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [RFC contig pages support 1/2] IB: Supports contiguous memory operations
  2015-12-09 18:39                   ` Christoph Hellwig
  (?)
@ 2015-12-13 12:48                   ` Shachar Raindel
       [not found]                     ` <AM4PR05MB14603FC8169D50AD2A8F5AA3DCEC0-n5Jp0YuYvM1n/kQvjoF5G9qRiQSDpxhJvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  -1 siblings, 1 reply; 20+ messages in thread
From: Shachar Raindel @ 2015-12-13 12:48 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jason Gunthorpe, Yishai Hadas, dledford, linux-rdma, Or Gerlitz,
	Tal Alon, linux-mm



> -----Original Message-----
> From: Christoph Hellwig [mailto:hch@infradead.org]
> Sent: Wednesday, December 09, 2015 8:40 PM
> 
> On Wed, Dec 09, 2015 at 10:00:02AM +0000, Shachar Raindel wrote:
> > As far as gain is concerned, we are seeing gains in two cases here:
> > 1. If the system has lots of non-fragmented, free memory, you can
> create large contig blocks that are above the CPU huge page size.
> > 2. If the system memory is very fragmented, you cannot allocate huge
> pages. However, an API that allows you to create small (i.e. 64KB,
> 128KB, etc.) contig blocks reduces the load on the HW page tables and
> caches.
> 
> None of that is a uniqueue requirement for the mlx4 devices.  Again,
> please work with the memory management folks to address your
> requirements in a generic way!

I completely agree, and this RFC was sent in order to start discussion
on this subject.

Dear MM people, can you please advise on the subject?

Multiple HW vendors, from different fields, ranging between embedded SoC
devices (TI) and HPC (Mellanox) are looking for a solution to allocate
blocks of contiguous memory to user space applications, without using huge
pages.

What should be the API to expose such feature? 

Should we create a virtual FS that allows the user to create "files"
representing memory allocations, and define the contiguous level we
attempt to allocate using folders (similar to hugetlbfs)?

Should we patch hugetlbfs to allow allocation of contiguous memory chunks,
without creating larger memory mapping in the CPU page tables?

Should we create a special "allocator" virtual device, that will hand out
memory in contiguous chunks via a call to mmap with an FD connected to the
device?

Thanks,
--Shachar


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC contig pages support 1/2] IB: Supports contiguous memory operations
  2015-12-13 12:48                   ` Shachar Raindel
@ 2015-12-22 14:59                         ` Vlastimil Babka
  0 siblings, 0 replies; 20+ messages in thread
From: Vlastimil Babka @ 2015-12-22 14:59 UTC (permalink / raw)
  To: Shachar Raindel, Christoph Hellwig
  Cc: Jason Gunthorpe, Yishai Hadas, dledford-H+wXaHxf7aLQT0dZR+AlfA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Or Gerlitz, Tal Alon,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg

On 12/13/2015 01:48 PM, Shachar Raindel wrote:
>
>
>> -----Original Message-----
>> From: Christoph Hellwig [mailto:hch-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org]
>> Sent: Wednesday, December 09, 2015 8:40 PM
>>
>> On Wed, Dec 09, 2015 at 10:00:02AM +0000, Shachar Raindel wrote:
>>> As far as gain is concerned, we are seeing gains in two cases here:
>>> 1. If the system has lots of non-fragmented, free memory, you can
>> create large contig blocks that are above the CPU huge page size.
>>> 2. If the system memory is very fragmented, you cannot allocate huge
>> pages. However, an API that allows you to create small (i.e. 64KB,
>> 128KB, etc.) contig blocks reduces the load on the HW page tables and
>> caches.
>>
>> None of that is a uniqueue requirement for the mlx4 devices.  Again,
>> please work with the memory management folks to address your
>> requirements in a generic way!
>
> I completely agree, and this RFC was sent in order to start discussion
> on this subject.
>
> Dear MM people, can you please advise on the subject?
>
> Multiple HW vendors, from different fields, ranging between embedded SoC
> devices (TI) and HPC (Mellanox) are looking for a solution to allocate
> blocks of contiguous memory to user space applications, without using huge
> pages.
>
> What should be the API to expose such feature?
>
> Should we create a virtual FS that allows the user to create "files"
> representing memory allocations, and define the contiguous level we
> attempt to allocate using folders (similar to hugetlbfs)?
>
> Should we patch hugetlbfs to allow allocation of contiguous memory chunks,
> without creating larger memory mapping in the CPU page tables?
>
> Should we create a special "allocator" virtual device, that will hand out
> memory in contiguous chunks via a call to mmap with an FD connected to the
> device?

How much memory do you assume to be used like this? Is this memory 
supposed to be swappable, migratable, etc? I.e. on LRU lists?
Allocating a lot of memory (e.g. most of userspace memory) that's not 
LRU wouldn't be nice. But LRU operations are not prepared to work witch 
such non-standard-sized allocations, regardless of what API you use.  So 
I think that's the more fundamental questions here.

> Thanks,
> --Shachar
>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo-Bw31MaZKKs0EbZ0PF+XxCw@public.gmane.org  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=ilto:"dont-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org"> email-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org </a>
>

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC contig pages support 1/2] IB: Supports contiguous memory operations
@ 2015-12-22 14:59                         ` Vlastimil Babka
  0 siblings, 0 replies; 20+ messages in thread
From: Vlastimil Babka @ 2015-12-22 14:59 UTC (permalink / raw)
  To: Shachar Raindel, Christoph Hellwig
  Cc: Jason Gunthorpe, Yishai Hadas, dledford, linux-rdma, Or Gerlitz,
	Tal Alon, linux-mm

On 12/13/2015 01:48 PM, Shachar Raindel wrote:
>
>
>> -----Original Message-----
>> From: Christoph Hellwig [mailto:hch@infradead.org]
>> Sent: Wednesday, December 09, 2015 8:40 PM
>>
>> On Wed, Dec 09, 2015 at 10:00:02AM +0000, Shachar Raindel wrote:
>>> As far as gain is concerned, we are seeing gains in two cases here:
>>> 1. If the system has lots of non-fragmented, free memory, you can
>> create large contig blocks that are above the CPU huge page size.
>>> 2. If the system memory is very fragmented, you cannot allocate huge
>> pages. However, an API that allows you to create small (i.e. 64KB,
>> 128KB, etc.) contig blocks reduces the load on the HW page tables and
>> caches.
>>
>> None of that is a uniqueue requirement for the mlx4 devices.  Again,
>> please work with the memory management folks to address your
>> requirements in a generic way!
>
> I completely agree, and this RFC was sent in order to start discussion
> on this subject.
>
> Dear MM people, can you please advise on the subject?
>
> Multiple HW vendors, from different fields, ranging between embedded SoC
> devices (TI) and HPC (Mellanox) are looking for a solution to allocate
> blocks of contiguous memory to user space applications, without using huge
> pages.
>
> What should be the API to expose such feature?
>
> Should we create a virtual FS that allows the user to create "files"
> representing memory allocations, and define the contiguous level we
> attempt to allocate using folders (similar to hugetlbfs)?
>
> Should we patch hugetlbfs to allow allocation of contiguous memory chunks,
> without creating larger memory mapping in the CPU page tables?
>
> Should we create a special "allocator" virtual device, that will hand out
> memory in contiguous chunks via a call to mmap with an FD connected to the
> device?

How much memory do you assume to be used like this? Is this memory 
supposed to be swappable, migratable, etc? I.e. on LRU lists?
Allocating a lot of memory (e.g. most of userspace memory) that's not 
LRU wouldn't be nice. But LRU operations are not prepared to work witch 
such non-standard-sized allocations, regardless of what API you use.  So 
I think that's the more fundamental questions here.

> Thanks,
> --Shachar
>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=ilto:"dont@kvack.org"> email@kvack.org </a>
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [RFC contig pages support 1/2] IB: Supports contiguous memory operations
  2015-12-22 14:59                         ` Vlastimil Babka
@ 2015-12-23 16:30                             ` Shachar Raindel
  -1 siblings, 0 replies; 20+ messages in thread
From: Shachar Raindel @ 2015-12-23 16:30 UTC (permalink / raw)
  To: Vlastimil Babka, Christoph Hellwig
  Cc: Jason Gunthorpe, Yishai Hadas, dledford-H+wXaHxf7aLQT0dZR+AlfA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Or Gerlitz, Tal Alon,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg



> -----Original Message-----
> From: Vlastimil Babka [mailto:vbabka-AlSwsSmVLrQ@public.gmane.org]
> Sent: Tuesday, December 22, 2015 4:59 PM
> 
> On 12/13/2015 01:48 PM, Shachar Raindel wrote:
> >
> >
> >> -----Original Message-----
> >> From: Christoph Hellwig [mailto:hch-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org]
> >> Sent: Wednesday, December 09, 2015 8:40 PM
> >>
> >> On Wed, Dec 09, 2015 at 10:00:02AM +0000, Shachar Raindel wrote:
> >>> As far as gain is concerned, we are seeing gains in two cases here:
> >>> 1. If the system has lots of non-fragmented, free memory, you can
> >> create large contig blocks that are above the CPU huge page size.
> >>> 2. If the system memory is very fragmented, you cannot allocate huge
> >> pages. However, an API that allows you to create small (i.e. 64KB,
> >> 128KB, etc.) contig blocks reduces the load on the HW page tables and
> >> caches.
> >>
> >> None of that is a uniqueue requirement for the mlx4 devices.  Again,
> >> please work with the memory management folks to address your
> >> requirements in a generic way!
> >
> > I completely agree, and this RFC was sent in order to start discussion
> > on this subject.
> >
> > Dear MM people, can you please advise on the subject?
> >
> > Multiple HW vendors, from different fields, ranging between embedded
> SoC
> > devices (TI) and HPC (Mellanox) are looking for a solution to allocate
> > blocks of contiguous memory to user space applications, without using
> huge
> > pages.
> >
> > What should be the API to expose such feature?
> >
> > Should we create a virtual FS that allows the user to create "files"
> > representing memory allocations, and define the contiguous level we
> > attempt to allocate using folders (similar to hugetlbfs)?
> >
> > Should we patch hugetlbfs to allow allocation of contiguous memory
> chunks,
> > without creating larger memory mapping in the CPU page tables?
> >
> > Should we create a special "allocator" virtual device, that will hand
> out
> > memory in contiguous chunks via a call to mmap with an FD connected to
> the
> > device?
> 
> How much memory do you assume to be used like this?

Depends on the use case. Most likely several MBs/core, used for interfacing
with the HW (packet rings, frame buffers, etc.).

Some applications might want to perform calculations in such memory, to 
optimize communication time, especially in the HPC market.

> Is this memory
> supposed to be swappable, migratable, etc? I.e. on LRU lists?

Most likely not. In many of the relevant applications (embedded, HPC),
there is no swap and the application threads are pinned to specific cores
and NUMA nodes.
The biggest pain here is that these memory pages will not be eligible for
compaction, making it harder to handle fragmentations and CMA allocation
requests.

> Allocating a lot of memory (e.g. most of userspace memory) that's not
> LRU wouldn't be nice. But LRU operations are not prepared to work witch
> such non-standard-sized allocations, regardless of what API you use.  So
> I think that's the more fundamental questions here.

I agree that there are fundamental questions here. 

That being said, there is a clear need for an API allowing 
allocation, to the user space, limited size of memory that
is composed of large contiguous blocks.

What will be the best way to implement such solution?

Thanks,
--Shachar

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [RFC contig pages support 1/2] IB: Supports contiguous memory operations
@ 2015-12-23 16:30                             ` Shachar Raindel
  0 siblings, 0 replies; 20+ messages in thread
From: Shachar Raindel @ 2015-12-23 16:30 UTC (permalink / raw)
  To: Vlastimil Babka, Christoph Hellwig
  Cc: Jason Gunthorpe, Yishai Hadas, dledford, linux-rdma, Or Gerlitz,
	Tal Alon, linux-mm



> -----Original Message-----
> From: Vlastimil Babka [mailto:vbabka@suse.cz]
> Sent: Tuesday, December 22, 2015 4:59 PM
> 
> On 12/13/2015 01:48 PM, Shachar Raindel wrote:
> >
> >
> >> -----Original Message-----
> >> From: Christoph Hellwig [mailto:hch@infradead.org]
> >> Sent: Wednesday, December 09, 2015 8:40 PM
> >>
> >> On Wed, Dec 09, 2015 at 10:00:02AM +0000, Shachar Raindel wrote:
> >>> As far as gain is concerned, we are seeing gains in two cases here:
> >>> 1. If the system has lots of non-fragmented, free memory, you can
> >> create large contig blocks that are above the CPU huge page size.
> >>> 2. If the system memory is very fragmented, you cannot allocate huge
> >> pages. However, an API that allows you to create small (i.e. 64KB,
> >> 128KB, etc.) contig blocks reduces the load on the HW page tables and
> >> caches.
> >>
> >> None of that is a uniqueue requirement for the mlx4 devices.  Again,
> >> please work with the memory management folks to address your
> >> requirements in a generic way!
> >
> > I completely agree, and this RFC was sent in order to start discussion
> > on this subject.
> >
> > Dear MM people, can you please advise on the subject?
> >
> > Multiple HW vendors, from different fields, ranging between embedded
> SoC
> > devices (TI) and HPC (Mellanox) are looking for a solution to allocate
> > blocks of contiguous memory to user space applications, without using
> huge
> > pages.
> >
> > What should be the API to expose such feature?
> >
> > Should we create a virtual FS that allows the user to create "files"
> > representing memory allocations, and define the contiguous level we
> > attempt to allocate using folders (similar to hugetlbfs)?
> >
> > Should we patch hugetlbfs to allow allocation of contiguous memory
> chunks,
> > without creating larger memory mapping in the CPU page tables?
> >
> > Should we create a special "allocator" virtual device, that will hand
> out
> > memory in contiguous chunks via a call to mmap with an FD connected to
> the
> > device?
> 
> How much memory do you assume to be used like this?

Depends on the use case. Most likely several MBs/core, used for interfacing
with the HW (packet rings, frame buffers, etc.).

Some applications might want to perform calculations in such memory, to 
optimize communication time, especially in the HPC market.

> Is this memory
> supposed to be swappable, migratable, etc? I.e. on LRU lists?

Most likely not. In many of the relevant applications (embedded, HPC),
there is no swap and the application threads are pinned to specific cores
and NUMA nodes.
The biggest pain here is that these memory pages will not be eligible for
compaction, making it harder to handle fragmentations and CMA allocation
requests.

> Allocating a lot of memory (e.g. most of userspace memory) that's not
> LRU wouldn't be nice. But LRU operations are not prepared to work witch
> such non-standard-sized allocations, regardless of what API you use.  So
> I think that's the more fundamental questions here.

I agree that there are fundamental questions here. 

That being said, there is a clear need for an API allowing 
allocation, to the user space, limited size of memory that
is composed of large contiguous blocks.

What will be the best way to implement such solution?

Thanks,
--Shachar

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC contig pages support 1/2] IB: Supports contiguous memory operations
  2015-12-23 16:30                             ` Shachar Raindel
@ 2016-01-04 14:43                                 ` Vlastimil Babka
  -1 siblings, 0 replies; 20+ messages in thread
From: Vlastimil Babka @ 2016-01-04 14:43 UTC (permalink / raw)
  To: Shachar Raindel, Christoph Hellwig
  Cc: Jason Gunthorpe, Yishai Hadas, dledford-H+wXaHxf7aLQT0dZR+AlfA,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA, Or Gerlitz, Tal Alon,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg

On 12/23/2015 05:30 PM, Shachar Raindel wrote:
 >>>
 >>> I completely agree, and this RFC was sent in order to start discussion
 >>> on this subject.
 >>>
 >>> Dear MM people, can you please advise on the subject?
 >>>
 >>> Multiple HW vendors, from different fields, ranging between embedded
 >> SoC
 >>> devices (TI) and HPC (Mellanox) are looking for a solution to allocate
 >>> blocks of contiguous memory to user space applications, without using
 >> huge
 >>> pages.
 >>>
 >>> What should be the API to expose such feature?
 >>>
 >>> Should we create a virtual FS that allows the user to create "files"
 >>> representing memory allocations, and define the contiguous level we
 >>> attempt to allocate using folders (similar to hugetlbfs)?
 >>>
 >>> Should we patch hugetlbfs to allow allocation of contiguous memory
 >> chunks,
 >>> without creating larger memory mapping in the CPU page tables?
 >>>
 >>> Should we create a special "allocator" virtual device, that will hand
 >> out
 >>> memory in contiguous chunks via a call to mmap with an FD connected to
 >> the
 >>> device?
 >>
 >> How much memory do you assume to be used like this?
 >
 > Depends on the use case. Most likely several MBs/core, used for 
interfacing
 > with the HW (packet rings, frame buffers, etc.).
 >
 > Some applications might want to perform calculations in such memory, to
 > optimize communication time, especially in the HPC market.

OK.

 >
 >> Is this memory
 >> supposed to be swappable, migratable, etc? I.e. on LRU lists?
 >
 > Most likely not. In many of the relevant applications (embedded, HPC),
 > there is no swap and the application threads are pinned to specific cores
 > and NUMA nodes.
 > The biggest pain here is that these memory pages will not be eligible for
 > compaction, making it harder to handle fragmentations and CMA allocation
 > requests.

There was a patch set to enable compaction on such pages, see 
https://lwn.net/Articles/650917/
Minchan was going to pick this after Gioh left, and then it should be 
possible. But it requires careful driver-specific cooperation, i.e. when 
a page can be isolated for the migration, see 
http://article.gmane.org/gmane.linux.kernel.mm/136457

 >> Allocating a lot of memory (e.g. most of userspace memory) that's not
 >> LRU wouldn't be nice. But LRU operations are not prepared to work witch
 >> such non-standard-sized allocations, regardless of what API you use.  So
 >> I think that's the more fundamental questions here.
 >
 > I agree that there are fundamental questions here.
 >
 > That being said, there is a clear need for an API allowing
 > allocation, to the user space, limited size of memory that
 > is composed of large contiguous blocks.
 >
 > What will be the best way to implement such solution?

Given the likely driver-specific constraints/handling of the page 
migration, I'm not sure if some completely universal API is feasible.
Maybe some reusable parts of the functionality in the patch in this 
thread could be provided by mm.

 > Thanks,
 > --Shachar
 >
 > --
 > To unsubscribe, send a message with 'unsubscribe linux-mm' in
 > the body to majordomo-Bw31MaZKKs0EbZ0PF+XxCw@public.gmane.org  For more info on Linux MM,
 > see: http://www.linux-mm.org/ .
 > Don't email: <a href=ilto:"dont-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org"> email-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org </a>
 >

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC contig pages support 1/2] IB: Supports contiguous memory operations
@ 2016-01-04 14:43                                 ` Vlastimil Babka
  0 siblings, 0 replies; 20+ messages in thread
From: Vlastimil Babka @ 2016-01-04 14:43 UTC (permalink / raw)
  To: Shachar Raindel, Christoph Hellwig
  Cc: Jason Gunthorpe, Yishai Hadas, dledford, linux-rdma, Or Gerlitz,
	Tal Alon, linux-mm

On 12/23/2015 05:30 PM, Shachar Raindel wrote:
 >>>
 >>> I completely agree, and this RFC was sent in order to start discussion
 >>> on this subject.
 >>>
 >>> Dear MM people, can you please advise on the subject?
 >>>
 >>> Multiple HW vendors, from different fields, ranging between embedded
 >> SoC
 >>> devices (TI) and HPC (Mellanox) are looking for a solution to allocate
 >>> blocks of contiguous memory to user space applications, without using
 >> huge
 >>> pages.
 >>>
 >>> What should be the API to expose such feature?
 >>>
 >>> Should we create a virtual FS that allows the user to create "files"
 >>> representing memory allocations, and define the contiguous level we
 >>> attempt to allocate using folders (similar to hugetlbfs)?
 >>>
 >>> Should we patch hugetlbfs to allow allocation of contiguous memory
 >> chunks,
 >>> without creating larger memory mapping in the CPU page tables?
 >>>
 >>> Should we create a special "allocator" virtual device, that will hand
 >> out
 >>> memory in contiguous chunks via a call to mmap with an FD connected to
 >> the
 >>> device?
 >>
 >> How much memory do you assume to be used like this?
 >
 > Depends on the use case. Most likely several MBs/core, used for 
interfacing
 > with the HW (packet rings, frame buffers, etc.).
 >
 > Some applications might want to perform calculations in such memory, to
 > optimize communication time, especially in the HPC market.

OK.

 >
 >> Is this memory
 >> supposed to be swappable, migratable, etc? I.e. on LRU lists?
 >
 > Most likely not. In many of the relevant applications (embedded, HPC),
 > there is no swap and the application threads are pinned to specific cores
 > and NUMA nodes.
 > The biggest pain here is that these memory pages will not be eligible for
 > compaction, making it harder to handle fragmentations and CMA allocation
 > requests.

There was a patch set to enable compaction on such pages, see 
https://lwn.net/Articles/650917/
Minchan was going to pick this after Gioh left, and then it should be 
possible. But it requires careful driver-specific cooperation, i.e. when 
a page can be isolated for the migration, see 
http://article.gmane.org/gmane.linux.kernel.mm/136457

 >> Allocating a lot of memory (e.g. most of userspace memory) that's not
 >> LRU wouldn't be nice. But LRU operations are not prepared to work witch
 >> such non-standard-sized allocations, regardless of what API you use.  So
 >> I think that's the more fundamental questions here.
 >
 > I agree that there are fundamental questions here.
 >
 > That being said, there is a clear need for an API allowing
 > allocation, to the user space, limited size of memory that
 > is composed of large contiguous blocks.
 >
 > What will be the best way to implement such solution?

Given the likely driver-specific constraints/handling of the page 
migration, I'm not sure if some completely universal API is feasible.
Maybe some reusable parts of the functionality in the patch in this 
thread could be provided by mm.

 > Thanks,
 > --Shachar
 >
 > --
 > To unsubscribe, send a message with 'unsubscribe linux-mm' in
 > the body to majordomo@kvack.org.  For more info on Linux MM,
 > see: http://www.linux-mm.org/ .
 > Don't email: <a href=ilto:"dont@kvack.org"> email@kvack.org </a>
 >

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC contig pages support 1/2] IB: Supports contiguous memory operations
  2015-12-23 16:30                             ` Shachar Raindel
  (?)
  (?)
@ 2016-01-04 14:44                             ` Vlastimil Babka
  -1 siblings, 0 replies; 20+ messages in thread
From: Vlastimil Babka @ 2016-01-04 14:44 UTC (permalink / raw)
  To: Shachar Raindel, Christoph Hellwig
  Cc: Jason Gunthorpe, Yishai Hadas, dledford, linux-rdma, Or Gerlitz,
	Tal Alon, linux-mm, Minchan Kim

[Sorry for resending, forgot to CC Minchan]

On 12/23/2015 05:30 PM, Shachar Raindel wrote:
 >>>
 >>> I completely agree, and this RFC was sent in order to start discussion
 >>> on this subject.
 >>>
 >>> Dear MM people, can you please advise on the subject?
 >>>
 >>> Multiple HW vendors, from different fields, ranging between embedded
 >> SoC
 >>> devices (TI) and HPC (Mellanox) are looking for a solution to allocate
 >>> blocks of contiguous memory to user space applications, without using
 >> huge
 >>> pages.
 >>>
 >>> What should be the API to expose such feature?
 >>>
 >>> Should we create a virtual FS that allows the user to create "files"
 >>> representing memory allocations, and define the contiguous level we
 >>> attempt to allocate using folders (similar to hugetlbfs)?
 >>>
 >>> Should we patch hugetlbfs to allow allocation of contiguous memory
 >> chunks,
 >>> without creating larger memory mapping in the CPU page tables?
 >>>
 >>> Should we create a special "allocator" virtual device, that will hand
 >> out
 >>> memory in contiguous chunks via a call to mmap with an FD connected to
 >> the
 >>> device?
 >>
 >> How much memory do you assume to be used like this?
 >
 > Depends on the use case. Most likely several MBs/core, used for 
interfacing
 > with the HW (packet rings, frame buffers, etc.).
 >
 > Some applications might want to perform calculations in such memory, to
 > optimize communication time, especially in the HPC market.

OK.

 >
 >> Is this memory
 >> supposed to be swappable, migratable, etc? I.e. on LRU lists?
 >
 > Most likely not. In many of the relevant applications (embedded, HPC),
 > there is no swap and the application threads are pinned to specific cores
 > and NUMA nodes.
 > The biggest pain here is that these memory pages will not be eligible for
 > compaction, making it harder to handle fragmentations and CMA allocation
 > requests.

There was a patch set to enable compaction on such pages, see 
https://lwn.net/Articles/650917/
Minchan was going to pick this after Gioh left, and then it should be 
possible. But it requires careful driver-specific cooperation, i.e. when 
a page can be isolated for the migration, see 
http://article.gmane.org/gmane.linux.kernel.mm/136457

 >> Allocating a lot of memory (e.g. most of userspace memory) that's not
 >> LRU wouldn't be nice. But LRU operations are not prepared to work witch
 >> such non-standard-sized allocations, regardless of what API you use.  So
 >> I think that's the more fundamental questions here.
 >
 > I agree that there are fundamental questions here.
 >
 > That being said, there is a clear need for an API allowing
 > allocation, to the user space, limited size of memory that
 > is composed of large contiguous blocks.
 >
 > What will be the best way to implement such solution?

Given the likely driver-specific constraints/handling of the page 
migration, I'm not sure if some completely universal API is feasible.
Maybe some reusable parts of the functionality in the patch in this 
thread could be provided by mm.

 > Thanks,
 > --Shachar
 >
 > --
 > To unsubscribe, send a message with 'unsubscribe linux-mm' in
 > the body to majordomo@kvack.org.  For more info on Linux MM,
 > see: http://www.linux-mm.org/ .
 > Don't email: <a href=ilto:"dont@kvack.org"> email@kvack.org </a>
 >

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2016-01-04 14:44 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-12-08 15:15 [RFC contig pages support 0/2] Add contiguous pages support Yishai Hadas
     [not found] ` <1449587707-24214-1-git-send-email-yishaih-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2015-12-08 15:15   ` [RFC contig pages support 1/2] IB: Supports contiguous memory operations Yishai Hadas
     [not found]     ` <1449587707-24214-2-git-send-email-yishaih-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2015-12-08 15:18       ` Christoph Hellwig
2015-12-08 15:18         ` Christoph Hellwig
     [not found]         ` <20151208151852.GA6688-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
2015-12-08 17:15           ` Jason Gunthorpe
2015-12-08 17:15             ` Jason Gunthorpe
2015-12-09 10:00             ` Shachar Raindel
     [not found]               ` <AM4PR05MB146005B448BEA876519335CDDCE80-n5Jp0YuYvM1n/kQvjoF5G9qRiQSDpxhJvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2015-12-09 17:48                 ` Jason Gunthorpe
2015-12-09 17:48                   ` Jason Gunthorpe
2015-12-09 18:39                 ` Christoph Hellwig
2015-12-09 18:39                   ` Christoph Hellwig
2015-12-13 12:48                   ` Shachar Raindel
     [not found]                     ` <AM4PR05MB14603FC8169D50AD2A8F5AA3DCEC0-n5Jp0YuYvM1n/kQvjoF5G9qRiQSDpxhJvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2015-12-22 14:59                       ` Vlastimil Babka
2015-12-22 14:59                         ` Vlastimil Babka
     [not found]                         ` <56796538.9040906-AlSwsSmVLrQ@public.gmane.org>
2015-12-23 16:30                           ` Shachar Raindel
2015-12-23 16:30                             ` Shachar Raindel
     [not found]                             ` <AM4PR05MB14603CF21CB493086BDEE026DCE60-n5Jp0YuYvM1n/kQvjoF5G9qRiQSDpxhJvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2016-01-04 14:43                               ` Vlastimil Babka
2016-01-04 14:43                                 ` Vlastimil Babka
2016-01-04 14:44                             ` Vlastimil Babka
2015-12-08 15:15   ` [RFC contig pages support 2/2] IB/mlx5: Exporting to user space the contiguous allocation capability Yishai Hadas

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.