All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] libibverbs: ibv_fork_init() and libhugetlbfs
@ 2010-05-31  9:13 Alexander Schmidt
  2010-06-02 21:49 ` Roland Dreier
  0 siblings, 1 reply; 9+ messages in thread
From: Alexander Schmidt @ 2010-05-31  9:13 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Stefan Roscher, Christoph Raisch, of-ewg, Linux RDMA, Alex Vainman

Hi Roland,

we have been working on adressing your review comments and are looking for
feedback regarding v2 now.

Problem description:

When fork support is enabled in libibverbs, madvise() is called for every
memory page that is registered as a memory region. Memory ranges that
are passed to madvise() must be page aligned and the size must be a
multiple of the page size. libibverbs uses sysconf(_SC_PAGESIZE) to find
out the system page size and rounds all ranges passed to reg_mr() according
to this page size. When memory from libhugetlbfs is passed to reg_mr(), this
does not work as the page size for this memory range might be different
(e.g. 16Mb). So libibverbs would have to use the huge page size to
calculate a page aligned range for madvise.

As huge pages are provided to the application "under the hood" when
preloading libhugetlbfs, the application does not have any knowledge about
when it registers a huge page or a usual page.

To work around this issue, detect the use of huge pages in libibverbs and
align memory ranges passed to madvise according to the huge page size.

Changes since v1:

- detect use of huge pages at ibv_fork_init() time by walking through
  /sys/kernel/mm/hugepages/
- read huge page size from /proc/pid/smaps, which contains the page
  size of the mapping (thereby enabling support for mutliple huge
  page sizes)
- code is independent of libhugetlbfs now, so huge pages can be provided
  to the application by any library

Performance:

PPC64 system with eHCA

without patch:
1M memory region    120usec
16M memory region  1970usec 

with patch v2:
1M memory region    172usec
16M memory region  2030usec

with patch and 16M huge pages:
1M memory region    110usec
16M memory region   193usec

Signed-off-by: Alexander Schmidt <alexs-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 src/memory.c |  137 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 131 insertions(+), 6 deletions(-)

--- libibverbs-1.1.2.orig/src/memory.c
+++ libibverbs-1.1.2/src/memory.c
@@ -40,6 +40,10 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <dirent.h>
+#include <limits.h>
 
 #include "ibverbs.h"
 
@@ -68,12 +72,117 @@ struct ibv_mem_node {
 static struct ibv_mem_node *mm_root;
 static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER;
 static int page_size;
+static int huge_page_enabled;
 static int too_late;
 
+static int is_huge_page_enabled(void)
+{
+	int n, ret = 0;
+	char *bufp;
+	DIR *dir;
+	struct dirent *entry;
+	FILE *file;
+	unsigned long nr_hugepages;
+	char buf[1024];
+
+	dir = opendir("/sys/kernel/mm/hugepages/");
+	if (!dir)
+		return 0;
+
+	while ((entry = readdir(dir))) {
+		if (strncmp(entry->d_name, "hugepages-", 10))
+			continue;
+
+		snprintf(buf, sizeof(buf), "/sys/kernel/mm/hugepages/%s/nr_hugepages",
+				entry->d_name);
+
+		file = fopen(buf, "r");
+		if (!file)
+			continue;
+
+		bufp = fgets(buf, sizeof(buf), file);
+		fclose(file);
+		if (!bufp)
+			continue;
+
+		n = sscanf(buf, "%lu", &nr_hugepages);
+		if (n < 1)
+			continue;
+
+		if (nr_hugepages) {
+			ret = 1;
+			goto out;
+		}
+	}
+
+out:
+	closedir(dir);
+
+	return ret;
+}
+
+static unsigned long smaps_page_size(FILE *file)
+{
+	int n;
+	unsigned long size = page_size;
+	char buf[1024];
+
+	while (fgets(buf, sizeof(buf), file) != NULL) {
+		if (!strstr(buf, "KernelPageSize:"))
+			continue;
+
+		n = sscanf(buf, "%*s %lu", &size);
+		if (n < 1)
+			continue;
+
+		/* page size is printed in Kb */
+		size = size * 1024;
+
+		break;
+	}
+
+	return size;
+}
+
+static unsigned long get_page_size(void *base)
+{
+	unsigned long ret = page_size;
+	pid_t pid;
+	FILE *file;
+	char buf[1024];
+
+	pid = getpid();
+	snprintf(buf, sizeof(buf), "/proc/%d/smaps", pid);
+
+	file = fopen(buf, "r");
+	if (!file)
+		goto out;
+
+	while (fgets(buf, sizeof(buf), file) != NULL) {
+		int n;
+		uintptr_t range_start, range_end;
+
+		n = sscanf(buf, "%lx-%lx", &range_start, &range_end);
+
+		if (n < 2)
+			continue;
+
+		if ((uintptr_t) base >= range_start && (uintptr_t) base < range_end) {
+			ret = smaps_page_size(file);
+			break;
+		}
+	}
+	fclose(file);
+
+out:
+	return ret;
+}
+
 int ibv_fork_init(void)
 {
-	void *tmp;
+	void *tmp, *tmp_aligned;
 	int ret;
+	unsigned long size;
 
 	if (mm_root)
 		return 0;
@@ -88,8 +197,18 @@ int ibv_fork_init(void)
 	if (posix_memalign(&tmp, page_size, page_size))
 		return ENOMEM;
 
-	ret = madvise(tmp, page_size, MADV_DONTFORK) ||
-	      madvise(tmp, page_size, MADV_DOFORK);
+	huge_page_enabled = is_huge_page_enabled();
+
+	if (huge_page_enabled) {
+		size = get_page_size(tmp);
+		tmp_aligned = (void *)((uintptr_t)tmp & ~(size - 1));
+	} else {
+		size = page_size;
+		tmp_aligned = tmp;
+	}
+
+	ret = madvise(tmp_aligned, size, MADV_DONTFORK) ||
+	      madvise(tmp_aligned, size, MADV_DOFORK);
 
 	free(tmp);
 
@@ -452,15 +571,21 @@ static int ibv_madvise_range(void *base,
 	struct ibv_mem_node *node, *tmp;
 	int inc;
 	int ret = 0;
+	unsigned long range_page_size;
 
 	if (!size)
 		return 0;
 
 	inc = advice == MADV_DONTFORK ? 1 : -1;
 
-	start = (uintptr_t) base & ~(page_size - 1);
-	end   = ((uintptr_t) (base + size + page_size - 1) &
-		 ~(page_size - 1)) - 1;
+	if (huge_page_enabled)
+		range_page_size = get_page_size(base);
+	else
+		range_page_size = page_size;
+
+	start = (uintptr_t) base & ~(range_page_size - 1);
+	end   = ((uintptr_t) (base + size + range_page_size - 1) &
+		 ~(range_page_size - 1)) - 1;
 
 	pthread_mutex_lock(&mm_mutex);
 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] libibverbs: ibv_fork_init() and libhugetlbfs
  2010-05-31  9:13 [PATCH v2] libibverbs: ibv_fork_init() and libhugetlbfs Alexander Schmidt
@ 2010-06-02 21:49 ` Roland Dreier
       [not found]   ` <adamxvdqf4e.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Roland Dreier @ 2010-06-02 21:49 UTC (permalink / raw)
  To: Alexander Schmidt
  Cc: Stefan Roscher, Christoph Raisch, of-ewg, Linux RDMA, Alex Vainman

 > without patch:
 > 1M memory region    120usec
 > 16M memory region  1970usec 
 > 
 > with patch v2:
 > 1M memory region    172usec
 > 16M memory region  2030usec

So if I read this correctly this patch introduces almost a 50% overhead
in the 1M case... and probably much worse (as a fraction) in say the 64K
or 4K case.  I wonder if that's acceptable?

Alex's original approach was to try the memadvise with normal page size
and then fall back to huge page size if that failed.  But of course that
wastes some time on the failed madvise in the hugepage case.

I think it would be interesting to compare timings for registering, say,
4K, 64K, 1M and 16M regions with and without huge page backing, for
three possibilities:

 - unpatched libibverbs (will obviously fail on hugepage backed regions)
 - patched with this v2
 - alternate patch that tries madvise and only does your /proc/pid/smaps
 - parsing if the first madvise fails.

 - R.
-- 
Roland Dreier <rolandd-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org> || For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/index.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] libibverbs: ibv_fork_init() and libhugetlbfs
       [not found]   ` <adamxvdqf4e.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
@ 2010-06-09  9:47     ` Alexander Schmidt
  2010-06-09 18:09       ` Roland Dreier
  0 siblings, 1 reply; 9+ messages in thread
From: Alexander Schmidt @ 2010-06-09  9:47 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Stefan Roscher, Christoph Raisch, of-ewg, Linux RDMA, Alex Vainman

On Wed, 02 Jun 2010 14:49:37 -0700
Roland Dreier <rdreier-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org> wrote:

> So if I read this correctly this patch introduces almost a 50% overhead
> in the 1M case... and probably much worse (as a fraction) in say the 64K
> or 4K case.  I wonder if that's acceptable?

We don't think this is acceptable, so we like the third approach you suggested
very much. I've written the code and attached it below. This third version
does not introduce additional overhead when not using huge pages (verified
with 4k, 64k, 1m and 16m memory regions).

Problem description:

When fork support is enabled in libibverbs, madvise() is called for every
memory page that is registered as a memory region. Memory ranges that
are passed to madvise() must be page aligned and the size must be a
multiple of the page size. libibverbs uses sysconf(_SC_PAGESIZE) to find
out the system page size and rounds all ranges passed to reg_mr() according
to this page size. When memory from libhugetlbfs is passed to reg_mr(), this
does not work as the page size for this memory range might be different
(e.g. 16Mb). So libibverbs would have to use the huge page size to
calculate a page aligned range for madvise.

As huge pages are provided to the application "under the hood" when
preloading libhugetlbfs, the application does not have any knowledge about
when it registers a huge page or a usual page.

To work around this issue, detect the use of huge pages in libibverbs and
align memory ranges passed to madvise according to the huge page size.

Changes since v1:

- detect use of huge pages at ibv_fork_init() time by walking through
  /sys/kernel/mm/hugepages/
- read huge page size from /proc/pid/smaps, which contains the page
  size of the mapping (thereby enabling support for mutliple huge
  page sizes)
- code is independent of libhugetlbfs now, so huge pages can be provided
  to the application by any library

Changes since v2:

- reading from /proc/ to determine the huge page size is now only done
  when a call to madvise() using the system page size fails. So there
  is no overhead introduced when registering non-huge-page memory.

Signed-off-by: Alexander Schmidt <alexs-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
---
 src/memory.c |   96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 90 insertions(+), 6 deletions(-)

--- libibverbs.git.orig/src/memory.c
+++ libibverbs.git/src/memory.c
@@ -40,6 +40,8 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <stdio.h>
+#include <string.h>
 
 #include "ibverbs.h"
 
@@ -70,10 +72,64 @@ static pthread_mutex_t mm_mutex = PTHREA
 static int page_size;
 static int too_late;
 
+static unsigned long smaps_page_size(FILE *file)
+{
+	int n;
+	unsigned long size = 0;
+	char buf[1024];
+
+	while (fgets(buf, sizeof(buf), file) != NULL) {
+		if (!strstr(buf, "KernelPageSize:"))
+			continue;
+
+		n = sscanf(buf, "%*s %lu", &size);
+		if (n < 1)
+			continue;
+
+		/* page size is printed in Kb */
+		size = size * 1024;
+
+		break;
+	}
+
+	return size;
+}
+
+static unsigned long get_page_size(void *base)
+{
+	unsigned long ret = 0;
+	FILE *file;
+	char buf[1024];
+
+	file = fopen("/proc/self/smaps", "r");
+	if (!file)
+		goto out;
+
+	while (fgets(buf, sizeof(buf), file) != NULL) {
+		int n;
+		uintptr_t range_start, range_end;
+
+		n = sscanf(buf, "%lx-%lx", &range_start, &range_end);
+
+		if (n < 2)
+			continue;
+
+		if ((uintptr_t) base >= range_start && (uintptr_t) base < range_end) {
+			ret = smaps_page_size(file);
+			break;
+		}
+	}
+	fclose(file);
+
+out:
+	return ret;
+}
+
 int ibv_fork_init(void)
 {
-	void *tmp;
+	void *tmp, *tmp_aligned;
 	int ret;
+	unsigned long size;
 
 	if (mm_root)
 		return 0;
@@ -88,8 +144,17 @@ int ibv_fork_init(void)
 	if (posix_memalign(&tmp, page_size, page_size))
 		return ENOMEM;
 
-	ret = madvise(tmp, page_size, MADV_DONTFORK) ||
-	      madvise(tmp, page_size, MADV_DOFORK);
+	size = get_page_size(tmp);
+
+	if (size)
+		tmp_aligned = (void *)((uintptr_t)tmp & ~(size - 1));
+	else {
+		size = page_size;
+		tmp_aligned = tmp;
+	}
+
+	ret = madvise(tmp_aligned, size, MADV_DONTFORK) ||
+	      madvise(tmp_aligned, size, MADV_DOFORK);
 
 	free(tmp);
 
@@ -522,7 +587,8 @@ static struct ibv_mem_node *undo_node(st
 	return node;
 }
 
-static int ibv_madvise_range(void *base, size_t size, int advice)
+static int ibv_madvise_range(void *base, size_t size, int advice,
+			     unsigned long page_size)
 {
 	uintptr_t start, end;
 	struct ibv_mem_node *node, *tmp;
@@ -612,10 +678,28 @@ out:
 	return ret;
 }
 
+static int ibv_fork_range(void *base, size_t size, int advice)
+{
+	int ret;
+	unsigned long range_page_size;
+
+	ret = ibv_madvise_range(base, size, advice, page_size);
+
+	if (ret == -1 && errno == EINVAL) {
+		range_page_size = get_page_size(base);
+
+		if (range_page_size)
+			ret = ibv_madvise_range(base, size, advice,
+						range_page_size);
+	}
+
+	return ret;
+}
+
 int ibv_dontfork_range(void *base, size_t size)
 {
 	if (mm_root)
-		return ibv_madvise_range(base, size, MADV_DONTFORK);
+		return ibv_fork_range(base, size, MADV_DONTFORK);
 	else {
 		too_late = 1;
 		return 0;
@@ -625,7 +709,7 @@ int ibv_dontfork_range(void *base, size_
 int ibv_dofork_range(void *base, size_t size)
 {
 	if (mm_root)
-		return ibv_madvise_range(base, size, MADV_DOFORK);
+		return ibv_fork_range(base, size, MADV_DOFORK);
 	else {
 		too_late = 1;
 		return 0;
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] libibverbs: ibv_fork_init() and libhugetlbfs
  2010-06-09  9:47     ` Alexander Schmidt
@ 2010-06-09 18:09       ` Roland Dreier
       [not found]         ` <adamxv4m620.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Roland Dreier @ 2010-06-09 18:09 UTC (permalink / raw)
  To: Alexander Schmidt
  Cc: Stefan Roscher, Christoph Raisch, of-ewg, Linux RDMA, Alex Vainman

Thanks, nice work.  I like this approach.  Alex (Vainman) any comments
on this?

 - R.
-- 
Roland Dreier <rolandd-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org> || For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/index.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] libibverbs: ibv_fork_init() and libhugetlbfs
       [not found]         ` <adamxv4m620.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
@ 2010-06-10 14:59           ` Alex Vainman
       [not found]             ` <4C10FDD0.8000108-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Alex Vainman @ 2010-06-10 14:59 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Alexander Schmidt, Stefan Roscher, Christoph Raisch, of-ewg, Linux RDMA

Wrote Roland Dreier:
> Thanks, nice work.  I like this approach.  Alex (Vainman) any comments
> on this?
> 
>  - R.

The solution looks great.

Thanks,
Alex
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2] libibverbs: ibv_fork_init() and libhugetlbfs
       [not found]             ` <4C10FDD0.8000108-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2010-06-28 15:18               ` Alexander Schmidt
  2010-07-03 20:19                 ` [ewg] " Roland Dreier
  0 siblings, 1 reply; 9+ messages in thread
From: Alexander Schmidt @ 2010-06-28 15:18 UTC (permalink / raw)
  To: alexv-smomgflXvOZWk0Htik3J/w
  Cc: alexonlists-Re5JQEeQqe8AvxtiuMwx3w, Roland Dreier,
	Stefan Roscher, Christoph Raisch, of-ewg, Linux RDMA

On Thu, 10 Jun 2010 17:59:28 +0300
Alex Vainman <alexonlists-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:

> Wrote Roland Dreier:
> > Thanks, nice work.  I like this approach.  Alex (Vainman) any comments
> > on this?
> > 
> >  - R.
> 
> The solution looks great.

Hi all,

in our further testing, we noticed that there is a substantial problem with
the current solution. Depending on the order of memory registrations, we might
end up with a corrupted node tree which blocks regions from being registered.

 When registering two memory regions A and B from within
the same huge page, we will end up with one node in the tree which covers the
whole huge page after registering A. When the second MR is registered, a node
is created with the MR size rounded to the system page size (as there is no
need to call madvise(), it is not noticed that MR B is part of a huge page).

Now if MR A is deregistered before MR B, I see that the tree containing
mem_nodes is empty afterwards, which causes problems for the deregistration of
MR B, leaving the tree in a corrupted state with negative refcounts. This also
breaks later registrations of other memory regions within this huge page.

At the moment I do not see an obvious solution for this, but it's clear that
an overhaul of this code is needed. I'm writing this to make sure that there
won't be a release of libibverbs containing this incomplete code, but also
to ask for comments from other people who might have an idea on how to fix
this.

Thanks for any comments!

Alex
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [ewg] [PATCH v2] libibverbs: ibv_fork_init() and libhugetlbfs
  2010-06-28 15:18               ` Alexander Schmidt
@ 2010-07-03 20:19                 ` Roland Dreier
       [not found]                   ` <adaoceonwsk.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
  0 siblings, 1 reply; 9+ messages in thread
From: Roland Dreier @ 2010-07-03 20:19 UTC (permalink / raw)
  To: Alexander Schmidt
  Cc: alexv-smomgflXvOZWk0Htik3J/w, alexonlists-Re5JQEeQqe8AvxtiuMwx3w,
	of-ewg, Linux RDMA, Christoph Raisch, Stefan Roscher

 >  When registering two memory regions A and B from within
 > the same huge page, we will end up with one node in the tree which covers the
 > whole huge page after registering A. When the second MR is registered, a node
 > is created with the MR size rounded to the system page size (as there is no
 > need to call madvise(), it is not noticed that MR B is part of a huge page).
 > 
 > Now if MR A is deregistered before MR B, I see that the tree containing
 > mem_nodes is empty afterwards, which causes problems for the deregistration of
 > MR B, leaving the tree in a corrupted state with negative refcounts. This also
 > breaks later registrations of other memory regions within this huge page.

Good thing I didn't get around to applying the patch yet ;)

I haven't thought this through fully, but it seems that maybe we could
extend the madvise tracking tree to keep track of the page size used for
each node in the tree.  Then for the registration of MR B above, we
would find the node for MR A covered MR B and we should be able to get
the ref counting right.

 - R.
-- 
Roland Dreier <rolandd-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org> || For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/index.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [ewg] [PATCH v2] libibverbs: ibv_fork_init() and libhugetlbfs
       [not found]                   ` <adaoceonwsk.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
@ 2010-07-06 15:25                     ` Alexander Schmidt
  2010-07-06 21:31                       ` Roland Dreier
  0 siblings, 1 reply; 9+ messages in thread
From: Alexander Schmidt @ 2010-07-06 15:25 UTC (permalink / raw)
  To: Roland Dreier
  Cc: alexv-smomgflXvOZWk0Htik3J/w, alexonlists-Re5JQEeQqe8AvxtiuMwx3w,
	of-ewg, Linux RDMA, Christoph Raisch, Stefan Roscher

On Sat, 03 Jul 2010 13:19:07 -0700
Roland Dreier <rdreier-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org> wrote:

>  >  When registering two memory regions A and B from within
>  > the same huge page, we will end up with one node in the tree which covers the
>  > whole huge page after registering A. When the second MR is registered, a node
>  > is created with the MR size rounded to the system page size (as there is no
>  > need to call madvise(), it is not noticed that MR B is part of a huge page).
>  > 
>  > Now if MR A is deregistered before MR B, I see that the tree containing
>  > mem_nodes is empty afterwards, which causes problems for the deregistration of
>  > MR B, leaving the tree in a corrupted state with negative refcounts. This also
>  > breaks later registrations of other memory regions within this huge page.
> 
> Good thing I didn't get around to applying the patch yet ;)
> 
> I haven't thought this through fully, but it seems that maybe we could
> extend the madvise tracking tree to keep track of the page size used for
> each node in the tree.  Then for the registration of MR B above, we
> would find the node for MR A covered MR B and we should be able to get
> the ref counting right.

We thought about this too, but in some special cases, we do not know the
correct page size of a memory range. For example when getting a 16M chunk
from a 16M huge page region which is also aligned to 16M, the first madvise()
will work fine and the code will assume that the page size is 64K.

If trying to register a 16M - 64K + 1 byte region, the first madvise() also
works fine. Now if a second memory region which resides in the last 64K is
registered, we end up with the same situation as above.

As this issue was not present in version 2 of the code, but there we had
a big performance penalty, I suggest the following: we could go back to
version 2 and introduce a new RDMAV_HUGEPAGE_SAFE env variable to let the user
decide between huge page support and better performance (the same approach we
use for the COW protection itself). Would this be okay or do you see a problem
with this?

Regards,
Alex
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [ewg] [PATCH v2] libibverbs: ibv_fork_init() and libhugetlbfs
  2010-07-06 15:25                     ` Alexander Schmidt
@ 2010-07-06 21:31                       ` Roland Dreier
  0 siblings, 0 replies; 9+ messages in thread
From: Roland Dreier @ 2010-07-06 21:31 UTC (permalink / raw)
  To: Alexander Schmidt
  Cc: alexv-smomgflXvOZWk0Htik3J/w, alexonlists-Re5JQEeQqe8AvxtiuMwx3w,
	of-ewg, Linux RDMA, Christoph Raisch, Stefan Roscher

 > We thought about this too, but in some special cases, we do not know the
 > correct page size of a memory range. For example when getting a 16M chunk
 > from a 16M huge page region which is also aligned to 16M, the first madvise()
 > will work fine and the code will assume that the page size is 64K.

I see ... yes, that does break my idea completely.

OK, another half-baked idea: what if we pay attention to when
madvise(DOFORK) fails as well as well madvise(DONTFORK) fails, and use
that as a hit that we better check the page size?

Perhaps this adds too much complexity ... in which case your idea:

 > As this issue was not present in version 2 of the code, but there we had
 > a big performance penalty, I suggest the following: we could go back to
 > version 2 and introduce a new RDMAV_HUGEPAGE_SAFE env variable to let the user
 > decide between huge page support and better performance (the same approach we
 > use for the COW protection itself).

seems like a reasonable alternative.

Thanks,
  Roland
-- 
Roland Dreier <rolandd-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org> || For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/index.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2010-07-06 21:31 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-05-31  9:13 [PATCH v2] libibverbs: ibv_fork_init() and libhugetlbfs Alexander Schmidt
2010-06-02 21:49 ` Roland Dreier
     [not found]   ` <adamxvdqf4e.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
2010-06-09  9:47     ` Alexander Schmidt
2010-06-09 18:09       ` Roland Dreier
     [not found]         ` <adamxv4m620.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
2010-06-10 14:59           ` Alex Vainman
     [not found]             ` <4C10FDD0.8000108-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2010-06-28 15:18               ` Alexander Schmidt
2010-07-03 20:19                 ` [ewg] " Roland Dreier
     [not found]                   ` <adaoceonwsk.fsf-BjVyx320WGW9gfZ95n9DRSW4+XlvGpQz@public.gmane.org>
2010-07-06 15:25                     ` Alexander Schmidt
2010-07-06 21:31                       ` Roland Dreier

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.