All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/8] cache: get rid of search loop in cache_add()
  2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
@ 2015-03-06  8:23 ` Petr Tesarik
  2015-03-06  8:52 ` [PATCH v2 2/8] cache: allow to return a page to the pool Petr Tesarik
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-06  8:23 UTC (permalink / raw)
  To: Atsushi Kumagai, Michael Holzheu; +Cc: kexec mailing list, Jan Willeke

The intention was that cache code is re-entrant, so all cache entries
should go through these states:

  1. free
  2. pending read
  3. used

The cache_add() function is used to move an entry from state 2 to 3, but
since the caller did not know cache entry pointer, it had to search the
pending list for a pending read for the given physical address. This is
not needed if cache_alloc() returns this pointer.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
---
 cache.c        | 26 ++++++--------------------
 cache.h        | 10 ++++++++--
 makedumpfile.c |  8 +++++---
 3 files changed, 19 insertions(+), 25 deletions(-)

diff --git a/cache.c b/cache.c
index 0dd957c..700ba0c 100644
--- a/cache.c
+++ b/cache.c
@@ -20,12 +20,6 @@
 #include "cache.h"
 #include "print_info.h"
 
-struct cache_entry {
-	unsigned long long paddr;
-	void *bufptr;
-	struct cache_entry *next, *prev;
-};
-
 struct cache {
 	struct cache_entry *head, *tail;
 };
@@ -98,38 +92,30 @@ cache_search(unsigned long long paddr)
 	return NULL;		/* cache miss */
 }
 
-void *
+struct cache_entry *
 cache_alloc(unsigned long long paddr)
 {
 	struct cache_entry *entry = NULL;
 
 	if (avail) {
 		entry = &pool[--avail];
-		entry->paddr = paddr;
 		add_entry(&pending, entry);
 	} else if (pending.tail) {
 		entry = pending.tail;
-		entry->paddr = paddr;
 	} else if (used.tail) {
 		entry = used.tail;
 		remove_entry(&used, entry);
-		entry->paddr = paddr;
 		add_entry(&pending, entry);
 	} else
 		return NULL;
 
-	return entry->bufptr;
+	entry->paddr = paddr;
+	return entry;
 }
 
 void
-cache_add(unsigned long long paddr)
+cache_add(struct cache_entry *entry)
 {
-	struct cache_entry *entry;
-	for (entry = pending.head; entry; entry = entry->next) {
-		if (entry->paddr == paddr) {
-			remove_entry(&pending, entry);
-			add_entry(&used, entry);
-			break;
-		}
-	}
+	remove_entry(&pending, entry);
+	add_entry(&used, entry);
 }
diff --git a/cache.h b/cache.h
index 4730e12..dab8eb9 100644
--- a/cache.h
+++ b/cache.h
@@ -19,9 +19,15 @@
 #ifndef _CACHE_H
 #define _CACHE_H
 
+struct cache_entry {
+	unsigned long long paddr;
+	void *bufptr;
+	struct cache_entry *next, *prev;
+};
+
 int cache_init(void);
 void *cache_search(unsigned long long paddr);
-void *cache_alloc(unsigned long long paddr);
-void cache_add(unsigned long long paddr);
+struct cache_entry *cache_alloc(unsigned long long paddr);
+void cache_add(struct cache_entry *entry);
 
 #endif	/* _CACHE_H */
diff --git a/makedumpfile.c b/makedumpfile.c
index 74bc9db..828adeb 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -591,6 +591,7 @@ readmem(int type_addr, unsigned long long addr, void *bufptr, size_t size)
 	unsigned long long paddr, maddr = NOT_PADDR;
 	unsigned long long pgaddr;
 	void *pgbuf;
+	struct cache_entry *cached;
 
 next_page:
 	switch (type_addr) {
@@ -644,9 +645,10 @@ next_page:
 	pgaddr = PAGEBASE(paddr);
 	pgbuf = cache_search(pgaddr);
 	if (!pgbuf) {
-		pgbuf = cache_alloc(pgaddr);
-		if (!pgbuf)
+		cached = cache_alloc(pgaddr);
+		if (!cached)
 			goto error;
+		pgbuf = cached->bufptr;
 
 		if (info->flag_refiltering) {
 			if (!readpage_kdump_compressed(pgaddr, pgbuf))
@@ -658,7 +660,7 @@ next_page:
 			if (!readpage_elf(pgaddr, pgbuf))
 				goto error;
 		}
-		cache_add(pgaddr);
+		cache_add(cached);
 	}
 
 	memcpy(bufptr, pgbuf + PAGEOFFSET(paddr), read_size);
-- 
1.8.4.5


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v2 2/8] cache: allow to return a page to the pool
  2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
  2015-03-06  8:23 ` [PATCH v2 1/8] cache: get rid of search loop in cache_add() Petr Tesarik
@ 2015-03-06  8:52 ` Petr Tesarik
  2015-03-06  8:59 ` [PATCH v2 3/8] cache: do not allocate from the pending list Petr Tesarik
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-06  8:52 UTC (permalink / raw)
  To: Atsushi Kumagai, Michael Holzheu; +Cc: kexec mailing list, Jan Willeke

After a failed read, the page should no longer be pending, but rather
available for future allocation.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
---
 cache.c        | 15 ++++++++++++---
 cache.h        |  1 +
 makedumpfile.c |  8 +++++---
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/cache.c b/cache.c
index 700ba0c..ad9f0f1 100644
--- a/cache.c
+++ b/cache.c
@@ -26,7 +26,8 @@ struct cache {
 
 /* 8 pages covers 4-level paging plus 4 data pages */
 #define CACHE_SIZE	8
-static struct cache_entry pool[CACHE_SIZE];
+static struct cache_entry entries[CACHE_SIZE];
+static struct cache_entry *pool[CACHE_SIZE];
 static int avail = CACHE_SIZE;
 
 static struct cache used, pending;
@@ -44,7 +45,8 @@ cache_init(void)
 			       strerror(errno));
 			return FALSE;
 		}
-		pool[i].bufptr = bufptr;
+		entries[i].bufptr = bufptr;
+		pool[i] = &entries[i];
 	}
 
 	return TRUE;
@@ -98,7 +100,7 @@ cache_alloc(unsigned long long paddr)
 	struct cache_entry *entry = NULL;
 
 	if (avail) {
-		entry = &pool[--avail];
+		entry = pool[--avail];
 		add_entry(&pending, entry);
 	} else if (pending.tail) {
 		entry = pending.tail;
@@ -119,3 +121,10 @@ cache_add(struct cache_entry *entry)
 	remove_entry(&pending, entry);
 	add_entry(&used, entry);
 }
+
+void
+cache_free(struct cache_entry *entry)
+{
+	remove_entry(&pending, entry);
+	pool[avail++] = entry;
+}
diff --git a/cache.h b/cache.h
index dab8eb9..0e65f97 100644
--- a/cache.h
+++ b/cache.h
@@ -29,5 +29,6 @@ int cache_init(void);
 void *cache_search(unsigned long long paddr);
 struct cache_entry *cache_alloc(unsigned long long paddr);
 void cache_add(struct cache_entry *entry);
+void cache_free(struct cache_entry *entry);
 
 #endif	/* _CACHE_H */
diff --git a/makedumpfile.c b/makedumpfile.c
index 828adeb..c62d035 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -652,13 +652,13 @@ next_page:
 
 		if (info->flag_refiltering) {
 			if (!readpage_kdump_compressed(pgaddr, pgbuf))
-				goto error;
+				goto error_cached;
 		} else if (info->flag_sadump) {
 			if (!readpage_sadump(pgaddr, pgbuf))
-				goto error;
+				goto error_cached;
 		} else {
 			if (!readpage_elf(pgaddr, pgbuf))
-				goto error;
+				goto error_cached;
 		}
 		cache_add(cached);
 	}
@@ -674,6 +674,8 @@ next_page:
 
 	return size_orig;
 
+error_cached:
+	cache_free(cached);
 error:
 	ERRMSG("type_addr: %d, addr:%llx, size:%zd\n", type_addr, addr, size_orig);
 	return FALSE;
-- 
1.8.4.5


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v2 3/8] cache: do not allocate from the pending list
  2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
  2015-03-06  8:23 ` [PATCH v2 1/8] cache: get rid of search loop in cache_add() Petr Tesarik
  2015-03-06  8:52 ` [PATCH v2 2/8] cache: allow to return a page to the pool Petr Tesarik
@ 2015-03-06  8:59 ` Petr Tesarik
  2015-03-06  9:26 ` [PATCH v2 4/8] cache: add hit/miss statistics to the final report Petr Tesarik
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-06  8:59 UTC (permalink / raw)
  To: Atsushi Kumagai, Michael Holzheu; +Cc: kexec mailing list, Jan Willeke

Since pending entries are under read, they should not be reused. This
change allows recursive use of the cache (reading pages from within
readpage itself). Although this feature is not used by makedumpfile
right now, this was the original intention of the pending list.
The cache_alloc() function may return NULL if and only if the recursion
level is bigger than CACHE_SIZE.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
---
 cache.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/cache.c b/cache.c
index ad9f0f1..344b4f6 100644
--- a/cache.c
+++ b/cache.c
@@ -101,16 +101,13 @@ cache_alloc(unsigned long long paddr)
 
 	if (avail) {
 		entry = pool[--avail];
-		add_entry(&pending, entry);
-	} else if (pending.tail) {
-		entry = pending.tail;
 	} else if (used.tail) {
 		entry = used.tail;
 		remove_entry(&used, entry);
-		add_entry(&pending, entry);
 	} else
 		return NULL;
 
+	add_entry(&pending, entry);
 	entry->paddr = paddr;
 	return entry;
 }
-- 
1.8.4.5


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v2 4/8] cache: add hit/miss statistics to the final report
  2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
                   ` (2 preceding siblings ...)
  2015-03-06  8:59 ` [PATCH v2 3/8] cache: do not allocate from the pending list Petr Tesarik
@ 2015-03-06  9:26 ` Petr Tesarik
  2015-03-06 13:07 ` [PATCH v2 5/8] cache: allocate buffers in one big chunk Petr Tesarik
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-06  9:26 UTC (permalink / raw)
  To: Atsushi Kumagai, Michael Holzheu; +Cc: kexec mailing list, Jan Willeke

Add the most basic cache statistics (pages hit and missed). Note that
the hit rate is not printed if cache was not used to avoid division
by zero.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
---
 makedumpfile.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/makedumpfile.c b/makedumpfile.c
index c62d035..d778139 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -39,6 +39,10 @@ struct SplitBlock		*splitblock = NULL;
 
 char filename_stdout[] = FILENAME_STDOUT;
 
+/* Cache statistics */
+static unsigned long long	cache_hit;
+static unsigned long long	cache_miss;
+
 static void first_cycle(mdf_pfn_t start, mdf_pfn_t max, struct cycle *cycle)
 {
 	cycle->start_pfn = round(start, info->pfn_cyclic);
@@ -645,6 +649,7 @@ next_page:
 	pgaddr = PAGEBASE(paddr);
 	pgbuf = cache_search(pgaddr);
 	if (!pgbuf) {
+		++cache_miss;
 		cached = cache_alloc(pgaddr);
 		if (!cached)
 			goto error;
@@ -661,7 +666,8 @@ next_page:
 				goto error_cached;
 		}
 		cache_add(cached);
-	}
+	} else
+		++cache_hit;
 
 	memcpy(bufptr, pgbuf + PAGEOFFSET(paddr), read_size);
 
@@ -8294,6 +8300,11 @@ print_report(void)
 	REPORT_MSG("--------------------------------------------------\n");
 	REPORT_MSG("Total pages     : 0x%016llx\n", info->max_mapnr);
 	REPORT_MSG("\n");
+	REPORT_MSG("Cache hit: %lld, miss: %lld", cache_hit, cache_miss);
+	if (cache_hit + cache_miss)
+		REPORT_MSG(", hit rate: %.1f%%",
+		    100.0 * cache_hit / (cache_hit + cache_miss));
+	REPORT_MSG("\n\n");
 }
 
 static void
-- 
1.8.4.5


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v2 5/8] cache: allocate buffers in one big chunk
  2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
                   ` (3 preceding siblings ...)
  2015-03-06  9:26 ` [PATCH v2 4/8] cache: add hit/miss statistics to the final report Petr Tesarik
@ 2015-03-06 13:07 ` Petr Tesarik
  2015-03-06 13:10 ` [PATCH v2 6/8] cache: allow arbitrary size of cache entries Petr Tesarik
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-06 13:07 UTC (permalink / raw)
  To: Atsushi Kumagai, Michael Holzheu; +Cc: kexec mailing list, Jan Willeke

This allows callers to change the buffer pointer, because it can be
reinitialized to the default value in cache_alloc() before returning
the cache entry.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
---
 cache.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/cache.c b/cache.c
index 344b4f6..ccd67ca 100644
--- a/cache.c
+++ b/cache.c
@@ -32,23 +32,23 @@ static int avail = CACHE_SIZE;
 
 static struct cache used, pending;
 
+static void *cachebuf;
+
 int
 cache_init(void)
 {
-	void *bufptr;
 	int i;
 
-	for (i = 0; i < CACHE_SIZE; ++i) {
-		bufptr = malloc(info->page_size);
-		if (bufptr == NULL) {
-			ERRMSG("Can't allocate memory for cache. %s\n",
-			       strerror(errno));
-			return FALSE;
-		}
-		entries[i].bufptr = bufptr;
-		pool[i] = &entries[i];
+	cachebuf = malloc(info->page_size * CACHE_SIZE);
+	if (cachebuf == NULL) {
+		ERRMSG("Can't allocate memory for cache. %s\n",
+		       strerror(errno));
+		return FALSE;
 	}
 
+	for (i = 0; i < CACHE_SIZE; ++i)
+		pool[i] = &entries[i];
+
 	return TRUE;
 }
 
@@ -98,6 +98,7 @@ struct cache_entry *
 cache_alloc(unsigned long long paddr)
 {
 	struct cache_entry *entry = NULL;
+	int idx;
 
 	if (avail) {
 		entry = pool[--avail];
@@ -107,8 +108,11 @@ cache_alloc(unsigned long long paddr)
 	} else
 		return NULL;
 
-	add_entry(&pending, entry);
+	idx = entry - entries;
 	entry->paddr = paddr;
+	entry->bufptr = cachebuf + idx * info->page_size;
+	add_entry(&pending, entry);
+
 	return entry;
 }
 
-- 
1.8.4.5


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v2 6/8] cache: allow arbitrary size of cache entries
  2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
                   ` (4 preceding siblings ...)
  2015-03-06 13:07 ` [PATCH v2 5/8] cache: allocate buffers in one big chunk Petr Tesarik
@ 2015-03-06 13:10 ` Petr Tesarik
  2015-03-06 13:23 ` [PATCH v2 7/8] cache: store mapped regions directly in the cache Petr Tesarik
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-06 13:10 UTC (permalink / raw)
  To: Atsushi Kumagai, Michael Holzheu; +Cc: kexec mailing list, Jan Willeke

Until this commit the assumed length of all cache entries has been
exactly one page. To make other sizes possible, the length is now
stored in struct cache_entry. Note that cache_search may return a
pointer in the middle of a cache buffer.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
---
 cache.c        | 12 ++++++++----
 cache.h        |  3 ++-
 makedumpfile.c |  2 +-
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/cache.c b/cache.c
index ccd67ca..938eda6 100644
--- a/cache.c
+++ b/cache.c
@@ -79,17 +79,20 @@ remove_entry(struct cache *cache, struct cache_entry *entry)
 }
 
 void *
-cache_search(unsigned long long paddr)
+cache_search(unsigned long long paddr, unsigned long length)
 {
 	struct cache_entry *entry;
-	for (entry = used.head; entry; entry = entry->next)
-		if (entry->paddr == paddr) {
+	for (entry = used.head; entry; entry = entry->next) {
+		size_t off = paddr - entry->paddr;
+		if (off < entry->buflen &&
+		    length <= entry->buflen - off) {
 			if (entry != used.head) {
 				remove_entry(&used, entry);
 				add_entry(&used, entry);
 			}
-			return entry->bufptr;
+			return entry->bufptr + off;
 		}
+	}
 
 	return NULL;		/* cache miss */
 }
@@ -111,6 +114,7 @@ cache_alloc(unsigned long long paddr)
 	idx = entry - entries;
 	entry->paddr = paddr;
 	entry->bufptr = cachebuf + idx * info->page_size;
+	entry->buflen = info->page_size;
 	add_entry(&pending, entry);
 
 	return entry;
diff --git a/cache.h b/cache.h
index 0e65f97..792ba6c 100644
--- a/cache.h
+++ b/cache.h
@@ -22,11 +22,12 @@
 struct cache_entry {
 	unsigned long long paddr;
 	void *bufptr;
+	unsigned long buflen;
 	struct cache_entry *next, *prev;
 };
 
 int cache_init(void);
-void *cache_search(unsigned long long paddr);
+void *cache_search(unsigned long long paddr, unsigned long length);
 struct cache_entry *cache_alloc(unsigned long long paddr);
 void cache_add(struct cache_entry *entry);
 void cache_free(struct cache_entry *entry);
diff --git a/makedumpfile.c b/makedumpfile.c
index d778139..f1aad08 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -647,7 +647,7 @@ next_page:
 	read_size = MIN(info->page_size - PAGEOFFSET(paddr), size);
 
 	pgaddr = PAGEBASE(paddr);
-	pgbuf = cache_search(pgaddr);
+	pgbuf = cache_search(pgaddr, read_size);
 	if (!pgbuf) {
 		++cache_miss;
 		cached = cache_alloc(pgaddr);
-- 
1.8.4.5


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v2 7/8] cache: store mapped regions directly in the cache
  2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
                   ` (5 preceding siblings ...)
  2015-03-06 13:10 ` [PATCH v2 6/8] cache: allow arbitrary size of cache entries Petr Tesarik
@ 2015-03-06 13:23 ` Petr Tesarik
  2015-03-06 13:28 ` [PATCH v2 8/8] cleanup: remove unused page_is_fractional Petr Tesarik
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-06 13:23 UTC (permalink / raw)
  To: Atsushi Kumagai, Michael Holzheu; +Cc: kexec mailing list, Jan Willeke

Avoid copying data between the mmapped region and the cache. To do that,
readmem() tries to map the page before reading it. The mmap path and
the read path are separated: mappage_elf() uses the mmap syscall, and
readpage_elf() uses the read syscall.

If mmap is successful, readmem() stores the mmap address in the cache
entry. Of course, the mapping must not be removed until the cache entry
is evicted, but the cache code has no knowledge about mmap. To solve that
in a flexible way, a discard callback is added to struct cache_entry and
called by cache eviction code.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
---
 cache.c        |   3 ++
 cache.h        |   2 ++
 makedumpfile.c | 105 ++++++++++++++++++++++++++++++++-------------------------
 3 files changed, 64 insertions(+), 46 deletions(-)

diff --git a/cache.c b/cache.c
index 938eda6..963eb76 100644
--- a/cache.c
+++ b/cache.c
@@ -108,6 +108,8 @@ cache_alloc(unsigned long long paddr)
 	} else if (used.tail) {
 		entry = used.tail;
 		remove_entry(&used, entry);
+		if (entry->discard)
+			entry->discard(entry);
 	} else
 		return NULL;
 
@@ -115,6 +117,7 @@ cache_alloc(unsigned long long paddr)
 	entry->paddr = paddr;
 	entry->bufptr = cachebuf + idx * info->page_size;
 	entry->buflen = info->page_size;
+	entry->discard = NULL;
 	add_entry(&pending, entry);
 
 	return entry;
diff --git a/cache.h b/cache.h
index 792ba6c..c55cec4 100644
--- a/cache.h
+++ b/cache.h
@@ -24,6 +24,8 @@ struct cache_entry {
 	void *bufptr;
 	unsigned long buflen;
 	struct cache_entry *next, *prev;
+
+	void (*discard)(struct cache_entry *);
 };
 
 int cache_init(void);
diff --git a/makedumpfile.c b/makedumpfile.c
index f1aad08..827c36f 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -294,6 +294,12 @@ read_page_desc(unsigned long long paddr, page_desc_t *pd)
 	return TRUE;
 }
 
+static void
+unmap_cache(struct cache_entry *entry)
+{
+	munmap(entry->bufptr, entry->buflen);
+}
+
 static int
 update_mmap_range(off_t offset, int initial) {
 	off_t start_offset, end_offset;
@@ -301,9 +307,6 @@ update_mmap_range(off_t offset, int initial) {
 	off_t max_offset = get_max_file_offset();
 	off_t pt_load_end = offset_to_pt_load_end(offset);
 
-	munmap(info->mmap_buf,
-	       info->mmap_end_offset - info->mmap_start_offset);
-
 	/*
 	 * offset for mmap() must be page aligned.
 	 */
@@ -357,29 +360,45 @@ initialize_mmap(void) {
 	return TRUE;
 }
 
-static int
-read_with_mmap(off_t offset, void *bufptr, unsigned long size) {
-	size_t read_size;
+static char *
+mappage_elf(unsigned long long paddr)
+{
+	off_t offset, offset2;
 
-next_region:
+	if (info->flag_usemmap != MMAP_ENABLE)
+		return NULL;
 
-	if (!is_mapped_with_mmap(offset))
-		if (!update_mmap_range(offset, 0))
-			return FALSE;
+	offset = paddr_to_offset(paddr);
+	if (!offset)
+		return NULL;
+
+	offset2 = paddr_to_offset(paddr + info->page_size);
+	if (!offset2)
+		return NULL;
 
-	read_size = MIN(info->mmap_end_offset - offset, size);
+	if (offset2 - offset != info->page_size)
+		return NULL;
 
-	memcpy(bufptr, info->mmap_buf +
-	       (offset - info->mmap_start_offset), read_size);
+	if (!is_mapped_with_mmap(offset) &&
+	    !update_mmap_range(offset, 0)) {
+		ERRMSG("Can't read the dump memory(%s) with mmap().\n",
+		       info->name_memory);
 
-	offset += read_size;
-	bufptr += read_size;
-	size -= read_size;
+		ERRMSG("This kernel might have some problems about mmap().\n");
+		ERRMSG("read() will be used instead of mmap() from now.\n");
 
-	if (size > 0)
-		goto next_region;
+		/*
+		 * Fall back to read().
+		 */
+		info->flag_usemmap = MMAP_DISABLE;
+		return NULL;
+	}
 
-	return TRUE;
+	if (offset < info->mmap_start_offset ||
+	    offset + info->page_size > info->mmap_end_offset)
+		return NULL;
+
+	return info->mmap_buf + (offset - info->mmap_start_offset);
 }
 
 static int
@@ -387,33 +406,16 @@ read_from_vmcore(off_t offset, void *bufptr, unsigned long size)
 {
 	const off_t failed = (off_t)-1;
 
-	if (info->flag_usemmap == MMAP_ENABLE &&
-	    page_is_fractional(offset) == FALSE) {
-		if (!read_with_mmap(offset, bufptr, size)) {
-			ERRMSG("Can't read the dump memory(%s) with mmap().\n",
-			       info->name_memory);
-
-			ERRMSG("This kernel might have some problems about mmap().\n");
-			ERRMSG("read() will be used instead of mmap() from now.\n");
-
-			/*
-			 * Fall back to read().
-			 */
-			info->flag_usemmap = MMAP_DISABLE;
-			read_from_vmcore(offset, bufptr, size);
-		}
-	} else {
-		if (lseek(info->fd_memory, offset, SEEK_SET) == failed) {
-			ERRMSG("Can't seek the dump memory(%s). (offset: %llx) %s\n",
-			       info->name_memory, (unsigned long long)offset, strerror(errno));
-			return FALSE;
-		}
+	if (lseek(info->fd_memory, offset, SEEK_SET) == failed) {
+		ERRMSG("Can't seek the dump memory(%s). (offset: %llx) %s\n",
+		       info->name_memory, (unsigned long long)offset, strerror(errno));
+		return FALSE;
+	}
 
-		if (read(info->fd_memory, bufptr, size) != size) {
-			ERRMSG("Can't read the dump memory(%s). %s\n",
-			       info->name_memory, strerror(errno));
-			return FALSE;
-		}
+	if (read(info->fd_memory, bufptr, size) != size) {
+		ERRMSG("Can't read the dump memory(%s). %s\n",
+		       info->name_memory, strerror(errno));
+		return FALSE;
 	}
 
 	return TRUE;
@@ -662,7 +664,18 @@ next_page:
 			if (!readpage_sadump(pgaddr, pgbuf))
 				goto error_cached;
 		} else {
-			if (!readpage_elf(pgaddr, pgbuf))
+			char *mapbuf = mappage_elf(pgaddr);
+			size_t mapoff;
+
+			if (mapbuf) {
+				pgbuf = mapbuf;
+				mapoff = mapbuf - info->mmap_buf;
+				cached->paddr = pgaddr - mapoff;
+				cached->bufptr = info->mmap_buf;
+				cached->buflen = info->mmap_end_offset -
+					info->mmap_start_offset;
+				cached->discard = unmap_cache;
+			} else if (!readpage_elf(pgaddr, pgbuf))
 				goto error_cached;
 		}
 		cache_add(cached);
-- 
1.8.4.5


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v2 8/8] cleanup: remove unused page_is_fractional
  2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
                   ` (6 preceding siblings ...)
  2015-03-06 13:23 ` [PATCH v2 7/8] cache: store mapped regions directly in the cache Petr Tesarik
@ 2015-03-06 13:28 ` Petr Tesarik
  2015-03-09 16:08 ` [PATCH v2 0/8] Handle mmaped regions in cache Michael Holzheu
  2015-03-09 16:08 ` [PATCH v2 0/8] Handle mmaped regions in cache [more analysis] Michael Holzheu
  9 siblings, 0 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-06 13:28 UTC (permalink / raw)
  To: Atsushi Kumagai, Michael Holzheu; +Cc: kexec mailing list, Jan Willeke

This function is no longer needed, because mappage_elf() now checks
whether the region is contiguous both in physical addresses and in
file offsets.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
---
 elf_info.c | 16 ----------------
 elf_info.h |  2 --
 2 files changed, 18 deletions(-)

diff --git a/elf_info.c b/elf_info.c
index 8bce942..f584393 100644
--- a/elf_info.c
+++ b/elf_info.c
@@ -583,22 +583,6 @@ offset_to_pt_load_end(off_t offset)
 	return pt_load_end;
 }
 
-/*
- * Judge whether the page is fractional or not.
- */
-int
-page_is_fractional(off_t page_offset)
-{
-	if (page_offset % info->page_size != 0)
-		return TRUE;
-
-	if (offset_to_pt_load_end(page_offset) - page_offset
-	    < info->page_size)
-		return TRUE;
-
-	return FALSE;
-}
-
 unsigned long long
 vaddr_to_paddr_general(unsigned long long vaddr)
 {
diff --git a/elf_info.h b/elf_info.h
index e712253..7ae6bf8 100644
--- a/elf_info.h
+++ b/elf_info.h
@@ -40,8 +40,6 @@ unsigned long long vaddr_to_paddr_general(unsigned long long vaddr);
 off_t vaddr_to_offset_slow(int fd, char *filename, unsigned long long vaddr);
 unsigned long long get_max_paddr(void);
 
-int page_is_fractional(off_t page_offset);
-
 int get_elf64_ehdr(int fd, char *filename, Elf64_Ehdr *ehdr);
 int get_elf32_ehdr(int fd, char *filename, Elf32_Ehdr *ehdr);
 int get_elf_info(int fd, char *filename);
-- 
1.8.4.5


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH v2 0/8] Handle mmaped regions in cache
@ 2015-03-06 14:03 Petr Tesarik
  2015-03-06  8:23 ` [PATCH v2 1/8] cache: get rid of search loop in cache_add() Petr Tesarik
                   ` (9 more replies)
  0 siblings, 10 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-06 14:03 UTC (permalink / raw)
  To: Atsushi Kumagai, Michael Holzheu; +Cc: kexec mailing list, Jan Willeke

Because all pages must go into the cache, data is unnecessarily
copied from mmapped regions to cache. Avoid this copying by storing
the mmapped regions directly in the cache.

First, the cache code needs a clean up clarification of the concept,
especially the meaning of the pending list (allocated cache entries
whose content is not yet valid).

Second, the cache must be able to handle differently sized objects
so that it can store individual pages as well as mmapped regions.

Last, the cache eviction code must be extended to allow either
reusing the read buffer or unmapping the region.

Changelog:
  v2: cache cleanup _and_ actuall mmap implementation
  v1: only the cache cleanup

Petr Tesarik (8):
  cache: get rid of search loop in cache_add()
  cache: allow to return a page to the pool
  cache: do not allocate from the pending list
  cache: add hit/miss statistics to the final report
  cache: allocate buffers in one big chunk
  cache: allow arbitrary size of cache entries
  cache: store mapped regions directly in the cache
  cleanup: remove unused page_is_fractional

 cache.c        |  81 +++++++++++++++++----------------
 cache.h        |  16 +++++--
 elf_info.c     |  16 -------
 elf_info.h     |   2 -
 makedumpfile.c | 138 ++++++++++++++++++++++++++++++++++-----------------------
 5 files changed, 138 insertions(+), 115 deletions(-)

-- 
1.8.4.5


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 0/8] Handle mmaped regions in cache
  2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
                   ` (7 preceding siblings ...)
  2015-03-06 13:28 ` [PATCH v2 8/8] cleanup: remove unused page_is_fractional Petr Tesarik
@ 2015-03-09 16:08 ` Michael Holzheu
  2015-03-09 16:08 ` [PATCH v2 0/8] Handle mmaped regions in cache [more analysis] Michael Holzheu
  9 siblings, 0 replies; 20+ messages in thread
From: Michael Holzheu @ 2015-03-09 16:08 UTC (permalink / raw)
  To: Petr Tesarik; +Cc: kexec mailing list, Atsushi Kumagai, Jan Willeke

Hello Petr,

With your patch we get 5-10 percent performance improvement on s390.

Two examples:

$ time ./makedumpfile  -d 31 /proc/vmcore  /dev/null -f

                  user   sys  = total
=====================================
With patches      0.156  0.248  0.404
Without patches   0.168  0.274  0.442  <- No idea why we save system time?

$ time ./makedumpfile  -d 0 /proc/vmcore  /dev/null -f

                  user   sys  = total
=====================================
With patches      0.683  0.020  0.703
Without patches   0.714  0.019  0.733

Regards
Michael

On Fri, 6 Mar 2015 15:03:12 +0100
Petr Tesarik <ptesarik@suse.cz> wrote:

> Because all pages must go into the cache, data is unnecessarily
> copied from mmapped regions to cache. Avoid this copying by storing
> the mmapped regions directly in the cache.
> 
> First, the cache code needs a clean up clarification of the concept,
> especially the meaning of the pending list (allocated cache entries
> whose content is not yet valid).
> 
> Second, the cache must be able to handle differently sized objects
> so that it can store individual pages as well as mmapped regions.
> 
> Last, the cache eviction code must be extended to allow either
> reusing the read buffer or unmapping the region.
> 
> Changelog:
>   v2: cache cleanup _and_ actuall mmap implementation
>   v1: only the cache cleanup
> 
> Petr Tesarik (8):
>   cache: get rid of search loop in cache_add()
>   cache: allow to return a page to the pool
>   cache: do not allocate from the pending list
>   cache: add hit/miss statistics to the final report
>   cache: allocate buffers in one big chunk
>   cache: allow arbitrary size of cache entries
>   cache: store mapped regions directly in the cache
>   cleanup: remove unused page_is_fractional
> 
>  cache.c        |  81 +++++++++++++++++----------------
>  cache.h        |  16 +++++--
>  elf_info.c     |  16 -------
>  elf_info.h     |   2 -
>  makedumpfile.c | 138 ++++++++++++++++++++++++++++++++++-----------------------
>  5 files changed, 138 insertions(+), 115 deletions(-)
> 


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
  2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
                   ` (8 preceding siblings ...)
  2015-03-09 16:08 ` [PATCH v2 0/8] Handle mmaped regions in cache Michael Holzheu
@ 2015-03-09 16:08 ` Michael Holzheu
  2015-03-12 13:56   ` Michael Holzheu
  2015-03-12 15:38   ` Petr Tesarik
  9 siblings, 2 replies; 20+ messages in thread
From: Michael Holzheu @ 2015-03-09 16:08 UTC (permalink / raw)
  To: Petr Tesarik; +Cc: kexec mailing list, Atsushi Kumagai, Jan Willeke

Hello Petr,

With your patches I now used "perf record" and "perf stat"
to check where the CPU time is consumed for -d31 and -d0.

For -d31 the read case is better and for -d0 the mmap case
is better.

  $ time ./makedumpfile  -d 31 /proc/vmcore  /dev/null -f [--non-mmap]

                    user   sys   = total
  =======================================
  mmap              0.156  0.248   0.404
  non-mmap          0.090  0.180   0.270

  $ time ./makedumpfile  -d 0 /proc/vmcore  /dev/null -f [--non-mmap]

                    user   sys   = total
  ======================================
  mmap              0.637  0.018   0.655
  non-mmap          0.275  1.153   1.428

As already said, we think the reason is that for -d0 we issue
only a small number of mmap/munmap calls because the mmap
chunks are larger than the read chunks.

For -d31 memory is fragmented and we issue lots of small
mmap/munmap calls. Because munmap (at least on s390) is a
very expensive operation and we need two calls (mmap/munmap),
the mmap mode is slower that the read mode.

I counted the mmap and read system calls with "perf stat":

                     mmap   unmap   read =    sum
  ===============================================
  mmap -d0            482     443    165     1090          
  mmap -d31         13454   13414    165    27033 
  non-mmap -d0         34       3 458917   458954 
  non-mmap -d31        34       3  74273    74310

Here the actual results I got with "perf record":

$ time ./makedumpfile  -d 31 /proc/vmcore  /dev/null -f

  Output of "perf report" for mmap case:

   /* Most time spent for unmap in kernel */
   29.75%  makedumpfile  [kernel.kallsyms]  [k] unmap_single_vma
    9.84%  makedumpfile  [kernel.kallsyms]  [k] remap_pfn_range
    8.49%  makedumpfile  [kernel.kallsyms]  [k] vm_normal_page

   /* Still some mmap overhead in makedumpfile readmem() */
   21.56%  makedumpfile  makedumpfile       [.] readmem
    8.49%  makedumpfile  makedumpfile       [.] write_kdump_pages_cyclic

  Output of "perf report" for non-mmap case:

   /* Time spent for sys_read (that needs also two copy operations on s390 :() */
   25.32%  makedumpfile  [kernel.kallsyms]  [k] memcpy_real
   22.74%  makedumpfile  [kernel.kallsyms]  [k] __copy_to_user

   /* readmem() for read path is cheaper ? */
   13.49%  makedumpfile  makedumpfile       [.] write_kdump_pages_cyclic
    4.53%  makedumpfile  makedumpfile       [.] readmem

$ time ./makedumpfile  -d 0 /proc/vmcore  /dev/null -f

  Output of "perf report" for mmap case:

   /* Almost no kernel time because we issue very view system calls */
    0.61%  makedumpfile  [kernel.kallsyms]  [k] unmap_single_vma
    0.61%  makedumpfile  [kernel.kallsyms]  [k] sysc_do_svc

   /* Almost all time consumed in user space */
   84.64%  makedumpfile  makedumpfile       [.] readmem
    8.82%  makedumpfile  makedumpfile       [.] write_cache

  Output of "perf report" for non-mmap case:

   /* Time spent for sys_read (that needs also two copy operations on s390) */
   31.50%  makedumpfile  [kernel.kallsyms]  [k] memcpy_real
   29.33%  makedumpfile  [kernel.kallsyms]  [k] __copy_to_user

   /* Very little user space time */
    3.87%  makedumpfile  makedumpfile       [.] write_cache
    3.82%  makedumpfile  makedumpfile       [.] readmem

I hope this analysis helps more than it confuses :-)

As a conclusion, we could think of mapping larger chunks
also for the fragmented case of -d 31 to reduce the amount
of mmap/munmap calls.

Another open question was why the mmap case consumes more CPU
time in readmem() than the read case. Our theory is that the
first memory access is slower because it is not in the HW
cache. For the mmap case userspace issues the first access (copy
to makdumpfile cache) and for the read case the kernel issues
the first access (memcpy_real/copy_to_user). Therefore the
cache miss is accounted to userspace for mmap and to kernel for
read.

And last but not least, perhaps on s390 we could replace
the bounce buffer used for memcpy_real()/copy_to_user() by
some more inteligent solution.

Best Regards
Michael

On Fri, 6 Mar 2015 15:03:12 +0100
Petr Tesarik <ptesarik@suse.cz> wrote:

> Because all pages must go into the cache, data is unnecessarily
> copied from mmapped regions to cache. Avoid this copying by storing
> the mmapped regions directly in the cache.
> 
> First, the cache code needs a clean up clarification of the concept,
> especially the meaning of the pending list (allocated cache entries
> whose content is not yet valid).
> 
> Second, the cache must be able to handle differently sized objects
> so that it can store individual pages as well as mmapped regions.
> 
> Last, the cache eviction code must be extended to allow either
> reusing the read buffer or unmapping the region.
> 
> Changelog:
>   v2: cache cleanup _and_ actuall mmap implementation
>   v1: only the cache cleanup
> 
> Petr Tesarik (8):
>   cache: get rid of search loop in cache_add()
>   cache: allow to return a page to the pool
>   cache: do not allocate from the pending list
>   cache: add hit/miss statistics to the final report
>   cache: allocate buffers in one big chunk
>   cache: allow arbitrary size of cache entries
>   cache: store mapped regions directly in the cache
>   cleanup: remove unused page_is_fractional
> 
>  cache.c        |  81 +++++++++++++++++----------------
>  cache.h        |  16 +++++--
>  elf_info.c     |  16 -------
>  elf_info.h     |   2 -
>  makedumpfile.c | 138 ++++++++++++++++++++++++++++++++++-----------------------
>  5 files changed, 138 insertions(+), 115 deletions(-)
> 


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
  2015-03-09 16:08 ` [PATCH v2 0/8] Handle mmaped regions in cache [more analysis] Michael Holzheu
@ 2015-03-12 13:56   ` Michael Holzheu
  2015-03-12 15:38   ` Petr Tesarik
  1 sibling, 0 replies; 20+ messages in thread
From: Michael Holzheu @ 2015-03-12 13:56 UTC (permalink / raw)
  To: Michael Holzheu
  Cc: kexec mailing list, Atsushi Kumagai, Petr Tesarik, Jan Willeke

On Mon, 9 Mar 2015 17:08:58 +0100
Michael Holzheu <holzheu@linux.vnet.ibm.com> wrote:

> Hello Petr,
> 

[snip]

> As a conclusion, we could think of mapping larger chunks
> also for the fragmented case of -d 31 to reduce the amount
> of mmap/munmap calls.
> 

FYI: I did some more tests and I am no longer sure if the above
conclusion was correct.

A simple "copy" program that reads or maps/unmaps every page
from /proc/vmcore and then writes it to /dev/null is faster
with mmap()/munmap() than with using read():

read:
-----
# time ./copy /dev/null read

real    0m1.072s
user    0m0.010s
sys     0m1.054s

# perf stat -e syscalls:sys_enter_old_mmap,syscalls:sys_enter_munmap,syscalls:sys_enter_read ./copy /dev/null read            
                 8      syscalls:sys_enter_old_mmap                                   
                 1      syscalls:sys_enter_munmap                                   
            458753      syscalls:sys_enter_read                                     
       1.405457536 seconds time elapsed

mmap:
-----
# time ./copy /dev/null mmap

real    0m0.947s
user    0m0.314s
sys     0m0.631s

# perf stat -e syscalls:sys_enter_old_mmap,syscalls:sys_enter_munmap,syscalls:sys_enter_read ./copy /dev/null mmap
            458760      syscalls:sys_enter_old_mmap                                   
            458753      syscalls:sys_enter_munmap                                   
                 1      syscalls:sys_enter_read                                     
       1.175956735 seconds time elapsed

Regards,
Michael


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
  2015-03-09 16:08 ` [PATCH v2 0/8] Handle mmaped regions in cache [more analysis] Michael Holzheu
  2015-03-12 13:56   ` Michael Holzheu
@ 2015-03-12 15:38   ` Petr Tesarik
  2015-03-13  4:10     ` Atsushi Kumagai
  2015-03-13 16:19     ` Michael Holzheu
  1 sibling, 2 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-12 15:38 UTC (permalink / raw)
  To: Michael Holzheu; +Cc: Atsushi Kumagai, kexec mailing list, Jan Willeke

On Mon, 9 Mar 2015 17:08:58 +0100
Michael Holzheu <holzheu@linux.vnet.ibm.com> wrote:

> Hello Petr,
> 
> With your patches I now used "perf record" and "perf stat"
> to check where the CPU time is consumed for -d31 and -d0.
> 
> For -d31 the read case is better and for -d0 the mmap case
> is better.
> 
>[...]
> 
> As already said, we think the reason is that for -d0 we issue
> only a small number of mmap/munmap calls because the mmap
> chunks are larger than the read chunks.

This is very likely.

> For -d31 memory is fragmented and we issue lots of small
> mmap/munmap calls. Because munmap (at least on s390) is a
> very expensive operation and we need two calls (mmap/munmap),
> the mmap mode is slower that the read mode.

Yes. And it may provide an explanation why my patch set improves the
situation. By keeping the mmapped regions in the cache, rather than
individual pages copied out of the mmap region, the cache is in fact
much larger, resulting in less mmap/munmap syscalls.

> I counted the mmap and read system calls with "perf stat":
> 
>                      mmap   unmap   read =    sum
>   ===============================================
>   mmap -d0            482     443    165     1090          
>   mmap -d31         13454   13414    165    27033 
>   non-mmap -d0         34       3 458917   458954 
>   non-mmap -d31        34       3  74273    74310

If your VM has 1.5 GiB of RAM, then the numbers for -d0 look
reasonable. For -d31, we should be able to do better than this
by allocating more cache slots and improving the algorithm.
I originally didn't deem it worth the effort, but seeing almost
30 times more mmaps than with -d0 may change my mind.

> Here the actual results I got with "perf record":
> 
> $ time ./makedumpfile  -d 31 /proc/vmcore  /dev/null -f
> 
>   Output of "perf report" for mmap case:
> 
>    /* Most time spent for unmap in kernel */
>    29.75%  makedumpfile  [kernel.kallsyms]  [k] unmap_single_vma
>     9.84%  makedumpfile  [kernel.kallsyms]  [k] remap_pfn_range
>     8.49%  makedumpfile  [kernel.kallsyms]  [k] vm_normal_page
> 
>    /* Still some mmap overhead in makedumpfile readmem() */
>    21.56%  makedumpfile  makedumpfile       [.] readmem

This number is interesting. Did you compile makedumpfile with
optimizations? If yes, then this number probably includes some
functions which were inlined.

>     8.49%  makedumpfile  makedumpfile       [.] write_kdump_pages_cyclic
> 
>   Output of "perf report" for non-mmap case:
> 
>    /* Time spent for sys_read (that needs also two copy operations on s390 :() */
>    25.32%  makedumpfile  [kernel.kallsyms]  [k] memcpy_real
>    22.74%  makedumpfile  [kernel.kallsyms]  [k] __copy_to_user
> 
>    /* readmem() for read path is cheaper ? */
>    13.49%  makedumpfile  makedumpfile       [.] write_kdump_pages_cyclic
>     4.53%  makedumpfile  makedumpfile       [.] readmem

Yes, much lower overhead of readmem is strange. For a moment I
suspected wrong accounting of the page fault handler, but then I
realized that for /proc/vmcore, all page table entries are created
with the present bit set already, so there are no page faults...

I haven't had time yet to set up a system for reproduction, but I'll
try to identify what's eating up the CPU time in readmem().

>[...]
> I hope this analysis helps more than it confuses :-)
> 
> As a conclusion, we could think of mapping larger chunks
> also for the fragmented case of -d 31 to reduce the amount
> of mmap/munmap calls.

I agree in general. Memory mapped through /proc/vmcore does not
increase run-time memory requirements, because it only adds a mapping
to the old kernel's memory. The only limiting factor is the virtual
address space. On many architectures, this is no issue at all, and we
could simply map the whole file at beginning. On some architectures,
the virtual address space is smaller than possible physical RAM, so
this approach would not work for them.

> Another open question was why the mmap case consumes more CPU
> time in readmem() than the read case. Our theory is that the
> first memory access is slower because it is not in the HW
> cache. For the mmap case userspace issues the first access (copy
> to makdumpfile cache) and for the read case the kernel issues
> the first access (memcpy_real/copy_to_user). Therefore the
> cache miss is accounted to userspace for mmap and to kernel for
> read.

I have no idea how to measure this on s390. On x86_64 I would add some
asm code to read TSC before and after the memory access instruction. I
guess there is a similar counter on s390. Suggestions?

> And last but not least, perhaps on s390 we could replace
> the bounce buffer used for memcpy_real()/copy_to_user() by
> some more inteligent solution.

Which would then improve the non-mmap times even more, right?

Petr T

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
  2015-03-12 15:38   ` Petr Tesarik
@ 2015-03-13  4:10     ` Atsushi Kumagai
  2015-03-13  8:04       ` Petr Tesarik
  2015-03-13 16:19     ` Michael Holzheu
  1 sibling, 1 reply; 20+ messages in thread
From: Atsushi Kumagai @ 2015-03-13  4:10 UTC (permalink / raw)
  To: ptesarik; +Cc: holzheu, kexec, willeke

Hello,

(Note: my email address has changed.)

In x86_64, calling ioremap/iounmap per page in copy_oldmem_page()
causes big performance degradation, so mmap() was introduced on
/proc/vmcore. However, there is no big difference between read() and
mmap() in s390 since it doesn't need ioremap/iounmap in copy_oldmem_page(),
so other issues have been revealed, right?

[...]

>> I counted the mmap and read system calls with "perf stat":
>>
>>                      mmap   unmap   read =    sum
>>   ===============================================
>>   mmap -d0            482     443    165     1090
>>   mmap -d31         13454   13414    165    27033
>>   non-mmap -d0         34       3 458917   458954
>>   non-mmap -d31        34       3  74273    74310
>
>If your VM has 1.5 GiB of RAM, then the numbers for -d0 look
>reasonable. For -d31, we should be able to do better than this
>by allocating more cache slots and improving the algorithm.
>I originally didn't deem it worth the effort, but seeing almost
>30 times more mmaps than with -d0 may change my mind.

Are you going to do it as v3 patch?
I'm going to release v1.5.8 soon, so I'll adopt v2 patch if
you don't think updating it.


Thanks
Atsushi Kumagai

>
>> Here the actual results I got with "perf record":
>>
>> $ time ./makedumpfile  -d 31 /proc/vmcore  /dev/null -f
>>
>>   Output of "perf report" for mmap case:
>>
>>    /* Most time spent for unmap in kernel */
>>    29.75%  makedumpfile  [kernel.kallsyms]  [k] unmap_single_vma
>>     9.84%  makedumpfile  [kernel.kallsyms]  [k] remap_pfn_range
>>     8.49%  makedumpfile  [kernel.kallsyms]  [k] vm_normal_page
>>
>>    /* Still some mmap overhead in makedumpfile readmem() */
>>    21.56%  makedumpfile  makedumpfile       [.] readmem
>
>This number is interesting. Did you compile makedumpfile with
>optimizations? If yes, then this number probably includes some
>functions which were inlined.
>
>>     8.49%  makedumpfile  makedumpfile       [.] write_kdump_pages_cyclic
>>
>>   Output of "perf report" for non-mmap case:
>>
>>    /* Time spent for sys_read (that needs also two copy operations on s390 :() */
>>    25.32%  makedumpfile  [kernel.kallsyms]  [k] memcpy_real
>>    22.74%  makedumpfile  [kernel.kallsyms]  [k] __copy_to_user
>>
>>    /* readmem() for read path is cheaper ? */
>>    13.49%  makedumpfile  makedumpfile       [.] write_kdump_pages_cyclic
>>     4.53%  makedumpfile  makedumpfile       [.] readmem
>
>Yes, much lower overhead of readmem is strange. For a moment I
>suspected wrong accounting of the page fault handler, but then I
>realized that for /proc/vmcore, all page table entries are created
>with the present bit set already, so there are no page faults...
>
>I haven't had time yet to set up a system for reproduction, but I'll
>try to identify what's eating up the CPU time in readmem().
>
>>[...]
>> I hope this analysis helps more than it confuses :-)
>>
>> As a conclusion, we could think of mapping larger chunks
>> also for the fragmented case of -d 31 to reduce the amount
>> of mmap/munmap calls.
>
>I agree in general. Memory mapped through /proc/vmcore does not
>increase run-time memory requirements, because it only adds a mapping
>to the old kernel's memory. The only limiting factor is the virtual
>address space. On many architectures, this is no issue at all, and we
>could simply map the whole file at beginning. On some architectures,
>the virtual address space is smaller than possible physical RAM, so
>this approach would not work for them.
>
>> Another open question was why the mmap case consumes more CPU
>> time in readmem() than the read case. Our theory is that the
>> first memory access is slower because it is not in the HW
>> cache. For the mmap case userspace issues the first access (copy
>> to makdumpfile cache) and for the read case the kernel issues
>> the first access (memcpy_real/copy_to_user). Therefore the
>> cache miss is accounted to userspace for mmap and to kernel for
>> read.
>
>I have no idea how to measure this on s390. On x86_64 I would add some
>asm code to read TSC before and after the memory access instruction. I
>guess there is a similar counter on s390. Suggestions?
>
>> And last but not least, perhaps on s390 we could replace
>> the bounce buffer used for memcpy_real()/copy_to_user() by
>> some more inteligent solution.
>
>Which would then improve the non-mmap times even more, right?
>
>Petr T

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
  2015-03-13  4:10     ` Atsushi Kumagai
@ 2015-03-13  8:04       ` Petr Tesarik
  2015-03-16  5:14         ` Atsushi Kumagai
  0 siblings, 1 reply; 20+ messages in thread
From: Petr Tesarik @ 2015-03-13  8:04 UTC (permalink / raw)
  To: Atsushi Kumagai; +Cc: holzheu, kexec, willeke

On Fri, 13 Mar 2015 04:10:22 +0000
Atsushi Kumagai <ats-kumagai@wm.jp.nec.com> wrote:

> Hello,
> 
> (Note: my email address has changed.)
> 
> In x86_64, calling ioremap/iounmap per page in copy_oldmem_page()
> causes big performance degradation, so mmap() was introduced on
> /proc/vmcore. However, there is no big difference between read() and
> mmap() in s390 since it doesn't need ioremap/iounmap in copy_oldmem_page(),
> so other issues have been revealed, right?
> 
> [...]
> 
> >> I counted the mmap and read system calls with "perf stat":
> >>
> >>                      mmap   unmap   read =    sum
> >>   ===============================================
> >>   mmap -d0            482     443    165     1090
> >>   mmap -d31         13454   13414    165    27033
> >>   non-mmap -d0         34       3 458917   458954
> >>   non-mmap -d31        34       3  74273    74310
> >
> >If your VM has 1.5 GiB of RAM, then the numbers for -d0 look
> >reasonable. For -d31, we should be able to do better than this
> >by allocating more cache slots and improving the algorithm.
> >I originally didn't deem it worth the effort, but seeing almost
> >30 times more mmaps than with -d0 may change my mind.
> 
> Are you going to do it as v3 patch?

No. Tuning the caching algorithm requires a lot of research. I plan to
do it, but testing it with all scenarios (and tuning the algorithm
based on the results) will probably take weeks. I don't think it makes
sense to wait for it.

> I'm going to release v1.5.8 soon, so I'll adopt v2 patch if
> you don't think updating it.

Since v2 already brings some performance gain, I appreciate it if you
can adopt it for v1.5.8.

Thank you very much,
Petr Tesarik

> Thanks
> Atsushi Kumagai
> 
> >
> >> Here the actual results I got with "perf record":
> >>
> >> $ time ./makedumpfile  -d 31 /proc/vmcore  /dev/null -f
> >>
> >>   Output of "perf report" for mmap case:
> >>
> >>    /* Most time spent for unmap in kernel */
> >>    29.75%  makedumpfile  [kernel.kallsyms]  [k] unmap_single_vma
> >>     9.84%  makedumpfile  [kernel.kallsyms]  [k] remap_pfn_range
> >>     8.49%  makedumpfile  [kernel.kallsyms]  [k] vm_normal_page
> >>
> >>    /* Still some mmap overhead in makedumpfile readmem() */
> >>    21.56%  makedumpfile  makedumpfile       [.] readmem
> >
> >This number is interesting. Did you compile makedumpfile with
> >optimizations? If yes, then this number probably includes some
> >functions which were inlined.
> >
> >>     8.49%  makedumpfile  makedumpfile       [.]
> >> write_kdump_pages_cyclic
> >>
> >>   Output of "perf report" for non-mmap case:
> >>
> >>    /* Time spent for sys_read (that needs also two copy operations
> >> on s390 :() */ 25.32%  makedumpfile  [kernel.kallsyms]  [k]
> >> memcpy_real 22.74%  makedumpfile  [kernel.kallsyms]  [k]
> >> __copy_to_user
> >>
> >>    /* readmem() for read path is cheaper ? */
> >>    13.49%  makedumpfile  makedumpfile       [.]
> >> write_kdump_pages_cyclic 4.53%  makedumpfile  makedumpfile
> >> [.] readmem
> >
> >Yes, much lower overhead of readmem is strange. For a moment I
> >suspected wrong accounting of the page fault handler, but then I
> >realized that for /proc/vmcore, all page table entries are created
> >with the present bit set already, so there are no page faults...
> >
> >I haven't had time yet to set up a system for reproduction, but I'll
> >try to identify what's eating up the CPU time in readmem().
> >
> >>[...]
> >> I hope this analysis helps more than it confuses :-)
> >>
> >> As a conclusion, we could think of mapping larger chunks
> >> also for the fragmented case of -d 31 to reduce the amount
> >> of mmap/munmap calls.
> >
> >I agree in general. Memory mapped through /proc/vmcore does not
> >increase run-time memory requirements, because it only adds a mapping
> >to the old kernel's memory. The only limiting factor is the virtual
> >address space. On many architectures, this is no issue at all, and we
> >could simply map the whole file at beginning. On some architectures,
> >the virtual address space is smaller than possible physical RAM, so
> >this approach would not work for them.
> >
> >> Another open question was why the mmap case consumes more CPU
> >> time in readmem() than the read case. Our theory is that the
> >> first memory access is slower because it is not in the HW
> >> cache. For the mmap case userspace issues the first access (copy
> >> to makdumpfile cache) and for the read case the kernel issues
> >> the first access (memcpy_real/copy_to_user). Therefore the
> >> cache miss is accounted to userspace for mmap and to kernel for
> >> read.
> >
> >I have no idea how to measure this on s390. On x86_64 I would add
> >some asm code to read TSC before and after the memory access
> >instruction. I guess there is a similar counter on s390. Suggestions?
> >
> >> And last but not least, perhaps on s390 we could replace
> >> the bounce buffer used for memcpy_real()/copy_to_user() by
> >> some more inteligent solution.
> >
> >Which would then improve the non-mmap times even more, right?
> >
> >Petr T


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
  2015-03-12 15:38   ` Petr Tesarik
  2015-03-13  4:10     ` Atsushi Kumagai
@ 2015-03-13 16:19     ` Michael Holzheu
  1 sibling, 0 replies; 20+ messages in thread
From: Michael Holzheu @ 2015-03-13 16:19 UTC (permalink / raw)
  To: Petr Tesarik; +Cc: Atsushi Kumagai, kexec mailing list, Jan Willeke

On Thu, 12 Mar 2015 16:38:22 +0100
Petr Tesarik <ptesarik@suse.cz> wrote:

> On Mon, 9 Mar 2015 17:08:58 +0100
> Michael Holzheu <holzheu@linux.vnet.ibm.com> wrote:

[snip]

> > I counted the mmap and read system calls with "perf stat":
> > 
> >                      mmap   unmap   read =    sum
> >   ===============================================
> >   mmap -d0            482     443    165     1090          
> >   mmap -d31         13454   13414    165    27033 
> >   non-mmap -d0         34       3 458917   458954 
> >   non-mmap -d31        34       3  74273    74310
> 
> If your VM has 1.5 GiB of RAM, then the numbers for -d0 look
> reasonable. 

I have 1792 MiB RAM.

> For -d31, we should be able to do better than this
> by allocating more cache slots and improving the algorithm.
> I originally didn't deem it worth the effort, but seeing almost
> 30 times more mmaps than with -d0 may change my mind.

ok.

> 
> > Here the actual results I got with "perf record":
> > 
> > $ time ./makedumpfile  -d 31 /proc/vmcore  /dev/null -f
> > 
> >   Output of "perf report" for mmap case:
> > 
> >    /* Most time spent for unmap in kernel */
> >    29.75%  makedumpfile  [kernel.kallsyms]  [k] unmap_single_vma
> >     9.84%  makedumpfile  [kernel.kallsyms]  [k] remap_pfn_range
> >     8.49%  makedumpfile  [kernel.kallsyms]  [k] vm_normal_page
> > 
> >    /* Still some mmap overhead in makedumpfile readmem() */
> >    21.56%  makedumpfile  makedumpfile       [.] readmem
> 
> This number is interesting. Did you compile makedumpfile with
> optimizations? If yes, then this number probably includes some
> functions which were inlined.

Yes, I used the default Makefile (-O2) so most functions are inlined.

With -O0 I get the following:

 15.35%  makedumpfile  libc-2.15.so       [.] memcpy
  2.14%  makedumpfile  makedumpfile       [.] __exclude_unnecessary_pages
  1.82%  makedumpfile  makedumpfile       [.] test_bit
  1.82%  makedumpfile  makedumpfile       [.] set_bitmap_cyclic
  1.32%  makedumpfile  makedumpfile       [.] clear_bit_on_2nd_bitmap
  1.32%  makedumpfile  makedumpfile       [.] write_kdump_pages_cyclic
  1.01%  makedumpfile  makedumpfile       [.] is_on
  0.88%  makedumpfile  makedumpfile       [.] paddr_to_offset
  0.75%  makedumpfile  makedumpfile       [.] is_dumpable_cyclic
  0.69%  makedumpfile  makedumpfile       [.] exclude_range
  0.63%  makedumpfile  makedumpfile       [.] clear_bit_on_2nd_bitmap_for_kernel
  0.63%  makedumpfile  [vdso]             [.] __kernel_gettimeofday
  0.57%  makedumpfile  makedumpfile       [.] print_progress
  0.50%  makedumpfile  makedumpfile       [.] cache_search

> >     8.49%  makedumpfile  makedumpfile       [.] write_kdump_pages_cyclic
> > 
> >   Output of "perf report" for non-mmap case:
> > 
> >    /* Time spent for sys_read (that needs also two copy operations on s390 :() */
> >    25.32%  makedumpfile  [kernel.kallsyms]  [k] memcpy_real
> >    22.74%  makedumpfile  [kernel.kallsyms]  [k] __copy_to_user
> > 
> >    /* readmem() for read path is cheaper ? */
> >    13.49%  makedumpfile  makedumpfile       [.] write_kdump_pages_cyclic
> >     4.53%  makedumpfile  makedumpfile       [.] readmem
> 
> Yes, much lower overhead of readmem is strange. For a moment I
> suspected wrong accounting of the page fault handler, but then I
> realized that for /proc/vmcore, all page table entries are created
> with the present bit set already, so there are no page faults...

Right, as said below, perhaps it is the HW caching issue.
 
> I haven't had time yet to set up a system for reproduction, but I'll
> try to identify what's eating up the CPU time in readmem().
> 
> >[...]
> > I hope this analysis helps more than it confuses :-)
> > 
> > As a conclusion, we could think of mapping larger chunks
> > also for the fragmented case of -d 31 to reduce the amount
> > of mmap/munmap calls.
> 
> I agree in general. Memory mapped through /proc/vmcore does not
> increase run-time memory requirements, because it only adds a mapping
> to the old kernel's memory.

At least you need the page table memory for the /proc/vmcore
mapping, right?

> The only limiting factor is the virtual
> address space. On many architectures, this is no issue at all, and we
> could simply map the whole file at beginning. On some architectures,
> the virtual address space is smaller than possible physical RAM, so
> this approach would not work for them.
> 
> > Another open question was why the mmap case consumes more CPU
> > time in readmem() than the read case. Our theory is that the
> > first memory access is slower because it is not in the HW
> > cache. For the mmap case userspace issues the first access (copy
> > to makdumpfile cache) and for the read case the kernel issues
> > the first access (memcpy_real/copy_to_user). Therefore the
> > cache miss is accounted to userspace for mmap and to kernel for
> > read.
> 
> I have no idea how to measure this on s390. On x86_64 I would add some
> asm code to read TSC before and after the memory access instruction. I
> guess there is a similar counter on s390. Suggestions?

On s390 under LPAR we have hardware counters for cache misses:

# perf stat -e cpum_cf/L1D_PENALTY_CYCLES/,cpum_cf/PROBLEM_STATE_L1D_PENALTY_CYCLES/ ./makedumpfile -d31 /proc/vmcore /dev/null -f 

 Performance counter stats for './makedumpfile -d31 /proc/vmcore /dev/null -f':
        1180577929      L1D_PENALTY_CYCLES                                          
        1166005960      PROBLEM_STATE_L1D_PENALTY_CYCLES                                   

# perf stat -e cpum_cf/L1D_PENALTY_CYCLES/,cpum_cf/PROBLEM_STATE_L1D_PENALTY_CYCLES/ ./makedumpfile -d31 /proc/vmcore /dev/null -f  --non-mmap

 Performance counter stats for './makedumpfile -d31 /proc/vmcore /dev/null -f --non-mmap':

        1691463111      L1D_PENALTY_CYCLES                                          
         151987617      PROBLEM_STATE_L1D_PENALTY_CYCLES                                   

AFAIK:

- L1D_PENALTY_CYCLES: Cycles wasted due to L1 cache misses (kernel + userspace)
- PROBLEM_STATE_L1D_PENALTY_CYCLES: Cycles wasted due to L1 cache misses (userspace only)

So if I got it right, we see that for the mmap() case the cache
misses are almost all in userspace and for the read() case they
are in kernel.

Interestingly on that machine (4 GiB, LPAR and newer model) for
mmap() was faster also for -d 31:

$ time ./makedumpfile /proc/vmcore -d 31 /dev/null -f 
real    0m0.125s
user    0m0.120s
sys     0m0.004s

$ time ./makedumpfile /proc/vmcore -d 31 /dev/null -f --non-mmap
real    0m0.238s
user    0m0.065s
sys     0m0.171

...

> 
> > And last but not least, perhaps on s390 we could replace
> > the bounce buffer used for memcpy_real()/copy_to_user() by
> > some more inteligent solution.
> 
> Which would then improve the non-mmap times even more, right?

Correct.

Best Regards,
Michael


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
  2015-03-13  8:04       ` Petr Tesarik
@ 2015-03-16  5:14         ` Atsushi Kumagai
  2015-03-16  7:26           ` Petr Tesarik
  0 siblings, 1 reply; 20+ messages in thread
From: Atsushi Kumagai @ 2015-03-16  5:14 UTC (permalink / raw)
  To: ptesarik; +Cc: holzheu, kexec, willeke

>On Fri, 13 Mar 2015 04:10:22 +0000
>Atsushi Kumagai <ats-kumagai@wm.jp.nec.com> wrote:
>
>> Hello,
>>
>> (Note: my email address has changed.)
>>
>> In x86_64, calling ioremap/iounmap per page in copy_oldmem_page()
>> causes big performance degradation, so mmap() was introduced on
>> /proc/vmcore. However, there is no big difference between read() and
>> mmap() in s390 since it doesn't need ioremap/iounmap in copy_oldmem_page(),
>> so other issues have been revealed, right?
>>
>> [...]
>>
>> >> I counted the mmap and read system calls with "perf stat":
>> >>
>> >>                      mmap   unmap   read =    sum
>> >>   ===============================================
>> >>   mmap -d0            482     443    165     1090
>> >>   mmap -d31         13454   13414    165    27033
>> >>   non-mmap -d0         34       3 458917   458954
>> >>   non-mmap -d31        34       3  74273    74310
>> >
>> >If your VM has 1.5 GiB of RAM, then the numbers for -d0 look
>> >reasonable. For -d31, we should be able to do better than this
>> >by allocating more cache slots and improving the algorithm.
>> >I originally didn't deem it worth the effort, but seeing almost
>> >30 times more mmaps than with -d0 may change my mind.
>>
>> Are you going to do it as v3 patch?
>
>No. Tuning the caching algorithm requires a lot of research. I plan to
>do it, but testing it with all scenarios (and tuning the algorithm
>based on the results) will probably take weeks. I don't think it makes
>sense to wait for it.
>
>> I'm going to release v1.5.8 soon, so I'll adopt v2 patch if
>> you don't think updating it.
>
>Since v2 already brings some performance gain, I appreciate it if you
>can adopt it for v1.5.8.

Ok, but unfortunately I got some error log during my test like below:

  $ ./makedumpfile -d31 /tmp/vmcore ./dumpfile.d31
  Excluding free pages               : [  0.0 %] /
  reset_bitmap_of_free_pages: The free list is broken.
  reset_bitmap_of_free_pages: The free list is broken.

  makedumpfile Failed.
  $

All of errors are the same as the above at least in my test.
I clarified that [PATCH v2 7/8] causes this by git bisect,
but the root cause is under investigation.

   4064                 if (!readmem(VADDR, curr+OFFSET(list_head.prev),
   4065                              &curr_prev, sizeof curr_prev)) {     // get wrong value here
   4066                         ERRMSG("Can't get prev list_head.\n");
   4067                         return FALSE;
   4068                 }
   4069                 if (previous != curr_prev) {
   4070                         ERRMSG("The free list is broken.\n");
   4071                         retcd = ANALYSIS_FAILED;
   4072                         return FALSE;
   4073                 }


Thanks
Atsushi Kumagai

>Thank you very much,
>Petr Tesarik
>
>> Thanks
>> Atsushi Kumagai
>>
>> >
>> >> Here the actual results I got with "perf record":
>> >>
>> >> $ time ./makedumpfile  -d 31 /proc/vmcore  /dev/null -f
>> >>
>> >>   Output of "perf report" for mmap case:
>> >>
>> >>    /* Most time spent for unmap in kernel */
>> >>    29.75%  makedumpfile  [kernel.kallsyms]  [k] unmap_single_vma
>> >>     9.84%  makedumpfile  [kernel.kallsyms]  [k] remap_pfn_range
>> >>     8.49%  makedumpfile  [kernel.kallsyms]  [k] vm_normal_page
>> >>
>> >>    /* Still some mmap overhead in makedumpfile readmem() */
>> >>    21.56%  makedumpfile  makedumpfile       [.] readmem
>> >
>> >This number is interesting. Did you compile makedumpfile with
>> >optimizations? If yes, then this number probably includes some
>> >functions which were inlined.
>> >
>> >>     8.49%  makedumpfile  makedumpfile       [.]
>> >> write_kdump_pages_cyclic
>> >>
>> >>   Output of "perf report" for non-mmap case:
>> >>
>> >>    /* Time spent for sys_read (that needs also two copy operations
>> >> on s390 :() */ 25.32%  makedumpfile  [kernel.kallsyms]  [k]
>> >> memcpy_real 22.74%  makedumpfile  [kernel.kallsyms]  [k]
>> >> __copy_to_user
>> >>
>> >>    /* readmem() for read path is cheaper ? */
>> >>    13.49%  makedumpfile  makedumpfile       [.]
>> >> write_kdump_pages_cyclic 4.53%  makedumpfile  makedumpfile
>> >> [.] readmem
>> >
>> >Yes, much lower overhead of readmem is strange. For a moment I
>> >suspected wrong accounting of the page fault handler, but then I
>> >realized that for /proc/vmcore, all page table entries are created
>> >with the present bit set already, so there are no page faults...
>> >
>> >I haven't had time yet to set up a system for reproduction, but I'll
>> >try to identify what's eating up the CPU time in readmem().
>> >
>> >>[...]
>> >> I hope this analysis helps more than it confuses :-)
>> >>
>> >> As a conclusion, we could think of mapping larger chunks
>> >> also for the fragmented case of -d 31 to reduce the amount
>> >> of mmap/munmap calls.
>> >
>> >I agree in general. Memory mapped through /proc/vmcore does not
>> >increase run-time memory requirements, because it only adds a mapping
>> >to the old kernel's memory. The only limiting factor is the virtual
>> >address space. On many architectures, this is no issue at all, and we
>> >could simply map the whole file at beginning. On some architectures,
>> >the virtual address space is smaller than possible physical RAM, so
>> >this approach would not work for them.
>> >
>> >> Another open question was why the mmap case consumes more CPU
>> >> time in readmem() than the read case. Our theory is that the
>> >> first memory access is slower because it is not in the HW
>> >> cache. For the mmap case userspace issues the first access (copy
>> >> to makdumpfile cache) and for the read case the kernel issues
>> >> the first access (memcpy_real/copy_to_user). Therefore the
>> >> cache miss is accounted to userspace for mmap and to kernel for
>> >> read.
>> >
>> >I have no idea how to measure this on s390. On x86_64 I would add
>> >some asm code to read TSC before and after the memory access
>> >instruction. I guess there is a similar counter on s390. Suggestions?
>> >
>> >> And last but not least, perhaps on s390 we could replace
>> >> the bounce buffer used for memcpy_real()/copy_to_user() by
>> >> some more inteligent solution.
>> >
>> >Which would then improve the non-mmap times even more, right?
>> >
>> >Petr T

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
  2015-03-16  5:14         ` Atsushi Kumagai
@ 2015-03-16  7:26           ` Petr Tesarik
  2015-03-16  8:06             ` Atsushi Kumagai
  0 siblings, 1 reply; 20+ messages in thread
From: Petr Tesarik @ 2015-03-16  7:26 UTC (permalink / raw)
  To: Atsushi Kumagai; +Cc: holzheu, kexec, willeke

On Mon, 16 Mar 2015 05:14:25 +0000
Atsushi Kumagai <ats-kumagai@wm.jp.nec.com> wrote:

> >On Fri, 13 Mar 2015 04:10:22 +0000
> >Atsushi Kumagai <ats-kumagai@wm.jp.nec.com> wrote:
>[...]
> >> I'm going to release v1.5.8 soon, so I'll adopt v2 patch if
> >> you don't think updating it.
> >
> >Since v2 already brings some performance gain, I appreciate it if you
> >can adopt it for v1.5.8.
> 
> Ok, but unfortunately I got some error log during my test like below:
> 
>   $ ./makedumpfile -d31 /tmp/vmcore ./dumpfile.d31
>   Excluding free pages               : [  0.0 %] /
>   reset_bitmap_of_free_pages: The free list is broken.
>   reset_bitmap_of_free_pages: The free list is broken.
> 
>   makedumpfile Failed.
>   $
> 
> All of errors are the same as the above at least in my test.
> I clarified that [PATCH v2 7/8] causes this by git bisect,
> but the root cause is under investigation.

The only change I can think of is the removal of page_is_fractional.
Originally, LOADs that do not start on a page boundary were never
mmapped. With this patch, this check is removed.

Can you try adding the following check to mappage_elf (and dropping
patch 8/8)?

	if (page_is_fractional(offset))
		return NULL;

Petr T

P.S. This reminds me I should try to get some kernel dumps with
fractional pages for regression testing...

> >Thank you very much,
> >Petr Tesarik
> >
> >> Thanks
> >> Atsushi Kumagai
> >>
> >> >
> >> >> Here the actual results I got with "perf record":
> >> >>
> >> >> $ time ./makedumpfile  -d 31 /proc/vmcore  /dev/null -f
> >> >>
> >> >>   Output of "perf report" for mmap case:
> >> >>
> >> >>    /* Most time spent for unmap in kernel */
> >> >>    29.75%  makedumpfile  [kernel.kallsyms]  [k] unmap_single_vma
> >> >>     9.84%  makedumpfile  [kernel.kallsyms]  [k] remap_pfn_range
> >> >>     8.49%  makedumpfile  [kernel.kallsyms]  [k] vm_normal_page
> >> >>
> >> >>    /* Still some mmap overhead in makedumpfile readmem() */
> >> >>    21.56%  makedumpfile  makedumpfile       [.] readmem
> >> >
> >> >This number is interesting. Did you compile makedumpfile with
> >> >optimizations? If yes, then this number probably includes some
> >> >functions which were inlined.
> >> >
> >> >>     8.49%  makedumpfile  makedumpfile       [.]
> >> >> write_kdump_pages_cyclic
> >> >>
> >> >>   Output of "perf report" for non-mmap case:
> >> >>
> >> >>    /* Time spent for sys_read (that needs also two copy
> >> >> operations on s390 :() */ 25.32%  makedumpfile
> >> >> [kernel.kallsyms]  [k] memcpy_real 22.74%  makedumpfile
> >> >> [kernel.kallsyms]  [k] __copy_to_user
> >> >>
> >> >>    /* readmem() for read path is cheaper ? */
> >> >>    13.49%  makedumpfile  makedumpfile       [.]
> >> >> write_kdump_pages_cyclic 4.53%  makedumpfile  makedumpfile
> >> >> [.] readmem
> >> >
> >> >Yes, much lower overhead of readmem is strange. For a moment I
> >> >suspected wrong accounting of the page fault handler, but then I
> >> >realized that for /proc/vmcore, all page table entries are created
> >> >with the present bit set already, so there are no page faults...
> >> >
> >> >I haven't had time yet to set up a system for reproduction, but
> >> >I'll try to identify what's eating up the CPU time in readmem().
> >> >
> >> >>[...]
> >> >> I hope this analysis helps more than it confuses :-)
> >> >>
> >> >> As a conclusion, we could think of mapping larger chunks
> >> >> also for the fragmented case of -d 31 to reduce the amount
> >> >> of mmap/munmap calls.
> >> >
> >> >I agree in general. Memory mapped through /proc/vmcore does not
> >> >increase run-time memory requirements, because it only adds a
> >> >mapping to the old kernel's memory. The only limiting factor is
> >> >the virtual address space. On many architectures, this is no
> >> >issue at all, and we could simply map the whole file at
> >> >beginning. On some architectures, the virtual address space is
> >> >smaller than possible physical RAM, so this approach would not
> >> >work for them.
> >> >
> >> >> Another open question was why the mmap case consumes more CPU
> >> >> time in readmem() than the read case. Our theory is that the
> >> >> first memory access is slower because it is not in the HW
> >> >> cache. For the mmap case userspace issues the first access (copy
> >> >> to makdumpfile cache) and for the read case the kernel issues
> >> >> the first access (memcpy_real/copy_to_user). Therefore the
> >> >> cache miss is accounted to userspace for mmap and to kernel for
> >> >> read.
> >> >
> >> >I have no idea how to measure this on s390. On x86_64 I would add
> >> >some asm code to read TSC before and after the memory access
> >> >instruction. I guess there is a similar counter on s390.
> >> >Suggestions?
> >> >
> >> >> And last but not least, perhaps on s390 we could replace
> >> >> the bounce buffer used for memcpy_real()/copy_to_user() by
> >> >> some more inteligent solution.
> >> >
> >> >Which would then improve the non-mmap times even more, right?
> >> >
> >> >Petr T
> 
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
  2015-03-16  7:26           ` Petr Tesarik
@ 2015-03-16  8:06             ` Atsushi Kumagai
  2015-03-16  8:24               ` Petr Tesarik
  0 siblings, 1 reply; 20+ messages in thread
From: Atsushi Kumagai @ 2015-03-16  8:06 UTC (permalink / raw)
  To: ptesarik; +Cc: holzheu, kexec, willeke

>On Mon, 16 Mar 2015 05:14:25 +0000
>Atsushi Kumagai <ats-kumagai@wm.jp.nec.com> wrote:
>
>> >On Fri, 13 Mar 2015 04:10:22 +0000
>> >Atsushi Kumagai <ats-kumagai@wm.jp.nec.com> wrote:
>>[...]
>> >> I'm going to release v1.5.8 soon, so I'll adopt v2 patch if
>> >> you don't think updating it.
>> >
>> >Since v2 already brings some performance gain, I appreciate it if you
>> >can adopt it for v1.5.8.
>>
>> Ok, but unfortunately I got some error log during my test like below:
>>
>>   $ ./makedumpfile -d31 /tmp/vmcore ./dumpfile.d31
>>   Excluding free pages               : [  0.0 %] /
>>   reset_bitmap_of_free_pages: The free list is broken.
>>   reset_bitmap_of_free_pages: The free list is broken.
>>
>>   makedumpfile Failed.
>>   $
>>
>> All of errors are the same as the above at least in my test.
>> I clarified that [PATCH v2 7/8] causes this by git bisect,
>> but the root cause is under investigation.
>
>The only change I can think of is the removal of page_is_fractional.
>Originally, LOADs that do not start on a page boundary were never
>mmapped. With this patch, this check is removed.
>
>Can you try adding the following check to mappage_elf (and dropping
>patch 8/8)?
>
>	if (page_is_fractional(offset))
>		return NULL;

It worked, thanks!

Additionally, I've remembered that we should keep page_is_fractional()
for old kernels.

  https://lkml.org/lkml/2013/11/13/439


Thanks
Atsushi Kumagai

>Petr T
>
>P.S. This reminds me I should try to get some kernel dumps with
>fractional pages for regression testing...
>
>> >Thank you very much,
>> >Petr Tesarik
>> >
>> >> Thanks
>> >> Atsushi Kumagai
>> >>
>> >> >
>> >> >> Here the actual results I got with "perf record":
>> >> >>
>> >> >> $ time ./makedumpfile  -d 31 /proc/vmcore  /dev/null -f
>> >> >>
>> >> >>   Output of "perf report" for mmap case:
>> >> >>
>> >> >>    /* Most time spent for unmap in kernel */
>> >> >>    29.75%  makedumpfile  [kernel.kallsyms]  [k] unmap_single_vma
>> >> >>     9.84%  makedumpfile  [kernel.kallsyms]  [k] remap_pfn_range
>> >> >>     8.49%  makedumpfile  [kernel.kallsyms]  [k] vm_normal_page
>> >> >>
>> >> >>    /* Still some mmap overhead in makedumpfile readmem() */
>> >> >>    21.56%  makedumpfile  makedumpfile       [.] readmem
>> >> >
>> >> >This number is interesting. Did you compile makedumpfile with
>> >> >optimizations? If yes, then this number probably includes some
>> >> >functions which were inlined.
>> >> >
>> >> >>     8.49%  makedumpfile  makedumpfile       [.]
>> >> >> write_kdump_pages_cyclic
>> >> >>
>> >> >>   Output of "perf report" for non-mmap case:
>> >> >>
>> >> >>    /* Time spent for sys_read (that needs also two copy
>> >> >> operations on s390 :() */ 25.32%  makedumpfile
>> >> >> [kernel.kallsyms]  [k] memcpy_real 22.74%  makedumpfile
>> >> >> [kernel.kallsyms]  [k] __copy_to_user
>> >> >>
>> >> >>    /* readmem() for read path is cheaper ? */
>> >> >>    13.49%  makedumpfile  makedumpfile       [.]
>> >> >> write_kdump_pages_cyclic 4.53%  makedumpfile  makedumpfile
>> >> >> [.] readmem
>> >> >
>> >> >Yes, much lower overhead of readmem is strange. For a moment I
>> >> >suspected wrong accounting of the page fault handler, but then I
>> >> >realized that for /proc/vmcore, all page table entries are created
>> >> >with the present bit set already, so there are no page faults...
>> >> >
>> >> >I haven't had time yet to set up a system for reproduction, but
>> >> >I'll try to identify what's eating up the CPU time in readmem().
>> >> >
>> >> >>[...]
>> >> >> I hope this analysis helps more than it confuses :-)
>> >> >>
>> >> >> As a conclusion, we could think of mapping larger chunks
>> >> >> also for the fragmented case of -d 31 to reduce the amount
>> >> >> of mmap/munmap calls.
>> >> >
>> >> >I agree in general. Memory mapped through /proc/vmcore does not
>> >> >increase run-time memory requirements, because it only adds a
>> >> >mapping to the old kernel's memory. The only limiting factor is
>> >> >the virtual address space. On many architectures, this is no
>> >> >issue at all, and we could simply map the whole file at
>> >> >beginning. On some architectures, the virtual address space is
>> >> >smaller than possible physical RAM, so this approach would not
>> >> >work for them.
>> >> >
>> >> >> Another open question was why the mmap case consumes more CPU
>> >> >> time in readmem() than the read case. Our theory is that the
>> >> >> first memory access is slower because it is not in the HW
>> >> >> cache. For the mmap case userspace issues the first access (copy
>> >> >> to makdumpfile cache) and for the read case the kernel issues
>> >> >> the first access (memcpy_real/copy_to_user). Therefore the
>> >> >> cache miss is accounted to userspace for mmap and to kernel for
>> >> >> read.
>> >> >
>> >> >I have no idea how to measure this on s390. On x86_64 I would add
>> >> >some asm code to read TSC before and after the memory access
>> >> >instruction. I guess there is a similar counter on s390.
>> >> >Suggestions?
>> >> >
>> >> >> And last but not least, perhaps on s390 we could replace
>> >> >> the bounce buffer used for memcpy_real()/copy_to_user() by
>> >> >> some more inteligent solution.
>> >> >
>> >> >Which would then improve the non-mmap times even more, right?
>> >> >
>> >> >Petr T
>>
>> _______________________________________________
>> kexec mailing list
>> kexec@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/kexec


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
  2015-03-16  8:06             ` Atsushi Kumagai
@ 2015-03-16  8:24               ` Petr Tesarik
  0 siblings, 0 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-16  8:24 UTC (permalink / raw)
  To: Atsushi Kumagai; +Cc: holzheu, kexec, willeke

On Mon, 16 Mar 2015 08:06:14 +0000
Atsushi Kumagai <ats-kumagai@wm.jp.nec.com> wrote:

> >On Mon, 16 Mar 2015 05:14:25 +0000
> >Atsushi Kumagai <ats-kumagai@wm.jp.nec.com> wrote:
> >
> >> >On Fri, 13 Mar 2015 04:10:22 +0000
> >> >Atsushi Kumagai <ats-kumagai@wm.jp.nec.com> wrote:
> >>[...]
> >> >> I'm going to release v1.5.8 soon, so I'll adopt v2 patch if
> >> >> you don't think updating it.
> >> >
> >> >Since v2 already brings some performance gain, I appreciate it if you
> >> >can adopt it for v1.5.8.
> >>
> >> Ok, but unfortunately I got some error log during my test like below:
> >>
> >>   $ ./makedumpfile -d31 /tmp/vmcore ./dumpfile.d31
> >>   Excluding free pages               : [  0.0 %] /
> >>   reset_bitmap_of_free_pages: The free list is broken.
> >>   reset_bitmap_of_free_pages: The free list is broken.
> >>
> >>   makedumpfile Failed.
> >>   $
> >>
> >> All of errors are the same as the above at least in my test.
> >> I clarified that [PATCH v2 7/8] causes this by git bisect,
> >> but the root cause is under investigation.
> >
> >The only change I can think of is the removal of page_is_fractional.
> >Originally, LOADs that do not start on a page boundary were never
> >mmapped. With this patch, this check is removed.
> >
> >Can you try adding the following check to mappage_elf (and dropping
> >patch 8/8)?
> >
> >	if (page_is_fractional(offset))
> >		return NULL;
> 
> It worked, thanks!
> 
> Additionally, I've remembered that we should keep page_is_fractional()
> for old kernels.

Well, if a LOAD segment is not page-aligned, the previous code would
have mapped only the page-aligned portion, and never beyond its
boundaries. I'm unsure what causes the bug, but I don't have time
to find the real root cause, so let's leave the fractional check in
place for now.

I'll be sending v3 of the patch set shortly.

Thank you for testing!

Petr T

>   https://lkml.org/lkml/2013/11/13/439
> 
> 
> Thanks
> Atsushi Kumagai
> 
> >Petr T
> >
> >P.S. This reminds me I should try to get some kernel dumps with
> >fractional pages for regression testing...
> >
> >> >Thank you very much,
> >> >Petr Tesarik
> >> >
> >> >> Thanks
> >> >> Atsushi Kumagai
> >> >>
> >> >> >
> >> >> >> Here the actual results I got with "perf record":
> >> >> >>
> >> >> >> $ time ./makedumpfile  -d 31 /proc/vmcore  /dev/null -f
> >> >> >>
> >> >> >>   Output of "perf report" for mmap case:
> >> >> >>
> >> >> >>    /* Most time spent for unmap in kernel */
> >> >> >>    29.75%  makedumpfile  [kernel.kallsyms]  [k]
> >> >> >> unmap_single_vma 9.84%  makedumpfile  [kernel.kallsyms]  [k]
> >> >> >> remap_pfn_range 8.49%  makedumpfile  [kernel.kallsyms]  [k]
> >> >> >> vm_normal_page
> >> >> >>
> >> >> >>    /* Still some mmap overhead in makedumpfile readmem() */
> >> >> >>    21.56%  makedumpfile  makedumpfile       [.] readmem
> >> >> >
> >> >> >This number is interesting. Did you compile makedumpfile with
> >> >> >optimizations? If yes, then this number probably includes some
> >> >> >functions which were inlined.
> >> >> >
> >> >> >>     8.49%  makedumpfile  makedumpfile       [.]
> >> >> >> write_kdump_pages_cyclic
> >> >> >>
> >> >> >>   Output of "perf report" for non-mmap case:
> >> >> >>
> >> >> >>    /* Time spent for sys_read (that needs also two copy
> >> >> >> operations on s390 :() */ 25.32%  makedumpfile
> >> >> >> [kernel.kallsyms]  [k] memcpy_real 22.74%  makedumpfile
> >> >> >> [kernel.kallsyms]  [k] __copy_to_user
> >> >> >>
> >> >> >>    /* readmem() for read path is cheaper ? */
> >> >> >>    13.49%  makedumpfile  makedumpfile       [.]
> >> >> >> write_kdump_pages_cyclic 4.53%  makedumpfile  makedumpfile
> >> >> >> [.] readmem
> >> >> >
> >> >> >Yes, much lower overhead of readmem is strange. For a moment I
> >> >> >suspected wrong accounting of the page fault handler, but then
> >> >> >I realized that for /proc/vmcore, all page table entries are
> >> >> >created with the present bit set already, so there are no page
> >> >> >faults...
> >> >> >
> >> >> >I haven't had time yet to set up a system for reproduction, but
> >> >> >I'll try to identify what's eating up the CPU time in
> >> >> >readmem().
> >> >> >
> >> >> >>[...]
> >> >> >> I hope this analysis helps more than it confuses :-)
> >> >> >>
> >> >> >> As a conclusion, we could think of mapping larger chunks
> >> >> >> also for the fragmented case of -d 31 to reduce the amount
> >> >> >> of mmap/munmap calls.
> >> >> >
> >> >> >I agree in general. Memory mapped through /proc/vmcore does not
> >> >> >increase run-time memory requirements, because it only adds a
> >> >> >mapping to the old kernel's memory. The only limiting factor is
> >> >> >the virtual address space. On many architectures, this is no
> >> >> >issue at all, and we could simply map the whole file at
> >> >> >beginning. On some architectures, the virtual address space is
> >> >> >smaller than possible physical RAM, so this approach would not
> >> >> >work for them.
> >> >> >
> >> >> >> Another open question was why the mmap case consumes more CPU
> >> >> >> time in readmem() than the read case. Our theory is that the
> >> >> >> first memory access is slower because it is not in the HW
> >> >> >> cache. For the mmap case userspace issues the first access
> >> >> >> (copy to makdumpfile cache) and for the read case the kernel
> >> >> >> issues the first access (memcpy_real/copy_to_user).
> >> >> >> Therefore the cache miss is accounted to userspace for mmap
> >> >> >> and to kernel for read.
> >> >> >
> >> >> >I have no idea how to measure this on s390. On x86_64 I would
> >> >> >add some asm code to read TSC before and after the memory
> >> >> >access instruction. I guess there is a similar counter on s390.
> >> >> >Suggestions?
> >> >> >
> >> >> >> And last but not least, perhaps on s390 we could replace
> >> >> >> the bounce buffer used for memcpy_real()/copy_to_user() by
> >> >> >> some more inteligent solution.
> >> >> >
> >> >> >Which would then improve the non-mmap times even more, right?
> >> >> >
> >> >> >Petr T
> >>
> >> _______________________________________________
> >> kexec mailing list
> >> kexec@lists.infradead.org
> >> http://lists.infradead.org/mailman/listinfo/kexec
> 


_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2015-03-16  8:24 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
2015-03-06  8:23 ` [PATCH v2 1/8] cache: get rid of search loop in cache_add() Petr Tesarik
2015-03-06  8:52 ` [PATCH v2 2/8] cache: allow to return a page to the pool Petr Tesarik
2015-03-06  8:59 ` [PATCH v2 3/8] cache: do not allocate from the pending list Petr Tesarik
2015-03-06  9:26 ` [PATCH v2 4/8] cache: add hit/miss statistics to the final report Petr Tesarik
2015-03-06 13:07 ` [PATCH v2 5/8] cache: allocate buffers in one big chunk Petr Tesarik
2015-03-06 13:10 ` [PATCH v2 6/8] cache: allow arbitrary size of cache entries Petr Tesarik
2015-03-06 13:23 ` [PATCH v2 7/8] cache: store mapped regions directly in the cache Petr Tesarik
2015-03-06 13:28 ` [PATCH v2 8/8] cleanup: remove unused page_is_fractional Petr Tesarik
2015-03-09 16:08 ` [PATCH v2 0/8] Handle mmaped regions in cache Michael Holzheu
2015-03-09 16:08 ` [PATCH v2 0/8] Handle mmaped regions in cache [more analysis] Michael Holzheu
2015-03-12 13:56   ` Michael Holzheu
2015-03-12 15:38   ` Petr Tesarik
2015-03-13  4:10     ` Atsushi Kumagai
2015-03-13  8:04       ` Petr Tesarik
2015-03-16  5:14         ` Atsushi Kumagai
2015-03-16  7:26           ` Petr Tesarik
2015-03-16  8:06             ` Atsushi Kumagai
2015-03-16  8:24               ` Petr Tesarik
2015-03-13 16:19     ` Michael Holzheu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.