* [PATCH v2 1/8] cache: get rid of search loop in cache_add()
2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
@ 2015-03-06 8:23 ` Petr Tesarik
2015-03-06 8:52 ` [PATCH v2 2/8] cache: allow to return a page to the pool Petr Tesarik
` (8 subsequent siblings)
9 siblings, 0 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-06 8:23 UTC (permalink / raw)
To: Atsushi Kumagai, Michael Holzheu; +Cc: kexec mailing list, Jan Willeke
The intention was that cache code is re-entrant, so all cache entries
should go through these states:
1. free
2. pending read
3. used
The cache_add() function is used to move an entry from state 2 to 3, but
since the caller did not know cache entry pointer, it had to search the
pending list for a pending read for the given physical address. This is
not needed if cache_alloc() returns this pointer.
Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
---
cache.c | 26 ++++++--------------------
cache.h | 10 ++++++++--
makedumpfile.c | 8 +++++---
3 files changed, 19 insertions(+), 25 deletions(-)
diff --git a/cache.c b/cache.c
index 0dd957c..700ba0c 100644
--- a/cache.c
+++ b/cache.c
@@ -20,12 +20,6 @@
#include "cache.h"
#include "print_info.h"
-struct cache_entry {
- unsigned long long paddr;
- void *bufptr;
- struct cache_entry *next, *prev;
-};
-
struct cache {
struct cache_entry *head, *tail;
};
@@ -98,38 +92,30 @@ cache_search(unsigned long long paddr)
return NULL; /* cache miss */
}
-void *
+struct cache_entry *
cache_alloc(unsigned long long paddr)
{
struct cache_entry *entry = NULL;
if (avail) {
entry = &pool[--avail];
- entry->paddr = paddr;
add_entry(&pending, entry);
} else if (pending.tail) {
entry = pending.tail;
- entry->paddr = paddr;
} else if (used.tail) {
entry = used.tail;
remove_entry(&used, entry);
- entry->paddr = paddr;
add_entry(&pending, entry);
} else
return NULL;
- return entry->bufptr;
+ entry->paddr = paddr;
+ return entry;
}
void
-cache_add(unsigned long long paddr)
+cache_add(struct cache_entry *entry)
{
- struct cache_entry *entry;
- for (entry = pending.head; entry; entry = entry->next) {
- if (entry->paddr == paddr) {
- remove_entry(&pending, entry);
- add_entry(&used, entry);
- break;
- }
- }
+ remove_entry(&pending, entry);
+ add_entry(&used, entry);
}
diff --git a/cache.h b/cache.h
index 4730e12..dab8eb9 100644
--- a/cache.h
+++ b/cache.h
@@ -19,9 +19,15 @@
#ifndef _CACHE_H
#define _CACHE_H
+struct cache_entry {
+ unsigned long long paddr;
+ void *bufptr;
+ struct cache_entry *next, *prev;
+};
+
int cache_init(void);
void *cache_search(unsigned long long paddr);
-void *cache_alloc(unsigned long long paddr);
-void cache_add(unsigned long long paddr);
+struct cache_entry *cache_alloc(unsigned long long paddr);
+void cache_add(struct cache_entry *entry);
#endif /* _CACHE_H */
diff --git a/makedumpfile.c b/makedumpfile.c
index 74bc9db..828adeb 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -591,6 +591,7 @@ readmem(int type_addr, unsigned long long addr, void *bufptr, size_t size)
unsigned long long paddr, maddr = NOT_PADDR;
unsigned long long pgaddr;
void *pgbuf;
+ struct cache_entry *cached;
next_page:
switch (type_addr) {
@@ -644,9 +645,10 @@ next_page:
pgaddr = PAGEBASE(paddr);
pgbuf = cache_search(pgaddr);
if (!pgbuf) {
- pgbuf = cache_alloc(pgaddr);
- if (!pgbuf)
+ cached = cache_alloc(pgaddr);
+ if (!cached)
goto error;
+ pgbuf = cached->bufptr;
if (info->flag_refiltering) {
if (!readpage_kdump_compressed(pgaddr, pgbuf))
@@ -658,7 +660,7 @@ next_page:
if (!readpage_elf(pgaddr, pgbuf))
goto error;
}
- cache_add(pgaddr);
+ cache_add(cached);
}
memcpy(bufptr, pgbuf + PAGEOFFSET(paddr), read_size);
--
1.8.4.5
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v2 2/8] cache: allow to return a page to the pool
2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
2015-03-06 8:23 ` [PATCH v2 1/8] cache: get rid of search loop in cache_add() Petr Tesarik
@ 2015-03-06 8:52 ` Petr Tesarik
2015-03-06 8:59 ` [PATCH v2 3/8] cache: do not allocate from the pending list Petr Tesarik
` (7 subsequent siblings)
9 siblings, 0 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-06 8:52 UTC (permalink / raw)
To: Atsushi Kumagai, Michael Holzheu; +Cc: kexec mailing list, Jan Willeke
After a failed read, the page should no longer be pending, but rather
available for future allocation.
Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
---
cache.c | 15 ++++++++++++---
cache.h | 1 +
makedumpfile.c | 8 +++++---
3 files changed, 18 insertions(+), 6 deletions(-)
diff --git a/cache.c b/cache.c
index 700ba0c..ad9f0f1 100644
--- a/cache.c
+++ b/cache.c
@@ -26,7 +26,8 @@ struct cache {
/* 8 pages covers 4-level paging plus 4 data pages */
#define CACHE_SIZE 8
-static struct cache_entry pool[CACHE_SIZE];
+static struct cache_entry entries[CACHE_SIZE];
+static struct cache_entry *pool[CACHE_SIZE];
static int avail = CACHE_SIZE;
static struct cache used, pending;
@@ -44,7 +45,8 @@ cache_init(void)
strerror(errno));
return FALSE;
}
- pool[i].bufptr = bufptr;
+ entries[i].bufptr = bufptr;
+ pool[i] = &entries[i];
}
return TRUE;
@@ -98,7 +100,7 @@ cache_alloc(unsigned long long paddr)
struct cache_entry *entry = NULL;
if (avail) {
- entry = &pool[--avail];
+ entry = pool[--avail];
add_entry(&pending, entry);
} else if (pending.tail) {
entry = pending.tail;
@@ -119,3 +121,10 @@ cache_add(struct cache_entry *entry)
remove_entry(&pending, entry);
add_entry(&used, entry);
}
+
+void
+cache_free(struct cache_entry *entry)
+{
+ remove_entry(&pending, entry);
+ pool[avail++] = entry;
+}
diff --git a/cache.h b/cache.h
index dab8eb9..0e65f97 100644
--- a/cache.h
+++ b/cache.h
@@ -29,5 +29,6 @@ int cache_init(void);
void *cache_search(unsigned long long paddr);
struct cache_entry *cache_alloc(unsigned long long paddr);
void cache_add(struct cache_entry *entry);
+void cache_free(struct cache_entry *entry);
#endif /* _CACHE_H */
diff --git a/makedumpfile.c b/makedumpfile.c
index 828adeb..c62d035 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -652,13 +652,13 @@ next_page:
if (info->flag_refiltering) {
if (!readpage_kdump_compressed(pgaddr, pgbuf))
- goto error;
+ goto error_cached;
} else if (info->flag_sadump) {
if (!readpage_sadump(pgaddr, pgbuf))
- goto error;
+ goto error_cached;
} else {
if (!readpage_elf(pgaddr, pgbuf))
- goto error;
+ goto error_cached;
}
cache_add(cached);
}
@@ -674,6 +674,8 @@ next_page:
return size_orig;
+error_cached:
+ cache_free(cached);
error:
ERRMSG("type_addr: %d, addr:%llx, size:%zd\n", type_addr, addr, size_orig);
return FALSE;
--
1.8.4.5
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v2 3/8] cache: do not allocate from the pending list
2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
2015-03-06 8:23 ` [PATCH v2 1/8] cache: get rid of search loop in cache_add() Petr Tesarik
2015-03-06 8:52 ` [PATCH v2 2/8] cache: allow to return a page to the pool Petr Tesarik
@ 2015-03-06 8:59 ` Petr Tesarik
2015-03-06 9:26 ` [PATCH v2 4/8] cache: add hit/miss statistics to the final report Petr Tesarik
` (6 subsequent siblings)
9 siblings, 0 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-06 8:59 UTC (permalink / raw)
To: Atsushi Kumagai, Michael Holzheu; +Cc: kexec mailing list, Jan Willeke
Since pending entries are under read, they should not be reused. This
change allows recursive use of the cache (reading pages from within
readpage itself). Although this feature is not used by makedumpfile
right now, this was the original intention of the pending list.
The cache_alloc() function may return NULL if and only if the recursion
level is bigger than CACHE_SIZE.
Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
---
cache.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/cache.c b/cache.c
index ad9f0f1..344b4f6 100644
--- a/cache.c
+++ b/cache.c
@@ -101,16 +101,13 @@ cache_alloc(unsigned long long paddr)
if (avail) {
entry = pool[--avail];
- add_entry(&pending, entry);
- } else if (pending.tail) {
- entry = pending.tail;
} else if (used.tail) {
entry = used.tail;
remove_entry(&used, entry);
- add_entry(&pending, entry);
} else
return NULL;
+ add_entry(&pending, entry);
entry->paddr = paddr;
return entry;
}
--
1.8.4.5
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v2 4/8] cache: add hit/miss statistics to the final report
2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
` (2 preceding siblings ...)
2015-03-06 8:59 ` [PATCH v2 3/8] cache: do not allocate from the pending list Petr Tesarik
@ 2015-03-06 9:26 ` Petr Tesarik
2015-03-06 13:07 ` [PATCH v2 5/8] cache: allocate buffers in one big chunk Petr Tesarik
` (5 subsequent siblings)
9 siblings, 0 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-06 9:26 UTC (permalink / raw)
To: Atsushi Kumagai, Michael Holzheu; +Cc: kexec mailing list, Jan Willeke
Add the most basic cache statistics (pages hit and missed). Note that
the hit rate is not printed if cache was not used to avoid division
by zero.
Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
---
makedumpfile.c | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/makedumpfile.c b/makedumpfile.c
index c62d035..d778139 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -39,6 +39,10 @@ struct SplitBlock *splitblock = NULL;
char filename_stdout[] = FILENAME_STDOUT;
+/* Cache statistics */
+static unsigned long long cache_hit;
+static unsigned long long cache_miss;
+
static void first_cycle(mdf_pfn_t start, mdf_pfn_t max, struct cycle *cycle)
{
cycle->start_pfn = round(start, info->pfn_cyclic);
@@ -645,6 +649,7 @@ next_page:
pgaddr = PAGEBASE(paddr);
pgbuf = cache_search(pgaddr);
if (!pgbuf) {
+ ++cache_miss;
cached = cache_alloc(pgaddr);
if (!cached)
goto error;
@@ -661,7 +666,8 @@ next_page:
goto error_cached;
}
cache_add(cached);
- }
+ } else
+ ++cache_hit;
memcpy(bufptr, pgbuf + PAGEOFFSET(paddr), read_size);
@@ -8294,6 +8300,11 @@ print_report(void)
REPORT_MSG("--------------------------------------------------\n");
REPORT_MSG("Total pages : 0x%016llx\n", info->max_mapnr);
REPORT_MSG("\n");
+ REPORT_MSG("Cache hit: %lld, miss: %lld", cache_hit, cache_miss);
+ if (cache_hit + cache_miss)
+ REPORT_MSG(", hit rate: %.1f%%",
+ 100.0 * cache_hit / (cache_hit + cache_miss));
+ REPORT_MSG("\n\n");
}
static void
--
1.8.4.5
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v2 5/8] cache: allocate buffers in one big chunk
2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
` (3 preceding siblings ...)
2015-03-06 9:26 ` [PATCH v2 4/8] cache: add hit/miss statistics to the final report Petr Tesarik
@ 2015-03-06 13:07 ` Petr Tesarik
2015-03-06 13:10 ` [PATCH v2 6/8] cache: allow arbitrary size of cache entries Petr Tesarik
` (4 subsequent siblings)
9 siblings, 0 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-06 13:07 UTC (permalink / raw)
To: Atsushi Kumagai, Michael Holzheu; +Cc: kexec mailing list, Jan Willeke
This allows callers to change the buffer pointer, because it can be
reinitialized to the default value in cache_alloc() before returning
the cache entry.
Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
---
cache.c | 26 +++++++++++++++-----------
1 file changed, 15 insertions(+), 11 deletions(-)
diff --git a/cache.c b/cache.c
index 344b4f6..ccd67ca 100644
--- a/cache.c
+++ b/cache.c
@@ -32,23 +32,23 @@ static int avail = CACHE_SIZE;
static struct cache used, pending;
+static void *cachebuf;
+
int
cache_init(void)
{
- void *bufptr;
int i;
- for (i = 0; i < CACHE_SIZE; ++i) {
- bufptr = malloc(info->page_size);
- if (bufptr == NULL) {
- ERRMSG("Can't allocate memory for cache. %s\n",
- strerror(errno));
- return FALSE;
- }
- entries[i].bufptr = bufptr;
- pool[i] = &entries[i];
+ cachebuf = malloc(info->page_size * CACHE_SIZE);
+ if (cachebuf == NULL) {
+ ERRMSG("Can't allocate memory for cache. %s\n",
+ strerror(errno));
+ return FALSE;
}
+ for (i = 0; i < CACHE_SIZE; ++i)
+ pool[i] = &entries[i];
+
return TRUE;
}
@@ -98,6 +98,7 @@ struct cache_entry *
cache_alloc(unsigned long long paddr)
{
struct cache_entry *entry = NULL;
+ int idx;
if (avail) {
entry = pool[--avail];
@@ -107,8 +108,11 @@ cache_alloc(unsigned long long paddr)
} else
return NULL;
- add_entry(&pending, entry);
+ idx = entry - entries;
entry->paddr = paddr;
+ entry->bufptr = cachebuf + idx * info->page_size;
+ add_entry(&pending, entry);
+
return entry;
}
--
1.8.4.5
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v2 6/8] cache: allow arbitrary size of cache entries
2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
` (4 preceding siblings ...)
2015-03-06 13:07 ` [PATCH v2 5/8] cache: allocate buffers in one big chunk Petr Tesarik
@ 2015-03-06 13:10 ` Petr Tesarik
2015-03-06 13:23 ` [PATCH v2 7/8] cache: store mapped regions directly in the cache Petr Tesarik
` (3 subsequent siblings)
9 siblings, 0 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-06 13:10 UTC (permalink / raw)
To: Atsushi Kumagai, Michael Holzheu; +Cc: kexec mailing list, Jan Willeke
Until this commit the assumed length of all cache entries has been
exactly one page. To make other sizes possible, the length is now
stored in struct cache_entry. Note that cache_search may return a
pointer in the middle of a cache buffer.
Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
---
cache.c | 12 ++++++++----
cache.h | 3 ++-
makedumpfile.c | 2 +-
3 files changed, 11 insertions(+), 6 deletions(-)
diff --git a/cache.c b/cache.c
index ccd67ca..938eda6 100644
--- a/cache.c
+++ b/cache.c
@@ -79,17 +79,20 @@ remove_entry(struct cache *cache, struct cache_entry *entry)
}
void *
-cache_search(unsigned long long paddr)
+cache_search(unsigned long long paddr, unsigned long length)
{
struct cache_entry *entry;
- for (entry = used.head; entry; entry = entry->next)
- if (entry->paddr == paddr) {
+ for (entry = used.head; entry; entry = entry->next) {
+ size_t off = paddr - entry->paddr;
+ if (off < entry->buflen &&
+ length <= entry->buflen - off) {
if (entry != used.head) {
remove_entry(&used, entry);
add_entry(&used, entry);
}
- return entry->bufptr;
+ return entry->bufptr + off;
}
+ }
return NULL; /* cache miss */
}
@@ -111,6 +114,7 @@ cache_alloc(unsigned long long paddr)
idx = entry - entries;
entry->paddr = paddr;
entry->bufptr = cachebuf + idx * info->page_size;
+ entry->buflen = info->page_size;
add_entry(&pending, entry);
return entry;
diff --git a/cache.h b/cache.h
index 0e65f97..792ba6c 100644
--- a/cache.h
+++ b/cache.h
@@ -22,11 +22,12 @@
struct cache_entry {
unsigned long long paddr;
void *bufptr;
+ unsigned long buflen;
struct cache_entry *next, *prev;
};
int cache_init(void);
-void *cache_search(unsigned long long paddr);
+void *cache_search(unsigned long long paddr, unsigned long length);
struct cache_entry *cache_alloc(unsigned long long paddr);
void cache_add(struct cache_entry *entry);
void cache_free(struct cache_entry *entry);
diff --git a/makedumpfile.c b/makedumpfile.c
index d778139..f1aad08 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -647,7 +647,7 @@ next_page:
read_size = MIN(info->page_size - PAGEOFFSET(paddr), size);
pgaddr = PAGEBASE(paddr);
- pgbuf = cache_search(pgaddr);
+ pgbuf = cache_search(pgaddr, read_size);
if (!pgbuf) {
++cache_miss;
cached = cache_alloc(pgaddr);
--
1.8.4.5
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v2 7/8] cache: store mapped regions directly in the cache
2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
` (5 preceding siblings ...)
2015-03-06 13:10 ` [PATCH v2 6/8] cache: allow arbitrary size of cache entries Petr Tesarik
@ 2015-03-06 13:23 ` Petr Tesarik
2015-03-06 13:28 ` [PATCH v2 8/8] cleanup: remove unused page_is_fractional Petr Tesarik
` (2 subsequent siblings)
9 siblings, 0 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-06 13:23 UTC (permalink / raw)
To: Atsushi Kumagai, Michael Holzheu; +Cc: kexec mailing list, Jan Willeke
Avoid copying data between the mmapped region and the cache. To do that,
readmem() tries to map the page before reading it. The mmap path and
the read path are separated: mappage_elf() uses the mmap syscall, and
readpage_elf() uses the read syscall.
If mmap is successful, readmem() stores the mmap address in the cache
entry. Of course, the mapping must not be removed until the cache entry
is evicted, but the cache code has no knowledge about mmap. To solve that
in a flexible way, a discard callback is added to struct cache_entry and
called by cache eviction code.
Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
---
cache.c | 3 ++
cache.h | 2 ++
makedumpfile.c | 105 ++++++++++++++++++++++++++++++++-------------------------
3 files changed, 64 insertions(+), 46 deletions(-)
diff --git a/cache.c b/cache.c
index 938eda6..963eb76 100644
--- a/cache.c
+++ b/cache.c
@@ -108,6 +108,8 @@ cache_alloc(unsigned long long paddr)
} else if (used.tail) {
entry = used.tail;
remove_entry(&used, entry);
+ if (entry->discard)
+ entry->discard(entry);
} else
return NULL;
@@ -115,6 +117,7 @@ cache_alloc(unsigned long long paddr)
entry->paddr = paddr;
entry->bufptr = cachebuf + idx * info->page_size;
entry->buflen = info->page_size;
+ entry->discard = NULL;
add_entry(&pending, entry);
return entry;
diff --git a/cache.h b/cache.h
index 792ba6c..c55cec4 100644
--- a/cache.h
+++ b/cache.h
@@ -24,6 +24,8 @@ struct cache_entry {
void *bufptr;
unsigned long buflen;
struct cache_entry *next, *prev;
+
+ void (*discard)(struct cache_entry *);
};
int cache_init(void);
diff --git a/makedumpfile.c b/makedumpfile.c
index f1aad08..827c36f 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -294,6 +294,12 @@ read_page_desc(unsigned long long paddr, page_desc_t *pd)
return TRUE;
}
+static void
+unmap_cache(struct cache_entry *entry)
+{
+ munmap(entry->bufptr, entry->buflen);
+}
+
static int
update_mmap_range(off_t offset, int initial) {
off_t start_offset, end_offset;
@@ -301,9 +307,6 @@ update_mmap_range(off_t offset, int initial) {
off_t max_offset = get_max_file_offset();
off_t pt_load_end = offset_to_pt_load_end(offset);
- munmap(info->mmap_buf,
- info->mmap_end_offset - info->mmap_start_offset);
-
/*
* offset for mmap() must be page aligned.
*/
@@ -357,29 +360,45 @@ initialize_mmap(void) {
return TRUE;
}
-static int
-read_with_mmap(off_t offset, void *bufptr, unsigned long size) {
- size_t read_size;
+static char *
+mappage_elf(unsigned long long paddr)
+{
+ off_t offset, offset2;
-next_region:
+ if (info->flag_usemmap != MMAP_ENABLE)
+ return NULL;
- if (!is_mapped_with_mmap(offset))
- if (!update_mmap_range(offset, 0))
- return FALSE;
+ offset = paddr_to_offset(paddr);
+ if (!offset)
+ return NULL;
+
+ offset2 = paddr_to_offset(paddr + info->page_size);
+ if (!offset2)
+ return NULL;
- read_size = MIN(info->mmap_end_offset - offset, size);
+ if (offset2 - offset != info->page_size)
+ return NULL;
- memcpy(bufptr, info->mmap_buf +
- (offset - info->mmap_start_offset), read_size);
+ if (!is_mapped_with_mmap(offset) &&
+ !update_mmap_range(offset, 0)) {
+ ERRMSG("Can't read the dump memory(%s) with mmap().\n",
+ info->name_memory);
- offset += read_size;
- bufptr += read_size;
- size -= read_size;
+ ERRMSG("This kernel might have some problems about mmap().\n");
+ ERRMSG("read() will be used instead of mmap() from now.\n");
- if (size > 0)
- goto next_region;
+ /*
+ * Fall back to read().
+ */
+ info->flag_usemmap = MMAP_DISABLE;
+ return NULL;
+ }
- return TRUE;
+ if (offset < info->mmap_start_offset ||
+ offset + info->page_size > info->mmap_end_offset)
+ return NULL;
+
+ return info->mmap_buf + (offset - info->mmap_start_offset);
}
static int
@@ -387,33 +406,16 @@ read_from_vmcore(off_t offset, void *bufptr, unsigned long size)
{
const off_t failed = (off_t)-1;
- if (info->flag_usemmap == MMAP_ENABLE &&
- page_is_fractional(offset) == FALSE) {
- if (!read_with_mmap(offset, bufptr, size)) {
- ERRMSG("Can't read the dump memory(%s) with mmap().\n",
- info->name_memory);
-
- ERRMSG("This kernel might have some problems about mmap().\n");
- ERRMSG("read() will be used instead of mmap() from now.\n");
-
- /*
- * Fall back to read().
- */
- info->flag_usemmap = MMAP_DISABLE;
- read_from_vmcore(offset, bufptr, size);
- }
- } else {
- if (lseek(info->fd_memory, offset, SEEK_SET) == failed) {
- ERRMSG("Can't seek the dump memory(%s). (offset: %llx) %s\n",
- info->name_memory, (unsigned long long)offset, strerror(errno));
- return FALSE;
- }
+ if (lseek(info->fd_memory, offset, SEEK_SET) == failed) {
+ ERRMSG("Can't seek the dump memory(%s). (offset: %llx) %s\n",
+ info->name_memory, (unsigned long long)offset, strerror(errno));
+ return FALSE;
+ }
- if (read(info->fd_memory, bufptr, size) != size) {
- ERRMSG("Can't read the dump memory(%s). %s\n",
- info->name_memory, strerror(errno));
- return FALSE;
- }
+ if (read(info->fd_memory, bufptr, size) != size) {
+ ERRMSG("Can't read the dump memory(%s). %s\n",
+ info->name_memory, strerror(errno));
+ return FALSE;
}
return TRUE;
@@ -662,7 +664,18 @@ next_page:
if (!readpage_sadump(pgaddr, pgbuf))
goto error_cached;
} else {
- if (!readpage_elf(pgaddr, pgbuf))
+ char *mapbuf = mappage_elf(pgaddr);
+ size_t mapoff;
+
+ if (mapbuf) {
+ pgbuf = mapbuf;
+ mapoff = mapbuf - info->mmap_buf;
+ cached->paddr = pgaddr - mapoff;
+ cached->bufptr = info->mmap_buf;
+ cached->buflen = info->mmap_end_offset -
+ info->mmap_start_offset;
+ cached->discard = unmap_cache;
+ } else if (!readpage_elf(pgaddr, pgbuf))
goto error_cached;
}
cache_add(cached);
--
1.8.4.5
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v2 8/8] cleanup: remove unused page_is_fractional
2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
` (6 preceding siblings ...)
2015-03-06 13:23 ` [PATCH v2 7/8] cache: store mapped regions directly in the cache Petr Tesarik
@ 2015-03-06 13:28 ` Petr Tesarik
2015-03-09 16:08 ` [PATCH v2 0/8] Handle mmaped regions in cache Michael Holzheu
2015-03-09 16:08 ` [PATCH v2 0/8] Handle mmaped regions in cache [more analysis] Michael Holzheu
9 siblings, 0 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-06 13:28 UTC (permalink / raw)
To: Atsushi Kumagai, Michael Holzheu; +Cc: kexec mailing list, Jan Willeke
This function is no longer needed, because mappage_elf() now checks
whether the region is contiguous both in physical addresses and in
file offsets.
Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
---
elf_info.c | 16 ----------------
elf_info.h | 2 --
2 files changed, 18 deletions(-)
diff --git a/elf_info.c b/elf_info.c
index 8bce942..f584393 100644
--- a/elf_info.c
+++ b/elf_info.c
@@ -583,22 +583,6 @@ offset_to_pt_load_end(off_t offset)
return pt_load_end;
}
-/*
- * Judge whether the page is fractional or not.
- */
-int
-page_is_fractional(off_t page_offset)
-{
- if (page_offset % info->page_size != 0)
- return TRUE;
-
- if (offset_to_pt_load_end(page_offset) - page_offset
- < info->page_size)
- return TRUE;
-
- return FALSE;
-}
-
unsigned long long
vaddr_to_paddr_general(unsigned long long vaddr)
{
diff --git a/elf_info.h b/elf_info.h
index e712253..7ae6bf8 100644
--- a/elf_info.h
+++ b/elf_info.h
@@ -40,8 +40,6 @@ unsigned long long vaddr_to_paddr_general(unsigned long long vaddr);
off_t vaddr_to_offset_slow(int fd, char *filename, unsigned long long vaddr);
unsigned long long get_max_paddr(void);
-int page_is_fractional(off_t page_offset);
-
int get_elf64_ehdr(int fd, char *filename, Elf64_Ehdr *ehdr);
int get_elf32_ehdr(int fd, char *filename, Elf32_Ehdr *ehdr);
int get_elf_info(int fd, char *filename);
--
1.8.4.5
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v2 0/8] Handle mmaped regions in cache
@ 2015-03-06 14:03 Petr Tesarik
2015-03-06 8:23 ` [PATCH v2 1/8] cache: get rid of search loop in cache_add() Petr Tesarik
` (9 more replies)
0 siblings, 10 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-06 14:03 UTC (permalink / raw)
To: Atsushi Kumagai, Michael Holzheu; +Cc: kexec mailing list, Jan Willeke
Because all pages must go into the cache, data is unnecessarily
copied from mmapped regions to cache. Avoid this copying by storing
the mmapped regions directly in the cache.
First, the cache code needs a clean up clarification of the concept,
especially the meaning of the pending list (allocated cache entries
whose content is not yet valid).
Second, the cache must be able to handle differently sized objects
so that it can store individual pages as well as mmapped regions.
Last, the cache eviction code must be extended to allow either
reusing the read buffer or unmapping the region.
Changelog:
v2: cache cleanup _and_ actuall mmap implementation
v1: only the cache cleanup
Petr Tesarik (8):
cache: get rid of search loop in cache_add()
cache: allow to return a page to the pool
cache: do not allocate from the pending list
cache: add hit/miss statistics to the final report
cache: allocate buffers in one big chunk
cache: allow arbitrary size of cache entries
cache: store mapped regions directly in the cache
cleanup: remove unused page_is_fractional
cache.c | 81 +++++++++++++++++----------------
cache.h | 16 +++++--
elf_info.c | 16 -------
elf_info.h | 2 -
makedumpfile.c | 138 ++++++++++++++++++++++++++++++++++-----------------------
5 files changed, 138 insertions(+), 115 deletions(-)
--
1.8.4.5
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 0/8] Handle mmaped regions in cache
2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
` (7 preceding siblings ...)
2015-03-06 13:28 ` [PATCH v2 8/8] cleanup: remove unused page_is_fractional Petr Tesarik
@ 2015-03-09 16:08 ` Michael Holzheu
2015-03-09 16:08 ` [PATCH v2 0/8] Handle mmaped regions in cache [more analysis] Michael Holzheu
9 siblings, 0 replies; 20+ messages in thread
From: Michael Holzheu @ 2015-03-09 16:08 UTC (permalink / raw)
To: Petr Tesarik; +Cc: kexec mailing list, Atsushi Kumagai, Jan Willeke
Hello Petr,
With your patch we get 5-10 percent performance improvement on s390.
Two examples:
$ time ./makedumpfile -d 31 /proc/vmcore /dev/null -f
user sys = total
=====================================
With patches 0.156 0.248 0.404
Without patches 0.168 0.274 0.442 <- No idea why we save system time?
$ time ./makedumpfile -d 0 /proc/vmcore /dev/null -f
user sys = total
=====================================
With patches 0.683 0.020 0.703
Without patches 0.714 0.019 0.733
Regards
Michael
On Fri, 6 Mar 2015 15:03:12 +0100
Petr Tesarik <ptesarik@suse.cz> wrote:
> Because all pages must go into the cache, data is unnecessarily
> copied from mmapped regions to cache. Avoid this copying by storing
> the mmapped regions directly in the cache.
>
> First, the cache code needs a clean up clarification of the concept,
> especially the meaning of the pending list (allocated cache entries
> whose content is not yet valid).
>
> Second, the cache must be able to handle differently sized objects
> so that it can store individual pages as well as mmapped regions.
>
> Last, the cache eviction code must be extended to allow either
> reusing the read buffer or unmapping the region.
>
> Changelog:
> v2: cache cleanup _and_ actuall mmap implementation
> v1: only the cache cleanup
>
> Petr Tesarik (8):
> cache: get rid of search loop in cache_add()
> cache: allow to return a page to the pool
> cache: do not allocate from the pending list
> cache: add hit/miss statistics to the final report
> cache: allocate buffers in one big chunk
> cache: allow arbitrary size of cache entries
> cache: store mapped regions directly in the cache
> cleanup: remove unused page_is_fractional
>
> cache.c | 81 +++++++++++++++++----------------
> cache.h | 16 +++++--
> elf_info.c | 16 -------
> elf_info.h | 2 -
> makedumpfile.c | 138 ++++++++++++++++++++++++++++++++++-----------------------
> 5 files changed, 138 insertions(+), 115 deletions(-)
>
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
` (8 preceding siblings ...)
2015-03-09 16:08 ` [PATCH v2 0/8] Handle mmaped regions in cache Michael Holzheu
@ 2015-03-09 16:08 ` Michael Holzheu
2015-03-12 13:56 ` Michael Holzheu
2015-03-12 15:38 ` Petr Tesarik
9 siblings, 2 replies; 20+ messages in thread
From: Michael Holzheu @ 2015-03-09 16:08 UTC (permalink / raw)
To: Petr Tesarik; +Cc: kexec mailing list, Atsushi Kumagai, Jan Willeke
Hello Petr,
With your patches I now used "perf record" and "perf stat"
to check where the CPU time is consumed for -d31 and -d0.
For -d31 the read case is better and for -d0 the mmap case
is better.
$ time ./makedumpfile -d 31 /proc/vmcore /dev/null -f [--non-mmap]
user sys = total
=======================================
mmap 0.156 0.248 0.404
non-mmap 0.090 0.180 0.270
$ time ./makedumpfile -d 0 /proc/vmcore /dev/null -f [--non-mmap]
user sys = total
======================================
mmap 0.637 0.018 0.655
non-mmap 0.275 1.153 1.428
As already said, we think the reason is that for -d0 we issue
only a small number of mmap/munmap calls because the mmap
chunks are larger than the read chunks.
For -d31 memory is fragmented and we issue lots of small
mmap/munmap calls. Because munmap (at least on s390) is a
very expensive operation and we need two calls (mmap/munmap),
the mmap mode is slower that the read mode.
I counted the mmap and read system calls with "perf stat":
mmap unmap read = sum
===============================================
mmap -d0 482 443 165 1090
mmap -d31 13454 13414 165 27033
non-mmap -d0 34 3 458917 458954
non-mmap -d31 34 3 74273 74310
Here the actual results I got with "perf record":
$ time ./makedumpfile -d 31 /proc/vmcore /dev/null -f
Output of "perf report" for mmap case:
/* Most time spent for unmap in kernel */
29.75% makedumpfile [kernel.kallsyms] [k] unmap_single_vma
9.84% makedumpfile [kernel.kallsyms] [k] remap_pfn_range
8.49% makedumpfile [kernel.kallsyms] [k] vm_normal_page
/* Still some mmap overhead in makedumpfile readmem() */
21.56% makedumpfile makedumpfile [.] readmem
8.49% makedumpfile makedumpfile [.] write_kdump_pages_cyclic
Output of "perf report" for non-mmap case:
/* Time spent for sys_read (that needs also two copy operations on s390 :() */
25.32% makedumpfile [kernel.kallsyms] [k] memcpy_real
22.74% makedumpfile [kernel.kallsyms] [k] __copy_to_user
/* readmem() for read path is cheaper ? */
13.49% makedumpfile makedumpfile [.] write_kdump_pages_cyclic
4.53% makedumpfile makedumpfile [.] readmem
$ time ./makedumpfile -d 0 /proc/vmcore /dev/null -f
Output of "perf report" for mmap case:
/* Almost no kernel time because we issue very view system calls */
0.61% makedumpfile [kernel.kallsyms] [k] unmap_single_vma
0.61% makedumpfile [kernel.kallsyms] [k] sysc_do_svc
/* Almost all time consumed in user space */
84.64% makedumpfile makedumpfile [.] readmem
8.82% makedumpfile makedumpfile [.] write_cache
Output of "perf report" for non-mmap case:
/* Time spent for sys_read (that needs also two copy operations on s390) */
31.50% makedumpfile [kernel.kallsyms] [k] memcpy_real
29.33% makedumpfile [kernel.kallsyms] [k] __copy_to_user
/* Very little user space time */
3.87% makedumpfile makedumpfile [.] write_cache
3.82% makedumpfile makedumpfile [.] readmem
I hope this analysis helps more than it confuses :-)
As a conclusion, we could think of mapping larger chunks
also for the fragmented case of -d 31 to reduce the amount
of mmap/munmap calls.
Another open question was why the mmap case consumes more CPU
time in readmem() than the read case. Our theory is that the
first memory access is slower because it is not in the HW
cache. For the mmap case userspace issues the first access (copy
to makdumpfile cache) and for the read case the kernel issues
the first access (memcpy_real/copy_to_user). Therefore the
cache miss is accounted to userspace for mmap and to kernel for
read.
And last but not least, perhaps on s390 we could replace
the bounce buffer used for memcpy_real()/copy_to_user() by
some more inteligent solution.
Best Regards
Michael
On Fri, 6 Mar 2015 15:03:12 +0100
Petr Tesarik <ptesarik@suse.cz> wrote:
> Because all pages must go into the cache, data is unnecessarily
> copied from mmapped regions to cache. Avoid this copying by storing
> the mmapped regions directly in the cache.
>
> First, the cache code needs a clean up clarification of the concept,
> especially the meaning of the pending list (allocated cache entries
> whose content is not yet valid).
>
> Second, the cache must be able to handle differently sized objects
> so that it can store individual pages as well as mmapped regions.
>
> Last, the cache eviction code must be extended to allow either
> reusing the read buffer or unmapping the region.
>
> Changelog:
> v2: cache cleanup _and_ actuall mmap implementation
> v1: only the cache cleanup
>
> Petr Tesarik (8):
> cache: get rid of search loop in cache_add()
> cache: allow to return a page to the pool
> cache: do not allocate from the pending list
> cache: add hit/miss statistics to the final report
> cache: allocate buffers in one big chunk
> cache: allow arbitrary size of cache entries
> cache: store mapped regions directly in the cache
> cleanup: remove unused page_is_fractional
>
> cache.c | 81 +++++++++++++++++----------------
> cache.h | 16 +++++--
> elf_info.c | 16 -------
> elf_info.h | 2 -
> makedumpfile.c | 138 ++++++++++++++++++++++++++++++++++-----------------------
> 5 files changed, 138 insertions(+), 115 deletions(-)
>
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
2015-03-09 16:08 ` [PATCH v2 0/8] Handle mmaped regions in cache [more analysis] Michael Holzheu
@ 2015-03-12 13:56 ` Michael Holzheu
2015-03-12 15:38 ` Petr Tesarik
1 sibling, 0 replies; 20+ messages in thread
From: Michael Holzheu @ 2015-03-12 13:56 UTC (permalink / raw)
To: Michael Holzheu
Cc: kexec mailing list, Atsushi Kumagai, Petr Tesarik, Jan Willeke
On Mon, 9 Mar 2015 17:08:58 +0100
Michael Holzheu <holzheu@linux.vnet.ibm.com> wrote:
> Hello Petr,
>
[snip]
> As a conclusion, we could think of mapping larger chunks
> also for the fragmented case of -d 31 to reduce the amount
> of mmap/munmap calls.
>
FYI: I did some more tests and I am no longer sure if the above
conclusion was correct.
A simple "copy" program that reads or maps/unmaps every page
from /proc/vmcore and then writes it to /dev/null is faster
with mmap()/munmap() than with using read():
read:
-----
# time ./copy /dev/null read
real 0m1.072s
user 0m0.010s
sys 0m1.054s
# perf stat -e syscalls:sys_enter_old_mmap,syscalls:sys_enter_munmap,syscalls:sys_enter_read ./copy /dev/null read
8 syscalls:sys_enter_old_mmap
1 syscalls:sys_enter_munmap
458753 syscalls:sys_enter_read
1.405457536 seconds time elapsed
mmap:
-----
# time ./copy /dev/null mmap
real 0m0.947s
user 0m0.314s
sys 0m0.631s
# perf stat -e syscalls:sys_enter_old_mmap,syscalls:sys_enter_munmap,syscalls:sys_enter_read ./copy /dev/null mmap
458760 syscalls:sys_enter_old_mmap
458753 syscalls:sys_enter_munmap
1 syscalls:sys_enter_read
1.175956735 seconds time elapsed
Regards,
Michael
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
2015-03-09 16:08 ` [PATCH v2 0/8] Handle mmaped regions in cache [more analysis] Michael Holzheu
2015-03-12 13:56 ` Michael Holzheu
@ 2015-03-12 15:38 ` Petr Tesarik
2015-03-13 4:10 ` Atsushi Kumagai
2015-03-13 16:19 ` Michael Holzheu
1 sibling, 2 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-12 15:38 UTC (permalink / raw)
To: Michael Holzheu; +Cc: Atsushi Kumagai, kexec mailing list, Jan Willeke
On Mon, 9 Mar 2015 17:08:58 +0100
Michael Holzheu <holzheu@linux.vnet.ibm.com> wrote:
> Hello Petr,
>
> With your patches I now used "perf record" and "perf stat"
> to check where the CPU time is consumed for -d31 and -d0.
>
> For -d31 the read case is better and for -d0 the mmap case
> is better.
>
>[...]
>
> As already said, we think the reason is that for -d0 we issue
> only a small number of mmap/munmap calls because the mmap
> chunks are larger than the read chunks.
This is very likely.
> For -d31 memory is fragmented and we issue lots of small
> mmap/munmap calls. Because munmap (at least on s390) is a
> very expensive operation and we need two calls (mmap/munmap),
> the mmap mode is slower that the read mode.
Yes. And it may provide an explanation why my patch set improves the
situation. By keeping the mmapped regions in the cache, rather than
individual pages copied out of the mmap region, the cache is in fact
much larger, resulting in less mmap/munmap syscalls.
> I counted the mmap and read system calls with "perf stat":
>
> mmap unmap read = sum
> ===============================================
> mmap -d0 482 443 165 1090
> mmap -d31 13454 13414 165 27033
> non-mmap -d0 34 3 458917 458954
> non-mmap -d31 34 3 74273 74310
If your VM has 1.5 GiB of RAM, then the numbers for -d0 look
reasonable. For -d31, we should be able to do better than this
by allocating more cache slots and improving the algorithm.
I originally didn't deem it worth the effort, but seeing almost
30 times more mmaps than with -d0 may change my mind.
> Here the actual results I got with "perf record":
>
> $ time ./makedumpfile -d 31 /proc/vmcore /dev/null -f
>
> Output of "perf report" for mmap case:
>
> /* Most time spent for unmap in kernel */
> 29.75% makedumpfile [kernel.kallsyms] [k] unmap_single_vma
> 9.84% makedumpfile [kernel.kallsyms] [k] remap_pfn_range
> 8.49% makedumpfile [kernel.kallsyms] [k] vm_normal_page
>
> /* Still some mmap overhead in makedumpfile readmem() */
> 21.56% makedumpfile makedumpfile [.] readmem
This number is interesting. Did you compile makedumpfile with
optimizations? If yes, then this number probably includes some
functions which were inlined.
> 8.49% makedumpfile makedumpfile [.] write_kdump_pages_cyclic
>
> Output of "perf report" for non-mmap case:
>
> /* Time spent for sys_read (that needs also two copy operations on s390 :() */
> 25.32% makedumpfile [kernel.kallsyms] [k] memcpy_real
> 22.74% makedumpfile [kernel.kallsyms] [k] __copy_to_user
>
> /* readmem() for read path is cheaper ? */
> 13.49% makedumpfile makedumpfile [.] write_kdump_pages_cyclic
> 4.53% makedumpfile makedumpfile [.] readmem
Yes, much lower overhead of readmem is strange. For a moment I
suspected wrong accounting of the page fault handler, but then I
realized that for /proc/vmcore, all page table entries are created
with the present bit set already, so there are no page faults...
I haven't had time yet to set up a system for reproduction, but I'll
try to identify what's eating up the CPU time in readmem().
>[...]
> I hope this analysis helps more than it confuses :-)
>
> As a conclusion, we could think of mapping larger chunks
> also for the fragmented case of -d 31 to reduce the amount
> of mmap/munmap calls.
I agree in general. Memory mapped through /proc/vmcore does not
increase run-time memory requirements, because it only adds a mapping
to the old kernel's memory. The only limiting factor is the virtual
address space. On many architectures, this is no issue at all, and we
could simply map the whole file at beginning. On some architectures,
the virtual address space is smaller than possible physical RAM, so
this approach would not work for them.
> Another open question was why the mmap case consumes more CPU
> time in readmem() than the read case. Our theory is that the
> first memory access is slower because it is not in the HW
> cache. For the mmap case userspace issues the first access (copy
> to makdumpfile cache) and for the read case the kernel issues
> the first access (memcpy_real/copy_to_user). Therefore the
> cache miss is accounted to userspace for mmap and to kernel for
> read.
I have no idea how to measure this on s390. On x86_64 I would add some
asm code to read TSC before and after the memory access instruction. I
guess there is a similar counter on s390. Suggestions?
> And last but not least, perhaps on s390 we could replace
> the bounce buffer used for memcpy_real()/copy_to_user() by
> some more inteligent solution.
Which would then improve the non-mmap times even more, right?
Petr T
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
2015-03-12 15:38 ` Petr Tesarik
@ 2015-03-13 4:10 ` Atsushi Kumagai
2015-03-13 8:04 ` Petr Tesarik
2015-03-13 16:19 ` Michael Holzheu
1 sibling, 1 reply; 20+ messages in thread
From: Atsushi Kumagai @ 2015-03-13 4:10 UTC (permalink / raw)
To: ptesarik; +Cc: holzheu, kexec, willeke
Hello,
(Note: my email address has changed.)
In x86_64, calling ioremap/iounmap per page in copy_oldmem_page()
causes big performance degradation, so mmap() was introduced on
/proc/vmcore. However, there is no big difference between read() and
mmap() in s390 since it doesn't need ioremap/iounmap in copy_oldmem_page(),
so other issues have been revealed, right?
[...]
>> I counted the mmap and read system calls with "perf stat":
>>
>> mmap unmap read = sum
>> ===============================================
>> mmap -d0 482 443 165 1090
>> mmap -d31 13454 13414 165 27033
>> non-mmap -d0 34 3 458917 458954
>> non-mmap -d31 34 3 74273 74310
>
>If your VM has 1.5 GiB of RAM, then the numbers for -d0 look
>reasonable. For -d31, we should be able to do better than this
>by allocating more cache slots and improving the algorithm.
>I originally didn't deem it worth the effort, but seeing almost
>30 times more mmaps than with -d0 may change my mind.
Are you going to do it as v3 patch?
I'm going to release v1.5.8 soon, so I'll adopt v2 patch if
you don't think updating it.
Thanks
Atsushi Kumagai
>
>> Here the actual results I got with "perf record":
>>
>> $ time ./makedumpfile -d 31 /proc/vmcore /dev/null -f
>>
>> Output of "perf report" for mmap case:
>>
>> /* Most time spent for unmap in kernel */
>> 29.75% makedumpfile [kernel.kallsyms] [k] unmap_single_vma
>> 9.84% makedumpfile [kernel.kallsyms] [k] remap_pfn_range
>> 8.49% makedumpfile [kernel.kallsyms] [k] vm_normal_page
>>
>> /* Still some mmap overhead in makedumpfile readmem() */
>> 21.56% makedumpfile makedumpfile [.] readmem
>
>This number is interesting. Did you compile makedumpfile with
>optimizations? If yes, then this number probably includes some
>functions which were inlined.
>
>> 8.49% makedumpfile makedumpfile [.] write_kdump_pages_cyclic
>>
>> Output of "perf report" for non-mmap case:
>>
>> /* Time spent for sys_read (that needs also two copy operations on s390 :() */
>> 25.32% makedumpfile [kernel.kallsyms] [k] memcpy_real
>> 22.74% makedumpfile [kernel.kallsyms] [k] __copy_to_user
>>
>> /* readmem() for read path is cheaper ? */
>> 13.49% makedumpfile makedumpfile [.] write_kdump_pages_cyclic
>> 4.53% makedumpfile makedumpfile [.] readmem
>
>Yes, much lower overhead of readmem is strange. For a moment I
>suspected wrong accounting of the page fault handler, but then I
>realized that for /proc/vmcore, all page table entries are created
>with the present bit set already, so there are no page faults...
>
>I haven't had time yet to set up a system for reproduction, but I'll
>try to identify what's eating up the CPU time in readmem().
>
>>[...]
>> I hope this analysis helps more than it confuses :-)
>>
>> As a conclusion, we could think of mapping larger chunks
>> also for the fragmented case of -d 31 to reduce the amount
>> of mmap/munmap calls.
>
>I agree in general. Memory mapped through /proc/vmcore does not
>increase run-time memory requirements, because it only adds a mapping
>to the old kernel's memory. The only limiting factor is the virtual
>address space. On many architectures, this is no issue at all, and we
>could simply map the whole file at beginning. On some architectures,
>the virtual address space is smaller than possible physical RAM, so
>this approach would not work for them.
>
>> Another open question was why the mmap case consumes more CPU
>> time in readmem() than the read case. Our theory is that the
>> first memory access is slower because it is not in the HW
>> cache. For the mmap case userspace issues the first access (copy
>> to makdumpfile cache) and for the read case the kernel issues
>> the first access (memcpy_real/copy_to_user). Therefore the
>> cache miss is accounted to userspace for mmap and to kernel for
>> read.
>
>I have no idea how to measure this on s390. On x86_64 I would add some
>asm code to read TSC before and after the memory access instruction. I
>guess there is a similar counter on s390. Suggestions?
>
>> And last but not least, perhaps on s390 we could replace
>> the bounce buffer used for memcpy_real()/copy_to_user() by
>> some more inteligent solution.
>
>Which would then improve the non-mmap times even more, right?
>
>Petr T
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
2015-03-13 4:10 ` Atsushi Kumagai
@ 2015-03-13 8:04 ` Petr Tesarik
2015-03-16 5:14 ` Atsushi Kumagai
0 siblings, 1 reply; 20+ messages in thread
From: Petr Tesarik @ 2015-03-13 8:04 UTC (permalink / raw)
To: Atsushi Kumagai; +Cc: holzheu, kexec, willeke
On Fri, 13 Mar 2015 04:10:22 +0000
Atsushi Kumagai <ats-kumagai@wm.jp.nec.com> wrote:
> Hello,
>
> (Note: my email address has changed.)
>
> In x86_64, calling ioremap/iounmap per page in copy_oldmem_page()
> causes big performance degradation, so mmap() was introduced on
> /proc/vmcore. However, there is no big difference between read() and
> mmap() in s390 since it doesn't need ioremap/iounmap in copy_oldmem_page(),
> so other issues have been revealed, right?
>
> [...]
>
> >> I counted the mmap and read system calls with "perf stat":
> >>
> >> mmap unmap read = sum
> >> ===============================================
> >> mmap -d0 482 443 165 1090
> >> mmap -d31 13454 13414 165 27033
> >> non-mmap -d0 34 3 458917 458954
> >> non-mmap -d31 34 3 74273 74310
> >
> >If your VM has 1.5 GiB of RAM, then the numbers for -d0 look
> >reasonable. For -d31, we should be able to do better than this
> >by allocating more cache slots and improving the algorithm.
> >I originally didn't deem it worth the effort, but seeing almost
> >30 times more mmaps than with -d0 may change my mind.
>
> Are you going to do it as v3 patch?
No. Tuning the caching algorithm requires a lot of research. I plan to
do it, but testing it with all scenarios (and tuning the algorithm
based on the results) will probably take weeks. I don't think it makes
sense to wait for it.
> I'm going to release v1.5.8 soon, so I'll adopt v2 patch if
> you don't think updating it.
Since v2 already brings some performance gain, I appreciate it if you
can adopt it for v1.5.8.
Thank you very much,
Petr Tesarik
> Thanks
> Atsushi Kumagai
>
> >
> >> Here the actual results I got with "perf record":
> >>
> >> $ time ./makedumpfile -d 31 /proc/vmcore /dev/null -f
> >>
> >> Output of "perf report" for mmap case:
> >>
> >> /* Most time spent for unmap in kernel */
> >> 29.75% makedumpfile [kernel.kallsyms] [k] unmap_single_vma
> >> 9.84% makedumpfile [kernel.kallsyms] [k] remap_pfn_range
> >> 8.49% makedumpfile [kernel.kallsyms] [k] vm_normal_page
> >>
> >> /* Still some mmap overhead in makedumpfile readmem() */
> >> 21.56% makedumpfile makedumpfile [.] readmem
> >
> >This number is interesting. Did you compile makedumpfile with
> >optimizations? If yes, then this number probably includes some
> >functions which were inlined.
> >
> >> 8.49% makedumpfile makedumpfile [.]
> >> write_kdump_pages_cyclic
> >>
> >> Output of "perf report" for non-mmap case:
> >>
> >> /* Time spent for sys_read (that needs also two copy operations
> >> on s390 :() */ 25.32% makedumpfile [kernel.kallsyms] [k]
> >> memcpy_real 22.74% makedumpfile [kernel.kallsyms] [k]
> >> __copy_to_user
> >>
> >> /* readmem() for read path is cheaper ? */
> >> 13.49% makedumpfile makedumpfile [.]
> >> write_kdump_pages_cyclic 4.53% makedumpfile makedumpfile
> >> [.] readmem
> >
> >Yes, much lower overhead of readmem is strange. For a moment I
> >suspected wrong accounting of the page fault handler, but then I
> >realized that for /proc/vmcore, all page table entries are created
> >with the present bit set already, so there are no page faults...
> >
> >I haven't had time yet to set up a system for reproduction, but I'll
> >try to identify what's eating up the CPU time in readmem().
> >
> >>[...]
> >> I hope this analysis helps more than it confuses :-)
> >>
> >> As a conclusion, we could think of mapping larger chunks
> >> also for the fragmented case of -d 31 to reduce the amount
> >> of mmap/munmap calls.
> >
> >I agree in general. Memory mapped through /proc/vmcore does not
> >increase run-time memory requirements, because it only adds a mapping
> >to the old kernel's memory. The only limiting factor is the virtual
> >address space. On many architectures, this is no issue at all, and we
> >could simply map the whole file at beginning. On some architectures,
> >the virtual address space is smaller than possible physical RAM, so
> >this approach would not work for them.
> >
> >> Another open question was why the mmap case consumes more CPU
> >> time in readmem() than the read case. Our theory is that the
> >> first memory access is slower because it is not in the HW
> >> cache. For the mmap case userspace issues the first access (copy
> >> to makdumpfile cache) and for the read case the kernel issues
> >> the first access (memcpy_real/copy_to_user). Therefore the
> >> cache miss is accounted to userspace for mmap and to kernel for
> >> read.
> >
> >I have no idea how to measure this on s390. On x86_64 I would add
> >some asm code to read TSC before and after the memory access
> >instruction. I guess there is a similar counter on s390. Suggestions?
> >
> >> And last but not least, perhaps on s390 we could replace
> >> the bounce buffer used for memcpy_real()/copy_to_user() by
> >> some more inteligent solution.
> >
> >Which would then improve the non-mmap times even more, right?
> >
> >Petr T
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
2015-03-12 15:38 ` Petr Tesarik
2015-03-13 4:10 ` Atsushi Kumagai
@ 2015-03-13 16:19 ` Michael Holzheu
1 sibling, 0 replies; 20+ messages in thread
From: Michael Holzheu @ 2015-03-13 16:19 UTC (permalink / raw)
To: Petr Tesarik; +Cc: Atsushi Kumagai, kexec mailing list, Jan Willeke
On Thu, 12 Mar 2015 16:38:22 +0100
Petr Tesarik <ptesarik@suse.cz> wrote:
> On Mon, 9 Mar 2015 17:08:58 +0100
> Michael Holzheu <holzheu@linux.vnet.ibm.com> wrote:
[snip]
> > I counted the mmap and read system calls with "perf stat":
> >
> > mmap unmap read = sum
> > ===============================================
> > mmap -d0 482 443 165 1090
> > mmap -d31 13454 13414 165 27033
> > non-mmap -d0 34 3 458917 458954
> > non-mmap -d31 34 3 74273 74310
>
> If your VM has 1.5 GiB of RAM, then the numbers for -d0 look
> reasonable.
I have 1792 MiB RAM.
> For -d31, we should be able to do better than this
> by allocating more cache slots and improving the algorithm.
> I originally didn't deem it worth the effort, but seeing almost
> 30 times more mmaps than with -d0 may change my mind.
ok.
>
> > Here the actual results I got with "perf record":
> >
> > $ time ./makedumpfile -d 31 /proc/vmcore /dev/null -f
> >
> > Output of "perf report" for mmap case:
> >
> > /* Most time spent for unmap in kernel */
> > 29.75% makedumpfile [kernel.kallsyms] [k] unmap_single_vma
> > 9.84% makedumpfile [kernel.kallsyms] [k] remap_pfn_range
> > 8.49% makedumpfile [kernel.kallsyms] [k] vm_normal_page
> >
> > /* Still some mmap overhead in makedumpfile readmem() */
> > 21.56% makedumpfile makedumpfile [.] readmem
>
> This number is interesting. Did you compile makedumpfile with
> optimizations? If yes, then this number probably includes some
> functions which were inlined.
Yes, I used the default Makefile (-O2) so most functions are inlined.
With -O0 I get the following:
15.35% makedumpfile libc-2.15.so [.] memcpy
2.14% makedumpfile makedumpfile [.] __exclude_unnecessary_pages
1.82% makedumpfile makedumpfile [.] test_bit
1.82% makedumpfile makedumpfile [.] set_bitmap_cyclic
1.32% makedumpfile makedumpfile [.] clear_bit_on_2nd_bitmap
1.32% makedumpfile makedumpfile [.] write_kdump_pages_cyclic
1.01% makedumpfile makedumpfile [.] is_on
0.88% makedumpfile makedumpfile [.] paddr_to_offset
0.75% makedumpfile makedumpfile [.] is_dumpable_cyclic
0.69% makedumpfile makedumpfile [.] exclude_range
0.63% makedumpfile makedumpfile [.] clear_bit_on_2nd_bitmap_for_kernel
0.63% makedumpfile [vdso] [.] __kernel_gettimeofday
0.57% makedumpfile makedumpfile [.] print_progress
0.50% makedumpfile makedumpfile [.] cache_search
> > 8.49% makedumpfile makedumpfile [.] write_kdump_pages_cyclic
> >
> > Output of "perf report" for non-mmap case:
> >
> > /* Time spent for sys_read (that needs also two copy operations on s390 :() */
> > 25.32% makedumpfile [kernel.kallsyms] [k] memcpy_real
> > 22.74% makedumpfile [kernel.kallsyms] [k] __copy_to_user
> >
> > /* readmem() for read path is cheaper ? */
> > 13.49% makedumpfile makedumpfile [.] write_kdump_pages_cyclic
> > 4.53% makedumpfile makedumpfile [.] readmem
>
> Yes, much lower overhead of readmem is strange. For a moment I
> suspected wrong accounting of the page fault handler, but then I
> realized that for /proc/vmcore, all page table entries are created
> with the present bit set already, so there are no page faults...
Right, as said below, perhaps it is the HW caching issue.
> I haven't had time yet to set up a system for reproduction, but I'll
> try to identify what's eating up the CPU time in readmem().
>
> >[...]
> > I hope this analysis helps more than it confuses :-)
> >
> > As a conclusion, we could think of mapping larger chunks
> > also for the fragmented case of -d 31 to reduce the amount
> > of mmap/munmap calls.
>
> I agree in general. Memory mapped through /proc/vmcore does not
> increase run-time memory requirements, because it only adds a mapping
> to the old kernel's memory.
At least you need the page table memory for the /proc/vmcore
mapping, right?
> The only limiting factor is the virtual
> address space. On many architectures, this is no issue at all, and we
> could simply map the whole file at beginning. On some architectures,
> the virtual address space is smaller than possible physical RAM, so
> this approach would not work for them.
>
> > Another open question was why the mmap case consumes more CPU
> > time in readmem() than the read case. Our theory is that the
> > first memory access is slower because it is not in the HW
> > cache. For the mmap case userspace issues the first access (copy
> > to makdumpfile cache) and for the read case the kernel issues
> > the first access (memcpy_real/copy_to_user). Therefore the
> > cache miss is accounted to userspace for mmap and to kernel for
> > read.
>
> I have no idea how to measure this on s390. On x86_64 I would add some
> asm code to read TSC before and after the memory access instruction. I
> guess there is a similar counter on s390. Suggestions?
On s390 under LPAR we have hardware counters for cache misses:
# perf stat -e cpum_cf/L1D_PENALTY_CYCLES/,cpum_cf/PROBLEM_STATE_L1D_PENALTY_CYCLES/ ./makedumpfile -d31 /proc/vmcore /dev/null -f
Performance counter stats for './makedumpfile -d31 /proc/vmcore /dev/null -f':
1180577929 L1D_PENALTY_CYCLES
1166005960 PROBLEM_STATE_L1D_PENALTY_CYCLES
# perf stat -e cpum_cf/L1D_PENALTY_CYCLES/,cpum_cf/PROBLEM_STATE_L1D_PENALTY_CYCLES/ ./makedumpfile -d31 /proc/vmcore /dev/null -f --non-mmap
Performance counter stats for './makedumpfile -d31 /proc/vmcore /dev/null -f --non-mmap':
1691463111 L1D_PENALTY_CYCLES
151987617 PROBLEM_STATE_L1D_PENALTY_CYCLES
AFAIK:
- L1D_PENALTY_CYCLES: Cycles wasted due to L1 cache misses (kernel + userspace)
- PROBLEM_STATE_L1D_PENALTY_CYCLES: Cycles wasted due to L1 cache misses (userspace only)
So if I got it right, we see that for the mmap() case the cache
misses are almost all in userspace and for the read() case they
are in kernel.
Interestingly on that machine (4 GiB, LPAR and newer model) for
mmap() was faster also for -d 31:
$ time ./makedumpfile /proc/vmcore -d 31 /dev/null -f
real 0m0.125s
user 0m0.120s
sys 0m0.004s
$ time ./makedumpfile /proc/vmcore -d 31 /dev/null -f --non-mmap
real 0m0.238s
user 0m0.065s
sys 0m0.171
...
>
> > And last but not least, perhaps on s390 we could replace
> > the bounce buffer used for memcpy_real()/copy_to_user() by
> > some more inteligent solution.
>
> Which would then improve the non-mmap times even more, right?
Correct.
Best Regards,
Michael
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
2015-03-13 8:04 ` Petr Tesarik
@ 2015-03-16 5:14 ` Atsushi Kumagai
2015-03-16 7:26 ` Petr Tesarik
0 siblings, 1 reply; 20+ messages in thread
From: Atsushi Kumagai @ 2015-03-16 5:14 UTC (permalink / raw)
To: ptesarik; +Cc: holzheu, kexec, willeke
>On Fri, 13 Mar 2015 04:10:22 +0000
>Atsushi Kumagai <ats-kumagai@wm.jp.nec.com> wrote:
>
>> Hello,
>>
>> (Note: my email address has changed.)
>>
>> In x86_64, calling ioremap/iounmap per page in copy_oldmem_page()
>> causes big performance degradation, so mmap() was introduced on
>> /proc/vmcore. However, there is no big difference between read() and
>> mmap() in s390 since it doesn't need ioremap/iounmap in copy_oldmem_page(),
>> so other issues have been revealed, right?
>>
>> [...]
>>
>> >> I counted the mmap and read system calls with "perf stat":
>> >>
>> >> mmap unmap read = sum
>> >> ===============================================
>> >> mmap -d0 482 443 165 1090
>> >> mmap -d31 13454 13414 165 27033
>> >> non-mmap -d0 34 3 458917 458954
>> >> non-mmap -d31 34 3 74273 74310
>> >
>> >If your VM has 1.5 GiB of RAM, then the numbers for -d0 look
>> >reasonable. For -d31, we should be able to do better than this
>> >by allocating more cache slots and improving the algorithm.
>> >I originally didn't deem it worth the effort, but seeing almost
>> >30 times more mmaps than with -d0 may change my mind.
>>
>> Are you going to do it as v3 patch?
>
>No. Tuning the caching algorithm requires a lot of research. I plan to
>do it, but testing it with all scenarios (and tuning the algorithm
>based on the results) will probably take weeks. I don't think it makes
>sense to wait for it.
>
>> I'm going to release v1.5.8 soon, so I'll adopt v2 patch if
>> you don't think updating it.
>
>Since v2 already brings some performance gain, I appreciate it if you
>can adopt it for v1.5.8.
Ok, but unfortunately I got some error log during my test like below:
$ ./makedumpfile -d31 /tmp/vmcore ./dumpfile.d31
Excluding free pages : [ 0.0 %] /
reset_bitmap_of_free_pages: The free list is broken.
reset_bitmap_of_free_pages: The free list is broken.
makedumpfile Failed.
$
All of errors are the same as the above at least in my test.
I clarified that [PATCH v2 7/8] causes this by git bisect,
but the root cause is under investigation.
4064 if (!readmem(VADDR, curr+OFFSET(list_head.prev),
4065 &curr_prev, sizeof curr_prev)) { // get wrong value here
4066 ERRMSG("Can't get prev list_head.\n");
4067 return FALSE;
4068 }
4069 if (previous != curr_prev) {
4070 ERRMSG("The free list is broken.\n");
4071 retcd = ANALYSIS_FAILED;
4072 return FALSE;
4073 }
Thanks
Atsushi Kumagai
>Thank you very much,
>Petr Tesarik
>
>> Thanks
>> Atsushi Kumagai
>>
>> >
>> >> Here the actual results I got with "perf record":
>> >>
>> >> $ time ./makedumpfile -d 31 /proc/vmcore /dev/null -f
>> >>
>> >> Output of "perf report" for mmap case:
>> >>
>> >> /* Most time spent for unmap in kernel */
>> >> 29.75% makedumpfile [kernel.kallsyms] [k] unmap_single_vma
>> >> 9.84% makedumpfile [kernel.kallsyms] [k] remap_pfn_range
>> >> 8.49% makedumpfile [kernel.kallsyms] [k] vm_normal_page
>> >>
>> >> /* Still some mmap overhead in makedumpfile readmem() */
>> >> 21.56% makedumpfile makedumpfile [.] readmem
>> >
>> >This number is interesting. Did you compile makedumpfile with
>> >optimizations? If yes, then this number probably includes some
>> >functions which were inlined.
>> >
>> >> 8.49% makedumpfile makedumpfile [.]
>> >> write_kdump_pages_cyclic
>> >>
>> >> Output of "perf report" for non-mmap case:
>> >>
>> >> /* Time spent for sys_read (that needs also two copy operations
>> >> on s390 :() */ 25.32% makedumpfile [kernel.kallsyms] [k]
>> >> memcpy_real 22.74% makedumpfile [kernel.kallsyms] [k]
>> >> __copy_to_user
>> >>
>> >> /* readmem() for read path is cheaper ? */
>> >> 13.49% makedumpfile makedumpfile [.]
>> >> write_kdump_pages_cyclic 4.53% makedumpfile makedumpfile
>> >> [.] readmem
>> >
>> >Yes, much lower overhead of readmem is strange. For a moment I
>> >suspected wrong accounting of the page fault handler, but then I
>> >realized that for /proc/vmcore, all page table entries are created
>> >with the present bit set already, so there are no page faults...
>> >
>> >I haven't had time yet to set up a system for reproduction, but I'll
>> >try to identify what's eating up the CPU time in readmem().
>> >
>> >>[...]
>> >> I hope this analysis helps more than it confuses :-)
>> >>
>> >> As a conclusion, we could think of mapping larger chunks
>> >> also for the fragmented case of -d 31 to reduce the amount
>> >> of mmap/munmap calls.
>> >
>> >I agree in general. Memory mapped through /proc/vmcore does not
>> >increase run-time memory requirements, because it only adds a mapping
>> >to the old kernel's memory. The only limiting factor is the virtual
>> >address space. On many architectures, this is no issue at all, and we
>> >could simply map the whole file at beginning. On some architectures,
>> >the virtual address space is smaller than possible physical RAM, so
>> >this approach would not work for them.
>> >
>> >> Another open question was why the mmap case consumes more CPU
>> >> time in readmem() than the read case. Our theory is that the
>> >> first memory access is slower because it is not in the HW
>> >> cache. For the mmap case userspace issues the first access (copy
>> >> to makdumpfile cache) and for the read case the kernel issues
>> >> the first access (memcpy_real/copy_to_user). Therefore the
>> >> cache miss is accounted to userspace for mmap and to kernel for
>> >> read.
>> >
>> >I have no idea how to measure this on s390. On x86_64 I would add
>> >some asm code to read TSC before and after the memory access
>> >instruction. I guess there is a similar counter on s390. Suggestions?
>> >
>> >> And last but not least, perhaps on s390 we could replace
>> >> the bounce buffer used for memcpy_real()/copy_to_user() by
>> >> some more inteligent solution.
>> >
>> >Which would then improve the non-mmap times even more, right?
>> >
>> >Petr T
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
2015-03-16 5:14 ` Atsushi Kumagai
@ 2015-03-16 7:26 ` Petr Tesarik
2015-03-16 8:06 ` Atsushi Kumagai
0 siblings, 1 reply; 20+ messages in thread
From: Petr Tesarik @ 2015-03-16 7:26 UTC (permalink / raw)
To: Atsushi Kumagai; +Cc: holzheu, kexec, willeke
On Mon, 16 Mar 2015 05:14:25 +0000
Atsushi Kumagai <ats-kumagai@wm.jp.nec.com> wrote:
> >On Fri, 13 Mar 2015 04:10:22 +0000
> >Atsushi Kumagai <ats-kumagai@wm.jp.nec.com> wrote:
>[...]
> >> I'm going to release v1.5.8 soon, so I'll adopt v2 patch if
> >> you don't think updating it.
> >
> >Since v2 already brings some performance gain, I appreciate it if you
> >can adopt it for v1.5.8.
>
> Ok, but unfortunately I got some error log during my test like below:
>
> $ ./makedumpfile -d31 /tmp/vmcore ./dumpfile.d31
> Excluding free pages : [ 0.0 %] /
> reset_bitmap_of_free_pages: The free list is broken.
> reset_bitmap_of_free_pages: The free list is broken.
>
> makedumpfile Failed.
> $
>
> All of errors are the same as the above at least in my test.
> I clarified that [PATCH v2 7/8] causes this by git bisect,
> but the root cause is under investigation.
The only change I can think of is the removal of page_is_fractional.
Originally, LOADs that do not start on a page boundary were never
mmapped. With this patch, this check is removed.
Can you try adding the following check to mappage_elf (and dropping
patch 8/8)?
if (page_is_fractional(offset))
return NULL;
Petr T
P.S. This reminds me I should try to get some kernel dumps with
fractional pages for regression testing...
> >Thank you very much,
> >Petr Tesarik
> >
> >> Thanks
> >> Atsushi Kumagai
> >>
> >> >
> >> >> Here the actual results I got with "perf record":
> >> >>
> >> >> $ time ./makedumpfile -d 31 /proc/vmcore /dev/null -f
> >> >>
> >> >> Output of "perf report" for mmap case:
> >> >>
> >> >> /* Most time spent for unmap in kernel */
> >> >> 29.75% makedumpfile [kernel.kallsyms] [k] unmap_single_vma
> >> >> 9.84% makedumpfile [kernel.kallsyms] [k] remap_pfn_range
> >> >> 8.49% makedumpfile [kernel.kallsyms] [k] vm_normal_page
> >> >>
> >> >> /* Still some mmap overhead in makedumpfile readmem() */
> >> >> 21.56% makedumpfile makedumpfile [.] readmem
> >> >
> >> >This number is interesting. Did you compile makedumpfile with
> >> >optimizations? If yes, then this number probably includes some
> >> >functions which were inlined.
> >> >
> >> >> 8.49% makedumpfile makedumpfile [.]
> >> >> write_kdump_pages_cyclic
> >> >>
> >> >> Output of "perf report" for non-mmap case:
> >> >>
> >> >> /* Time spent for sys_read (that needs also two copy
> >> >> operations on s390 :() */ 25.32% makedumpfile
> >> >> [kernel.kallsyms] [k] memcpy_real 22.74% makedumpfile
> >> >> [kernel.kallsyms] [k] __copy_to_user
> >> >>
> >> >> /* readmem() for read path is cheaper ? */
> >> >> 13.49% makedumpfile makedumpfile [.]
> >> >> write_kdump_pages_cyclic 4.53% makedumpfile makedumpfile
> >> >> [.] readmem
> >> >
> >> >Yes, much lower overhead of readmem is strange. For a moment I
> >> >suspected wrong accounting of the page fault handler, but then I
> >> >realized that for /proc/vmcore, all page table entries are created
> >> >with the present bit set already, so there are no page faults...
> >> >
> >> >I haven't had time yet to set up a system for reproduction, but
> >> >I'll try to identify what's eating up the CPU time in readmem().
> >> >
> >> >>[...]
> >> >> I hope this analysis helps more than it confuses :-)
> >> >>
> >> >> As a conclusion, we could think of mapping larger chunks
> >> >> also for the fragmented case of -d 31 to reduce the amount
> >> >> of mmap/munmap calls.
> >> >
> >> >I agree in general. Memory mapped through /proc/vmcore does not
> >> >increase run-time memory requirements, because it only adds a
> >> >mapping to the old kernel's memory. The only limiting factor is
> >> >the virtual address space. On many architectures, this is no
> >> >issue at all, and we could simply map the whole file at
> >> >beginning. On some architectures, the virtual address space is
> >> >smaller than possible physical RAM, so this approach would not
> >> >work for them.
> >> >
> >> >> Another open question was why the mmap case consumes more CPU
> >> >> time in readmem() than the read case. Our theory is that the
> >> >> first memory access is slower because it is not in the HW
> >> >> cache. For the mmap case userspace issues the first access (copy
> >> >> to makdumpfile cache) and for the read case the kernel issues
> >> >> the first access (memcpy_real/copy_to_user). Therefore the
> >> >> cache miss is accounted to userspace for mmap and to kernel for
> >> >> read.
> >> >
> >> >I have no idea how to measure this on s390. On x86_64 I would add
> >> >some asm code to read TSC before and after the memory access
> >> >instruction. I guess there is a similar counter on s390.
> >> >Suggestions?
> >> >
> >> >> And last but not least, perhaps on s390 we could replace
> >> >> the bounce buffer used for memcpy_real()/copy_to_user() by
> >> >> some more inteligent solution.
> >> >
> >> >Which would then improve the non-mmap times even more, right?
> >> >
> >> >Petr T
>
> _______________________________________________
> kexec mailing list
> kexec@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 20+ messages in thread
* RE: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
2015-03-16 7:26 ` Petr Tesarik
@ 2015-03-16 8:06 ` Atsushi Kumagai
2015-03-16 8:24 ` Petr Tesarik
0 siblings, 1 reply; 20+ messages in thread
From: Atsushi Kumagai @ 2015-03-16 8:06 UTC (permalink / raw)
To: ptesarik; +Cc: holzheu, kexec, willeke
>On Mon, 16 Mar 2015 05:14:25 +0000
>Atsushi Kumagai <ats-kumagai@wm.jp.nec.com> wrote:
>
>> >On Fri, 13 Mar 2015 04:10:22 +0000
>> >Atsushi Kumagai <ats-kumagai@wm.jp.nec.com> wrote:
>>[...]
>> >> I'm going to release v1.5.8 soon, so I'll adopt v2 patch if
>> >> you don't think updating it.
>> >
>> >Since v2 already brings some performance gain, I appreciate it if you
>> >can adopt it for v1.5.8.
>>
>> Ok, but unfortunately I got some error log during my test like below:
>>
>> $ ./makedumpfile -d31 /tmp/vmcore ./dumpfile.d31
>> Excluding free pages : [ 0.0 %] /
>> reset_bitmap_of_free_pages: The free list is broken.
>> reset_bitmap_of_free_pages: The free list is broken.
>>
>> makedumpfile Failed.
>> $
>>
>> All of errors are the same as the above at least in my test.
>> I clarified that [PATCH v2 7/8] causes this by git bisect,
>> but the root cause is under investigation.
>
>The only change I can think of is the removal of page_is_fractional.
>Originally, LOADs that do not start on a page boundary were never
>mmapped. With this patch, this check is removed.
>
>Can you try adding the following check to mappage_elf (and dropping
>patch 8/8)?
>
> if (page_is_fractional(offset))
> return NULL;
It worked, thanks!
Additionally, I've remembered that we should keep page_is_fractional()
for old kernels.
https://lkml.org/lkml/2013/11/13/439
Thanks
Atsushi Kumagai
>Petr T
>
>P.S. This reminds me I should try to get some kernel dumps with
>fractional pages for regression testing...
>
>> >Thank you very much,
>> >Petr Tesarik
>> >
>> >> Thanks
>> >> Atsushi Kumagai
>> >>
>> >> >
>> >> >> Here the actual results I got with "perf record":
>> >> >>
>> >> >> $ time ./makedumpfile -d 31 /proc/vmcore /dev/null -f
>> >> >>
>> >> >> Output of "perf report" for mmap case:
>> >> >>
>> >> >> /* Most time spent for unmap in kernel */
>> >> >> 29.75% makedumpfile [kernel.kallsyms] [k] unmap_single_vma
>> >> >> 9.84% makedumpfile [kernel.kallsyms] [k] remap_pfn_range
>> >> >> 8.49% makedumpfile [kernel.kallsyms] [k] vm_normal_page
>> >> >>
>> >> >> /* Still some mmap overhead in makedumpfile readmem() */
>> >> >> 21.56% makedumpfile makedumpfile [.] readmem
>> >> >
>> >> >This number is interesting. Did you compile makedumpfile with
>> >> >optimizations? If yes, then this number probably includes some
>> >> >functions which were inlined.
>> >> >
>> >> >> 8.49% makedumpfile makedumpfile [.]
>> >> >> write_kdump_pages_cyclic
>> >> >>
>> >> >> Output of "perf report" for non-mmap case:
>> >> >>
>> >> >> /* Time spent for sys_read (that needs also two copy
>> >> >> operations on s390 :() */ 25.32% makedumpfile
>> >> >> [kernel.kallsyms] [k] memcpy_real 22.74% makedumpfile
>> >> >> [kernel.kallsyms] [k] __copy_to_user
>> >> >>
>> >> >> /* readmem() for read path is cheaper ? */
>> >> >> 13.49% makedumpfile makedumpfile [.]
>> >> >> write_kdump_pages_cyclic 4.53% makedumpfile makedumpfile
>> >> >> [.] readmem
>> >> >
>> >> >Yes, much lower overhead of readmem is strange. For a moment I
>> >> >suspected wrong accounting of the page fault handler, but then I
>> >> >realized that for /proc/vmcore, all page table entries are created
>> >> >with the present bit set already, so there are no page faults...
>> >> >
>> >> >I haven't had time yet to set up a system for reproduction, but
>> >> >I'll try to identify what's eating up the CPU time in readmem().
>> >> >
>> >> >>[...]
>> >> >> I hope this analysis helps more than it confuses :-)
>> >> >>
>> >> >> As a conclusion, we could think of mapping larger chunks
>> >> >> also for the fragmented case of -d 31 to reduce the amount
>> >> >> of mmap/munmap calls.
>> >> >
>> >> >I agree in general. Memory mapped through /proc/vmcore does not
>> >> >increase run-time memory requirements, because it only adds a
>> >> >mapping to the old kernel's memory. The only limiting factor is
>> >> >the virtual address space. On many architectures, this is no
>> >> >issue at all, and we could simply map the whole file at
>> >> >beginning. On some architectures, the virtual address space is
>> >> >smaller than possible physical RAM, so this approach would not
>> >> >work for them.
>> >> >
>> >> >> Another open question was why the mmap case consumes more CPU
>> >> >> time in readmem() than the read case. Our theory is that the
>> >> >> first memory access is slower because it is not in the HW
>> >> >> cache. For the mmap case userspace issues the first access (copy
>> >> >> to makdumpfile cache) and for the read case the kernel issues
>> >> >> the first access (memcpy_real/copy_to_user). Therefore the
>> >> >> cache miss is accounted to userspace for mmap and to kernel for
>> >> >> read.
>> >> >
>> >> >I have no idea how to measure this on s390. On x86_64 I would add
>> >> >some asm code to read TSC before and after the memory access
>> >> >instruction. I guess there is a similar counter on s390.
>> >> >Suggestions?
>> >> >
>> >> >> And last but not least, perhaps on s390 we could replace
>> >> >> the bounce buffer used for memcpy_real()/copy_to_user() by
>> >> >> some more inteligent solution.
>> >> >
>> >> >Which would then improve the non-mmap times even more, right?
>> >> >
>> >> >Petr T
>>
>> _______________________________________________
>> kexec mailing list
>> kexec@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/kexec
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 0/8] Handle mmaped regions in cache [more analysis]
2015-03-16 8:06 ` Atsushi Kumagai
@ 2015-03-16 8:24 ` Petr Tesarik
0 siblings, 0 replies; 20+ messages in thread
From: Petr Tesarik @ 2015-03-16 8:24 UTC (permalink / raw)
To: Atsushi Kumagai; +Cc: holzheu, kexec, willeke
On Mon, 16 Mar 2015 08:06:14 +0000
Atsushi Kumagai <ats-kumagai@wm.jp.nec.com> wrote:
> >On Mon, 16 Mar 2015 05:14:25 +0000
> >Atsushi Kumagai <ats-kumagai@wm.jp.nec.com> wrote:
> >
> >> >On Fri, 13 Mar 2015 04:10:22 +0000
> >> >Atsushi Kumagai <ats-kumagai@wm.jp.nec.com> wrote:
> >>[...]
> >> >> I'm going to release v1.5.8 soon, so I'll adopt v2 patch if
> >> >> you don't think updating it.
> >> >
> >> >Since v2 already brings some performance gain, I appreciate it if you
> >> >can adopt it for v1.5.8.
> >>
> >> Ok, but unfortunately I got some error log during my test like below:
> >>
> >> $ ./makedumpfile -d31 /tmp/vmcore ./dumpfile.d31
> >> Excluding free pages : [ 0.0 %] /
> >> reset_bitmap_of_free_pages: The free list is broken.
> >> reset_bitmap_of_free_pages: The free list is broken.
> >>
> >> makedumpfile Failed.
> >> $
> >>
> >> All of errors are the same as the above at least in my test.
> >> I clarified that [PATCH v2 7/8] causes this by git bisect,
> >> but the root cause is under investigation.
> >
> >The only change I can think of is the removal of page_is_fractional.
> >Originally, LOADs that do not start on a page boundary were never
> >mmapped. With this patch, this check is removed.
> >
> >Can you try adding the following check to mappage_elf (and dropping
> >patch 8/8)?
> >
> > if (page_is_fractional(offset))
> > return NULL;
>
> It worked, thanks!
>
> Additionally, I've remembered that we should keep page_is_fractional()
> for old kernels.
Well, if a LOAD segment is not page-aligned, the previous code would
have mapped only the page-aligned portion, and never beyond its
boundaries. I'm unsure what causes the bug, but I don't have time
to find the real root cause, so let's leave the fractional check in
place for now.
I'll be sending v3 of the patch set shortly.
Thank you for testing!
Petr T
> https://lkml.org/lkml/2013/11/13/439
>
>
> Thanks
> Atsushi Kumagai
>
> >Petr T
> >
> >P.S. This reminds me I should try to get some kernel dumps with
> >fractional pages for regression testing...
> >
> >> >Thank you very much,
> >> >Petr Tesarik
> >> >
> >> >> Thanks
> >> >> Atsushi Kumagai
> >> >>
> >> >> >
> >> >> >> Here the actual results I got with "perf record":
> >> >> >>
> >> >> >> $ time ./makedumpfile -d 31 /proc/vmcore /dev/null -f
> >> >> >>
> >> >> >> Output of "perf report" for mmap case:
> >> >> >>
> >> >> >> /* Most time spent for unmap in kernel */
> >> >> >> 29.75% makedumpfile [kernel.kallsyms] [k]
> >> >> >> unmap_single_vma 9.84% makedumpfile [kernel.kallsyms] [k]
> >> >> >> remap_pfn_range 8.49% makedumpfile [kernel.kallsyms] [k]
> >> >> >> vm_normal_page
> >> >> >>
> >> >> >> /* Still some mmap overhead in makedumpfile readmem() */
> >> >> >> 21.56% makedumpfile makedumpfile [.] readmem
> >> >> >
> >> >> >This number is interesting. Did you compile makedumpfile with
> >> >> >optimizations? If yes, then this number probably includes some
> >> >> >functions which were inlined.
> >> >> >
> >> >> >> 8.49% makedumpfile makedumpfile [.]
> >> >> >> write_kdump_pages_cyclic
> >> >> >>
> >> >> >> Output of "perf report" for non-mmap case:
> >> >> >>
> >> >> >> /* Time spent for sys_read (that needs also two copy
> >> >> >> operations on s390 :() */ 25.32% makedumpfile
> >> >> >> [kernel.kallsyms] [k] memcpy_real 22.74% makedumpfile
> >> >> >> [kernel.kallsyms] [k] __copy_to_user
> >> >> >>
> >> >> >> /* readmem() for read path is cheaper ? */
> >> >> >> 13.49% makedumpfile makedumpfile [.]
> >> >> >> write_kdump_pages_cyclic 4.53% makedumpfile makedumpfile
> >> >> >> [.] readmem
> >> >> >
> >> >> >Yes, much lower overhead of readmem is strange. For a moment I
> >> >> >suspected wrong accounting of the page fault handler, but then
> >> >> >I realized that for /proc/vmcore, all page table entries are
> >> >> >created with the present bit set already, so there are no page
> >> >> >faults...
> >> >> >
> >> >> >I haven't had time yet to set up a system for reproduction, but
> >> >> >I'll try to identify what's eating up the CPU time in
> >> >> >readmem().
> >> >> >
> >> >> >>[...]
> >> >> >> I hope this analysis helps more than it confuses :-)
> >> >> >>
> >> >> >> As a conclusion, we could think of mapping larger chunks
> >> >> >> also for the fragmented case of -d 31 to reduce the amount
> >> >> >> of mmap/munmap calls.
> >> >> >
> >> >> >I agree in general. Memory mapped through /proc/vmcore does not
> >> >> >increase run-time memory requirements, because it only adds a
> >> >> >mapping to the old kernel's memory. The only limiting factor is
> >> >> >the virtual address space. On many architectures, this is no
> >> >> >issue at all, and we could simply map the whole file at
> >> >> >beginning. On some architectures, the virtual address space is
> >> >> >smaller than possible physical RAM, so this approach would not
> >> >> >work for them.
> >> >> >
> >> >> >> Another open question was why the mmap case consumes more CPU
> >> >> >> time in readmem() than the read case. Our theory is that the
> >> >> >> first memory access is slower because it is not in the HW
> >> >> >> cache. For the mmap case userspace issues the first access
> >> >> >> (copy to makdumpfile cache) and for the read case the kernel
> >> >> >> issues the first access (memcpy_real/copy_to_user).
> >> >> >> Therefore the cache miss is accounted to userspace for mmap
> >> >> >> and to kernel for read.
> >> >> >
> >> >> >I have no idea how to measure this on s390. On x86_64 I would
> >> >> >add some asm code to read TSC before and after the memory
> >> >> >access instruction. I guess there is a similar counter on s390.
> >> >> >Suggestions?
> >> >> >
> >> >> >> And last but not least, perhaps on s390 we could replace
> >> >> >> the bounce buffer used for memcpy_real()/copy_to_user() by
> >> >> >> some more inteligent solution.
> >> >> >
> >> >> >Which would then improve the non-mmap times even more, right?
> >> >> >
> >> >> >Petr T
> >>
> >> _______________________________________________
> >> kexec mailing list
> >> kexec@lists.infradead.org
> >> http://lists.infradead.org/mailman/listinfo/kexec
>
_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec
^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2015-03-16 8:24 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-06 14:03 [PATCH v2 0/8] Handle mmaped regions in cache Petr Tesarik
2015-03-06 8:23 ` [PATCH v2 1/8] cache: get rid of search loop in cache_add() Petr Tesarik
2015-03-06 8:52 ` [PATCH v2 2/8] cache: allow to return a page to the pool Petr Tesarik
2015-03-06 8:59 ` [PATCH v2 3/8] cache: do not allocate from the pending list Petr Tesarik
2015-03-06 9:26 ` [PATCH v2 4/8] cache: add hit/miss statistics to the final report Petr Tesarik
2015-03-06 13:07 ` [PATCH v2 5/8] cache: allocate buffers in one big chunk Petr Tesarik
2015-03-06 13:10 ` [PATCH v2 6/8] cache: allow arbitrary size of cache entries Petr Tesarik
2015-03-06 13:23 ` [PATCH v2 7/8] cache: store mapped regions directly in the cache Petr Tesarik
2015-03-06 13:28 ` [PATCH v2 8/8] cleanup: remove unused page_is_fractional Petr Tesarik
2015-03-09 16:08 ` [PATCH v2 0/8] Handle mmaped regions in cache Michael Holzheu
2015-03-09 16:08 ` [PATCH v2 0/8] Handle mmaped regions in cache [more analysis] Michael Holzheu
2015-03-12 13:56 ` Michael Holzheu
2015-03-12 15:38 ` Petr Tesarik
2015-03-13 4:10 ` Atsushi Kumagai
2015-03-13 8:04 ` Petr Tesarik
2015-03-16 5:14 ` Atsushi Kumagai
2015-03-16 7:26 ` Petr Tesarik
2015-03-16 8:06 ` Atsushi Kumagai
2015-03-16 8:24 ` Petr Tesarik
2015-03-13 16:19 ` Michael Holzheu
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.