All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] makedumpfile: Improve performance for parallel compression with zlib.
@ 2015-10-14  5:24 Atsushi Kumagai
  2015-10-22  3:11 ` "Zhou, Wenjian/周文?"
  0 siblings, 1 reply; 3+ messages in thread
From: Atsushi Kumagai @ 2015-10-14  5:24 UTC (permalink / raw)
  To: kexec; +Cc: qiaonuohan, zhouwj-fnst

Hello,

I have improved the performance issue of parallel compression
which we faced in:

  http://lists.infradead.org/pipermail/kexec/2015-July/014137.html

The cause of the issue is that compress2() calls malloc() and free()
for a temp buffer in each call, it can cause many page faults since
makedumpfile has to call compress2() for each page.

It's easy to avoid the issue, just divide compress2() into three
functions as initialization part, compression part and finalization
part. Then we don't need to call the initialization function and the
finalization function for each page.

In order to benchmark, I measured the execution time and the number of
page faults by *perf stat -e page-faults* on the current devel branch(v1.5.8+).

The result is here:

  CPU:   Dual-Core AMD Opteron(tm) Processor 2218 @ 2.6GHz (4 cores)
  Memory:  5GB
  zlib:  1.2.3-29
  glibc: 2.12-1.132

        version      |  num-threads  |  time(sec)  |   page-faults
     ----------------+---------------+-------------+------------------
          devel      |       1       |   133.96    |    21,801,120
          devel      |       3       |    87.25    |    21,801,150
      + this patch   |       1       |    47.80    |     1,036,408
      + this patch   |       3       |    39.14    |     1,036,478


Thanks
Atsushi Kumagai


From: Atsushi Kumagai <ats-kumagai@wm.jp.nec.com>
Date: Thu, 8 Oct 2015 15:06:08 +0900
Subject: [PATCH] Improve performance for parallel compression with zlib.

compress2() allocates a buffer, compresses a input data and
deallocates the buffer in each call. makedumpfile has to call
compress2() for each page, it can cause big performance
degradation due to many page faults. This issue will be
especially apparent in the case of multi thread compression
since per-thread arena is easy to be grown and trimmed compared
with main arena.

Fortunately, the zlib functions called in compress2() are global,
it's easy to extract the allocation and deallocation part from
compress2().

Signed-off-by: Atsushi Kumagai <ats-kumagai@wm.jp.nec.com>
---
 makedumpfile.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 makedumpfile.h |  4 ++++
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/makedumpfile.c b/makedumpfile.c
index 06c8baf..fa0b779 100644
--- a/makedumpfile.c
+++ b/makedumpfile.c
@@ -25,6 +25,7 @@
 #include <sys/time.h>
 #include <limits.h>
 #include <assert.h>
+#include <zlib.h>
 
 struct symbol_table	symbol_table;
 struct size_table	size_table;
@@ -3538,6 +3539,11 @@ initial_for_parallel()
 		MMAP_CACHE_PARALLEL(i)->mmap_start_offset = 0;
 		MMAP_CACHE_PARALLEL(i)->mmap_end_offset = 0;
 
+		if (initialize_zlib(&ZLIB_STREAM_PARALLEL(i), Z_BEST_SPEED) == FALSE) {
+			ERRMSG("zlib initialization failed.\n");
+			return FALSE;
+		}
+
 #ifdef USELZO
 		if ((WRKMEM_PARALLEL(i) = malloc(LZO1X_1_MEM_COMPRESS)) == NULL) {
 			MSG("Can't allocate memory for the working memory. %s\n",
@@ -3628,6 +3634,7 @@ free_for_parallel()
 
 				free(MMAP_CACHE_PARALLEL(i));
 			}
+			finalize_zlib(&ZLIB_STREAM_PARALLEL(i));
 #ifdef USELZO
 			if (WRKMEM_PARALLEL(i) != NULL)
 				free(WRKMEM_PARALLEL(i));
@@ -7017,6 +7024,53 @@ write_kdump_page(struct cache_data *cd_header, struct cache_data *cd_page,
 	return TRUE;
 }
 
+int initialize_zlib(z_stream *stream, int level)
+{
+	int err;
+
+	stream->zalloc = (alloc_func)Z_NULL;
+	stream->zfree = (free_func)Z_NULL;
+	stream->opaque = (voidpf)Z_NULL;
+
+	err = deflateInit(stream, level);
+	if (err != Z_OK) {
+		ERRMSG("deflateInit failed: %s\n", zError(err));
+		return FALSE;
+	}
+	return TRUE;
+}
+
+int compress_mdf (z_stream *stream, Bytef *dest, uLongf *destLen,
+		  const Bytef *source, uLong sourceLen, int level)
+{
+	int err;
+	stream->next_in = (Bytef*)source;
+	stream->avail_in = (uInt)sourceLen;
+	stream->next_out = dest;
+	stream->avail_out = (uInt)*destLen;
+	if ((uLong)stream->avail_out != *destLen)
+		return Z_BUF_ERROR;
+
+	err = deflate(stream, Z_FINISH);
+
+	if (err != Z_STREAM_END) {
+		deflateReset(stream);
+		return err == Z_OK ? Z_BUF_ERROR : err;
+	}
+	*destLen = stream->total_out;
+
+	err = deflateReset(stream);
+	return err;
+}
+
+int finalize_zlib(z_stream *stream)
+{
+	int err;
+	err = deflateEnd(stream);
+
+	return err;
+}
+
 void *
 kdump_thread_function_cyclic(void *arg) {
 	void *retval = PTHREAD_FAIL;
@@ -7035,6 +7089,7 @@ kdump_thread_function_cyclic(void *arg) {
 	struct mmap_cache *mmap_cache =
 			MMAP_CACHE_PARALLEL(kdump_thread_args->thread_num);
 	unsigned long size_out;
+	z_stream *stream = &ZLIB_STREAM_PARALLEL(kdump_thread_args->thread_num);
 #ifdef USELZO
 	lzo_bytep wrkmem = WRKMEM_PARALLEL(kdump_thread_args->thread_num);
 #endif
@@ -7135,7 +7190,7 @@ kdump_thread_function_cyclic(void *arg) {
 			size_out = kdump_thread_args->len_buf_out;
 			if ((info->flag_compress & DUMP_DH_COMPRESSED_ZLIB)
 			    && ((size_out = kdump_thread_args->len_buf_out),
-				compress2(buf_out, &size_out, buf,
+				compress_mdf(stream, buf_out, &size_out, buf,
 					  info->page_size,
 					  Z_BEST_SPEED) == Z_OK)
 			    && (size_out < info->page_size)) {
diff --git a/makedumpfile.h b/makedumpfile.h
index 0bd6425..cb8f0f3 100644
--- a/makedumpfile.h
+++ b/makedumpfile.h
@@ -438,6 +438,7 @@ do { \
 #define BUF_PARALLEL(i)			info->parallel_info[i].buf
 #define BUF_OUT_PARALLEL(i)		info->parallel_info[i].buf_out
 #define MMAP_CACHE_PARALLEL(i)		info->parallel_info[i].mmap_cache
+#define ZLIB_STREAM_PARALLEL(i)		info->parallel_info[i].zlib_stream
 #ifdef USELZO
 #define WRKMEM_PARALLEL(i)		info->parallel_info[i].wrkmem
 #endif
@@ -1050,6 +1051,7 @@ struct parallel_info {
 	unsigned char		*buf;
 	unsigned char 		*buf_out;
 	struct mmap_cache	*mmap_cache;
+	z_stream		zlib_stream;
 #ifdef USELZO
 	lzo_bytep		wrkmem;
 #endif
@@ -2051,5 +2053,7 @@ int initial_xen(void);
 unsigned long long get_free_memory_size(void);
 int calculate_cyclic_buffer_size(void);
 int prepare_splitblock_table(void);
+int initialize_zlib(z_stream *stream, int level);
+int finalize_zlib(z_stream *stream);
 
 #endif /* MAKEDUMPFILE_H */
-- 
1.9.0

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] makedumpfile: Improve performance for parallel compression with zlib.
  2015-10-14  5:24 [PATCH] makedumpfile: Improve performance for parallel compression with zlib Atsushi Kumagai
@ 2015-10-22  3:11 ` "Zhou, Wenjian/周文?"
  2015-10-22  4:07   ` qiaonuohan
  0 siblings, 1 reply; 3+ messages in thread
From: "Zhou, Wenjian/周文?" @ 2015-10-22  3:11 UTC (permalink / raw)
  To: Atsushi Kumagai, kexec; +Cc: qiaonuohan, HATAYAMA Daisuke

Hello Kumagai,

I test it, and it works well. The following is the results.

in virtual machine(memory 2G):
with empty memory:
          version      |  num-threads  |  time(sec)
       ----------------+---------------+-------------
            devel      |       0       |    12.76
            devel      |       1       |    19.29
            devel      |       2       |    11.56
        + this patch   |       0       |    12.85
        + this patch   |       1       |     5.61
        + this patch   |       2       |     2.68

with full memory:
          version      |  num-threads  |  time(sec)
       ----------------+---------------+-------------
            devel      |       0       |    51.18
            devel      |       1       |    57.82
            devel      |       2       |    41.54
        + this patch   |       0       |    49.25
        + this patch   |       1       |    44.80
        + this patch   |       2       |    33.87


in real machine(memory 16G):
with empty memory:
          version      |  num-threads  |  time(sec)
       ----------------+---------------+-------------
            devel      |       0       |    86.12
            devel      |       1       |   222.37
            devel      |       8       |    81.50
            devel      |       16      |    98.44
        + this patch   |       0       |    86.07
        + this patch   |       1       |    84.33
        + this patch   |       8       |    14.95
        + this patch   |       16      |    13.96

with full memory:
          version      |  num-threads  |  time(sec)
       ----------------+---------------+-------------
            devel      |       0       |   540.89
            devel      |       1       |   715.25
            devel      |       8       |   132.54
            devel      |       16      |   112.89
        + this patch   |       0       |   542.79
        + this patch   |       1       |   538.22
        + this patch   |       8       |   108.28
        + this patch   |       16      |   107.83

-- 
Thanks
Zhou

On 10/14/2015 01:24 PM, Atsushi Kumagai wrote:
> Hello,
> 
> I have improved the performance issue of parallel compression
> which we faced in:
> 
>    http://lists.infradead.org/pipermail/kexec/2015-July/014137.html
> 
> The cause of the issue is that compress2() calls malloc() and free()
> for a temp buffer in each call, it can cause many page faults since
> makedumpfile has to call compress2() for each page.
> 
> It's easy to avoid the issue, just divide compress2() into three
> functions as initialization part, compression part and finalization
> part. Then we don't need to call the initialization function and the
> finalization function for each page.
> 
> In order to benchmark, I measured the execution time and the number of
> page faults by *perf stat -e page-faults* on the current devel branch(v1.5.8+).
> 
> The result is here:
> 
>    CPU:   Dual-Core AMD Opteron(tm) Processor 2218 @ 2.6GHz (4 cores)
>    Memory:  5GB
>    zlib:  1.2.3-29
>    glibc: 2.12-1.132
> 
>          version      |  num-threads  |  time(sec)  |   page-faults
>       ----------------+---------------+-------------+------------------
>            devel      |       1       |   133.96    |    21,801,120
>            devel      |       3       |    87.25    |    21,801,150
>        + this patch   |       1       |    47.80    |     1,036,408
>        + this patch   |       3       |    39.14    |     1,036,478
> 
> 
> Thanks
> Atsushi Kumagai
> 
> 
> From: Atsushi Kumagai <ats-kumagai@wm.jp.nec.com>
> Date: Thu, 8 Oct 2015 15:06:08 +0900
> Subject: [PATCH] Improve performance for parallel compression with zlib.
> 
> compress2() allocates a buffer, compresses a input data and
> deallocates the buffer in each call. makedumpfile has to call
> compress2() for each page, it can cause big performance
> degradation due to many page faults. This issue will be
> especially apparent in the case of multi thread compression
> since per-thread arena is easy to be grown and trimmed compared
> with main arena.
> 
> Fortunately, the zlib functions called in compress2() are global,
> it's easy to extract the allocation and deallocation part from
> compress2().
> 
> Signed-off-by: Atsushi Kumagai <ats-kumagai@wm.jp.nec.com>
> ---
>   makedumpfile.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>   makedumpfile.h |  4 ++++
>   2 files changed, 60 insertions(+), 1 deletion(-)
> 
> diff --git a/makedumpfile.c b/makedumpfile.c
> index 06c8baf..fa0b779 100644
> --- a/makedumpfile.c
> +++ b/makedumpfile.c
> @@ -25,6 +25,7 @@
>   #include <sys/time.h>
>   #include <limits.h>
>   #include <assert.h>
> +#include <zlib.h>
>   
>   struct symbol_table	symbol_table;
>   struct size_table	size_table;
> @@ -3538,6 +3539,11 @@ initial_for_parallel()
>   		MMAP_CACHE_PARALLEL(i)->mmap_start_offset = 0;
>   		MMAP_CACHE_PARALLEL(i)->mmap_end_offset = 0;
>   
> +		if (initialize_zlib(&ZLIB_STREAM_PARALLEL(i), Z_BEST_SPEED) == FALSE) {
> +			ERRMSG("zlib initialization failed.\n");
> +			return FALSE;
> +		}
> +
>   #ifdef USELZO
>   		if ((WRKMEM_PARALLEL(i) = malloc(LZO1X_1_MEM_COMPRESS)) == NULL) {
>   			MSG("Can't allocate memory for the working memory. %s\n",
> @@ -3628,6 +3634,7 @@ free_for_parallel()
>   
>   				free(MMAP_CACHE_PARALLEL(i));
>   			}
> +			finalize_zlib(&ZLIB_STREAM_PARALLEL(i));
>   #ifdef USELZO
>   			if (WRKMEM_PARALLEL(i) != NULL)
>   				free(WRKMEM_PARALLEL(i));
> @@ -7017,6 +7024,53 @@ write_kdump_page(struct cache_data *cd_header, struct cache_data *cd_page,
>   	return TRUE;
>   }
>   
> +int initialize_zlib(z_stream *stream, int level)
> +{
> +	int err;
> +
> +	stream->zalloc = (alloc_func)Z_NULL;
> +	stream->zfree = (free_func)Z_NULL;
> +	stream->opaque = (voidpf)Z_NULL;
> +
> +	err = deflateInit(stream, level);
> +	if (err != Z_OK) {
> +		ERRMSG("deflateInit failed: %s\n", zError(err));
> +		return FALSE;
> +	}
> +	return TRUE;
> +}
> +
> +int compress_mdf (z_stream *stream, Bytef *dest, uLongf *destLen,
> +		  const Bytef *source, uLong sourceLen, int level)
> +{
> +	int err;
> +	stream->next_in = (Bytef*)source;
> +	stream->avail_in = (uInt)sourceLen;
> +	stream->next_out = dest;
> +	stream->avail_out = (uInt)*destLen;
> +	if ((uLong)stream->avail_out != *destLen)
> +		return Z_BUF_ERROR;
> +
> +	err = deflate(stream, Z_FINISH);
> +
> +	if (err != Z_STREAM_END) {
> +		deflateReset(stream);
> +		return err == Z_OK ? Z_BUF_ERROR : err;
> +	}
> +	*destLen = stream->total_out;
> +
> +	err = deflateReset(stream);
> +	return err;
> +}
> +
> +int finalize_zlib(z_stream *stream)
> +{
> +	int err;
> +	err = deflateEnd(stream);
> +
> +	return err;
> +}
> +
>   void *
>   kdump_thread_function_cyclic(void *arg) {
>   	void *retval = PTHREAD_FAIL;
> @@ -7035,6 +7089,7 @@ kdump_thread_function_cyclic(void *arg) {
>   	struct mmap_cache *mmap_cache =
>   			MMAP_CACHE_PARALLEL(kdump_thread_args->thread_num);
>   	unsigned long size_out;
> +	z_stream *stream = &ZLIB_STREAM_PARALLEL(kdump_thread_args->thread_num);
>   #ifdef USELZO
>   	lzo_bytep wrkmem = WRKMEM_PARALLEL(kdump_thread_args->thread_num);
>   #endif
> @@ -7135,7 +7190,7 @@ kdump_thread_function_cyclic(void *arg) {
>   			size_out = kdump_thread_args->len_buf_out;
>   			if ((info->flag_compress & DUMP_DH_COMPRESSED_ZLIB)
>   			    && ((size_out = kdump_thread_args->len_buf_out),
> -				compress2(buf_out, &size_out, buf,
> +				compress_mdf(stream, buf_out, &size_out, buf,
>   					  info->page_size,
>   					  Z_BEST_SPEED) == Z_OK)
>   			    && (size_out < info->page_size)) {
> diff --git a/makedumpfile.h b/makedumpfile.h
> index 0bd6425..cb8f0f3 100644
> --- a/makedumpfile.h
> +++ b/makedumpfile.h
> @@ -438,6 +438,7 @@ do { \
>   #define BUF_PARALLEL(i)			info->parallel_info[i].buf
>   #define BUF_OUT_PARALLEL(i)		info->parallel_info[i].buf_out
>   #define MMAP_CACHE_PARALLEL(i)		info->parallel_info[i].mmap_cache
> +#define ZLIB_STREAM_PARALLEL(i)		info->parallel_info[i].zlib_stream
>   #ifdef USELZO
>   #define WRKMEM_PARALLEL(i)		info->parallel_info[i].wrkmem
>   #endif
> @@ -1050,6 +1051,7 @@ struct parallel_info {
>   	unsigned char		*buf;
>   	unsigned char 		*buf_out;
>   	struct mmap_cache	*mmap_cache;
> +	z_stream		zlib_stream;
>   #ifdef USELZO
>   	lzo_bytep		wrkmem;
>   #endif
> @@ -2051,5 +2053,7 @@ int initial_xen(void);
>   unsigned long long get_free_memory_size(void);
>   int calculate_cyclic_buffer_size(void);
>   int prepare_splitblock_table(void);
> +int initialize_zlib(z_stream *stream, int level);
> +int finalize_zlib(z_stream *stream);
>   
>   #endif /* MAKEDUMPFILE_H */
> 

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] makedumpfile: Improve performance for parallel compression with zlib.
  2015-10-22  3:11 ` "Zhou, Wenjian/周文?"
@ 2015-10-22  4:07   ` qiaonuohan
  0 siblings, 0 replies; 3+ messages in thread
From: qiaonuohan @ 2015-10-22  4:07 UTC (permalink / raw)
  To: "Zhou, Wenjian/周文?"
  Cc: kernel@kyup.com >> Nikolay Borisov, Atsushi Kumagai, kexec,
	HATAYAMA Daisuke

Hello Zhou,

The performance improvement shows great according to your data. However,
the data is not that enough to show the change of common use.

Since the amount of 0/1 in a page will cause the vary of time used for
compression, I would like you get more benchmark on different core with
different amount of 0/1.

On 10/22/2015 11:11 AM, "Zhou, Wenjian/周文?" wrote:
> Hello Kumagai,
>
> I test it, and it works well. The following is the results.
>
> in virtual machine(memory 2G):
> with empty memory:
>            version      |  num-threads  |  time(sec)
>         ----------------+---------------+-------------
>              devel      |       0       |    12.76
>              devel      |       1       |    19.29
>              devel      |       2       |    11.56
>          + this patch   |       0       |    12.85
>          + this patch   |       1       |     5.61
>          + this patch   |       2       |     2.68
>
> with full memory:

full memory means memory is occupied by 1, without 0?

>            version      |  num-threads  |  time(sec)
>         ----------------+---------------+-------------
>              devel      |       0       |    51.18
>              devel      |       1       |    57.82
>              devel      |       2       |    41.54
>          + this patch   |       0       |    49.25
>          + this patch   |       1       |    44.80
>          + this patch   |       2       |    33.87
>
>
> in real machine(memory 16G):
> with empty memory:
>            version      |  num-threads  |  time(sec)
>         ----------------+---------------+-------------
>              devel      |       0       |    86.12
>              devel      |       1       |   222.37
>              devel      |       8       |    81.50
>              devel      |       16      |    98.44
>          + this patch   |       0       |    86.07
>          + this patch   |       1       |    84.33
>          + this patch   |       8       |    14.95
>          + this patch   |       16      |    13.96
>
> with full memory:
>            version      |  num-threads  |  time(sec)
>         ----------------+---------------+-------------
>              devel      |       0       |   540.89
>              devel      |       1       |   715.25
>              devel      |       8       |   132.54
>              devel      |       16      |   112.89
>          + this patch   |       0       |   542.79
>          + this patch   |       1       |   538.22
>          + this patch   |       8       |   108.28
>          + this patch   |       16      |   107.83
>


-- 
Regards
Qiao Nuohan

_______________________________________________
kexec mailing list
kexec@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/kexec

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2015-10-22  4:07 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-14  5:24 [PATCH] makedumpfile: Improve performance for parallel compression with zlib Atsushi Kumagai
2015-10-22  3:11 ` "Zhou, Wenjian/周文?"
2015-10-22  4:07   ` qiaonuohan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.