All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] buffer: add an aligned buffer with less alignment than a page
@ 2014-09-15 15:55 Janne Grunau
  2014-09-15 15:55 ` [PATCH 2/3] ec: make use of added aligned buffers Janne Grunau
                   ` (4 more replies)
  0 siblings, 5 replies; 41+ messages in thread
From: Janne Grunau @ 2014-09-15 15:55 UTC (permalink / raw)
  To: ceph-devel

SIMD optimized erasure code computation needs aligned memory. Buffers
aligned to a page boundary are wasted on it though. The buffers used
for the erasure code computation are typical smaller than a page.

Currently an alignement of 16 bytes is enough for the used
SIMD instruction sets (SSE4 and NEON).

Signed-off-by: Janne Grunau <j@jannau.net>
---
 configure.ac         |   9 +++++
 src/common/buffer.cc | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++
 src/include/buffer.h |  10 ++++++
 3 files changed, 119 insertions(+)

diff --git a/configure.ac b/configure.ac
index cccf2d9..1bb27c4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -793,6 +793,15 @@ AC_MSG_RESULT([no])
 ])
 
 #
+# Check for functions to provide aligned memory
+#
+AC_CHECK_HEADERS([malloc.h])
+AC_CHECK_FUNCS([posix_memalign _aligned_malloc memalign aligned_malloc],
+               [found_memalign=yes; break])
+
+AS_IF([test "x$found_memalign" != "xyes"], [AC_MSG_WARN([No function for aligned memory allocation found])])
+
+#
 # Check for pthread spinlock (depends on ACX_PTHREAD)
 #
 saved_LIBS="$LIBS"
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index b141759..acc221f 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -30,6 +30,10 @@
 #include <sys/uio.h>
 #include <limits.h>
 
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
 namespace ceph {
 
 #ifdef BUFFER_DEBUG
@@ -155,9 +159,15 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     virtual int zero_copy_to_fd(int fd, loff_t *offset) {
       return -ENOTSUP;
     }
+    virtual bool is_aligned() {
+      return ((long)data & ~CEPH_ALIGN_MASK) == 0;
+    }
     virtual bool is_page_aligned() {
       return ((long)data & ~CEPH_PAGE_MASK) == 0;
     }
+    bool is_n_align_sized() {
+      return (len & ~CEPH_ALIGN_MASK) == 0;
+    }
     bool is_n_page_sized() {
       return (len & ~CEPH_PAGE_MASK) == 0;
     }
@@ -209,6 +219,41 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     }
   };
 
+  class buffer::raw_aligned : public buffer::raw {
+  public:
+    raw_aligned(unsigned l) : raw(l) {
+      if (len) {
+#if HAVE_POSIX_MEMALIGN
+        if (posix_memalign((void **) &data, CEPH_ALIGN, len))
+          data = 0;
+#elif HAVE__ALIGNED_MALLOC
+        data = _aligned_malloc(len, CEPH_ALIGN);
+#elif HAVE_MEMALIGN
+        data = memalign(CEPH_ALIGN, len);
+#elif HAVE_ALIGNED_MALLOC
+        data = aligned_malloc((len + CEPH_ALIGN - 1) & ~CEPH_ALIGN_MASK,
+                              CEPH_ALIGN);
+#else
+        data = malloc(len);
+#endif
+        if (!data)
+          throw bad_alloc();
+      } else {
+        data = 0;
+      }
+      inc_total_alloc(len);
+      bdout << "raw_aligned " << this << " alloc " << (void *)data << " " << l << " " << buffer::get_total_alloc() << bendl;
+    }
+    ~raw_aligned() {
+      free(data);
+      dec_total_alloc(len);
+      bdout << "raw_aligned " << this << " free " << (void *)data << " " << buffer::get_total_alloc() << bendl;
+    }
+    raw* clone_empty() {
+      return new raw_aligned(len);
+    }
+  };
+
 #ifndef __CYGWIN__
   class buffer::raw_mmap_pages : public buffer::raw {
   public:
@@ -334,6 +379,10 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
       return true;
     }
 
+    bool is_aligned() {
+      return false;
+    }
+
     bool is_page_aligned() {
       return false;
     }
@@ -520,6 +569,9 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
   buffer::raw* buffer::create_static(unsigned len, char *buf) {
     return new raw_static(buf, len);
   }
+  buffer::raw* buffer::create_aligned(unsigned len) {
+    return new raw_aligned(len);
+  }
   buffer::raw* buffer::create_page_aligned(unsigned len) {
 #ifndef __CYGWIN__
     //return new raw_mmap_pages(len);
@@ -1013,6 +1065,16 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     return true;
   }
 
+  bool buffer::list::is_aligned() const
+  {
+    for (std::list<ptr>::const_iterator it = _buffers.begin();
+         it != _buffers.end();
+         ++it)
+      if (!it->is_aligned())
+        return false;
+    return true;
+  }
+
   bool buffer::list::is_page_aligned() const
   {
     for (std::list<ptr>::const_iterator it = _buffers.begin();
@@ -1101,6 +1163,44 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     _buffers.push_back(nb);
   }
 
+void buffer::list::rebuild_aligned()
+{
+  std::list<ptr>::iterator p = _buffers.begin();
+  while (p != _buffers.end()) {
+    // keep anything that's already page sized+aligned
+    if (p->is_aligned() && p->is_n_align_sized()) {
+      /*cout << " segment " << (void*)p->c_str()
+             << " offset " << ((unsigned long)p->c_str() & ~CEPH_ALIGN_MASK)
+             << " length " << p->length()
+             << " " << (p->length() & ~CEPH_ALIGN_MASK) << " ok" << std::endl;
+      */
+      ++p;
+      continue;
+    }
+
+    // consolidate unaligned items, until we get something that is sized+aligned
+    list unaligned;
+    unsigned offset = 0;
+    do {
+      /*cout << " segment " << (void*)p->c_str()
+             << " offset " << ((unsigned long)p->c_str() & ~CEPH_ALIGN_MASK)
+             << " length " << p->length() << " " << (p->length() & ~CEPH_ALIGN_MASK)
+             << " overall offset " << offset << " " << (offset & ~CEPH_ALIGN_MASK)
+             << " not ok" << std::endl;
+      */
+      offset += p->length();
+      unaligned.push_back(*p);
+      _buffers.erase(p++);
+    } while (p != _buffers.end() &&
+	     (!p->is_aligned() ||
+	      !p->is_n_align_sized() ||
+	      (offset & ~CEPH_ALIGN_MASK)));
+    ptr nb(buffer::create_aligned(unaligned._len));
+    unaligned.rebuild(nb);
+    _buffers.insert(p, unaligned._buffers.front());
+  }
+}
+
 void buffer::list::rebuild_page_aligned()
 {
   std::list<ptr>::iterator p = _buffers.begin();
diff --git a/src/include/buffer.h b/src/include/buffer.h
index e5c1b50..ea362e7 100644
--- a/src/include/buffer.h
+++ b/src/include/buffer.h
@@ -56,6 +56,9 @@
 # include <assert.h>
 #endif
 
+#define CEPH_ALIGN 16
+#define CEPH_ALIGN_MASK (~(CEPH_ALIGN - 1LLU))
+
 namespace ceph {
 
 class buffer {
@@ -124,6 +127,7 @@ private:
    */
   class raw;
   class raw_malloc;
+  class raw_aligned;
   class raw_static;
   class raw_mmap_pages;
   class raw_posix_aligned;
@@ -144,6 +148,7 @@ public:
   static raw* create_malloc(unsigned len);
   static raw* claim_malloc(unsigned len, char *buf);
   static raw* create_static(unsigned len, char *buf);
+  static raw* create_aligned(unsigned len);
   static raw* create_page_aligned(unsigned len);
   static raw* create_zero_copy(unsigned len, int fd, int64_t *offset);
 
@@ -177,7 +182,9 @@ public:
     bool at_buffer_head() const { return _off == 0; }
     bool at_buffer_tail() const;
 
+    bool is_aligned() const { return ((long)c_str() & ~CEPH_ALIGN_MASK) == 0; }
     bool is_page_aligned() const { return ((long)c_str() & ~CEPH_PAGE_MASK) == 0; }
+    bool is_n_align_sized() const { return (length() & ~CEPH_ALIGN_MASK) == 0; }
     bool is_n_page_sized() const { return (length() & ~CEPH_PAGE_MASK) == 0; }
 
     // accessors
@@ -344,7 +351,9 @@ public:
     bool contents_equal(buffer::list& other);
 
     bool can_zero_copy() const;
+    bool is_aligned() const;
     bool is_page_aligned() const;
+    bool is_n_align_sized() const;
     bool is_n_page_sized() const;
 
     bool is_zero() const;
@@ -382,6 +391,7 @@ public:
     bool is_contiguous();
     void rebuild();
     void rebuild(ptr& nb);
+    void rebuild_aligned();
     void rebuild_page_aligned();
 
     // sort-of-like-assignment-op
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 2/3] ec: make use of added aligned buffers
  2014-09-15 15:55 [PATCH 1/3] buffer: add an aligned buffer with less alignment than a page Janne Grunau
@ 2014-09-15 15:55 ` Janne Grunau
  2014-09-15 17:20   ` Loic Dachary
  2014-09-15 15:55 ` [PATCH 3/3] ceph_erasure_code_benchmark: align the encoding input Janne Grunau
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 41+ messages in thread
From: Janne Grunau @ 2014-09-15 15:55 UTC (permalink / raw)
  To: ceph-devel

Requiring page aligned buffers and realigning the input if necessary
creates measurable oberhead. ceph_erasure_code_benchmark is ~30% faster
with this change for technique=reed_sol_van,k=2,m=1.

Also prevents a misaligned buffer when bufferlist::c_str(bufferlist)
has to allocate a new buffer to provide continuous one. See bug #9408

Signed-off-by: Janne Grunau <j@jannau.net>
---
 src/erasure-code/ErasureCode.cc | 46 +++++++++++++++++++++++++----------------
 src/erasure-code/ErasureCode.h  |  3 ++-
 2 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/src/erasure-code/ErasureCode.cc b/src/erasure-code/ErasureCode.cc
index 5953f49..078f60b 100644
--- a/src/erasure-code/ErasureCode.cc
+++ b/src/erasure-code/ErasureCode.cc
@@ -54,22 +54,38 @@ int ErasureCode::minimum_to_decode_with_cost(const set<int> &want_to_read,
 }
 
 int ErasureCode::encode_prepare(const bufferlist &raw,
-                                bufferlist *prepared) const
+                                map<int, bufferlist> &encoded) const
 {
   unsigned int k = get_data_chunk_count();
   unsigned int m = get_chunk_count() - k;
   unsigned blocksize = get_chunk_size(raw.length());
-  unsigned padded_length = blocksize * k;
-  *prepared = raw;
-  if (padded_length - raw.length() > 0) {
-    bufferptr pad(padded_length - raw.length());
-    pad.zero();
-    prepared->push_back(pad);
+  unsigned pad_len = blocksize * k - raw.length();
+
+  bufferlist prepared = raw;
+
+  if (!prepared.is_aligned()) {
+    prepared.rebuild_aligned();
+  }
+
+  for (unsigned int i = 0; i < k - !!pad_len; i++) {
+    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
+    bufferlist &chunk = encoded[chunk_index];
+    chunk.substr_of(prepared, i * blocksize, blocksize);
+  }
+  if (pad_len > 0) {
+    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[k - 1] : k - 1;
+    bufferlist &chunk = encoded[chunk_index];
+    bufferptr padded(buffer::create_aligned(blocksize));
+    raw.copy((k - 1) * blocksize, blocksize - pad_len, padded.c_str());
+    padded.zero(blocksize - pad_len, pad_len);
+    chunk.push_back(padded);
   }
-  unsigned coding_length = blocksize * m;
-  bufferptr coding(buffer::create_page_aligned(coding_length));
-  prepared->push_back(coding);
-  prepared->rebuild_page_aligned();
+  for (unsigned int i = k; i < k + m; i++) {
+    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
+    bufferlist &chunk = encoded[chunk_index];
+    chunk.push_back(buffer::create_aligned(blocksize));
+  }
+
   return 0;
 }
 
@@ -80,15 +96,9 @@ int ErasureCode::encode(const set<int> &want_to_encode,
   unsigned int k = get_data_chunk_count();
   unsigned int m = get_chunk_count() - k;
   bufferlist out;
-  int err = encode_prepare(in, &out);
+  int err = encode_prepare(in, *encoded);
   if (err)
     return err;
-  unsigned blocksize = get_chunk_size(in.length());
-  for (unsigned int i = 0; i < k + m; i++) {
-    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
-    bufferlist &chunk = (*encoded)[chunk_index];
-    chunk.substr_of(out, i * blocksize, blocksize);
-  }
   encode_chunks(want_to_encode, encoded);
   for (unsigned int i = 0; i < k + m; i++) {
     if (want_to_encode.count(i) == 0)
diff --git a/src/erasure-code/ErasureCode.h b/src/erasure-code/ErasureCode.h
index 7aaea95..62aa383 100644
--- a/src/erasure-code/ErasureCode.h
+++ b/src/erasure-code/ErasureCode.h
@@ -46,7 +46,8 @@ namespace ceph {
                                             const map<int, int> &available,
                                             set<int> *minimum);
 
-    int encode_prepare(const bufferlist &raw, bufferlist *prepared) const;
+    int encode_prepare(const bufferlist &raw,
+                       map<int, bufferlist> &encoded) const;
 
     virtual int encode(const set<int> &want_to_encode,
                        const bufferlist &in,
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH 3/3] ceph_erasure_code_benchmark: align the encoding input
  2014-09-15 15:55 [PATCH 1/3] buffer: add an aligned buffer with less alignment than a page Janne Grunau
  2014-09-15 15:55 ` [PATCH 2/3] ec: make use of added aligned buffers Janne Grunau
@ 2014-09-15 15:55 ` Janne Grunau
  2014-09-15 16:46 ` [PATCH 1/3] buffer: add an aligned buffer with less alignment than a page Loic Dachary
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 41+ messages in thread
From: Janne Grunau @ 2014-09-15 15:55 UTC (permalink / raw)
  To: ceph-devel

The benchmark is supposed to measure the encoding speed and not the
overhead of buffer realignments.

Signed-off-by: Janne Grunau <j@jannau.net>
---
 src/test/erasure-code/ceph_erasure_code_benchmark.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/test/erasure-code/ceph_erasure_code_benchmark.cc b/src/test/erasure-code/ceph_erasure_code_benchmark.cc
index 84bd7da..f63fd77 100644
--- a/src/test/erasure-code/ceph_erasure_code_benchmark.cc
+++ b/src/test/erasure-code/ceph_erasure_code_benchmark.cc
@@ -144,6 +144,7 @@ int ErasureCodeBench::encode()
 
   bufferlist in;
   in.append(string(in_size, 'X'));
+  in.rebuild_aligned();
   set<int> want_to_encode;
   for (int i = 0; i < k + m; i++) {
     want_to_encode.insert(i);
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH 1/3] buffer: add an aligned buffer with less alignment than a page
  2014-09-15 15:55 [PATCH 1/3] buffer: add an aligned buffer with less alignment than a page Janne Grunau
  2014-09-15 15:55 ` [PATCH 2/3] ec: make use of added aligned buffers Janne Grunau
  2014-09-15 15:55 ` [PATCH 3/3] ceph_erasure_code_benchmark: align the encoding input Janne Grunau
@ 2014-09-15 16:46 ` Loic Dachary
  2014-09-18 10:33 ` v2 aligned buffer changes for erasure codes Janne Grunau
  2014-09-29 12:34 ` [PATCH v3 0/4] buffer alignment for erasure code SIMD Janne Grunau
  4 siblings, 0 replies; 41+ messages in thread
From: Loic Dachary @ 2014-09-15 16:46 UTC (permalink / raw)
  To: Janne Grunau, ceph-devel

[-- Attachment #1: Type: text/plain, Size: 8674 bytes --]

Look great !

Running on git builder under the branch wip-9408-buffer-alignment at http://ceph.com/gitbuilder.cgi

On 15/09/2014 17:55, Janne Grunau wrote:
> SIMD optimized erasure code computation needs aligned memory. Buffers
> aligned to a page boundary are wasted on it though. The buffers used
> for the erasure code computation are typical smaller than a page.
> 
> Currently an alignement of 16 bytes is enough for the used
> SIMD instruction sets (SSE4 and NEON).
> 
> Signed-off-by: Janne Grunau <j@jannau.net>
> ---
>  configure.ac         |   9 +++++
>  src/common/buffer.cc | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  src/include/buffer.h |  10 ++++++
>  3 files changed, 119 insertions(+)
> 
> diff --git a/configure.ac b/configure.ac
> index cccf2d9..1bb27c4 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -793,6 +793,15 @@ AC_MSG_RESULT([no])
>  ])
>  
>  #
> +# Check for functions to provide aligned memory
> +#
> +AC_CHECK_HEADERS([malloc.h])
> +AC_CHECK_FUNCS([posix_memalign _aligned_malloc memalign aligned_malloc],
> +               [found_memalign=yes; break])
> +
> +AS_IF([test "x$found_memalign" != "xyes"], [AC_MSG_WARN([No function for aligned memory allocation found])])
> +
> +#
>  # Check for pthread spinlock (depends on ACX_PTHREAD)
>  #
>  saved_LIBS="$LIBS"
> diff --git a/src/common/buffer.cc b/src/common/buffer.cc
> index b141759..acc221f 100644
> --- a/src/common/buffer.cc
> +++ b/src/common/buffer.cc
> @@ -30,6 +30,10 @@
>  #include <sys/uio.h>
>  #include <limits.h>
>  
> +#ifdef HAVE_MALLOC_H
> +#include <malloc.h>
> +#endif
> +
>  namespace ceph {
>  
>  #ifdef BUFFER_DEBUG
> @@ -155,9 +159,15 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>      virtual int zero_copy_to_fd(int fd, loff_t *offset) {
>        return -ENOTSUP;
>      }
> +    virtual bool is_aligned() {
> +      return ((long)data & ~CEPH_ALIGN_MASK) == 0;
> +    }
>      virtual bool is_page_aligned() {
>        return ((long)data & ~CEPH_PAGE_MASK) == 0;
>      }
> +    bool is_n_align_sized() {
> +      return (len & ~CEPH_ALIGN_MASK) == 0;
> +    }
>      bool is_n_page_sized() {
>        return (len & ~CEPH_PAGE_MASK) == 0;
>      }
> @@ -209,6 +219,41 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>      }
>    };
>  
> +  class buffer::raw_aligned : public buffer::raw {
> +  public:
> +    raw_aligned(unsigned l) : raw(l) {
> +      if (len) {
> +#if HAVE_POSIX_MEMALIGN
> +        if (posix_memalign((void **) &data, CEPH_ALIGN, len))
> +          data = 0;
> +#elif HAVE__ALIGNED_MALLOC
> +        data = _aligned_malloc(len, CEPH_ALIGN);
> +#elif HAVE_MEMALIGN
> +        data = memalign(CEPH_ALIGN, len);
> +#elif HAVE_ALIGNED_MALLOC
> +        data = aligned_malloc((len + CEPH_ALIGN - 1) & ~CEPH_ALIGN_MASK,
> +                              CEPH_ALIGN);
> +#else
> +        data = malloc(len);
> +#endif
> +        if (!data)
> +          throw bad_alloc();
> +      } else {
> +        data = 0;
> +      }
> +      inc_total_alloc(len);
> +      bdout << "raw_aligned " << this << " alloc " << (void *)data << " " << l << " " << buffer::get_total_alloc() << bendl;
> +    }
> +    ~raw_aligned() {
> +      free(data);
> +      dec_total_alloc(len);
> +      bdout << "raw_aligned " << this << " free " << (void *)data << " " << buffer::get_total_alloc() << bendl;
> +    }
> +    raw* clone_empty() {
> +      return new raw_aligned(len);
> +    }
> +  };
> +
>  #ifndef __CYGWIN__
>    class buffer::raw_mmap_pages : public buffer::raw {
>    public:
> @@ -334,6 +379,10 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>        return true;
>      }
>  
> +    bool is_aligned() {
> +      return false;
> +    }
> +
>      bool is_page_aligned() {
>        return false;
>      }
> @@ -520,6 +569,9 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>    buffer::raw* buffer::create_static(unsigned len, char *buf) {
>      return new raw_static(buf, len);
>    }
> +  buffer::raw* buffer::create_aligned(unsigned len) {
> +    return new raw_aligned(len);
> +  }
>    buffer::raw* buffer::create_page_aligned(unsigned len) {
>  #ifndef __CYGWIN__
>      //return new raw_mmap_pages(len);
> @@ -1013,6 +1065,16 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>      return true;
>    }
>  
> +  bool buffer::list::is_aligned() const
> +  {
> +    for (std::list<ptr>::const_iterator it = _buffers.begin();
> +         it != _buffers.end();
> +         ++it)
> +      if (!it->is_aligned())
> +        return false;
> +    return true;
> +  }
> +
>    bool buffer::list::is_page_aligned() const
>    {
>      for (std::list<ptr>::const_iterator it = _buffers.begin();
> @@ -1101,6 +1163,44 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>      _buffers.push_back(nb);
>    }
>  
> +void buffer::list::rebuild_aligned()
> +{
> +  std::list<ptr>::iterator p = _buffers.begin();
> +  while (p != _buffers.end()) {
> +    // keep anything that's already page sized+aligned
> +    if (p->is_aligned() && p->is_n_align_sized()) {
> +      /*cout << " segment " << (void*)p->c_str()
> +             << " offset " << ((unsigned long)p->c_str() & ~CEPH_ALIGN_MASK)
> +             << " length " << p->length()
> +             << " " << (p->length() & ~CEPH_ALIGN_MASK) << " ok" << std::endl;
> +      */
> +      ++p;
> +      continue;
> +    }
> +
> +    // consolidate unaligned items, until we get something that is sized+aligned
> +    list unaligned;
> +    unsigned offset = 0;
> +    do {
> +      /*cout << " segment " << (void*)p->c_str()
> +             << " offset " << ((unsigned long)p->c_str() & ~CEPH_ALIGN_MASK)
> +             << " length " << p->length() << " " << (p->length() & ~CEPH_ALIGN_MASK)
> +             << " overall offset " << offset << " " << (offset & ~CEPH_ALIGN_MASK)
> +             << " not ok" << std::endl;
> +      */
> +      offset += p->length();
> +      unaligned.push_back(*p);
> +      _buffers.erase(p++);
> +    } while (p != _buffers.end() &&
> +	     (!p->is_aligned() ||
> +	      !p->is_n_align_sized() ||
> +	      (offset & ~CEPH_ALIGN_MASK)));
> +    ptr nb(buffer::create_aligned(unaligned._len));
> +    unaligned.rebuild(nb);
> +    _buffers.insert(p, unaligned._buffers.front());
> +  }
> +}
> +
>  void buffer::list::rebuild_page_aligned()
>  {
>    std::list<ptr>::iterator p = _buffers.begin();
> diff --git a/src/include/buffer.h b/src/include/buffer.h
> index e5c1b50..ea362e7 100644
> --- a/src/include/buffer.h
> +++ b/src/include/buffer.h
> @@ -56,6 +56,9 @@
>  # include <assert.h>
>  #endif
>  
> +#define CEPH_ALIGN 16
> +#define CEPH_ALIGN_MASK (~(CEPH_ALIGN - 1LLU))
> +
>  namespace ceph {
>  
>  class buffer {
> @@ -124,6 +127,7 @@ private:
>     */
>    class raw;
>    class raw_malloc;
> +  class raw_aligned;
>    class raw_static;
>    class raw_mmap_pages;
>    class raw_posix_aligned;
> @@ -144,6 +148,7 @@ public:
>    static raw* create_malloc(unsigned len);
>    static raw* claim_malloc(unsigned len, char *buf);
>    static raw* create_static(unsigned len, char *buf);
> +  static raw* create_aligned(unsigned len);
>    static raw* create_page_aligned(unsigned len);
>    static raw* create_zero_copy(unsigned len, int fd, int64_t *offset);
>  
> @@ -177,7 +182,9 @@ public:
>      bool at_buffer_head() const { return _off == 0; }
>      bool at_buffer_tail() const;
>  
> +    bool is_aligned() const { return ((long)c_str() & ~CEPH_ALIGN_MASK) == 0; }
>      bool is_page_aligned() const { return ((long)c_str() & ~CEPH_PAGE_MASK) == 0; }
> +    bool is_n_align_sized() const { return (length() & ~CEPH_ALIGN_MASK) == 0; }
>      bool is_n_page_sized() const { return (length() & ~CEPH_PAGE_MASK) == 0; }
>  
>      // accessors
> @@ -344,7 +351,9 @@ public:
>      bool contents_equal(buffer::list& other);
>  
>      bool can_zero_copy() const;
> +    bool is_aligned() const;
>      bool is_page_aligned() const;
> +    bool is_n_align_sized() const;
>      bool is_n_page_sized() const;
>  
>      bool is_zero() const;
> @@ -382,6 +391,7 @@ public:
>      bool is_contiguous();
>      void rebuild();
>      void rebuild(ptr& nb);
> +    void rebuild_aligned();
>      void rebuild_page_aligned();
>  
>      // sort-of-like-assignment-op
> 

-- 
Loïc Dachary, Artisan Logiciel Libre


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 263 bytes --]

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 2/3] ec: make use of added aligned buffers
  2014-09-15 15:55 ` [PATCH 2/3] ec: make use of added aligned buffers Janne Grunau
@ 2014-09-15 17:20   ` Loic Dachary
  2014-09-15 23:56     ` Ma, Jianpeng
  0 siblings, 1 reply; 41+ messages in thread
From: Loic Dachary @ 2014-09-15 17:20 UTC (permalink / raw)
  To: Janne Grunau, ceph-devel

[-- Attachment #1: Type: text/plain, Size: 4532 bytes --]

Hi Janne,

See below:

On 15/09/2014 17:55, Janne Grunau wrote:
> Requiring page aligned buffers and realigning the input if necessary
> creates measurable oberhead. ceph_erasure_code_benchmark is ~30% faster
> with this change for technique=reed_sol_van,k=2,m=1.
> 
> Also prevents a misaligned buffer when bufferlist::c_str(bufferlist)
> has to allocate a new buffer to provide continuous one. See bug #9408
> 
> Signed-off-by: Janne Grunau <j@jannau.net>
> ---
>  src/erasure-code/ErasureCode.cc | 46 +++++++++++++++++++++++++----------------
>  src/erasure-code/ErasureCode.h  |  3 ++-
>  2 files changed, 30 insertions(+), 19 deletions(-)
> 
> diff --git a/src/erasure-code/ErasureCode.cc b/src/erasure-code/ErasureCode.cc
> index 5953f49..078f60b 100644
> --- a/src/erasure-code/ErasureCode.cc
> +++ b/src/erasure-code/ErasureCode.cc
> @@ -54,22 +54,38 @@ int ErasureCode::minimum_to_decode_with_cost(const set<int> &want_to_read,
>  }
>  
>  int ErasureCode::encode_prepare(const bufferlist &raw,
> -                                bufferlist *prepared) const
> +                                map<int, bufferlist> &encoded) const
>  {
>    unsigned int k = get_data_chunk_count();
>    unsigned int m = get_chunk_count() - k;
>    unsigned blocksize = get_chunk_size(raw.length());
> -  unsigned padded_length = blocksize * k;
> -  *prepared = raw;
> -  if (padded_length - raw.length() > 0) {
> -    bufferptr pad(padded_length - raw.length());
> -    pad.zero();
> -    prepared->push_back(pad);
> +  unsigned pad_len = blocksize * k - raw.length();
> +
> +  bufferlist prepared = raw;
> +
> +  if (!prepared.is_aligned()) {
> +    prepared.rebuild_aligned();
> +  }
> +
> +  for (unsigned int i = 0; i < k - !!pad_len; i++) {
> +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
> +    bufferlist &chunk = encoded[chunk_index];
> +    chunk.substr_of(prepared, i * blocksize, blocksize);
> +  }

It is possible for more than one chunk to be padding. It's a border case but... for instance with alignment = 16, k=12 and in of length 1550 you end up with two padding chunks because the blocksize is 144.

> +  if (pad_len > 0) {
> +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[k - 1] : k - 1;
> +    bufferlist &chunk = encoded[chunk_index];
> +    bufferptr padded(buffer::create_aligned(blocksize));
> +    raw.copy((k - 1) * blocksize, blocksize - pad_len, padded.c_str());
> +    padded.zero(blocksize - pad_len, pad_len);
> +    chunk.push_back(padded);
>    }
> -  unsigned coding_length = blocksize * m;
> -  bufferptr coding(buffer::create_page_aligned(coding_length));
> -  prepared->push_back(coding);
> -  prepared->rebuild_page_aligned();
> +  for (unsigned int i = k; i < k + m; i++) {
> +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
> +    bufferlist &chunk = encoded[chunk_index];
> +    chunk.push_back(buffer::create_aligned(blocksize));
> +  }
> +
>    return 0;
>  }
>  
> @@ -80,15 +96,9 @@ int ErasureCode::encode(const set<int> &want_to_encode,
>    unsigned int k = get_data_chunk_count();
>    unsigned int m = get_chunk_count() - k;
>    bufferlist out;
> -  int err = encode_prepare(in, &out);
> +  int err = encode_prepare(in, *encoded);
>    if (err)
>      return err;
> -  unsigned blocksize = get_chunk_size(in.length());
> -  for (unsigned int i = 0; i < k + m; i++) {
> -    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
> -    bufferlist &chunk = (*encoded)[chunk_index];
> -    chunk.substr_of(out, i * blocksize, blocksize);
> -  }
>    encode_chunks(want_to_encode, encoded);
>    for (unsigned int i = 0; i < k + m; i++) {
>      if (want_to_encode.count(i) == 0)
> diff --git a/src/erasure-code/ErasureCode.h b/src/erasure-code/ErasureCode.h
> index 7aaea95..62aa383 100644
> --- a/src/erasure-code/ErasureCode.h
> +++ b/src/erasure-code/ErasureCode.h
> @@ -46,7 +46,8 @@ namespace ceph {
>                                              const map<int, int> &available,
>                                              set<int> *minimum);
>  
> -    int encode_prepare(const bufferlist &raw, bufferlist *prepared) const;
> +    int encode_prepare(const bufferlist &raw,
> +                       map<int, bufferlist> &encoded) const;
>  
>      virtual int encode(const set<int> &want_to_encode,
>                         const bufferlist &in,
> 

-- 
Loïc Dachary, Artisan Logiciel Libre


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 263 bytes --]

^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: [PATCH 2/3] ec: make use of added aligned buffers
  2014-09-15 17:20   ` Loic Dachary
@ 2014-09-15 23:56     ` Ma, Jianpeng
  2014-09-16  0:02       ` Sage Weil
  0 siblings, 1 reply; 41+ messages in thread
From: Ma, Jianpeng @ 2014-09-15 23:56 UTC (permalink / raw)
  To: Loic Dachary, Janne Grunau, ceph-devel

If we modify bufferlist::c_str()  to bufferlist::c_str(bool align).
If (align)
  Posix_memalign(data, CEPH_PAGE_SIZE, len)
Else
  Origin code.

I think this is simple and correctly code.

Jianpeng
Thanks!

> -----Original Message-----
> From: ceph-devel-owner@vger.kernel.org
> [mailto:ceph-devel-owner@vger.kernel.org] On Behalf Of Loic Dachary
> Sent: Tuesday, September 16, 2014 1:20 AM
> To: Janne Grunau; ceph-devel@vger.kernel.org
> Subject: Re: [PATCH 2/3] ec: make use of added aligned buffers
> 
> Hi Janne,
> 
> See below:
> 
> On 15/09/2014 17:55, Janne Grunau wrote:
> > Requiring page aligned buffers and realigning the input if necessary
> > creates measurable oberhead. ceph_erasure_code_benchmark is ~30%
> > faster with this change for technique=reed_sol_van,k=2,m=1.
> >
> > Also prevents a misaligned buffer when bufferlist::c_str(bufferlist)
> > has to allocate a new buffer to provide continuous one. See bug #9408
> >
> > Signed-off-by: Janne Grunau <j@jannau.net>
> > ---
> >  src/erasure-code/ErasureCode.cc | 46
> > +++++++++++++++++++++++++----------------
> >  src/erasure-code/ErasureCode.h  |  3 ++-
> >  2 files changed, 30 insertions(+), 19 deletions(-)
> >
> > diff --git a/src/erasure-code/ErasureCode.cc
> > b/src/erasure-code/ErasureCode.cc index 5953f49..078f60b 100644
> > --- a/src/erasure-code/ErasureCode.cc
> > +++ b/src/erasure-code/ErasureCode.cc
> > @@ -54,22 +54,38 @@ int
> ErasureCode::minimum_to_decode_with_cost(const
> > set<int> &want_to_read,  }
> >
> >  int ErasureCode::encode_prepare(const bufferlist &raw,
> > -                                bufferlist *prepared) const
> > +                                map<int, bufferlist> &encoded)
> const
> >  {
> >    unsigned int k = get_data_chunk_count();
> >    unsigned int m = get_chunk_count() - k;
> >    unsigned blocksize = get_chunk_size(raw.length());
> > -  unsigned padded_length = blocksize * k;
> > -  *prepared = raw;
> > -  if (padded_length - raw.length() > 0) {
> > -    bufferptr pad(padded_length - raw.length());
> > -    pad.zero();
> > -    prepared->push_back(pad);
> > +  unsigned pad_len = blocksize * k - raw.length();
> > +
> > +  bufferlist prepared = raw;
> > +
> > +  if (!prepared.is_aligned()) {
> > +    prepared.rebuild_aligned();
> > +  }
> > +
> > +  for (unsigned int i = 0; i < k - !!pad_len; i++) {
> > +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
> > +    bufferlist &chunk = encoded[chunk_index];
> > +    chunk.substr_of(prepared, i * blocksize, blocksize);  }
> 
> It is possible for more than one chunk to be padding. It's a border case but... for
> instance with alignment = 16, k=12 and in of length 1550 you end up with two
> padding chunks because the blocksize is 144.
> 
> > +  if (pad_len > 0) {
> > +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[k - 1] : k -
> 1;
> > +    bufferlist &chunk = encoded[chunk_index];
> > +    bufferptr padded(buffer::create_aligned(blocksize));
> > +    raw.copy((k - 1) * blocksize, blocksize - pad_len, padded.c_str());
> > +    padded.zero(blocksize - pad_len, pad_len);
> > +    chunk.push_back(padded);
> >    }
> > -  unsigned coding_length = blocksize * m;
> > -  bufferptr coding(buffer::create_page_aligned(coding_length));
> > -  prepared->push_back(coding);
> > -  prepared->rebuild_page_aligned();
> > +  for (unsigned int i = k; i < k + m; i++) {
> > +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
> > +    bufferlist &chunk = encoded[chunk_index];
> > +    chunk.push_back(buffer::create_aligned(blocksize));
> > +  }
> > +
> >    return 0;
> >  }
> >
> > @@ -80,15 +96,9 @@ int ErasureCode::encode(const set<int>
> &want_to_encode,
> >    unsigned int k = get_data_chunk_count();
> >    unsigned int m = get_chunk_count() - k;
> >    bufferlist out;
> > -  int err = encode_prepare(in, &out);
> > +  int err = encode_prepare(in, *encoded);
> >    if (err)
> >      return err;
> > -  unsigned blocksize = get_chunk_size(in.length());
> > -  for (unsigned int i = 0; i < k + m; i++) {
> > -    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
> > -    bufferlist &chunk = (*encoded)[chunk_index];
> > -    chunk.substr_of(out, i * blocksize, blocksize);
> > -  }
> >    encode_chunks(want_to_encode, encoded);
> >    for (unsigned int i = 0; i < k + m; i++) {
> >      if (want_to_encode.count(i) == 0) diff --git
> > a/src/erasure-code/ErasureCode.h b/src/erasure-code/ErasureCode.h
> > index 7aaea95..62aa383 100644
> > --- a/src/erasure-code/ErasureCode.h
> > +++ b/src/erasure-code/ErasureCode.h
> > @@ -46,7 +46,8 @@ namespace ceph {
> >                                              const map<int, int>
> &available,
> >                                              set<int> *minimum);
> >
> > -    int encode_prepare(const bufferlist &raw, bufferlist *prepared) const;
> > +    int encode_prepare(const bufferlist &raw,
> > +                       map<int, bufferlist> &encoded) const;
> >
> >      virtual int encode(const set<int> &want_to_encode,
> >                         const bufferlist &in,
> >
> 
> --
> Loïc Dachary, Artisan Logiciel Libre

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: [PATCH 2/3] ec: make use of added aligned buffers
  2014-09-15 23:56     ` Ma, Jianpeng
@ 2014-09-16  0:02       ` Sage Weil
  2014-09-16  0:08         ` Ma, Jianpeng
  0 siblings, 1 reply; 41+ messages in thread
From: Sage Weil @ 2014-09-16  0:02 UTC (permalink / raw)
  To: Ma, Jianpeng; +Cc: Loic Dachary, Janne Grunau, ceph-devel

On Mon, 15 Sep 2014, Ma, Jianpeng wrote:
> If we modify bufferlist::c_str()  to bufferlist::c_str(bool align).
> If (align)
>   Posix_memalign(data, CEPH_PAGE_SIZE, len)
> Else
>   Origin code.

Alignment isn't really a bool, it's an int.  c_str(int align=1) ?

sage

> 
> I think this is simple and correctly code.
> 
> Jianpeng
> Thanks!
> 
> > -----Original Message-----
> > From: ceph-devel-owner@vger.kernel.org
> > [mailto:ceph-devel-owner@vger.kernel.org] On Behalf Of Loic Dachary
> > Sent: Tuesday, September 16, 2014 1:20 AM
> > To: Janne Grunau; ceph-devel@vger.kernel.org
> > Subject: Re: [PATCH 2/3] ec: make use of added aligned buffers
> > 
> > Hi Janne,
> > 
> > See below:
> > 
> > On 15/09/2014 17:55, Janne Grunau wrote:
> > > Requiring page aligned buffers and realigning the input if necessary
> > > creates measurable oberhead. ceph_erasure_code_benchmark is ~30%
> > > faster with this change for technique=reed_sol_van,k=2,m=1.
> > >
> > > Also prevents a misaligned buffer when bufferlist::c_str(bufferlist)
> > > has to allocate a new buffer to provide continuous one. See bug #9408
> > >
> > > Signed-off-by: Janne Grunau <j@jannau.net>
> > > ---
> > >  src/erasure-code/ErasureCode.cc | 46
> > > +++++++++++++++++++++++++----------------
> > >  src/erasure-code/ErasureCode.h  |  3 ++-
> > >  2 files changed, 30 insertions(+), 19 deletions(-)
> > >
> > > diff --git a/src/erasure-code/ErasureCode.cc
> > > b/src/erasure-code/ErasureCode.cc index 5953f49..078f60b 100644
> > > --- a/src/erasure-code/ErasureCode.cc
> > > +++ b/src/erasure-code/ErasureCode.cc
> > > @@ -54,22 +54,38 @@ int
> > ErasureCode::minimum_to_decode_with_cost(const
> > > set<int> &want_to_read,  }
> > >
> > >  int ErasureCode::encode_prepare(const bufferlist &raw,
> > > -                                bufferlist *prepared) const
> > > +                                map<int, bufferlist> &encoded)
> > const
> > >  {
> > >    unsigned int k = get_data_chunk_count();
> > >    unsigned int m = get_chunk_count() - k;
> > >    unsigned blocksize = get_chunk_size(raw.length());
> > > -  unsigned padded_length = blocksize * k;
> > > -  *prepared = raw;
> > > -  if (padded_length - raw.length() > 0) {
> > > -    bufferptr pad(padded_length - raw.length());
> > > -    pad.zero();
> > > -    prepared->push_back(pad);
> > > +  unsigned pad_len = blocksize * k - raw.length();
> > > +
> > > +  bufferlist prepared = raw;
> > > +
> > > +  if (!prepared.is_aligned()) {
> > > +    prepared.rebuild_aligned();
> > > +  }
> > > +
> > > +  for (unsigned int i = 0; i < k - !!pad_len; i++) {
> > > +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
> > > +    bufferlist &chunk = encoded[chunk_index];
> > > +    chunk.substr_of(prepared, i * blocksize, blocksize);  }
> > 
> > It is possible for more than one chunk to be padding. It's a border case but... for
> > instance with alignment = 16, k=12 and in of length 1550 you end up with two
> > padding chunks because the blocksize is 144.
> > 
> > > +  if (pad_len > 0) {
> > > +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[k - 1] : k -
> > 1;
> > > +    bufferlist &chunk = encoded[chunk_index];
> > > +    bufferptr padded(buffer::create_aligned(blocksize));
> > > +    raw.copy((k - 1) * blocksize, blocksize - pad_len, padded.c_str());
> > > +    padded.zero(blocksize - pad_len, pad_len);
> > > +    chunk.push_back(padded);
> > >    }
> > > -  unsigned coding_length = blocksize * m;
> > > -  bufferptr coding(buffer::create_page_aligned(coding_length));
> > > -  prepared->push_back(coding);
> > > -  prepared->rebuild_page_aligned();
> > > +  for (unsigned int i = k; i < k + m; i++) {
> > > +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
> > > +    bufferlist &chunk = encoded[chunk_index];
> > > +    chunk.push_back(buffer::create_aligned(blocksize));
> > > +  }
> > > +
> > >    return 0;
> > >  }
> > >
> > > @@ -80,15 +96,9 @@ int ErasureCode::encode(const set<int>
> > &want_to_encode,
> > >    unsigned int k = get_data_chunk_count();
> > >    unsigned int m = get_chunk_count() - k;
> > >    bufferlist out;
> > > -  int err = encode_prepare(in, &out);
> > > +  int err = encode_prepare(in, *encoded);
> > >    if (err)
> > >      return err;
> > > -  unsigned blocksize = get_chunk_size(in.length());
> > > -  for (unsigned int i = 0; i < k + m; i++) {
> > > -    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
> > > -    bufferlist &chunk = (*encoded)[chunk_index];
> > > -    chunk.substr_of(out, i * blocksize, blocksize);
> > > -  }
> > >    encode_chunks(want_to_encode, encoded);
> > >    for (unsigned int i = 0; i < k + m; i++) {
> > >      if (want_to_encode.count(i) == 0) diff --git
> > > a/src/erasure-code/ErasureCode.h b/src/erasure-code/ErasureCode.h
> > > index 7aaea95..62aa383 100644
> > > --- a/src/erasure-code/ErasureCode.h
> > > +++ b/src/erasure-code/ErasureCode.h
> > > @@ -46,7 +46,8 @@ namespace ceph {
> > >                                              const map<int, int>
> > &available,
> > >                                              set<int> *minimum);
> > >
> > > -    int encode_prepare(const bufferlist &raw, bufferlist *prepared) const;
> > > +    int encode_prepare(const bufferlist &raw,
> > > +                       map<int, bufferlist> &encoded) const;
> > >
> > >      virtual int encode(const set<int> &want_to_encode,
> > >                         const bufferlist &in,
> > >
> > 
> > --
> > Lo?c Dachary, Artisan Logiciel Libre
> 
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 

^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: [PATCH 2/3] ec: make use of added aligned buffers
  2014-09-16  0:02       ` Sage Weil
@ 2014-09-16  0:08         ` Ma, Jianpeng
  2014-09-16  6:47           ` Loic Dachary
  0 siblings, 1 reply; 41+ messages in thread
From: Ma, Jianpeng @ 2014-09-16  0:08 UTC (permalink / raw)
  To: Sage Weil; +Cc: Loic Dachary, Janne Grunau, ceph-devel

> -----Original Message-----
> From: Sage Weil [mailto:sweil@redhat.com]
> Sent: Tuesday, September 16, 2014 8:02 AM
> To: Ma, Jianpeng
> Cc: Loic Dachary; Janne Grunau; ceph-devel@vger.kernel.org
> Subject: RE: [PATCH 2/3] ec: make use of added aligned buffers
> 
> On Mon, 15 Sep 2014, Ma, Jianpeng wrote:
> > If we modify bufferlist::c_str()  to bufferlist::c_str(bool align).
> > If (align)
> >   Posix_memalign(data, CEPH_PAGE_SIZE, len) Else
> >   Origin code.
> 
> Alignment isn't really a bool, it's an int.  c_str(int align=1) ?
> 
I mean if we need a align memory after bufferlist::c_str. We can set the align = true; 
Now bufferlist::c_str depend on the size when using rebuild if bufferlist have more prt.

BTW, we also can change the rebuild() && rebuild(ptr). Merge two func into one rebuild(bool align).
By judge the parameter align to alloc align memory or not.

Or am I missing something?

Jianpeng
> sage
> 
> >
> > I think this is simple and correctly code.
> >
> > Jianpeng
> > Thanks!
> >
> > > -----Original Message-----
> > > From: ceph-devel-owner@vger.kernel.org
> > > [mailto:ceph-devel-owner@vger.kernel.org] On Behalf Of Loic Dachary
> > > Sent: Tuesday, September 16, 2014 1:20 AM
> > > To: Janne Grunau; ceph-devel@vger.kernel.org
> > > Subject: Re: [PATCH 2/3] ec: make use of added aligned buffers
> > >
> > > Hi Janne,
> > >
> > > See below:
> > >
> > > On 15/09/2014 17:55, Janne Grunau wrote:
> > > > Requiring page aligned buffers and realigning the input if
> > > > necessary creates measurable oberhead. ceph_erasure_code_benchmark
> > > > is ~30% faster with this change for technique=reed_sol_van,k=2,m=1.
> > > >
> > > > Also prevents a misaligned buffer when
> > > > bufferlist::c_str(bufferlist) has to allocate a new buffer to
> > > > provide continuous one. See bug #9408
> > > >
> > > > Signed-off-by: Janne Grunau <j@jannau.net>
> > > > ---
> > > >  src/erasure-code/ErasureCode.cc | 46
> > > > +++++++++++++++++++++++++----------------
> > > >  src/erasure-code/ErasureCode.h  |  3 ++-
> > > >  2 files changed, 30 insertions(+), 19 deletions(-)
> > > >
> > > > diff --git a/src/erasure-code/ErasureCode.cc
> > > > b/src/erasure-code/ErasureCode.cc index 5953f49..078f60b 100644
> > > > --- a/src/erasure-code/ErasureCode.cc
> > > > +++ b/src/erasure-code/ErasureCode.cc
> > > > @@ -54,22 +54,38 @@ int
> > > ErasureCode::minimum_to_decode_with_cost(const
> > > > set<int> &want_to_read,  }
> > > >
> > > >  int ErasureCode::encode_prepare(const bufferlist &raw,
> > > > -                                bufferlist *prepared) const
> > > > +                                map<int, bufferlist> &encoded)
> > > const
> > > >  {
> > > >    unsigned int k = get_data_chunk_count();
> > > >    unsigned int m = get_chunk_count() - k;
> > > >    unsigned blocksize = get_chunk_size(raw.length());
> > > > -  unsigned padded_length = blocksize * k;
> > > > -  *prepared = raw;
> > > > -  if (padded_length - raw.length() > 0) {
> > > > -    bufferptr pad(padded_length - raw.length());
> > > > -    pad.zero();
> > > > -    prepared->push_back(pad);
> > > > +  unsigned pad_len = blocksize * k - raw.length();
> > > > +
> > > > +  bufferlist prepared = raw;
> > > > +
> > > > +  if (!prepared.is_aligned()) {
> > > > +    prepared.rebuild_aligned();
> > > > +  }
> > > > +
> > > > +  for (unsigned int i = 0; i < k - !!pad_len; i++) {
> > > > +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
> > > > +    bufferlist &chunk = encoded[chunk_index];
> > > > +    chunk.substr_of(prepared, i * blocksize, blocksize);  }
> > >
> > > It is possible for more than one chunk to be padding. It's a border
> > > case but... for instance with alignment = 16, k=12 and in of length
> > > 1550 you end up with two padding chunks because the blocksize is 144.
> > >
> > > > +  if (pad_len > 0) {
> > > > +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[k
> > > > + - 1] : k -
> > > 1;
> > > > +    bufferlist &chunk = encoded[chunk_index];
> > > > +    bufferptr padded(buffer::create_aligned(blocksize));
> > > > +    raw.copy((k - 1) * blocksize, blocksize - pad_len, padded.c_str());
> > > > +    padded.zero(blocksize - pad_len, pad_len);
> > > > +    chunk.push_back(padded);
> > > >    }
> > > > -  unsigned coding_length = blocksize * m;
> > > > -  bufferptr coding(buffer::create_page_aligned(coding_length));
> > > > -  prepared->push_back(coding);
> > > > -  prepared->rebuild_page_aligned();
> > > > +  for (unsigned int i = k; i < k + m; i++) {
> > > > +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
> > > > +    bufferlist &chunk = encoded[chunk_index];
> > > > +    chunk.push_back(buffer::create_aligned(blocksize));
> > > > +  }
> > > > +
> > > >    return 0;
> > > >  }
> > > >
> > > > @@ -80,15 +96,9 @@ int ErasureCode::encode(const set<int>
> > > &want_to_encode,
> > > >    unsigned int k = get_data_chunk_count();
> > > >    unsigned int m = get_chunk_count() - k;
> > > >    bufferlist out;
> > > > -  int err = encode_prepare(in, &out);
> > > > +  int err = encode_prepare(in, *encoded);
> > > >    if (err)
> > > >      return err;
> > > > -  unsigned blocksize = get_chunk_size(in.length());
> > > > -  for (unsigned int i = 0; i < k + m; i++) {
> > > > -    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
> > > > -    bufferlist &chunk = (*encoded)[chunk_index];
> > > > -    chunk.substr_of(out, i * blocksize, blocksize);
> > > > -  }
> > > >    encode_chunks(want_to_encode, encoded);
> > > >    for (unsigned int i = 0; i < k + m; i++) {
> > > >      if (want_to_encode.count(i) == 0) diff --git
> > > > a/src/erasure-code/ErasureCode.h b/src/erasure-code/ErasureCode.h
> > > > index 7aaea95..62aa383 100644
> > > > --- a/src/erasure-code/ErasureCode.h
> > > > +++ b/src/erasure-code/ErasureCode.h
> > > > @@ -46,7 +46,8 @@ namespace ceph {
> > > >                                              const map<int,
> int>
> > > &available,
> > > >                                              set<int>
> *minimum);
> > > >
> > > > -    int encode_prepare(const bufferlist &raw, bufferlist *prepared)
> const;
> > > > +    int encode_prepare(const bufferlist &raw,
> > > > +                       map<int, bufferlist> &encoded) const;
> > > >
> > > >      virtual int encode(const set<int> &want_to_encode,
> > > >                         const bufferlist &in,
> > > >
> > >
> > > --
> > > Lo?c Dachary, Artisan Logiciel Libre
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe ceph-devel"
> > in the body of a message to majordomo@vger.kernel.org More majordomo
> > info at  http://vger.kernel.org/majordomo-info.html
> >
> >

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 2/3] ec: make use of added aligned buffers
  2014-09-16  0:08         ` Ma, Jianpeng
@ 2014-09-16  6:47           ` Loic Dachary
  2014-09-16  6:59             ` Ma, Jianpeng
  0 siblings, 1 reply; 41+ messages in thread
From: Loic Dachary @ 2014-09-16  6:47 UTC (permalink / raw)
  To: Ma, Jianpeng; +Cc: ceph-devel

[-- Attachment #1: Type: text/plain, Size: 7371 bytes --]



On 16/09/2014 02:08, Ma, Jianpeng wrote:
>> -----Original Message-----
>> From: Sage Weil [mailto:sweil@redhat.com]
>> Sent: Tuesday, September 16, 2014 8:02 AM
>> To: Ma, Jianpeng
>> Cc: Loic Dachary; Janne Grunau; ceph-devel@vger.kernel.org
>> Subject: RE: [PATCH 2/3] ec: make use of added aligned buffers
>>
>> On Mon, 15 Sep 2014, Ma, Jianpeng wrote:
>>> If we modify bufferlist::c_str()  to bufferlist::c_str(bool align).
>>> If (align)
>>>   Posix_memalign(data, CEPH_PAGE_SIZE, len) Else
>>>   Origin code.
>>
>> Alignment isn't really a bool, it's an int.  c_str(int align=1) ?
>>
> I mean if we need a align memory after bufferlist::c_str. We can set the align = true; 
> Now bufferlist::c_str depend on the size when using rebuild if bufferlist have more prt.
> 
> BTW, we also can change the rebuild() && rebuild(ptr). Merge two func into one rebuild(bool align).
> By judge the parameter align to alloc align memory or not.
> 
> Or am I missing something?

I don't think there is a need for c_str(int align). We make every effort to allocate buffers that are properly aligned. If c_str() does not return an aligned buffer, the proper fix is to align the allocated buffer at the source, not to allocate a new aligned buffer and copy the content of the non aligned buffer into it.

Do you see a reason why that would be a bad way to deal with alignment ?

Cheers

> 
> Jianpeng
>> sage
>>
>>>
>>> I think this is simple and correctly code.
>>>
>>> Jianpeng
>>> Thanks!
>>>
>>>> -----Original Message-----
>>>> From: ceph-devel-owner@vger.kernel.org
>>>> [mailto:ceph-devel-owner@vger.kernel.org] On Behalf Of Loic Dachary
>>>> Sent: Tuesday, September 16, 2014 1:20 AM
>>>> To: Janne Grunau; ceph-devel@vger.kernel.org
>>>> Subject: Re: [PATCH 2/3] ec: make use of added aligned buffers
>>>>
>>>> Hi Janne,
>>>>
>>>> See below:
>>>>
>>>> On 15/09/2014 17:55, Janne Grunau wrote:
>>>>> Requiring page aligned buffers and realigning the input if
>>>>> necessary creates measurable oberhead. ceph_erasure_code_benchmark
>>>>> is ~30% faster with this change for technique=reed_sol_van,k=2,m=1.
>>>>>
>>>>> Also prevents a misaligned buffer when
>>>>> bufferlist::c_str(bufferlist) has to allocate a new buffer to
>>>>> provide continuous one. See bug #9408
>>>>>
>>>>> Signed-off-by: Janne Grunau <j@jannau.net>
>>>>> ---
>>>>>  src/erasure-code/ErasureCode.cc | 46
>>>>> +++++++++++++++++++++++++----------------
>>>>>  src/erasure-code/ErasureCode.h  |  3 ++-
>>>>>  2 files changed, 30 insertions(+), 19 deletions(-)
>>>>>
>>>>> diff --git a/src/erasure-code/ErasureCode.cc
>>>>> b/src/erasure-code/ErasureCode.cc index 5953f49..078f60b 100644
>>>>> --- a/src/erasure-code/ErasureCode.cc
>>>>> +++ b/src/erasure-code/ErasureCode.cc
>>>>> @@ -54,22 +54,38 @@ int
>>>> ErasureCode::minimum_to_decode_with_cost(const
>>>>> set<int> &want_to_read,  }
>>>>>
>>>>>  int ErasureCode::encode_prepare(const bufferlist &raw,
>>>>> -                                bufferlist *prepared) const
>>>>> +                                map<int, bufferlist> &encoded)
>>>> const
>>>>>  {
>>>>>    unsigned int k = get_data_chunk_count();
>>>>>    unsigned int m = get_chunk_count() - k;
>>>>>    unsigned blocksize = get_chunk_size(raw.length());
>>>>> -  unsigned padded_length = blocksize * k;
>>>>> -  *prepared = raw;
>>>>> -  if (padded_length - raw.length() > 0) {
>>>>> -    bufferptr pad(padded_length - raw.length());
>>>>> -    pad.zero();
>>>>> -    prepared->push_back(pad);
>>>>> +  unsigned pad_len = blocksize * k - raw.length();
>>>>> +
>>>>> +  bufferlist prepared = raw;
>>>>> +
>>>>> +  if (!prepared.is_aligned()) {
>>>>> +    prepared.rebuild_aligned();
>>>>> +  }
>>>>> +
>>>>> +  for (unsigned int i = 0; i < k - !!pad_len; i++) {
>>>>> +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
>>>>> +    bufferlist &chunk = encoded[chunk_index];
>>>>> +    chunk.substr_of(prepared, i * blocksize, blocksize);  }
>>>>
>>>> It is possible for more than one chunk to be padding. It's a border
>>>> case but... for instance with alignment = 16, k=12 and in of length
>>>> 1550 you end up with two padding chunks because the blocksize is 144.
>>>>
>>>>> +  if (pad_len > 0) {
>>>>> +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[k
>>>>> + - 1] : k -
>>>> 1;
>>>>> +    bufferlist &chunk = encoded[chunk_index];
>>>>> +    bufferptr padded(buffer::create_aligned(blocksize));
>>>>> +    raw.copy((k - 1) * blocksize, blocksize - pad_len, padded.c_str());
>>>>> +    padded.zero(blocksize - pad_len, pad_len);
>>>>> +    chunk.push_back(padded);
>>>>>    }
>>>>> -  unsigned coding_length = blocksize * m;
>>>>> -  bufferptr coding(buffer::create_page_aligned(coding_length));
>>>>> -  prepared->push_back(coding);
>>>>> -  prepared->rebuild_page_aligned();
>>>>> +  for (unsigned int i = k; i < k + m; i++) {
>>>>> +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
>>>>> +    bufferlist &chunk = encoded[chunk_index];
>>>>> +    chunk.push_back(buffer::create_aligned(blocksize));
>>>>> +  }
>>>>> +
>>>>>    return 0;
>>>>>  }
>>>>>
>>>>> @@ -80,15 +96,9 @@ int ErasureCode::encode(const set<int>
>>>> &want_to_encode,
>>>>>    unsigned int k = get_data_chunk_count();
>>>>>    unsigned int m = get_chunk_count() - k;
>>>>>    bufferlist out;
>>>>> -  int err = encode_prepare(in, &out);
>>>>> +  int err = encode_prepare(in, *encoded);
>>>>>    if (err)
>>>>>      return err;
>>>>> -  unsigned blocksize = get_chunk_size(in.length());
>>>>> -  for (unsigned int i = 0; i < k + m; i++) {
>>>>> -    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
>>>>> -    bufferlist &chunk = (*encoded)[chunk_index];
>>>>> -    chunk.substr_of(out, i * blocksize, blocksize);
>>>>> -  }
>>>>>    encode_chunks(want_to_encode, encoded);
>>>>>    for (unsigned int i = 0; i < k + m; i++) {
>>>>>      if (want_to_encode.count(i) == 0) diff --git
>>>>> a/src/erasure-code/ErasureCode.h b/src/erasure-code/ErasureCode.h
>>>>> index 7aaea95..62aa383 100644
>>>>> --- a/src/erasure-code/ErasureCode.h
>>>>> +++ b/src/erasure-code/ErasureCode.h
>>>>> @@ -46,7 +46,8 @@ namespace ceph {
>>>>>                                              const map<int,
>> int>
>>>> &available,
>>>>>                                              set<int>
>> *minimum);
>>>>>
>>>>> -    int encode_prepare(const bufferlist &raw, bufferlist *prepared)
>> const;
>>>>> +    int encode_prepare(const bufferlist &raw,
>>>>> +                       map<int, bufferlist> &encoded) const;
>>>>>
>>>>>      virtual int encode(const set<int> &want_to_encode,
>>>>>                         const bufferlist &in,
>>>>>
>>>>
>>>> --
>>>> Lo?c Dachary, Artisan Logiciel Libre
>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe ceph-devel"
>>> in the body of a message to majordomo@vger.kernel.org More majordomo
>>> info at  http://vger.kernel.org/majordomo-info.html
>>>
>>>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

-- 
Loïc Dachary, Artisan Logiciel Libre


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 263 bytes --]

^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: [PATCH 2/3] ec: make use of added aligned buffers
  2014-09-16  6:47           ` Loic Dachary
@ 2014-09-16  6:59             ` Ma, Jianpeng
  2014-09-16  7:55               ` Loic Dachary
  0 siblings, 1 reply; 41+ messages in thread
From: Ma, Jianpeng @ 2014-09-16  6:59 UTC (permalink / raw)
  To: Loic Dachary; +Cc: ceph-devel

> -----Original Message-----
> From: Loic Dachary [mailto:loic@dachary.org]
> Sent: Tuesday, September 16, 2014 2:47 PM
> To: Ma, Jianpeng
> Cc: ceph-devel@vger.kernel.org
> Subject: Re: [PATCH 2/3] ec: make use of added aligned buffers
> 
> 
> 
> On 16/09/2014 02:08, Ma, Jianpeng wrote:
> >> -----Original Message-----
> >> From: Sage Weil [mailto:sweil@redhat.com]
> >> Sent: Tuesday, September 16, 2014 8:02 AM
> >> To: Ma, Jianpeng
> >> Cc: Loic Dachary; Janne Grunau; ceph-devel@vger.kernel.org
> >> Subject: RE: [PATCH 2/3] ec: make use of added aligned buffers
> >>
> >> On Mon, 15 Sep 2014, Ma, Jianpeng wrote:
> >>> If we modify bufferlist::c_str()  to bufferlist::c_str(bool align).
> >>> If (align)
> >>>   Posix_memalign(data, CEPH_PAGE_SIZE, len) Else
> >>>   Origin code.
> >>
> >> Alignment isn't really a bool, it's an int.  c_str(int align=1) ?
> >>
> > I mean if we need a align memory after bufferlist::c_str. We can set
> > the align = true; Now bufferlist::c_str depend on the size when using rebuild if
> bufferlist have more prt.
> >
> > BTW, we also can change the rebuild() && rebuild(ptr). Merge two func into
> one rebuild(bool align).
> > By judge the parameter align to alloc align memory or not.
> >
> > Or am I missing something?
> 
> I don't think there is a need for c_str(int align). We make every effort to allocate
> buffers that are properly aligned. If c_str() does not return an aligned buffer,
> the proper fix is to align the allocated buffer at the source, not to allocate a
> new aligned buffer and copy the content of the non aligned buffer into it.
> 

> Do you see a reason why that would be a bad way to deal with alignment ?
 We only guarantee memory align after c_str and don't depend on the previous steps.
If encode[i] have more ptr suppose they all aligned memory.
But how to guarantee aligned memory after c_str.
If bufferlist have more ptr, the aligned memory depend on the size of bufferlist.

Jianpeng
> 
> Cheers
> 
> >
> > Jianpeng
> >> sage
> >>
> >>>
> >>> I think this is simple and correctly code.
> >>>
> >>> Jianpeng
> >>> Thanks!
> >>>
> >>>> -----Original Message-----
> >>>> From: ceph-devel-owner@vger.kernel.org
> >>>> [mailto:ceph-devel-owner@vger.kernel.org] On Behalf Of Loic Dachary
> >>>> Sent: Tuesday, September 16, 2014 1:20 AM
> >>>> To: Janne Grunau; ceph-devel@vger.kernel.org
> >>>> Subject: Re: [PATCH 2/3] ec: make use of added aligned buffers
> >>>>
> >>>> Hi Janne,
> >>>>
> >>>> See below:
> >>>>
> >>>> On 15/09/2014 17:55, Janne Grunau wrote:
> >>>>> Requiring page aligned buffers and realigning the input if
> >>>>> necessary creates measurable oberhead.
> ceph_erasure_code_benchmark
> >>>>> is ~30% faster with this change for technique=reed_sol_van,k=2,m=1.
> >>>>>
> >>>>> Also prevents a misaligned buffer when
> >>>>> bufferlist::c_str(bufferlist) has to allocate a new buffer to
> >>>>> provide continuous one. See bug #9408
> >>>>>
> >>>>> Signed-off-by: Janne Grunau <j@jannau.net>
> >>>>> ---
> >>>>>  src/erasure-code/ErasureCode.cc | 46
> >>>>> +++++++++++++++++++++++++----------------
> >>>>>  src/erasure-code/ErasureCode.h  |  3 ++-
> >>>>>  2 files changed, 30 insertions(+), 19 deletions(-)
> >>>>>
> >>>>> diff --git a/src/erasure-code/ErasureCode.cc
> >>>>> b/src/erasure-code/ErasureCode.cc index 5953f49..078f60b 100644
> >>>>> --- a/src/erasure-code/ErasureCode.cc
> >>>>> +++ b/src/erasure-code/ErasureCode.cc
> >>>>> @@ -54,22 +54,38 @@ int
> >>>> ErasureCode::minimum_to_decode_with_cost(const
> >>>>> set<int> &want_to_read,  }
> >>>>>
> >>>>>  int ErasureCode::encode_prepare(const bufferlist &raw,
> >>>>> -                                bufferlist *prepared) const
> >>>>> +                                map<int, bufferlist> &encoded)
> >>>> const
> >>>>>  {
> >>>>>    unsigned int k = get_data_chunk_count();
> >>>>>    unsigned int m = get_chunk_count() - k;
> >>>>>    unsigned blocksize = get_chunk_size(raw.length());
> >>>>> -  unsigned padded_length = blocksize * k;
> >>>>> -  *prepared = raw;
> >>>>> -  if (padded_length - raw.length() > 0) {
> >>>>> -    bufferptr pad(padded_length - raw.length());
> >>>>> -    pad.zero();
> >>>>> -    prepared->push_back(pad);
> >>>>> +  unsigned pad_len = blocksize * k - raw.length();
> >>>>> +
> >>>>> +  bufferlist prepared = raw;
> >>>>> +
> >>>>> +  if (!prepared.is_aligned()) {
> >>>>> +    prepared.rebuild_aligned();
> >>>>> +  }
> >>>>> +
> >>>>> +  for (unsigned int i = 0; i < k - !!pad_len; i++) {
> >>>>> +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
> >>>>> +    bufferlist &chunk = encoded[chunk_index];
> >>>>> +    chunk.substr_of(prepared, i * blocksize, blocksize);  }
> >>>>
> >>>> It is possible for more than one chunk to be padding. It's a border
> >>>> case but... for instance with alignment = 16, k=12 and in of length
> >>>> 1550 you end up with two padding chunks because the blocksize is 144.
> >>>>
> >>>>> +  if (pad_len > 0) {
> >>>>> +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[k
> >>>>> + - 1] : k -
> >>>> 1;
> >>>>> +    bufferlist &chunk = encoded[chunk_index];
> >>>>> +    bufferptr padded(buffer::create_aligned(blocksize));
> >>>>> +    raw.copy((k - 1) * blocksize, blocksize - pad_len, padded.c_str());
> >>>>> +    padded.zero(blocksize - pad_len, pad_len);
> >>>>> +    chunk.push_back(padded);
> >>>>>    }
> >>>>> -  unsigned coding_length = blocksize * m;
> >>>>> -  bufferptr coding(buffer::create_page_aligned(coding_length));
> >>>>> -  prepared->push_back(coding);
> >>>>> -  prepared->rebuild_page_aligned();
> >>>>> +  for (unsigned int i = k; i < k + m; i++) {
> >>>>> +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
> >>>>> +    bufferlist &chunk = encoded[chunk_index];
> >>>>> +    chunk.push_back(buffer::create_aligned(blocksize));
> >>>>> +  }
> >>>>> +
> >>>>>    return 0;
> >>>>>  }
> >>>>>
> >>>>> @@ -80,15 +96,9 @@ int ErasureCode::encode(const set<int>
> >>>> &want_to_encode,
> >>>>>    unsigned int k = get_data_chunk_count();
> >>>>>    unsigned int m = get_chunk_count() - k;
> >>>>>    bufferlist out;
> >>>>> -  int err = encode_prepare(in, &out);
> >>>>> +  int err = encode_prepare(in, *encoded);
> >>>>>    if (err)
> >>>>>      return err;
> >>>>> -  unsigned blocksize = get_chunk_size(in.length());
> >>>>> -  for (unsigned int i = 0; i < k + m; i++) {
> >>>>> -    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
> >>>>> -    bufferlist &chunk = (*encoded)[chunk_index];
> >>>>> -    chunk.substr_of(out, i * blocksize, blocksize);
> >>>>> -  }
> >>>>>    encode_chunks(want_to_encode, encoded);
> >>>>>    for (unsigned int i = 0; i < k + m; i++) {
> >>>>>      if (want_to_encode.count(i) == 0) diff --git
> >>>>> a/src/erasure-code/ErasureCode.h b/src/erasure-code/ErasureCode.h
> >>>>> index 7aaea95..62aa383 100644
> >>>>> --- a/src/erasure-code/ErasureCode.h
> >>>>> +++ b/src/erasure-code/ErasureCode.h
> >>>>> @@ -46,7 +46,8 @@ namespace ceph {
> >>>>>                                              const map<int,
> >> int>
> >>>> &available,
> >>>>>                                              set<int>
> >> *minimum);
> >>>>>
> >>>>> -    int encode_prepare(const bufferlist &raw, bufferlist *prepared)
> >> const;
> >>>>> +    int encode_prepare(const bufferlist &raw,
> >>>>> +                       map<int, bufferlist> &encoded) const;
> >>>>>
> >>>>>      virtual int encode(const set<int> &want_to_encode,
> >>>>>                         const bufferlist &in,
> >>>>>
> >>>>
> >>>> --
> >>>> Lo?c Dachary, Artisan Logiciel Libre
> >>>
> >>> --
> >>> To unsubscribe from this list: send the line "unsubscribe ceph-devel"
> >>> in the body of a message to majordomo@vger.kernel.org More majordomo
> >>> info at  http://vger.kernel.org/majordomo-info.html
> >>>
> >>>
> > --
> > To unsubscribe from this list: send the line "unsubscribe ceph-devel"
> > in the body of a message to majordomo@vger.kernel.org More majordomo
> > info at  http://vger.kernel.org/majordomo-info.html
> >
> 
> --
> Loïc Dachary, Artisan Logiciel Libre

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH 2/3] ec: make use of added aligned buffers
  2014-09-16  6:59             ` Ma, Jianpeng
@ 2014-09-16  7:55               ` Loic Dachary
  2014-09-16  8:23                 ` Ma, Jianpeng
  0 siblings, 1 reply; 41+ messages in thread
From: Loic Dachary @ 2014-09-16  7:55 UTC (permalink / raw)
  To: Ma, Jianpeng; +Cc: ceph-devel

[-- Attachment #1: Type: text/plain, Size: 9354 bytes --]



On 16/09/2014 08:59, Ma, Jianpeng wrote:
>> -----Original Message-----
>> From: Loic Dachary [mailto:loic@dachary.org]
>> Sent: Tuesday, September 16, 2014 2:47 PM
>> To: Ma, Jianpeng
>> Cc: ceph-devel@vger.kernel.org
>> Subject: Re: [PATCH 2/3] ec: make use of added aligned buffers
>>
>>
>>
>> On 16/09/2014 02:08, Ma, Jianpeng wrote:
>>>> -----Original Message-----
>>>> From: Sage Weil [mailto:sweil@redhat.com]
>>>> Sent: Tuesday, September 16, 2014 8:02 AM
>>>> To: Ma, Jianpeng
>>>> Cc: Loic Dachary; Janne Grunau; ceph-devel@vger.kernel.org
>>>> Subject: RE: [PATCH 2/3] ec: make use of added aligned buffers
>>>>
>>>> On Mon, 15 Sep 2014, Ma, Jianpeng wrote:
>>>>> If we modify bufferlist::c_str()  to bufferlist::c_str(bool align).
>>>>> If (align)
>>>>>   Posix_memalign(data, CEPH_PAGE_SIZE, len) Else
>>>>>   Origin code.
>>>>
>>>> Alignment isn't really a bool, it's an int.  c_str(int align=1) ?
>>>>
>>> I mean if we need a align memory after bufferlist::c_str. We can set
>>> the align = true; Now bufferlist::c_str depend on the size when using rebuild if
>> bufferlist have more prt.
>>>
>>> BTW, we also can change the rebuild() && rebuild(ptr). Merge two func into
>> one rebuild(bool align).
>>> By judge the parameter align to alloc align memory or not.
>>>
>>> Or am I missing something?
>>
>> I don't think there is a need for c_str(int align). We make every effort to allocate
>> buffers that are properly aligned. If c_str() does not return an aligned buffer,
>> the proper fix is to align the allocated buffer at the source, not to allocate a
>> new aligned buffer and copy the content of the non aligned buffer into it.
>>
> 
>> Do you see a reason why that would be a bad way to deal with alignment ?
>  We only guarantee memory align after c_str and don't depend on the previous steps.

This is a benefit indeed: less room for error in general.

> If encode[i] have more ptr suppose they all aligned memory.
> But how to guarantee aligned memory after c_str.
> If bufferlist have more ptr, the aligned memory depend on the size of bufferlist.

The ErasureCode::encode_prepare prepares the allocated buffer so that

*) it holds enough room to contain all chunks
*) c_str() on a chunk boundary will return a pointer that does not require allocating memory

The ErasureCodeJerasure::get_chunk_size function determines the size of the buffer allocated by encode_prepare and further guarantees that each chunk is:

*) size aligned on 16 bytes
*) memory aligned on 16 bytes so that SIMD instructions can use it without moving it

In other words, if c_str() had to re-allocate the buffer because it is not aligned, it would mean that the allocation of the buffer is not done as it should.

Cheers

> Jianpeng
>>
>> Cheers
>>
>>>
>>> Jianpeng
>>>> sage
>>>>
>>>>>
>>>>> I think this is simple and correctly code.
>>>>>
>>>>> Jianpeng
>>>>> Thanks!
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: ceph-devel-owner@vger.kernel.org
>>>>>> [mailto:ceph-devel-owner@vger.kernel.org] On Behalf Of Loic Dachary
>>>>>> Sent: Tuesday, September 16, 2014 1:20 AM
>>>>>> To: Janne Grunau; ceph-devel@vger.kernel.org
>>>>>> Subject: Re: [PATCH 2/3] ec: make use of added aligned buffers
>>>>>>
>>>>>> Hi Janne,
>>>>>>
>>>>>> See below:
>>>>>>
>>>>>> On 15/09/2014 17:55, Janne Grunau wrote:
>>>>>>> Requiring page aligned buffers and realigning the input if
>>>>>>> necessary creates measurable oberhead.
>> ceph_erasure_code_benchmark
>>>>>>> is ~30% faster with this change for technique=reed_sol_van,k=2,m=1.
>>>>>>>
>>>>>>> Also prevents a misaligned buffer when
>>>>>>> bufferlist::c_str(bufferlist) has to allocate a new buffer to
>>>>>>> provide continuous one. See bug #9408
>>>>>>>
>>>>>>> Signed-off-by: Janne Grunau <j@jannau.net>
>>>>>>> ---
>>>>>>>  src/erasure-code/ErasureCode.cc | 46
>>>>>>> +++++++++++++++++++++++++----------------
>>>>>>>  src/erasure-code/ErasureCode.h  |  3 ++-
>>>>>>>  2 files changed, 30 insertions(+), 19 deletions(-)
>>>>>>>
>>>>>>> diff --git a/src/erasure-code/ErasureCode.cc
>>>>>>> b/src/erasure-code/ErasureCode.cc index 5953f49..078f60b 100644
>>>>>>> --- a/src/erasure-code/ErasureCode.cc
>>>>>>> +++ b/src/erasure-code/ErasureCode.cc
>>>>>>> @@ -54,22 +54,38 @@ int
>>>>>> ErasureCode::minimum_to_decode_with_cost(const
>>>>>>> set<int> &want_to_read,  }
>>>>>>>
>>>>>>>  int ErasureCode::encode_prepare(const bufferlist &raw,
>>>>>>> -                                bufferlist *prepared) const
>>>>>>> +                                map<int, bufferlist> &encoded)
>>>>>> const
>>>>>>>  {
>>>>>>>    unsigned int k = get_data_chunk_count();
>>>>>>>    unsigned int m = get_chunk_count() - k;
>>>>>>>    unsigned blocksize = get_chunk_size(raw.length());
>>>>>>> -  unsigned padded_length = blocksize * k;
>>>>>>> -  *prepared = raw;
>>>>>>> -  if (padded_length - raw.length() > 0) {
>>>>>>> -    bufferptr pad(padded_length - raw.length());
>>>>>>> -    pad.zero();
>>>>>>> -    prepared->push_back(pad);
>>>>>>> +  unsigned pad_len = blocksize * k - raw.length();
>>>>>>> +
>>>>>>> +  bufferlist prepared = raw;
>>>>>>> +
>>>>>>> +  if (!prepared.is_aligned()) {
>>>>>>> +    prepared.rebuild_aligned();
>>>>>>> +  }
>>>>>>> +
>>>>>>> +  for (unsigned int i = 0; i < k - !!pad_len; i++) {
>>>>>>> +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
>>>>>>> +    bufferlist &chunk = encoded[chunk_index];
>>>>>>> +    chunk.substr_of(prepared, i * blocksize, blocksize);  }
>>>>>>
>>>>>> It is possible for more than one chunk to be padding. It's a border
>>>>>> case but... for instance with alignment = 16, k=12 and in of length
>>>>>> 1550 you end up with two padding chunks because the blocksize is 144.
>>>>>>
>>>>>>> +  if (pad_len > 0) {
>>>>>>> +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[k
>>>>>>> + - 1] : k -
>>>>>> 1;
>>>>>>> +    bufferlist &chunk = encoded[chunk_index];
>>>>>>> +    bufferptr padded(buffer::create_aligned(blocksize));
>>>>>>> +    raw.copy((k - 1) * blocksize, blocksize - pad_len, padded.c_str());
>>>>>>> +    padded.zero(blocksize - pad_len, pad_len);
>>>>>>> +    chunk.push_back(padded);
>>>>>>>    }
>>>>>>> -  unsigned coding_length = blocksize * m;
>>>>>>> -  bufferptr coding(buffer::create_page_aligned(coding_length));
>>>>>>> -  prepared->push_back(coding);
>>>>>>> -  prepared->rebuild_page_aligned();
>>>>>>> +  for (unsigned int i = k; i < k + m; i++) {
>>>>>>> +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
>>>>>>> +    bufferlist &chunk = encoded[chunk_index];
>>>>>>> +    chunk.push_back(buffer::create_aligned(blocksize));
>>>>>>> +  }
>>>>>>> +
>>>>>>>    return 0;
>>>>>>>  }
>>>>>>>
>>>>>>> @@ -80,15 +96,9 @@ int ErasureCode::encode(const set<int>
>>>>>> &want_to_encode,
>>>>>>>    unsigned int k = get_data_chunk_count();
>>>>>>>    unsigned int m = get_chunk_count() - k;
>>>>>>>    bufferlist out;
>>>>>>> -  int err = encode_prepare(in, &out);
>>>>>>> +  int err = encode_prepare(in, *encoded);
>>>>>>>    if (err)
>>>>>>>      return err;
>>>>>>> -  unsigned blocksize = get_chunk_size(in.length());
>>>>>>> -  for (unsigned int i = 0; i < k + m; i++) {
>>>>>>> -    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
>>>>>>> -    bufferlist &chunk = (*encoded)[chunk_index];
>>>>>>> -    chunk.substr_of(out, i * blocksize, blocksize);
>>>>>>> -  }
>>>>>>>    encode_chunks(want_to_encode, encoded);
>>>>>>>    for (unsigned int i = 0; i < k + m; i++) {
>>>>>>>      if (want_to_encode.count(i) == 0) diff --git
>>>>>>> a/src/erasure-code/ErasureCode.h b/src/erasure-code/ErasureCode.h
>>>>>>> index 7aaea95..62aa383 100644
>>>>>>> --- a/src/erasure-code/ErasureCode.h
>>>>>>> +++ b/src/erasure-code/ErasureCode.h
>>>>>>> @@ -46,7 +46,8 @@ namespace ceph {
>>>>>>>                                              const map<int,
>>>> int>
>>>>>> &available,
>>>>>>>                                              set<int>
>>>> *minimum);
>>>>>>>
>>>>>>> -    int encode_prepare(const bufferlist &raw, bufferlist *prepared)
>>>> const;
>>>>>>> +    int encode_prepare(const bufferlist &raw,
>>>>>>> +                       map<int, bufferlist> &encoded) const;
>>>>>>>
>>>>>>>      virtual int encode(const set<int> &want_to_encode,
>>>>>>>                         const bufferlist &in,
>>>>>>>
>>>>>>
>>>>>> --
>>>>>> Lo?c Dachary, Artisan Logiciel Libre
>>>>>
>>>>> --
>>>>> To unsubscribe from this list: send the line "unsubscribe ceph-devel"
>>>>> in the body of a message to majordomo@vger.kernel.org More majordomo
>>>>> info at  http://vger.kernel.org/majordomo-info.html
>>>>>
>>>>>
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe ceph-devel"
>>> in the body of a message to majordomo@vger.kernel.org More majordomo
>>> info at  http://vger.kernel.org/majordomo-info.html
>>>
>>
>> --
>> Loïc Dachary, Artisan Logiciel Libre
> 
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

-- 
Loïc Dachary, Artisan Logiciel Libre


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 263 bytes --]

^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: [PATCH 2/3] ec: make use of added aligned buffers
  2014-09-16  7:55               ` Loic Dachary
@ 2014-09-16  8:23                 ` Ma, Jianpeng
  0 siblings, 0 replies; 41+ messages in thread
From: Ma, Jianpeng @ 2014-09-16  8:23 UTC (permalink / raw)
  To: Loic Dachary; +Cc: ceph-devel

I see it .Thanks very much!

Jianpeng
> -----Original Message-----
> From: Loic Dachary [mailto:loic@dachary.org]
> Sent: Tuesday, September 16, 2014 3:55 PM
> To: Ma, Jianpeng
> Cc: ceph-devel@vger.kernel.org
> Subject: Re: [PATCH 2/3] ec: make use of added aligned buffers
> 
> 
> 
> On 16/09/2014 08:59, Ma, Jianpeng wrote:
> >> -----Original Message-----
> >> From: Loic Dachary [mailto:loic@dachary.org]
> >> Sent: Tuesday, September 16, 2014 2:47 PM
> >> To: Ma, Jianpeng
> >> Cc: ceph-devel@vger.kernel.org
> >> Subject: Re: [PATCH 2/3] ec: make use of added aligned buffers
> >>
> >>
> >>
> >> On 16/09/2014 02:08, Ma, Jianpeng wrote:
> >>>> -----Original Message-----
> >>>> From: Sage Weil [mailto:sweil@redhat.com]
> >>>> Sent: Tuesday, September 16, 2014 8:02 AM
> >>>> To: Ma, Jianpeng
> >>>> Cc: Loic Dachary; Janne Grunau; ceph-devel@vger.kernel.org
> >>>> Subject: RE: [PATCH 2/3] ec: make use of added aligned buffers
> >>>>
> >>>> On Mon, 15 Sep 2014, Ma, Jianpeng wrote:
> >>>>> If we modify bufferlist::c_str()  to bufferlist::c_str(bool align).
> >>>>> If (align)
> >>>>>   Posix_memalign(data, CEPH_PAGE_SIZE, len) Else
> >>>>>   Origin code.
> >>>>
> >>>> Alignment isn't really a bool, it's an int.  c_str(int align=1) ?
> >>>>
> >>> I mean if we need a align memory after bufferlist::c_str. We can set
> >>> the align = true; Now bufferlist::c_str depend on the size when
> >>> using rebuild if
> >> bufferlist have more prt.
> >>>
> >>> BTW, we also can change the rebuild() && rebuild(ptr). Merge two
> >>> func into
> >> one rebuild(bool align).
> >>> By judge the parameter align to alloc align memory or not.
> >>>
> >>> Or am I missing something?
> >>
> >> I don't think there is a need for c_str(int align). We make every
> >> effort to allocate buffers that are properly aligned. If c_str() does
> >> not return an aligned buffer, the proper fix is to align the
> >> allocated buffer at the source, not to allocate a new aligned buffer and copy
> the content of the non aligned buffer into it.
> >>
> >
> >> Do you see a reason why that would be a bad way to deal with alignment ?
> >  We only guarantee memory align after c_str and don't depend on the
> previous steps.
> 
> This is a benefit indeed: less room for error in general.
> 
> > If encode[i] have more ptr suppose they all aligned memory.
> > But how to guarantee aligned memory after c_str.
> > If bufferlist have more ptr, the aligned memory depend on the size of
> bufferlist.
> 
> The ErasureCode::encode_prepare prepares the allocated buffer so that
> 
> *) it holds enough room to contain all chunks
> *) c_str() on a chunk boundary will return a pointer that does not require
> allocating memory
> 
> The ErasureCodeJerasure::get_chunk_size function determines the size of the
> buffer allocated by encode_prepare and further guarantees that each chunk is:
> 
> *) size aligned on 16 bytes
> *) memory aligned on 16 bytes so that SIMD instructions can use it without
> moving it
> 
> In other words, if c_str() had to re-allocate the buffer because it is not aligned,
> it would mean that the allocation of the buffer is not done as it should.
> 

> Cheers
> 
> > Jianpeng
> >>
> >> Cheers
> >>
> >>>
> >>> Jianpeng
> >>>> sage
> >>>>
> >>>>>
> >>>>> I think this is simple and correctly code.
> >>>>>
> >>>>> Jianpeng
> >>>>> Thanks!
> >>>>>
> >>>>>> -----Original Message-----
> >>>>>> From: ceph-devel-owner@vger.kernel.org
> >>>>>> [mailto:ceph-devel-owner@vger.kernel.org] On Behalf Of Loic
> >>>>>> Dachary
> >>>>>> Sent: Tuesday, September 16, 2014 1:20 AM
> >>>>>> To: Janne Grunau; ceph-devel@vger.kernel.org
> >>>>>> Subject: Re: [PATCH 2/3] ec: make use of added aligned buffers
> >>>>>>
> >>>>>> Hi Janne,
> >>>>>>
> >>>>>> See below:
> >>>>>>
> >>>>>> On 15/09/2014 17:55, Janne Grunau wrote:
> >>>>>>> Requiring page aligned buffers and realigning the input if
> >>>>>>> necessary creates measurable oberhead.
> >> ceph_erasure_code_benchmark
> >>>>>>> is ~30% faster with this change for technique=reed_sol_van,k=2,m=1.
> >>>>>>>
> >>>>>>> Also prevents a misaligned buffer when
> >>>>>>> bufferlist::c_str(bufferlist) has to allocate a new buffer to
> >>>>>>> provide continuous one. See bug #9408
> >>>>>>>
> >>>>>>> Signed-off-by: Janne Grunau <j@jannau.net>
> >>>>>>> ---
> >>>>>>>  src/erasure-code/ErasureCode.cc | 46
> >>>>>>> +++++++++++++++++++++++++----------------
> >>>>>>>  src/erasure-code/ErasureCode.h  |  3 ++-
> >>>>>>>  2 files changed, 30 insertions(+), 19 deletions(-)
> >>>>>>>
> >>>>>>> diff --git a/src/erasure-code/ErasureCode.cc
> >>>>>>> b/src/erasure-code/ErasureCode.cc index 5953f49..078f60b 100644
> >>>>>>> --- a/src/erasure-code/ErasureCode.cc
> >>>>>>> +++ b/src/erasure-code/ErasureCode.cc
> >>>>>>> @@ -54,22 +54,38 @@ int
> >>>>>> ErasureCode::minimum_to_decode_with_cost(const
> >>>>>>> set<int> &want_to_read,  }
> >>>>>>>
> >>>>>>>  int ErasureCode::encode_prepare(const bufferlist &raw,
> >>>>>>> -                                bufferlist *prepared) const
> >>>>>>> +                                map<int, bufferlist>
> &encoded)
> >>>>>> const
> >>>>>>>  {
> >>>>>>>    unsigned int k = get_data_chunk_count();
> >>>>>>>    unsigned int m = get_chunk_count() - k;
> >>>>>>>    unsigned blocksize = get_chunk_size(raw.length());
> >>>>>>> -  unsigned padded_length = blocksize * k;
> >>>>>>> -  *prepared = raw;
> >>>>>>> -  if (padded_length - raw.length() > 0) {
> >>>>>>> -    bufferptr pad(padded_length - raw.length());
> >>>>>>> -    pad.zero();
> >>>>>>> -    prepared->push_back(pad);
> >>>>>>> +  unsigned pad_len = blocksize * k - raw.length();
> >>>>>>> +
> >>>>>>> +  bufferlist prepared = raw;
> >>>>>>> +
> >>>>>>> +  if (!prepared.is_aligned()) {
> >>>>>>> +    prepared.rebuild_aligned();  }
> >>>>>>> +
> >>>>>>> +  for (unsigned int i = 0; i < k - !!pad_len; i++) {
> >>>>>>> +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] :
> i;
> >>>>>>> +    bufferlist &chunk = encoded[chunk_index];
> >>>>>>> +    chunk.substr_of(prepared, i * blocksize, blocksize);  }
> >>>>>>
> >>>>>> It is possible for more than one chunk to be padding. It's a
> >>>>>> border case but... for instance with alignment = 16, k=12 and in
> >>>>>> of length
> >>>>>> 1550 you end up with two padding chunks because the blocksize is 144.
> >>>>>>
> >>>>>>> +  if (pad_len > 0) {
> >>>>>>> +    int chunk_index = chunk_mapping.size() > 0 ?
> >>>>>>> + chunk_mapping[k
> >>>>>>> + - 1] : k -
> >>>>>> 1;
> >>>>>>> +    bufferlist &chunk = encoded[chunk_index];
> >>>>>>> +    bufferptr padded(buffer::create_aligned(blocksize));
> >>>>>>> +    raw.copy((k - 1) * blocksize, blocksize - pad_len, padded.c_str());
> >>>>>>> +    padded.zero(blocksize - pad_len, pad_len);
> >>>>>>> +    chunk.push_back(padded);
> >>>>>>>    }
> >>>>>>> -  unsigned coding_length = blocksize * m;
> >>>>>>> -  bufferptr coding(buffer::create_page_aligned(coding_length));
> >>>>>>> -  prepared->push_back(coding);
> >>>>>>> -  prepared->rebuild_page_aligned();
> >>>>>>> +  for (unsigned int i = k; i < k + m; i++) {
> >>>>>>> +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] :
> i;
> >>>>>>> +    bufferlist &chunk = encoded[chunk_index];
> >>>>>>> +    chunk.push_back(buffer::create_aligned(blocksize));
> >>>>>>> +  }
> >>>>>>> +
> >>>>>>>    return 0;
> >>>>>>>  }
> >>>>>>>
> >>>>>>> @@ -80,15 +96,9 @@ int ErasureCode::encode(const set<int>
> >>>>>> &want_to_encode,
> >>>>>>>    unsigned int k = get_data_chunk_count();
> >>>>>>>    unsigned int m = get_chunk_count() - k;
> >>>>>>>    bufferlist out;
> >>>>>>> -  int err = encode_prepare(in, &out);
> >>>>>>> +  int err = encode_prepare(in, *encoded);
> >>>>>>>    if (err)
> >>>>>>>      return err;
> >>>>>>> -  unsigned blocksize = get_chunk_size(in.length());
> >>>>>>> -  for (unsigned int i = 0; i < k + m; i++) {
> >>>>>>> -    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] :
> i;
> >>>>>>> -    bufferlist &chunk = (*encoded)[chunk_index];
> >>>>>>> -    chunk.substr_of(out, i * blocksize, blocksize);
> >>>>>>> -  }
> >>>>>>>    encode_chunks(want_to_encode, encoded);
> >>>>>>>    for (unsigned int i = 0; i < k + m; i++) {
> >>>>>>>      if (want_to_encode.count(i) == 0) diff --git
> >>>>>>> a/src/erasure-code/ErasureCode.h
> >>>>>>> b/src/erasure-code/ErasureCode.h index 7aaea95..62aa383 100644
> >>>>>>> --- a/src/erasure-code/ErasureCode.h
> >>>>>>> +++ b/src/erasure-code/ErasureCode.h
> >>>>>>> @@ -46,7 +46,8 @@ namespace ceph {
> >>>>>>>                                              const
> map<int,
> >>>> int>
> >>>>>> &available,
> >>>>>>>                                              set<int>
> >>>> *minimum);
> >>>>>>>
> >>>>>>> -    int encode_prepare(const bufferlist &raw, bufferlist *prepared)
> >>>> const;
> >>>>>>> +    int encode_prepare(const bufferlist &raw,
> >>>>>>> +                       map<int, bufferlist> &encoded) const;
> >>>>>>>
> >>>>>>>      virtual int encode(const set<int> &want_to_encode,
> >>>>>>>                         const bufferlist &in,
> >>>>>>>
> >>>>>>
> >>>>>> --
> >>>>>> Lo?c Dachary, Artisan Logiciel Libre
> >>>>>
> >>>>> --
> >>>>> To unsubscribe from this list: send the line "unsubscribe ceph-devel"
> >>>>> in the body of a message to majordomo@vger.kernel.org More
> >>>>> majordomo info at  http://vger.kernel.org/majordomo-info.html
> >>>>>
> >>>>>
> >>> --
> >>> To unsubscribe from this list: send the line "unsubscribe ceph-devel"
> >>> in the body of a message to majordomo@vger.kernel.org More majordomo
> >>> info at  http://vger.kernel.org/majordomo-info.html
> >>>
> >>
> >> --
> >> Loïc Dachary, Artisan Logiciel Libre
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe ceph-devel"
> > in the body of a message to majordomo@vger.kernel.org More majordomo
> > info at  http://vger.kernel.org/majordomo-info.html
> >
> 
> --
> Loïc Dachary, Artisan Logiciel Libre

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 41+ messages in thread

* v2 aligned buffer changes for erasure codes
  2014-09-15 15:55 [PATCH 1/3] buffer: add an aligned buffer with less alignment than a page Janne Grunau
                   ` (2 preceding siblings ...)
  2014-09-15 16:46 ` [PATCH 1/3] buffer: add an aligned buffer with less alignment than a page Loic Dachary
@ 2014-09-18 10:33 ` Janne Grunau
  2014-09-18 10:33   ` [PATCH v2 1/3] buffer: add an aligned buffer with less alignment than a page Janne Grunau
                     ` (3 more replies)
  2014-09-29 12:34 ` [PATCH v3 0/4] buffer alignment for erasure code SIMD Janne Grunau
  4 siblings, 4 replies; 41+ messages in thread
From: Janne Grunau @ 2014-09-18 10:33 UTC (permalink / raw)
  To: ceph-devel

Hi,

following a is an updated patchset. It passes now make check in src

It has following changes:
 * use 32-byte alignment since the isa plugin use AVX2
   (src/erasure-code/isa/README claims it needs 16*k byte aligned buffers
   but I can't see a reason why it would need more than 32-bytes
 * ErasureCode::encode_prepare() handles more than one chunk with padding

cheers

Janne

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v2 1/3] buffer: add an aligned buffer with less alignment than a page
  2014-09-18 10:33 ` v2 aligned buffer changes for erasure codes Janne Grunau
@ 2014-09-18 10:33   ` Janne Grunau
  2014-09-18 10:33   ` [PATCH v2 2/3] ec: use 32-byte aligned buffers Janne Grunau
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 41+ messages in thread
From: Janne Grunau @ 2014-09-18 10:33 UTC (permalink / raw)
  To: ceph-devel

SIMD optimized erasure code computation needs aligned memory. Buffers
aligned to a page boundary are wasted on it though. The buffers used
for the erasure code computation are typical smaller than a page.

An alignment of 32 bytes is chosen to satisfy the needs of AVX/AVX2.
Could be made arch specific to reduce the alignment to 16 bytes for
arm/aarch64 NEON.

Signed-off-by: Janne Grunau <j@jannau.net>
---
 configure.ac         |   9 +++++
 src/common/buffer.cc | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++
 src/include/buffer.h |  10 ++++++
 3 files changed, 119 insertions(+)

diff --git a/configure.ac b/configure.ac
index cccf2d9..1bb27c4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -793,6 +793,15 @@ AC_MSG_RESULT([no])
 ])
 
 #
+# Check for functions to provide aligned memory
+#
+AC_CHECK_HEADERS([malloc.h])
+AC_CHECK_FUNCS([posix_memalign _aligned_malloc memalign aligned_malloc],
+               [found_memalign=yes; break])
+
+AS_IF([test "x$found_memalign" != "xyes"], [AC_MSG_WARN([No function for aligned memory allocation found])])
+
+#
 # Check for pthread spinlock (depends on ACX_PTHREAD)
 #
 saved_LIBS="$LIBS"
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index b141759..acc221f 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -30,6 +30,10 @@
 #include <sys/uio.h>
 #include <limits.h>
 
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
 namespace ceph {
 
 #ifdef BUFFER_DEBUG
@@ -155,9 +159,15 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     virtual int zero_copy_to_fd(int fd, loff_t *offset) {
       return -ENOTSUP;
     }
+    virtual bool is_aligned() {
+      return ((long)data & ~CEPH_ALIGN_MASK) == 0;
+    }
     virtual bool is_page_aligned() {
       return ((long)data & ~CEPH_PAGE_MASK) == 0;
     }
+    bool is_n_align_sized() {
+      return (len & ~CEPH_ALIGN_MASK) == 0;
+    }
     bool is_n_page_sized() {
       return (len & ~CEPH_PAGE_MASK) == 0;
     }
@@ -209,6 +219,41 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     }
   };
 
+  class buffer::raw_aligned : public buffer::raw {
+  public:
+    raw_aligned(unsigned l) : raw(l) {
+      if (len) {
+#if HAVE_POSIX_MEMALIGN
+        if (posix_memalign((void **) &data, CEPH_ALIGN, len))
+          data = 0;
+#elif HAVE__ALIGNED_MALLOC
+        data = _aligned_malloc(len, CEPH_ALIGN);
+#elif HAVE_MEMALIGN
+        data = memalign(CEPH_ALIGN, len);
+#elif HAVE_ALIGNED_MALLOC
+        data = aligned_malloc((len + CEPH_ALIGN - 1) & ~CEPH_ALIGN_MASK,
+                              CEPH_ALIGN);
+#else
+        data = malloc(len);
+#endif
+        if (!data)
+          throw bad_alloc();
+      } else {
+        data = 0;
+      }
+      inc_total_alloc(len);
+      bdout << "raw_aligned " << this << " alloc " << (void *)data << " " << l << " " << buffer::get_total_alloc() << bendl;
+    }
+    ~raw_aligned() {
+      free(data);
+      dec_total_alloc(len);
+      bdout << "raw_aligned " << this << " free " << (void *)data << " " << buffer::get_total_alloc() << bendl;
+    }
+    raw* clone_empty() {
+      return new raw_aligned(len);
+    }
+  };
+
 #ifndef __CYGWIN__
   class buffer::raw_mmap_pages : public buffer::raw {
   public:
@@ -334,6 +379,10 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
       return true;
     }
 
+    bool is_aligned() {
+      return false;
+    }
+
     bool is_page_aligned() {
       return false;
     }
@@ -520,6 +569,9 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
   buffer::raw* buffer::create_static(unsigned len, char *buf) {
     return new raw_static(buf, len);
   }
+  buffer::raw* buffer::create_aligned(unsigned len) {
+    return new raw_aligned(len);
+  }
   buffer::raw* buffer::create_page_aligned(unsigned len) {
 #ifndef __CYGWIN__
     //return new raw_mmap_pages(len);
@@ -1013,6 +1065,16 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     return true;
   }
 
+  bool buffer::list::is_aligned() const
+  {
+    for (std::list<ptr>::const_iterator it = _buffers.begin();
+         it != _buffers.end();
+         ++it)
+      if (!it->is_aligned())
+        return false;
+    return true;
+  }
+
   bool buffer::list::is_page_aligned() const
   {
     for (std::list<ptr>::const_iterator it = _buffers.begin();
@@ -1101,6 +1163,44 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     _buffers.push_back(nb);
   }
 
+void buffer::list::rebuild_aligned()
+{
+  std::list<ptr>::iterator p = _buffers.begin();
+  while (p != _buffers.end()) {
+    // keep anything that's already page sized+aligned
+    if (p->is_aligned() && p->is_n_align_sized()) {
+      /*cout << " segment " << (void*)p->c_str()
+             << " offset " << ((unsigned long)p->c_str() & ~CEPH_ALIGN_MASK)
+             << " length " << p->length()
+             << " " << (p->length() & ~CEPH_ALIGN_MASK) << " ok" << std::endl;
+      */
+      ++p;
+      continue;
+    }
+
+    // consolidate unaligned items, until we get something that is sized+aligned
+    list unaligned;
+    unsigned offset = 0;
+    do {
+      /*cout << " segment " << (void*)p->c_str()
+             << " offset " << ((unsigned long)p->c_str() & ~CEPH_ALIGN_MASK)
+             << " length " << p->length() << " " << (p->length() & ~CEPH_ALIGN_MASK)
+             << " overall offset " << offset << " " << (offset & ~CEPH_ALIGN_MASK)
+             << " not ok" << std::endl;
+      */
+      offset += p->length();
+      unaligned.push_back(*p);
+      _buffers.erase(p++);
+    } while (p != _buffers.end() &&
+	     (!p->is_aligned() ||
+	      !p->is_n_align_sized() ||
+	      (offset & ~CEPH_ALIGN_MASK)));
+    ptr nb(buffer::create_aligned(unaligned._len));
+    unaligned.rebuild(nb);
+    _buffers.insert(p, unaligned._buffers.front());
+  }
+}
+
 void buffer::list::rebuild_page_aligned()
 {
   std::list<ptr>::iterator p = _buffers.begin();
diff --git a/src/include/buffer.h b/src/include/buffer.h
index e5c1b50..ecf6013 100644
--- a/src/include/buffer.h
+++ b/src/include/buffer.h
@@ -56,6 +56,9 @@
 # include <assert.h>
 #endif
 
+#define CEPH_ALIGN 32
+#define CEPH_ALIGN_MASK (~(CEPH_ALIGN - 1LLU))
+
 namespace ceph {
 
 class buffer {
@@ -124,6 +127,7 @@ private:
    */
   class raw;
   class raw_malloc;
+  class raw_aligned;
   class raw_static;
   class raw_mmap_pages;
   class raw_posix_aligned;
@@ -144,6 +148,7 @@ public:
   static raw* create_malloc(unsigned len);
   static raw* claim_malloc(unsigned len, char *buf);
   static raw* create_static(unsigned len, char *buf);
+  static raw* create_aligned(unsigned len);
   static raw* create_page_aligned(unsigned len);
   static raw* create_zero_copy(unsigned len, int fd, int64_t *offset);
 
@@ -177,7 +182,9 @@ public:
     bool at_buffer_head() const { return _off == 0; }
     bool at_buffer_tail() const;
 
+    bool is_aligned() const { return ((long)c_str() & ~CEPH_ALIGN_MASK) == 0; }
     bool is_page_aligned() const { return ((long)c_str() & ~CEPH_PAGE_MASK) == 0; }
+    bool is_n_align_sized() const { return (length() & ~CEPH_ALIGN_MASK) == 0; }
     bool is_n_page_sized() const { return (length() & ~CEPH_PAGE_MASK) == 0; }
 
     // accessors
@@ -344,7 +351,9 @@ public:
     bool contents_equal(buffer::list& other);
 
     bool can_zero_copy() const;
+    bool is_aligned() const;
     bool is_page_aligned() const;
+    bool is_n_align_sized() const;
     bool is_n_page_sized() const;
 
     bool is_zero() const;
@@ -382,6 +391,7 @@ public:
     bool is_contiguous();
     void rebuild();
     void rebuild(ptr& nb);
+    void rebuild_aligned();
     void rebuild_page_aligned();
 
     // sort-of-like-assignment-op
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH v2 2/3] ec: use 32-byte aligned buffers
  2014-09-18 10:33 ` v2 aligned buffer changes for erasure codes Janne Grunau
  2014-09-18 10:33   ` [PATCH v2 1/3] buffer: add an aligned buffer with less alignment than a page Janne Grunau
@ 2014-09-18 10:33   ` Janne Grunau
  2014-09-19  9:47     ` Loic Dachary
  2014-09-18 10:33   ` [PATCH v2 3/3] ceph_erasure_code_benchmark: align the encoding input Janne Grunau
  2014-09-18 12:18   ` v2 aligned buffer changes for erasure codes Andreas Joachim Peters
  3 siblings, 1 reply; 41+ messages in thread
From: Janne Grunau @ 2014-09-18 10:33 UTC (permalink / raw)
  To: ceph-devel

Requiring page aligned buffers and realigning the input if necessary
creates measurable oberhead. ceph_erasure_code_benchmark is ~30% faster
with this change for technique=reed_sol_van,k=2,m=1.

Also prevents a misaligned buffer when bufferlist::c_str(bufferlist)
has to allocate a new buffer to provide continuous one. See bug #9408

Signed-off-by: Janne Grunau <j@jannau.net>
---
 src/erasure-code/ErasureCode.cc | 57 ++++++++++++++++++++++++++++-------------
 src/erasure-code/ErasureCode.h  |  3 ++-
 2 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/src/erasure-code/ErasureCode.cc b/src/erasure-code/ErasureCode.cc
index 5953f49..7aa5235 100644
--- a/src/erasure-code/ErasureCode.cc
+++ b/src/erasure-code/ErasureCode.cc
@@ -54,22 +54,49 @@ int ErasureCode::minimum_to_decode_with_cost(const set<int> &want_to_read,
 }
 
 int ErasureCode::encode_prepare(const bufferlist &raw,
-                                bufferlist *prepared) const
+                                map<int, bufferlist> &encoded) const
 {
   unsigned int k = get_data_chunk_count();
   unsigned int m = get_chunk_count() - k;
   unsigned blocksize = get_chunk_size(raw.length());
-  unsigned padded_length = blocksize * k;
-  *prepared = raw;
-  if (padded_length - raw.length() > 0) {
-    bufferptr pad(padded_length - raw.length());
-    pad.zero();
-    prepared->push_back(pad);
+  unsigned pad_len = blocksize * k - raw.length();
+  unsigned padded_chunks = k - raw.length() / blocksize;
+  bufferlist prepared = raw;
+
+  if (!prepared.is_aligned()) {
+    // splice padded chunks off to make the rebuild faster
+    if (padded_chunks)
+      prepared.splice((k - padded_chunks) * blocksize,
+                      padded_chunks * blocksize - pad_len);
+    prepared.rebuild_aligned();
+  }
+
+  for (unsigned int i = 0; i < k - padded_chunks; i++) {
+    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
+    bufferlist &chunk = encoded[chunk_index];
+    chunk.substr_of(prepared, i * blocksize, blocksize);
+  }
+  if (padded_chunks) {
+    unsigned remainder = raw.length() - (k - padded_chunks) * blocksize;
+    bufferlist padded;
+    bufferptr buf(buffer::create_aligned(padded_chunks * blocksize));
+
+    raw.copy((k - padded_chunks) * blocksize, remainder, buf.c_str());
+    buf.zero(remainder, pad_len);
+    padded.push_back(buf);
+
+    for (unsigned int i = k - padded_chunks; i < k; i++) {
+      int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
+      bufferlist &chunk = encoded[chunk_index];
+      chunk.substr_of(padded, (i - (k - padded_chunks)) * blocksize, blocksize);
+    }
+  }
+  for (unsigned int i = k; i < k + m; i++) {
+    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
+    bufferlist &chunk = encoded[chunk_index];
+    chunk.push_back(buffer::create_aligned(blocksize));
   }
-  unsigned coding_length = blocksize * m;
-  bufferptr coding(buffer::create_page_aligned(coding_length));
-  prepared->push_back(coding);
-  prepared->rebuild_page_aligned();
+
   return 0;
 }
 
@@ -80,15 +107,9 @@ int ErasureCode::encode(const set<int> &want_to_encode,
   unsigned int k = get_data_chunk_count();
   unsigned int m = get_chunk_count() - k;
   bufferlist out;
-  int err = encode_prepare(in, &out);
+  int err = encode_prepare(in, *encoded);
   if (err)
     return err;
-  unsigned blocksize = get_chunk_size(in.length());
-  for (unsigned int i = 0; i < k + m; i++) {
-    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
-    bufferlist &chunk = (*encoded)[chunk_index];
-    chunk.substr_of(out, i * blocksize, blocksize);
-  }
   encode_chunks(want_to_encode, encoded);
   for (unsigned int i = 0; i < k + m; i++) {
     if (want_to_encode.count(i) == 0)
diff --git a/src/erasure-code/ErasureCode.h b/src/erasure-code/ErasureCode.h
index 7aaea95..62aa383 100644
--- a/src/erasure-code/ErasureCode.h
+++ b/src/erasure-code/ErasureCode.h
@@ -46,7 +46,8 @@ namespace ceph {
                                             const map<int, int> &available,
                                             set<int> *minimum);
 
-    int encode_prepare(const bufferlist &raw, bufferlist *prepared) const;
+    int encode_prepare(const bufferlist &raw,
+                       map<int, bufferlist> &encoded) const;
 
     virtual int encode(const set<int> &want_to_encode,
                        const bufferlist &in,
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH v2 3/3] ceph_erasure_code_benchmark: align the encoding input
  2014-09-18 10:33 ` v2 aligned buffer changes for erasure codes Janne Grunau
  2014-09-18 10:33   ` [PATCH v2 1/3] buffer: add an aligned buffer with less alignment than a page Janne Grunau
  2014-09-18 10:33   ` [PATCH v2 2/3] ec: use 32-byte aligned buffers Janne Grunau
@ 2014-09-18 10:33   ` Janne Grunau
  2014-09-18 12:18   ` v2 aligned buffer changes for erasure codes Andreas Joachim Peters
  3 siblings, 0 replies; 41+ messages in thread
From: Janne Grunau @ 2014-09-18 10:33 UTC (permalink / raw)
  To: ceph-devel

The benchmark is supposed to measure the encoding speed and not the
overhead of buffer realignments.

Signed-off-by: Janne Grunau <j@jannau.net>
---
 src/test/erasure-code/ceph_erasure_code_benchmark.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/test/erasure-code/ceph_erasure_code_benchmark.cc b/src/test/erasure-code/ceph_erasure_code_benchmark.cc
index c6a4228..3fedcd5 100644
--- a/src/test/erasure-code/ceph_erasure_code_benchmark.cc
+++ b/src/test/erasure-code/ceph_erasure_code_benchmark.cc
@@ -144,6 +144,7 @@ int ErasureCodeBench::encode()
 
   bufferlist in;
   in.append(string(in_size, 'X'));
+  in.rebuild_aligned();
   set<int> want_to_encode;
   for (int i = 0; i < k + m; i++) {
     want_to_encode.insert(i);
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* RE: v2 aligned buffer changes for erasure codes
  2014-09-18 10:33 ` v2 aligned buffer changes for erasure codes Janne Grunau
                     ` (2 preceding siblings ...)
  2014-09-18 10:33   ` [PATCH v2 3/3] ceph_erasure_code_benchmark: align the encoding input Janne Grunau
@ 2014-09-18 12:18   ` Andreas Joachim Peters
  2014-09-18 12:34     ` Andreas Joachim Peters
  2014-09-18 12:40     ` Janne Grunau
  3 siblings, 2 replies; 41+ messages in thread
From: Andreas Joachim Peters @ 2014-09-18 12:18 UTC (permalink / raw)
  To: Janne Grunau, ceph-devel

Hi Janne, 
=> (src/erasure-code/isa/README claims it needs 16*k byte aligned buffers

I should update the README since it is misleading ... it should say 8*k or 16*k byte aligned chunk size depending on the compiler/platform used, it is not the alignment of the allocated buffer addresses.The get_alignment in the plug-in function is used to compute the chunk size for the encoding (as I said not the start address alignment). 

If you pass k buffers for decoding each buffer should be aligned at least to 16 or as you pointed out better 32 bytes. 

For encoding there is normally a single buffer split 'virtually' into k pieces. To make all pieces starting at an aligned address one needs to align the chunk size to e.g. 16*k. For the best possible performance on all platforms we should change the get_alignment function in the ISA plug-in to return 32*k if there are no other objections ?!?!
 
Cheers Andreas.
________________________________________
From: ceph-devel-owner@vger.kernel.org [ceph-devel-owner@vger.kernel.org] on behalf of Janne Grunau [j@jannau.net]
Sent: 18 September 2014 12:33
To: ceph-devel@vger.kernel.org
Subject: v2 aligned buffer changes for erasure codes

Hi,

following a is an updated patchset. It passes now make check in src

It has following changes:
 * use 32-byte alignment since the isa plugin use AVX2
   (src/erasure-code/isa/README claims it needs 16*k byte aligned buffers
   but I can't see a reason why it would need more than 32-bytes
 * ErasureCode::encode_prepare() handles more than one chunk with padding

cheers

Janne
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: v2 aligned buffer changes for erasure codes
  2014-09-18 12:18   ` v2 aligned buffer changes for erasure codes Andreas Joachim Peters
@ 2014-09-18 12:34     ` Andreas Joachim Peters
  2014-09-18 12:53       ` Janne Grunau
  2014-09-19  9:18       ` Loic Dachary
  2014-09-18 12:40     ` Janne Grunau
  1 sibling, 2 replies; 41+ messages in thread
From: Andreas Joachim Peters @ 2014-09-18 12:34 UTC (permalink / raw)
  To: Janne Grunau, ceph-devel

Hi Janne/Loic, 
there is more confusion atleast on my side ...

I had now a look at the jerasure plug-in and I am now slightly confused why you have two ways to return in get_alignment ... one is as I assume and another one is "per_chunk_alignment" ... what should the function return Loic?

Cheers Andreas.
________________________________________
From: ceph-devel-owner@vger.kernel.org [ceph-devel-owner@vger.kernel.org] on behalf of Andreas Joachim Peters [Andreas.Joachim.Peters@cern.ch]
Sent: 18 September 2014 14:18
To: Janne Grunau; ceph-devel@vger.kernel.org
Subject: RE: v2 aligned buffer changes for erasure codes

Hi Janne,
=> (src/erasure-code/isa/README claims it needs 16*k byte aligned buffers

I should update the README since it is misleading ... it should say 8*k or 16*k byte aligned chunk size depending on the compiler/platform used, it is not the alignment of the allocated buffer addresses.The get_alignment in the plug-in function is used to compute the chunk size for the encoding (as I said not the start address alignment).

If you pass k buffers for decoding each buffer should be aligned at least to 16 or as you pointed out better 32 bytes.

For encoding there is normally a single buffer split 'virtually' into k pieces. To make all pieces starting at an aligned address one needs to align the chunk size to e.g. 16*k. For the best possible performance on all platforms we should change the get_alignment function in the ISA plug-in to return 32*k if there are no other objections ?!?!

Cheers Andreas.
________________________________________
From: ceph-devel-owner@vger.kernel.org [ceph-devel-owner@vger.kernel.org] on behalf of Janne Grunau [j@jannau.net]
Sent: 18 September 2014 12:33
To: ceph-devel@vger.kernel.org
Subject: v2 aligned buffer changes for erasure codes

Hi,

following a is an updated patchset. It passes now make check in src

It has following changes:
 * use 32-byte alignment since the isa plugin use AVX2
   (src/erasure-code/isa/README claims it needs 16*k byte aligned buffers
   but I can't see a reason why it would need more than 32-bytes
 * ErasureCode::encode_prepare() handles more than one chunk with padding

cheers

Janne
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: v2 aligned buffer changes for erasure codes
  2014-09-18 12:18   ` v2 aligned buffer changes for erasure codes Andreas Joachim Peters
  2014-09-18 12:34     ` Andreas Joachim Peters
@ 2014-09-18 12:40     ` Janne Grunau
  2014-09-18 13:01       ` Andreas Joachim Peters
  1 sibling, 1 reply; 41+ messages in thread
From: Janne Grunau @ 2014-09-18 12:40 UTC (permalink / raw)
  To: Andreas Joachim Peters; +Cc: ceph-devel

Hi,

On 2014-09-18 12:18:59 +0000, Andreas Joachim Peters wrote:
> 
> => (src/erasure-code/isa/README claims it needs 16*k byte aligned buffers
> 
> I should update the README since it is misleading ... it should say 
> 8*k or 16*k byte aligned chunk size depending on the compiler/platform 
> used, it is not the alignment of the allocated buffer addresses.The 
> get_alignment in the plug-in function is used to compute the chunk 
> size for the encoding (as I said not the start address alignment). 

I've seen that

> If you pass k buffers for decoding each buffer should be aligned at 
> least to 16 or as you pointed out better 32 bytes. 

ok, that makes sense

> For encoding there is normally a single buffer split 'virtually' into 
> k pieces. To make all pieces starting at an aligned address one needs 
> to align the chunk size to e.g. 16*k.

I don't get that. How is the buffer splitted? into k (+ m) chunk size 
parts? As long as the start and the length are both 16 (or 32) byte 
aligned all parts are properly aligned too. I don't see where the k 
comes into play.

cheers

Janne

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: v2 aligned buffer changes for erasure codes
  2014-09-18 12:34     ` Andreas Joachim Peters
@ 2014-09-18 12:53       ` Janne Grunau
  2014-09-19  9:18       ` Loic Dachary
  1 sibling, 0 replies; 41+ messages in thread
From: Janne Grunau @ 2014-09-18 12:53 UTC (permalink / raw)
  To: Andreas Joachim Peters; +Cc: ceph-devel

Hi,

On 2014-09-18 12:34:49 +0000, Andreas Joachim Peters wrote:
>
> there is more confusion atleast on my side ...
> 
> I had now a look at the jerasure plug-in and I am now slightly 
> confused why you have two ways to return in get_alignment ... one is 
> as I assume and another one is "per_chunk_alignment" ... what should 
> the function return Loic?

the per_chunk_alignment is just a bool which says that each chunk has to
start at an aligned address.

get_alignement() seems to be used to align the chunk size.

It might come from gf-complete' strange alignment requirements. Instead 
of requiring aligned buffers it requires that src and dst buffer have 
the same remainder when divided by 16. The best way to archieve that is 
to align the length to 16 and use a single buffer.

I agree it's convoluted.

Janne

^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: v2 aligned buffer changes for erasure codes
  2014-09-18 12:40     ` Janne Grunau
@ 2014-09-18 13:01       ` Andreas Joachim Peters
  2014-09-18 13:23         ` Janne Grunau
  0 siblings, 1 reply; 41+ messages in thread
From: Andreas Joachim Peters @ 2014-09-18 13:01 UTC (permalink / raw)
  To: Janne Grunau; +Cc: ceph-devel

Hi Janne, 

>> For encoding there is normally a single buffer split 'virtually' into
>> k pieces. To make all pieces starting at an aligned address one needs
>> to align the chunk size to e.g. 16*k.

>I don't get that. How is the buffer splitted? into k (+ m) chunk size
>parts? As long as the start and the length are both 16 (or 32) byte
>aligned all parts are properly aligned too. I don't see where the k
>comes into play.

The original data block to encode has to be split into k equally long pieces. Each piece is given as one of the k input buffers to the erasure code algorithm producing m output buffers and each piece has to have an aligned starting address and length.

If you deal with 128 byte data input buffers for k=4 it splits like

offset=00 len=32 as chunk1
offset=32 len=32 as chunk2
offset=64 len=32 as chunk3
offset=96 len=32 as chunk4

If the desired IO size would be 196 bytes the 32 byte alignment requirement blows this buffer up to 256 bytes:

offset=00 len=64 as chunk1
offset=64 len=64 as chunk2
offset=128 len=64 as chunk3
offset=196 len=64 as chunk4

For the typical 4kb only k=2,4,8,16,32,64,128 do not increase the buffer. If someone configures e.g. k=10 the buffer is increased from 4096 to 4160 bytes and it creates 1.5% storage volume overhead.

Cheers Andreas.




________________________________________
From: ceph-devel-owner@vger.kernel.org [ceph-devel-owner@vger.kernel.org] on behalf of Janne Grunau [j@jannau.net]
Sent: 18 September 2014 14:40
To: Andreas Joachim Peters
Cc: ceph-devel@vger.kernel.org
Subject: Re: v2 aligned buffer changes for erasure codes

Hi,

On 2014-09-18 12:18:59 +0000, Andreas Joachim Peters wrote:
>
> => (src/erasure-code/isa/README claims it needs 16*k byte aligned buffers
>
> I should update the README since it is misleading ... it should say
> 8*k or 16*k byte aligned chunk size depending on the compiler/platform
> used, it is not the alignment of the allocated buffer addresses.The
> get_alignment in the plug-in function is used to compute the chunk
> size for the encoding (as I said not the start address alignment).

I've seen that

> If you pass k buffers for decoding each buffer should be aligned at
> least to 16 or as you pointed out better 32 bytes.

ok, that makes sense

> For encoding there is normally a single buffer split 'virtually' into
> k pieces. To make all pieces starting at an aligned address one needs
> to align the chunk size to e.g. 16*k.

I don't get that. How is the buffer splitted? into k (+ m) chunk size
parts? As long as the start and the length are both 16 (or 32) byte
aligned all parts are properly aligned too. I don't see where the k
comes into play.

cheers

Janne
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: v2 aligned buffer changes for erasure codes
  2014-09-18 13:01       ` Andreas Joachim Peters
@ 2014-09-18 13:23         ` Janne Grunau
  2014-09-18 14:47           ` Andreas Joachim Peters
  0 siblings, 1 reply; 41+ messages in thread
From: Janne Grunau @ 2014-09-18 13:23 UTC (permalink / raw)
  To: Andreas Joachim Peters; +Cc: ceph-devel

On 2014-09-18 13:01:03 +0000, Andreas Joachim Peters wrote:
> 
> >> For encoding there is normally a single buffer split 'virtually' 
> >> into
> >> k pieces. To make all pieces starting at an aligned address one 
> >> needs
> >> to align the chunk size to e.g. 16*k.
> 
> >I don't get that. How is the buffer splitted? into k (+ m) chunk size
> >parts? As long as the start and the length are both 16 (or 32) byte
> >aligned all parts are properly aligned too. I don't see where the k
> >comes into play.
> 
> The original data block to encode has to be split into k equally long 
> pieces. Each piece is given as one of the k input buffers to the 
> erasure code algorithm producing m output buffers and each piece has 
> to have an aligned starting address and length.
> 
> If you deal with 128 byte data input buffers for k=4 it splits like
> 
> offset=00 len=32 as chunk1
> offset=32 len=32 as chunk2
> offset=64 len=32 as chunk3
> offset=96 len=32 as chunk4
> 
> If the desired IO size would be 196 bytes the 32 byte alignment 
> requirement blows this buffer up to 256 bytes:
> 
> offset=00 len=64 as chunk1
> offset=64 len=64 as chunk2
> offset=128 len=64 as chunk3
> offset=196 len=64 as chunk4

I fail to see how the 32 * k is related to alignment. It's only used for 
to pad the total size so it becomes a mulitple of k * 32. That is ok 
since we want k 32-byte aligned chunks. The alignment for each chunk is 
just 32-bytes.

Janne

^ permalink raw reply	[flat|nested] 41+ messages in thread

* RE: v2 aligned buffer changes for erasure codes
  2014-09-18 13:23         ` Janne Grunau
@ 2014-09-18 14:47           ` Andreas Joachim Peters
  0 siblings, 0 replies; 41+ messages in thread
From: Andreas Joachim Peters @ 2014-09-18 14:47 UTC (permalink / raw)
  To: Janne Grunau; +Cc: ceph-devel

> I fail to see how the 32 * k is related to alignment. It's only used for
> to pad the total size so it becomes a mulitple of k * 32. That is ok
> since we want k 32-byte aligned chunks. The alignment for each chunk is
> just 32-bytes.

Yes, agreed! The alignment for each chunk should be 32 bytes. 

And the implementation is most efficient if the given encoding buffer is already padded to k*32 bytes, it avoids an additional buffer allocation and copy.

Cheers Andreas.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: v2 aligned buffer changes for erasure codes
  2014-09-18 12:34     ` Andreas Joachim Peters
  2014-09-18 12:53       ` Janne Grunau
@ 2014-09-19  9:18       ` Loic Dachary
  1 sibling, 0 replies; 41+ messages in thread
From: Loic Dachary @ 2014-09-19  9:18 UTC (permalink / raw)
  To: Andreas Joachim Peters, Janne Grunau, ceph-devel

[-- Attachment #1: Type: text/plain, Size: 3383 bytes --]

Hi Andreas,

The per_chunk_alignment addresses a backward compatible change in the way they are calculated. The problem was that the initial calculation lead to oversized chunks. The long explanation is at https://github.com/ceph/ceph/commit/c7daaaf5e63d0bd1d444385e62611fe276f6ce29

Please let me know if you see something wrong :-)

Cheers

On 18/09/2014 14:34, Andreas Joachim Peters wrote:
> Hi Janne/Loic, 
> there is more confusion atleast on my side ...
> 
> I had now a look at the jerasure plug-in and I am now slightly confused why you have two ways to return in get_alignment ... one is as I assume and another one is "per_chunk_alignment" ... what should the function return Loic?
> 
> Cheers Andreas.
> ________________________________________
> From: ceph-devel-owner@vger.kernel.org [ceph-devel-owner@vger.kernel.org] on behalf of Andreas Joachim Peters [Andreas.Joachim.Peters@cern.ch]
> Sent: 18 September 2014 14:18
> To: Janne Grunau; ceph-devel@vger.kernel.org
> Subject: RE: v2 aligned buffer changes for erasure codes
> 
> Hi Janne,
> => (src/erasure-code/isa/README claims it needs 16*k byte aligned buffers
> 
> I should update the README since it is misleading ... it should say 8*k or 16*k byte aligned chunk size depending on the compiler/platform used, it is not the alignment of the allocated buffer addresses.The get_alignment in the plug-in function is used to compute the chunk size for the encoding (as I said not the start address alignment).
> 
> If you pass k buffers for decoding each buffer should be aligned at least to 16 or as you pointed out better 32 bytes.
> 
> For encoding there is normally a single buffer split 'virtually' into k pieces. To make all pieces starting at an aligned address one needs to align the chunk size to e.g. 16*k. For the best possible performance on all platforms we should change the get_alignment function in the ISA plug-in to return 32*k if there are no other objections ?!?!
> 
> Cheers Andreas.
> ________________________________________
> From: ceph-devel-owner@vger.kernel.org [ceph-devel-owner@vger.kernel.org] on behalf of Janne Grunau [j@jannau.net]
> Sent: 18 September 2014 12:33
> To: ceph-devel@vger.kernel.org
> Subject: v2 aligned buffer changes for erasure codes
> 
> Hi,
> 
> following a is an updated patchset. It passes now make check in src
> 
> It has following changes:
>  * use 32-byte alignment since the isa plugin use AVX2
>    (src/erasure-code/isa/README claims it needs 16*k byte aligned buffers
>    but I can't see a reason why it would need more than 32-bytes
>  * ErasureCode::encode_prepare() handles more than one chunk with padding
> 
> cheers
> 
> Janne
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

-- 
Loïc Dachary, Artisan Logiciel Libre


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 263 bytes --]

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 2/3] ec: use 32-byte aligned buffers
  2014-09-18 10:33   ` [PATCH v2 2/3] ec: use 32-byte aligned buffers Janne Grunau
@ 2014-09-19  9:47     ` Loic Dachary
  0 siblings, 0 replies; 41+ messages in thread
From: Loic Dachary @ 2014-09-19  9:47 UTC (permalink / raw)
  To: Janne Grunau, ceph-devel

[-- Attachment #1: Type: text/plain, Size: 5057 bytes --]

Hi Janne,

This looks good ! The 32 byte aligned buffer applies to the diff related to buffer.h though, could you update the title ? I tend to prefer erasure-code over ec : it is easier to grep / search ;-)

Cheers

On 18/09/2014 12:33, Janne Grunau wrote:
> Requiring page aligned buffers and realigning the input if necessary
> creates measurable oberhead. ceph_erasure_code_benchmark is ~30% faster
> with this change for technique=reed_sol_van,k=2,m=1.
> 
> Also prevents a misaligned buffer when bufferlist::c_str(bufferlist)
> has to allocate a new buffer to provide continuous one. See bug #9408
> 
> Signed-off-by: Janne Grunau <j@jannau.net>
> ---
>  src/erasure-code/ErasureCode.cc | 57 ++++++++++++++++++++++++++++-------------
>  src/erasure-code/ErasureCode.h  |  3 ++-
>  2 files changed, 41 insertions(+), 19 deletions(-)
> 
> diff --git a/src/erasure-code/ErasureCode.cc b/src/erasure-code/ErasureCode.cc
> index 5953f49..7aa5235 100644
> --- a/src/erasure-code/ErasureCode.cc
> +++ b/src/erasure-code/ErasureCode.cc
> @@ -54,22 +54,49 @@ int ErasureCode::minimum_to_decode_with_cost(const set<int> &want_to_read,
>  }
>  
>  int ErasureCode::encode_prepare(const bufferlist &raw,
> -                                bufferlist *prepared) const
> +                                map<int, bufferlist> &encoded) const
>  {
>    unsigned int k = get_data_chunk_count();
>    unsigned int m = get_chunk_count() - k;
>    unsigned blocksize = get_chunk_size(raw.length());
> -  unsigned padded_length = blocksize * k;
> -  *prepared = raw;
> -  if (padded_length - raw.length() > 0) {
> -    bufferptr pad(padded_length - raw.length());
> -    pad.zero();
> -    prepared->push_back(pad);
> +  unsigned pad_len = blocksize * k - raw.length();
> +  unsigned padded_chunks = k - raw.length() / blocksize;
> +  bufferlist prepared = raw;
> +
> +  if (!prepared.is_aligned()) {
> +    // splice padded chunks off to make the rebuild faster
> +    if (padded_chunks)
> +      prepared.splice((k - padded_chunks) * blocksize,
> +                      padded_chunks * blocksize - pad_len);
> +    prepared.rebuild_aligned();
> +  }
> +
> +  for (unsigned int i = 0; i < k - padded_chunks; i++) {
> +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
> +    bufferlist &chunk = encoded[chunk_index];
> +    chunk.substr_of(prepared, i * blocksize, blocksize);
> +  }
> +  if (padded_chunks) {
> +    unsigned remainder = raw.length() - (k - padded_chunks) * blocksize;
> +    bufferlist padded;
> +    bufferptr buf(buffer::create_aligned(padded_chunks * blocksize));
> +
> +    raw.copy((k - padded_chunks) * blocksize, remainder, buf.c_str());
> +    buf.zero(remainder, pad_len);
> +    padded.push_back(buf);
> +
> +    for (unsigned int i = k - padded_chunks; i < k; i++) {
> +      int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
> +      bufferlist &chunk = encoded[chunk_index];
> +      chunk.substr_of(padded, (i - (k - padded_chunks)) * blocksize, blocksize);
> +    }
> +  }
> +  for (unsigned int i = k; i < k + m; i++) {
> +    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
> +    bufferlist &chunk = encoded[chunk_index];
> +    chunk.push_back(buffer::create_aligned(blocksize));
>    }
> -  unsigned coding_length = blocksize * m;
> -  bufferptr coding(buffer::create_page_aligned(coding_length));
> -  prepared->push_back(coding);
> -  prepared->rebuild_page_aligned();
> +
>    return 0;
>  }
>  
> @@ -80,15 +107,9 @@ int ErasureCode::encode(const set<int> &want_to_encode,
>    unsigned int k = get_data_chunk_count();
>    unsigned int m = get_chunk_count() - k;
>    bufferlist out;
> -  int err = encode_prepare(in, &out);
> +  int err = encode_prepare(in, *encoded);
>    if (err)
>      return err;
> -  unsigned blocksize = get_chunk_size(in.length());
> -  for (unsigned int i = 0; i < k + m; i++) {
> -    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
> -    bufferlist &chunk = (*encoded)[chunk_index];
> -    chunk.substr_of(out, i * blocksize, blocksize);
> -  }
>    encode_chunks(want_to_encode, encoded);
>    for (unsigned int i = 0; i < k + m; i++) {
>      if (want_to_encode.count(i) == 0)
> diff --git a/src/erasure-code/ErasureCode.h b/src/erasure-code/ErasureCode.h
> index 7aaea95..62aa383 100644
> --- a/src/erasure-code/ErasureCode.h
> +++ b/src/erasure-code/ErasureCode.h
> @@ -46,7 +46,8 @@ namespace ceph {
>                                              const map<int, int> &available,
>                                              set<int> *minimum);
>  
> -    int encode_prepare(const bufferlist &raw, bufferlist *prepared) const;
> +    int encode_prepare(const bufferlist &raw,
> +                       map<int, bufferlist> &encoded) const;
>  
>      virtual int encode(const set<int> &want_to_encode,
>                         const bufferlist &in,
> 

-- 
Loïc Dachary, Artisan Logiciel Libre


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 263 bytes --]

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v3 0/4] buffer alignment for erasure code SIMD
  2014-09-15 15:55 [PATCH 1/3] buffer: add an aligned buffer with less alignment than a page Janne Grunau
                   ` (3 preceding siblings ...)
  2014-09-18 10:33 ` v2 aligned buffer changes for erasure codes Janne Grunau
@ 2014-09-29 12:34 ` Janne Grunau
  2014-09-29 12:34   ` [PATCH v3 1/4] buffer: add an aligned buffer with less alignment than a page Janne Grunau
                     ` (5 more replies)
  4 siblings, 6 replies; 41+ messages in thread
From: Janne Grunau @ 2014-09-29 12:34 UTC (permalink / raw)
  To: ceph-devel

Hi,

reworked patchset to address the comments in https://github.com/ceph/ceph/pull/2558

variable alignment instead of the hardcoded 32-byte alignment
fixed copy and paste error in a comment
change buffer alignment for decoding too (much simpler than the encoding
changes)

I'll do a github pull request once the last make check run finishes
locally.

Also available from http://git.jannau.net/ceph.git/log/?h=buffer_align

regards

Janne


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v3 1/4] buffer: add an aligned buffer with less alignment than a page
  2014-09-29 12:34 ` [PATCH v3 0/4] buffer alignment for erasure code SIMD Janne Grunau
@ 2014-09-29 12:34   ` Janne Grunau
  2014-09-29 13:12     ` Loic Dachary
  2014-09-29 13:27     ` Loic Dachary
  2014-09-29 12:34   ` [PATCH v3 2/4] erasure code: use a function for the chunk mapping index Janne Grunau
                     ` (4 subsequent siblings)
  5 siblings, 2 replies; 41+ messages in thread
From: Janne Grunau @ 2014-09-29 12:34 UTC (permalink / raw)
  To: ceph-devel

SIMD optimized erasure code computation needs aligned memory. Buffers
aligned to a page boundary are not needed though. The buffers used
for the erasure code computation are typical smaller than a page.

The typical alignment requirements SIMD operations are 16 bytes for
SSE2 and NEON and 32 bytes for AVX/AVX2.

Signed-off-by: Janne Grunau <j@jannau.net>
---
 configure.ac         |   9 +++++
 src/common/buffer.cc | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++
 src/include/buffer.h |  15 ++++++++
 3 files changed, 130 insertions(+)

diff --git a/configure.ac b/configure.ac
index cccf2d9..1bb27c4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -793,6 +793,15 @@ AC_MSG_RESULT([no])
 ])
 
 #
+# Check for functions to provide aligned memory
+#
+AC_CHECK_HEADERS([malloc.h])
+AC_CHECK_FUNCS([posix_memalign _aligned_malloc memalign aligned_malloc],
+               [found_memalign=yes; break])
+
+AS_IF([test "x$found_memalign" != "xyes"], [AC_MSG_WARN([No function for aligned memory allocation found])])
+
+#
 # Check for pthread spinlock (depends on ACX_PTHREAD)
 #
 saved_LIBS="$LIBS"
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index af298ac..dfe887e 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -30,6 +30,10 @@
 #include <sys/uio.h>
 #include <limits.h>
 
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
 namespace ceph {
 
 #ifdef BUFFER_DEBUG
@@ -155,9 +159,17 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     virtual int zero_copy_to_fd(int fd, loff_t *offset) {
       return -ENOTSUP;
     }
+    virtual bool is_aligned(unsigned align) {
+      assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
+      return ((intptr_t)data & (align - 1)) == 0;
+    }
     virtual bool is_page_aligned() {
       return ((long)data & ~CEPH_PAGE_MASK) == 0;
     }
+    bool is_n_align_sized(unsigned align) {
+      assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
+      return (len % align) == 0;
+    }
     bool is_n_page_sized() {
       return (len & ~CEPH_PAGE_MASK) == 0;
     }
@@ -209,6 +221,43 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     }
   };
 
+  class buffer::raw_aligned : public buffer::raw {
+    unsigned _align;
+  public:
+    raw_aligned(unsigned l, unsigned align) : raw(l) {
+      _align = align;
+      assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
+      if (len) {
+#if HAVE_POSIX_MEMALIGN
+        if (posix_memalign((void **) &data, align, len))
+          data = 0;
+#elif HAVE__ALIGNED_MALLOC
+        data = _aligned_malloc(len, align);
+#elif HAVE_MEMALIGN
+        data = memalign(align, len);
+#elif HAVE_ALIGNED_MALLOC
+        data = aligned_malloc((len + align - 1) & (align - 1), align);
+#else
+        data = malloc(len);
+#endif
+        if (!data)
+          throw bad_alloc();
+      } else {
+        data = 0;
+      }
+      inc_total_alloc(len);
+      bdout << "raw_aligned " << this << " alloc " << (void *)data << " " << l << " " << buffer::get_total_alloc() << bendl;
+    }
+    ~raw_aligned() {
+      free(data);
+      dec_total_alloc(len);
+      bdout << "raw_aligned " << this << " free " << (void *)data << " " << buffer::get_total_alloc() << bendl;
+    }
+    raw* clone_empty() {
+      return new raw_aligned(len, _align);
+    }
+  };
+
 #ifndef __CYGWIN__
   class buffer::raw_mmap_pages : public buffer::raw {
   public:
@@ -334,6 +383,10 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
       return true;
     }
 
+    bool is_aligned() {
+      return false;
+    }
+
     bool is_page_aligned() {
       return false;
     }
@@ -520,6 +573,9 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
   buffer::raw* buffer::create_static(unsigned len, char *buf) {
     return new raw_static(buf, len);
   }
+  buffer::raw* buffer::create_aligned(unsigned len, unsigned align) {
+    return new raw_aligned(len, align);
+  }
   buffer::raw* buffer::create_page_aligned(unsigned len) {
 #ifndef __CYGWIN__
     //return new raw_mmap_pages(len);
@@ -1013,6 +1069,17 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     return true;
   }
 
+  bool buffer::list::is_aligned(unsigned align) const
+  {
+    assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
+    for (std::list<ptr>::const_iterator it = _buffers.begin();
+         it != _buffers.end();
+         ++it)
+      if (!it->is_aligned(align))
+        return false;
+    return true;
+  }
+
   bool buffer::list::is_page_aligned() const
   {
     for (std::list<ptr>::const_iterator it = _buffers.begin();
@@ -1101,6 +1168,45 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     _buffers.push_back(nb);
   }
 
+void buffer::list::rebuild_aligned(unsigned align)
+{
+  std::list<ptr>::iterator p = _buffers.begin();
+  assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
+  while (p != _buffers.end()) {
+    // keep anything that's already align sized+aligned
+    if (p->is_aligned(align) && p->is_n_align_sized(align)) {
+      /*cout << " segment " << (void*)p->c_str()
+             << " offset " << ((unsigned long)p->c_str() & (align - 1))
+             << " length " << p->length()
+             << " " << (p->length() & (align - 1)) << " ok" << std::endl;
+      */
+      ++p;
+      continue;
+    }
+
+    // consolidate unaligned items, until we get something that is sized+aligned
+    list unaligned;
+    unsigned offset = 0;
+    do {
+      /*cout << " segment " << (void*)p->c_str()
+             << " offset " << ((unsigned long)p->c_str() & (align - 1))
+             << " length " << p->length() << " " << (p->length() & (align - 1))
+             << " overall offset " << offset << " " << (offset & (align - 1))
+             << " not ok" << std::endl;
+      */
+      offset += p->length();
+      unaligned.push_back(*p);
+      _buffers.erase(p++);
+    } while (p != _buffers.end() &&
+	     (!p->is_aligned(align) ||
+	      !p->is_n_align_sized(align) ||
+	      (offset % align)));
+    ptr nb(buffer::create_aligned(unaligned._len, align));
+    unaligned.rebuild(nb);
+    _buffers.insert(p, unaligned._buffers.front());
+  }
+}
+
 void buffer::list::rebuild_page_aligned()
 {
   std::list<ptr>::iterator p = _buffers.begin();
diff --git a/src/include/buffer.h b/src/include/buffer.h
index e5c1b50..2a32cf4 100644
--- a/src/include/buffer.h
+++ b/src/include/buffer.h
@@ -124,6 +124,7 @@ private:
    */
   class raw;
   class raw_malloc;
+  class raw_aligned;
   class raw_static;
   class raw_mmap_pages;
   class raw_posix_aligned;
@@ -144,6 +145,7 @@ public:
   static raw* create_malloc(unsigned len);
   static raw* claim_malloc(unsigned len, char *buf);
   static raw* create_static(unsigned len, char *buf);
+  static raw* create_aligned(unsigned len, unsigned align);
   static raw* create_page_aligned(unsigned len);
   static raw* create_zero_copy(unsigned len, int fd, int64_t *offset);
 
@@ -177,7 +179,17 @@ public:
     bool at_buffer_head() const { return _off == 0; }
     bool at_buffer_tail() const;
 
+    bool is_aligned(unsigned align) const
+    {
+      assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
+      return ((intptr_t)c_str() & (align - 1)) == 0;
+    }
     bool is_page_aligned() const { return ((long)c_str() & ~CEPH_PAGE_MASK) == 0; }
+    bool is_n_align_sized(unsigned align) const
+    {
+      assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
+      return (length() % align) == 0;
+    }
     bool is_n_page_sized() const { return (length() & ~CEPH_PAGE_MASK) == 0; }
 
     // accessors
@@ -344,7 +356,9 @@ public:
     bool contents_equal(buffer::list& other);
 
     bool can_zero_copy() const;
+    bool is_aligned(unsigned align) const;
     bool is_page_aligned() const;
+    bool is_n_align_sized(unsigned align) const;
     bool is_n_page_sized() const;
 
     bool is_zero() const;
@@ -382,6 +396,7 @@ public:
     bool is_contiguous();
     void rebuild();
     void rebuild(ptr& nb);
+    void rebuild_aligned(unsigned align);
     void rebuild_page_aligned();
 
     // sort-of-like-assignment-op
-- 
2.1.1


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH v3 2/4] erasure code: use a function for the chunk mapping index
  2014-09-29 12:34 ` [PATCH v3 0/4] buffer alignment for erasure code SIMD Janne Grunau
  2014-09-29 12:34   ` [PATCH v3 1/4] buffer: add an aligned buffer with less alignment than a page Janne Grunau
@ 2014-09-29 12:34   ` Janne Grunau
  2014-09-29 12:34   ` [PATCH v3 3/4] erasure code: use 32-byte aligned buffers Janne Grunau
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 41+ messages in thread
From: Janne Grunau @ 2014-09-29 12:34 UTC (permalink / raw)
  To: ceph-devel

---
 src/erasure-code/ErasureCode.cc | 14 ++++++++------
 src/erasure-code/ErasureCode.h  |  3 +++
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/erasure-code/ErasureCode.cc b/src/erasure-code/ErasureCode.cc
index 5953f49..8b6c57f 100644
--- a/src/erasure-code/ErasureCode.cc
+++ b/src/erasure-code/ErasureCode.cc
@@ -22,6 +22,11 @@
 #include "common/strtol.h"
 #include "ErasureCode.h"
 
+int ErasureCode::chunk_index(unsigned int i) const
+{
+  return chunk_mapping.size() > i ? chunk_mapping[i] : i;
+}
+
 int ErasureCode::minimum_to_decode(const set<int> &want_to_read,
                                    const set<int> &available_chunks,
                                    set<int> *minimum)
@@ -85,8 +90,7 @@ int ErasureCode::encode(const set<int> &want_to_encode,
     return err;
   unsigned blocksize = get_chunk_size(in.length());
   for (unsigned int i = 0; i < k + m; i++) {
-    int chunk_index = chunk_mapping.size() > 0 ? chunk_mapping[i] : i;
-    bufferlist &chunk = (*encoded)[chunk_index];
+    bufferlist &chunk = (*encoded)[chunk_index(i)];
     chunk.substr_of(out, i * blocksize, blocksize);
   }
   encode_chunks(want_to_encode, encoded);
@@ -223,15 +227,13 @@ int ErasureCode::decode_concat(const map<int, bufferlist> &chunks,
   set<int> want_to_read;
 
   for (unsigned int i = 0; i < get_data_chunk_count(); i++) {
-    int chunk = chunk_mapping.size() > i ? chunk_mapping[i] : i;
-    want_to_read.insert(chunk);
+    want_to_read.insert(chunk_index(i));
   }
   map<int, bufferlist> decoded_map;
   int r = decode(want_to_read, chunks, &decoded_map);
   if (r == 0) {
     for (unsigned int i = 0; i < get_data_chunk_count(); i++) {
-      int chunk = chunk_mapping.size() > i ? chunk_mapping[i] : i;
-      decoded->claim_append(decoded_map[chunk]);
+      decoded->claim_append(decoded_map[chunk_index(i)]);
     }
   }
   return r;
diff --git a/src/erasure-code/ErasureCode.h b/src/erasure-code/ErasureCode.h
index 7aaea95..ab00120 100644
--- a/src/erasure-code/ErasureCode.h
+++ b/src/erasure-code/ErasureCode.h
@@ -85,6 +85,9 @@ namespace ceph {
 
     virtual int decode_concat(const map<int, bufferlist> &chunks,
 			      bufferlist *decoded);
+
+  private:
+    int chunk_index(unsigned int i) const;
   };
 }
 
-- 
2.1.1


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH v3 3/4] erasure code: use 32-byte aligned buffers
  2014-09-29 12:34 ` [PATCH v3 0/4] buffer alignment for erasure code SIMD Janne Grunau
  2014-09-29 12:34   ` [PATCH v3 1/4] buffer: add an aligned buffer with less alignment than a page Janne Grunau
  2014-09-29 12:34   ` [PATCH v3 2/4] erasure code: use a function for the chunk mapping index Janne Grunau
@ 2014-09-29 12:34   ` Janne Grunau
  2014-09-29 12:34   ` [PATCH v3 4/4] ceph_erasure_code_benchmark: use 32-byte aligned input Janne Grunau
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 41+ messages in thread
From: Janne Grunau @ 2014-09-29 12:34 UTC (permalink / raw)
  To: ceph-devel

Requiring page aligned buffers and realigning the input if necessary
creates measurable oberhead. ceph_erasure_code_benchmark is between
10-20% faster depending on the workload.

Also prevents a misaligned buffer when bufferlist::c_str(bufferlist)
has to allocate a new buffer to provide continuous one. See bug #9408

Signed-off-by: Janne Grunau <j@jannau.net>
---
 src/erasure-code/ErasureCode.cc | 59 ++++++++++++++++++++++++++++-------------
 src/erasure-code/ErasureCode.h  |  3 ++-
 2 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/src/erasure-code/ErasureCode.cc b/src/erasure-code/ErasureCode.cc
index 8b6c57f..7087f51 100644
--- a/src/erasure-code/ErasureCode.cc
+++ b/src/erasure-code/ErasureCode.cc
@@ -22,6 +22,8 @@
 #include "common/strtol.h"
 #include "ErasureCode.h"
 
+static const unsigned SIMD_ALIGN = 32;
+
 int ErasureCode::chunk_index(unsigned int i) const
 {
   return chunk_mapping.size() > i ? chunk_mapping[i] : i;
@@ -59,22 +61,46 @@ int ErasureCode::minimum_to_decode_with_cost(const set<int> &want_to_read,
 }
 
 int ErasureCode::encode_prepare(const bufferlist &raw,
-                                bufferlist *prepared) const
+                                map<int, bufferlist> &encoded) const
 {
   unsigned int k = get_data_chunk_count();
   unsigned int m = get_chunk_count() - k;
   unsigned blocksize = get_chunk_size(raw.length());
-  unsigned padded_length = blocksize * k;
-  *prepared = raw;
-  if (padded_length - raw.length() > 0) {
-    bufferptr pad(padded_length - raw.length());
-    pad.zero();
-    prepared->push_back(pad);
+  unsigned pad_len = blocksize * k - raw.length();
+  unsigned padded_chunks = k - raw.length() / blocksize;
+  bufferlist prepared = raw;
+
+  if (!prepared.is_aligned(SIMD_ALIGN)) {
+    // splice padded chunks off to make the rebuild faster
+    if (padded_chunks)
+      prepared.splice((k - padded_chunks) * blocksize,
+                      padded_chunks * blocksize - pad_len);
+    prepared.rebuild_aligned(SIMD_ALIGN);
+  }
+
+  for (unsigned int i = 0; i < k - padded_chunks; i++) {
+    bufferlist &chunk = encoded[chunk_index(i)];
+    chunk.substr_of(prepared, i * blocksize, blocksize);
+  }
+  if (padded_chunks) {
+    unsigned remainder = raw.length() - (k - padded_chunks) * blocksize;
+    bufferptr buf(buffer::create_aligned(blocksize, SIMD_ALIGN));
+
+    raw.copy((k - padded_chunks) * blocksize, remainder, buf.c_str());
+    buf.zero(remainder, blocksize - remainder);
+    encoded[chunk_index(k-padded_chunks)].push_back(buf);
+
+    for (unsigned int i = k - padded_chunks + 1; i < k; i++) {
+      bufferptr buf(buffer::create_aligned(blocksize, SIMD_ALIGN));
+      buf.zero();
+      encoded[chunk_index(i)].push_back(buf);
+    }
+  }
+  for (unsigned int i = k; i < k + m; i++) {
+    bufferlist &chunk = encoded[chunk_index(i)];
+    chunk.push_back(buffer::create_aligned(blocksize, SIMD_ALIGN));
   }
-  unsigned coding_length = blocksize * m;
-  bufferptr coding(buffer::create_page_aligned(coding_length));
-  prepared->push_back(coding);
-  prepared->rebuild_page_aligned();
+
   return 0;
 }
 
@@ -85,14 +111,9 @@ int ErasureCode::encode(const set<int> &want_to_encode,
   unsigned int k = get_data_chunk_count();
   unsigned int m = get_chunk_count() - k;
   bufferlist out;
-  int err = encode_prepare(in, &out);
+  int err = encode_prepare(in, *encoded);
   if (err)
     return err;
-  unsigned blocksize = get_chunk_size(in.length());
-  for (unsigned int i = 0; i < k + m; i++) {
-    bufferlist &chunk = (*encoded)[chunk_index(i)];
-    chunk.substr_of(out, i * blocksize, blocksize);
-  }
   encode_chunks(want_to_encode, encoded);
   for (unsigned int i = 0; i < k + m; i++) {
     if (want_to_encode.count(i) == 0)
@@ -132,11 +153,11 @@ int ErasureCode::decode(const set<int> &want_to_read,
   unsigned blocksize = (*chunks.begin()).second.length();
   for (unsigned int i =  0; i < k + m; i++) {
     if (chunks.find(i) == chunks.end()) {
-      bufferptr ptr(buffer::create_page_aligned(blocksize));
+      bufferptr ptr(buffer::create_aligned(blocksize, SIMD_ALIGN));
       (*decoded)[i].push_front(ptr);
     } else {
       (*decoded)[i] = chunks.find(i)->second;
-      (*decoded)[i].rebuild_page_aligned();
+      (*decoded)[i].rebuild_aligned(SIMD_ALIGN);
     }
   }
   return decode_chunks(want_to_read, chunks, decoded);
diff --git a/src/erasure-code/ErasureCode.h b/src/erasure-code/ErasureCode.h
index ab00120..11c3491 100644
--- a/src/erasure-code/ErasureCode.h
+++ b/src/erasure-code/ErasureCode.h
@@ -46,7 +46,8 @@ namespace ceph {
                                             const map<int, int> &available,
                                             set<int> *minimum);
 
-    int encode_prepare(const bufferlist &raw, bufferlist *prepared) const;
+    int encode_prepare(const bufferlist &raw,
+                       map<int, bufferlist> &encoded) const;
 
     virtual int encode(const set<int> &want_to_encode,
                        const bufferlist &in,
-- 
2.1.1


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [PATCH v3 4/4] ceph_erasure_code_benchmark: use 32-byte aligned input
  2014-09-29 12:34 ` [PATCH v3 0/4] buffer alignment for erasure code SIMD Janne Grunau
                     ` (2 preceding siblings ...)
  2014-09-29 12:34   ` [PATCH v3 3/4] erasure code: use 32-byte aligned buffers Janne Grunau
@ 2014-09-29 12:34   ` Janne Grunau
  2014-09-29 13:15   ` [PATCH v3 0/4] buffer alignment for erasure code SIMD Loic Dachary
  2014-09-29 15:18   ` Milosz Tanski
  5 siblings, 0 replies; 41+ messages in thread
From: Janne Grunau @ 2014-09-29 12:34 UTC (permalink / raw)
  To: ceph-devel

The benchmark is supposed to measure the encoding/decoding speed and
not the overhead of buffer realignments.

Signed-off-by: Janne Grunau <j@jannau.net>
---
 src/test/erasure-code/ceph_erasure_code_benchmark.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/test/erasure-code/ceph_erasure_code_benchmark.cc b/src/test/erasure-code/ceph_erasure_code_benchmark.cc
index c6a4228..71d22a7 100644
--- a/src/test/erasure-code/ceph_erasure_code_benchmark.cc
+++ b/src/test/erasure-code/ceph_erasure_code_benchmark.cc
@@ -144,6 +144,7 @@ int ErasureCodeBench::encode()
 
   bufferlist in;
   in.append(string(in_size, 'X'));
+  in.rebuild_aligned(32);
   set<int> want_to_encode;
   for (int i = 0; i < k + m; i++) {
     want_to_encode.insert(i);
@@ -183,6 +184,7 @@ int ErasureCodeBench::decode()
   }
   bufferlist in;
   in.append(string(in_size, 'X'));
+  in.rebuild_aligned(32);
 
   set<int> want_to_encode;
   for (int i = 0; i < k + m; i++) {
-- 
2.1.1


^ permalink raw reply related	[flat|nested] 41+ messages in thread

* Re: [PATCH v3 1/4] buffer: add an aligned buffer with less alignment than a page
  2014-09-29 12:34   ` [PATCH v3 1/4] buffer: add an aligned buffer with less alignment than a page Janne Grunau
@ 2014-09-29 13:12     ` Loic Dachary
  2014-10-02 12:09       ` Janne Grunau
  2014-09-29 13:27     ` Loic Dachary
  1 sibling, 1 reply; 41+ messages in thread
From: Loic Dachary @ 2014-09-29 13:12 UTC (permalink / raw)
  To: Janne Grunau, ceph-devel

[-- Attachment #1: Type: text/plain, Size: 9154 bytes --]



On 29/09/2014 14:34, Janne Grunau wrote:
> SIMD optimized erasure code computation needs aligned memory. Buffers
> aligned to a page boundary are not needed though. The buffers used
> for the erasure code computation are typical smaller than a page.
> 
> The typical alignment requirements SIMD operations are 16 bytes for
> SSE2 and NEON and 32 bytes for AVX/AVX2.
> 
> Signed-off-by: Janne Grunau <j@jannau.net>
> ---
>  configure.ac         |   9 +++++
>  src/common/buffer.cc | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  src/include/buffer.h |  15 ++++++++
>  3 files changed, 130 insertions(+)
> 
> diff --git a/configure.ac b/configure.ac
> index cccf2d9..1bb27c4 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -793,6 +793,15 @@ AC_MSG_RESULT([no])
>  ])
>  
>  #
> +# Check for functions to provide aligned memory
> +#
> +AC_CHECK_HEADERS([malloc.h])
> +AC_CHECK_FUNCS([posix_memalign _aligned_malloc memalign aligned_malloc],
> +               [found_memalign=yes; break])
> +
> +AS_IF([test "x$found_memalign" != "xyes"], [AC_MSG_WARN([No function for aligned memory allocation found])])
> +
> +#
>  # Check for pthread spinlock (depends on ACX_PTHREAD)
>  #
>  saved_LIBS="$LIBS"
> diff --git a/src/common/buffer.cc b/src/common/buffer.cc
> index af298ac..dfe887e 100644
> --- a/src/common/buffer.cc
> +++ b/src/common/buffer.cc
> @@ -30,6 +30,10 @@
>  #include <sys/uio.h>
>  #include <limits.h>
>  
> +#ifdef HAVE_MALLOC_H
> +#include <malloc.h>
> +#endif
> +
>  namespace ceph {
>  
>  #ifdef BUFFER_DEBUG
> @@ -155,9 +159,17 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>      virtual int zero_copy_to_fd(int fd, loff_t *offset) {
>        return -ENOTSUP;
>      }
> +    virtual bool is_aligned(unsigned align) {
> +      assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
> +      return ((intptr_t)data & (align - 1)) == 0;
> +    }
>      virtual bool is_page_aligned() {
>        return ((long)data & ~CEPH_PAGE_MASK) == 0;
>      }
> +    bool is_n_align_sized(unsigned align) {
> +      assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);

This does not seem necessary in this context ?

> +      return (len % align) == 0;
> +    }
>      bool is_n_page_sized() {
>        return (len & ~CEPH_PAGE_MASK) == 0;
>      }
> @@ -209,6 +221,43 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>      }
>    };
>  
> +  class buffer::raw_aligned : public buffer::raw {
> +    unsigned _align;
> +  public:
> +    raw_aligned(unsigned l, unsigned align) : raw(l) {
> +      _align = align;
> +      assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
> +      if (len) {
> +#if HAVE_POSIX_MEMALIGN
> +        if (posix_memalign((void **) &data, align, len))
> +          data = 0;
> +#elif HAVE__ALIGNED_MALLOC
> +        data = _aligned_malloc(len, align);
> +#elif HAVE_MEMALIGN
> +        data = memalign(align, len);
> +#elif HAVE_ALIGNED_MALLOC
> +        data = aligned_malloc((len + align - 1) & (align - 1), align);
> +#else
> +        data = malloc(len);
> +#endif
> +        if (!data)
> +          throw bad_alloc();
> +      } else {
> +        data = 0;
> +      }
> +      inc_total_alloc(len);
> +      bdout << "raw_aligned " << this << " alloc " << (void *)data << " " << l << " " << buffer::get_total_alloc() << bendl;
> +    }
> +    ~raw_aligned() {
> +      free(data);
> +      dec_total_alloc(len);
> +      bdout << "raw_aligned " << this << " free " << (void *)data << " " << buffer::get_total_alloc() << bendl;
> +    }
> +    raw* clone_empty() {
> +      return new raw_aligned(len, _align);
> +    }
> +  };
> +
>  #ifndef __CYGWIN__
>    class buffer::raw_mmap_pages : public buffer::raw {
>    public:
> @@ -334,6 +383,10 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>        return true;
>      }
>  
> +    bool is_aligned() {
> +      return false;
> +    }
> +
>      bool is_page_aligned() {
>        return false;
>      }
> @@ -520,6 +573,9 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>    buffer::raw* buffer::create_static(unsigned len, char *buf) {
>      return new raw_static(buf, len);
>    }
> +  buffer::raw* buffer::create_aligned(unsigned len, unsigned align) {
> +    return new raw_aligned(len, align);
> +  }
>    buffer::raw* buffer::create_page_aligned(unsigned len) {
>  #ifndef __CYGWIN__
>      //return new raw_mmap_pages(len);
> @@ -1013,6 +1069,17 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>      return true;
>    }
>  
> +  bool buffer::list::is_aligned(unsigned align) const
> +  {
> +    assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
> +    for (std::list<ptr>::const_iterator it = _buffers.begin();
> +         it != _buffers.end();
> +         ++it)
> +      if (!it->is_aligned(align))
> +        return false;
> +    return true;
> +  }
> +
>    bool buffer::list::is_page_aligned() const
>    {
>      for (std::list<ptr>::const_iterator it = _buffers.begin();
> @@ -1101,6 +1168,45 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>      _buffers.push_back(nb);
>    }
>  
> +void buffer::list::rebuild_aligned(unsigned align)
> +{
> +  std::list<ptr>::iterator p = _buffers.begin();
> +  assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
> +  while (p != _buffers.end()) {
> +    // keep anything that's already align sized+aligned
> +    if (p->is_aligned(align) && p->is_n_align_sized(align)) {
> +      /*cout << " segment " << (void*)p->c_str()
> +             << " offset " << ((unsigned long)p->c_str() & (align - 1))
> +             << " length " << p->length()
> +             << " " << (p->length() & (align - 1)) << " ok" << std::endl;
> +      */
> +      ++p;
> +      continue;
> +    }
> +
> +    // consolidate unaligned items, until we get something that is sized+aligned
> +    list unaligned;
> +    unsigned offset = 0;
> +    do {
> +      /*cout << " segment " << (void*)p->c_str()
> +             << " offset " << ((unsigned long)p->c_str() & (align - 1))
> +             << " length " << p->length() << " " << (p->length() & (align - 1))
> +             << " overall offset " << offset << " " << (offset & (align - 1))
> +             << " not ok" << std::endl;
> +      */
> +      offset += p->length();
> +      unaligned.push_back(*p);
> +      _buffers.erase(p++);
> +    } while (p != _buffers.end() &&
> +	     (!p->is_aligned(align) ||
> +	      !p->is_n_align_sized(align) ||
> +	      (offset % align)));
> +    ptr nb(buffer::create_aligned(unaligned._len, align));
> +    unaligned.rebuild(nb);
> +    _buffers.insert(p, unaligned._buffers.front());
> +  }
> +}
> +
>  void buffer::list::rebuild_page_aligned()
>  {
>    std::list<ptr>::iterator p = _buffers.begin();
> diff --git a/src/include/buffer.h b/src/include/buffer.h
> index e5c1b50..2a32cf4 100644
> --- a/src/include/buffer.h
> +++ b/src/include/buffer.h
> @@ -124,6 +124,7 @@ private:
>     */
>    class raw;
>    class raw_malloc;
> +  class raw_aligned;
>    class raw_static;
>    class raw_mmap_pages;
>    class raw_posix_aligned;
> @@ -144,6 +145,7 @@ public:
>    static raw* create_malloc(unsigned len);
>    static raw* claim_malloc(unsigned len, char *buf);
>    static raw* create_static(unsigned len, char *buf);
> +  static raw* create_aligned(unsigned len, unsigned align);
>    static raw* create_page_aligned(unsigned len);
>    static raw* create_zero_copy(unsigned len, int fd, int64_t *offset);
>  
> @@ -177,7 +179,17 @@ public:
>      bool at_buffer_head() const { return _off == 0; }
>      bool at_buffer_tail() const;
>  
> +    bool is_aligned(unsigned align) const
> +    {
> +      assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
> +      return ((intptr_t)c_str() & (align - 1)) == 0;
> +    }
>      bool is_page_aligned() const { return ((long)c_str() & ~CEPH_PAGE_MASK) == 0; }
> +    bool is_n_align_sized(unsigned align) const
> +    {
> +      assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
> +      return (length() % align) == 0;
> +    }
>      bool is_n_page_sized() const { return (length() & ~CEPH_PAGE_MASK) == 0; }
>  
>      // accessors
> @@ -344,7 +356,9 @@ public:
>      bool contents_equal(buffer::list& other);
>  
>      bool can_zero_copy() const;
> +    bool is_aligned(unsigned align) const;
>      bool is_page_aligned() const;
> +    bool is_n_align_sized(unsigned align) const;
>      bool is_n_page_sized() const;
>  
>      bool is_zero() const;
> @@ -382,6 +396,7 @@ public:
>      bool is_contiguous();
>      void rebuild();
>      void rebuild(ptr& nb);
> +    void rebuild_aligned(unsigned align);
>      void rebuild_page_aligned();
>  
>      // sort-of-like-assignment-op
> 

-- 
Loïc Dachary, Artisan Logiciel Libre


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 263 bytes --]

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v3 0/4] buffer alignment for erasure code SIMD
  2014-09-29 12:34 ` [PATCH v3 0/4] buffer alignment for erasure code SIMD Janne Grunau
                     ` (3 preceding siblings ...)
  2014-09-29 12:34   ` [PATCH v3 4/4] ceph_erasure_code_benchmark: use 32-byte aligned input Janne Grunau
@ 2014-09-29 13:15   ` Loic Dachary
  2014-09-29 15:18   ` Milosz Tanski
  5 siblings, 0 replies; 41+ messages in thread
From: Loic Dachary @ 2014-09-29 13:15 UTC (permalink / raw)
  To: Janne Grunau, ceph-devel

[-- Attachment #1: Type: text/plain, Size: 899 bytes --]

Hi,

On 29/09/2014 14:34, Janne Grunau wrote:
> Hi,
> 
> reworked patchset to address the comments in https://github.com/ceph/ceph/pull/2558
> 
> variable alignment instead of the hardcoded 32-byte alignment
> fixed copy and paste error in a comment
> change buffer alignment for decoding too (much simpler than the encoding
> changes)
> 
> I'll do a github pull request once the last make check run finishes
> locally.

I was too quick it seems : https://github.com/ceph/ceph/pull/2595

Looks great !

Cheers

> 
> Also available from http://git.jannau.net/ceph.git/log/?h=buffer_align
> 
> regards
> 
> Janne
> 
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

-- 
Loïc Dachary, Artisan Logiciel Libre


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 263 bytes --]

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v3 1/4] buffer: add an aligned buffer with less alignment than a page
  2014-09-29 12:34   ` [PATCH v3 1/4] buffer: add an aligned buffer with less alignment than a page Janne Grunau
  2014-09-29 13:12     ` Loic Dachary
@ 2014-09-29 13:27     ` Loic Dachary
  2014-10-02 12:12       ` Janne Grunau
  1 sibling, 1 reply; 41+ messages in thread
From: Loic Dachary @ 2014-09-29 13:27 UTC (permalink / raw)
  To: Janne Grunau, ceph-devel

[-- Attachment #1: Type: text/plain, Size: 9410 bytes --]

Hi Janne,

I think the implementation of the bufferlist n_align_sized functions is missing. That showed when adding unit tests for them

https://github.com/dachary/ceph/commit/fd01824f1681edd0ec029b02318b6c40b923c156

Feel free to pick up where I left, I won't work on them any more today ;-)

Cheers

On 29/09/2014 14:34, Janne Grunau wrote:
> SIMD optimized erasure code computation needs aligned memory. Buffers
> aligned to a page boundary are not needed though. The buffers used
> for the erasure code computation are typical smaller than a page.
> 
> The typical alignment requirements SIMD operations are 16 bytes for
> SSE2 and NEON and 32 bytes for AVX/AVX2.
> 
> Signed-off-by: Janne Grunau <j@jannau.net>
> ---
>  configure.ac         |   9 +++++
>  src/common/buffer.cc | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  src/include/buffer.h |  15 ++++++++
>  3 files changed, 130 insertions(+)
> 
> diff --git a/configure.ac b/configure.ac
> index cccf2d9..1bb27c4 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -793,6 +793,15 @@ AC_MSG_RESULT([no])
>  ])
>  
>  #
> +# Check for functions to provide aligned memory
> +#
> +AC_CHECK_HEADERS([malloc.h])
> +AC_CHECK_FUNCS([posix_memalign _aligned_malloc memalign aligned_malloc],
> +               [found_memalign=yes; break])
> +
> +AS_IF([test "x$found_memalign" != "xyes"], [AC_MSG_WARN([No function for aligned memory allocation found])])
> +
> +#
>  # Check for pthread spinlock (depends on ACX_PTHREAD)
>  #
>  saved_LIBS="$LIBS"
> diff --git a/src/common/buffer.cc b/src/common/buffer.cc
> index af298ac..dfe887e 100644
> --- a/src/common/buffer.cc
> +++ b/src/common/buffer.cc
> @@ -30,6 +30,10 @@
>  #include <sys/uio.h>
>  #include <limits.h>
>  
> +#ifdef HAVE_MALLOC_H
> +#include <malloc.h>
> +#endif
> +
>  namespace ceph {
>  
>  #ifdef BUFFER_DEBUG
> @@ -155,9 +159,17 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>      virtual int zero_copy_to_fd(int fd, loff_t *offset) {
>        return -ENOTSUP;
>      }
> +    virtual bool is_aligned(unsigned align) {
> +      assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
> +      return ((intptr_t)data & (align - 1)) == 0;
> +    }
>      virtual bool is_page_aligned() {
>        return ((long)data & ~CEPH_PAGE_MASK) == 0;
>      }
> +    bool is_n_align_sized(unsigned align) {
> +      assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
> +      return (len % align) == 0;
> +    }
>      bool is_n_page_sized() {
>        return (len & ~CEPH_PAGE_MASK) == 0;
>      }
> @@ -209,6 +221,43 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>      }
>    };
>  
> +  class buffer::raw_aligned : public buffer::raw {
> +    unsigned _align;
> +  public:
> +    raw_aligned(unsigned l, unsigned align) : raw(l) {
> +      _align = align;
> +      assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
> +      if (len) {
> +#if HAVE_POSIX_MEMALIGN
> +        if (posix_memalign((void **) &data, align, len))
> +          data = 0;
> +#elif HAVE__ALIGNED_MALLOC
> +        data = _aligned_malloc(len, align);
> +#elif HAVE_MEMALIGN
> +        data = memalign(align, len);
> +#elif HAVE_ALIGNED_MALLOC
> +        data = aligned_malloc((len + align - 1) & (align - 1), align);
> +#else
> +        data = malloc(len);
> +#endif
> +        if (!data)
> +          throw bad_alloc();
> +      } else {
> +        data = 0;
> +      }
> +      inc_total_alloc(len);
> +      bdout << "raw_aligned " << this << " alloc " << (void *)data << " " << l << " " << buffer::get_total_alloc() << bendl;
> +    }
> +    ~raw_aligned() {
> +      free(data);
> +      dec_total_alloc(len);
> +      bdout << "raw_aligned " << this << " free " << (void *)data << " " << buffer::get_total_alloc() << bendl;
> +    }
> +    raw* clone_empty() {
> +      return new raw_aligned(len, _align);
> +    }
> +  };
> +
>  #ifndef __CYGWIN__
>    class buffer::raw_mmap_pages : public buffer::raw {
>    public:
> @@ -334,6 +383,10 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>        return true;
>      }
>  
> +    bool is_aligned() {
> +      return false;
> +    }
> +
>      bool is_page_aligned() {
>        return false;
>      }
> @@ -520,6 +573,9 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>    buffer::raw* buffer::create_static(unsigned len, char *buf) {
>      return new raw_static(buf, len);
>    }
> +  buffer::raw* buffer::create_aligned(unsigned len, unsigned align) {
> +    return new raw_aligned(len, align);
> +  }
>    buffer::raw* buffer::create_page_aligned(unsigned len) {
>  #ifndef __CYGWIN__
>      //return new raw_mmap_pages(len);
> @@ -1013,6 +1069,17 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>      return true;
>    }
>  
> +  bool buffer::list::is_aligned(unsigned align) const
> +  {
> +    assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
> +    for (std::list<ptr>::const_iterator it = _buffers.begin();
> +         it != _buffers.end();
> +         ++it)
> +      if (!it->is_aligned(align))
> +        return false;
> +    return true;
> +  }
> +
>    bool buffer::list::is_page_aligned() const
>    {
>      for (std::list<ptr>::const_iterator it = _buffers.begin();
> @@ -1101,6 +1168,45 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>      _buffers.push_back(nb);
>    }
>  
> +void buffer::list::rebuild_aligned(unsigned align)
> +{
> +  std::list<ptr>::iterator p = _buffers.begin();
> +  assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
> +  while (p != _buffers.end()) {
> +    // keep anything that's already align sized+aligned
> +    if (p->is_aligned(align) && p->is_n_align_sized(align)) {
> +      /*cout << " segment " << (void*)p->c_str()
> +             << " offset " << ((unsigned long)p->c_str() & (align - 1))
> +             << " length " << p->length()
> +             << " " << (p->length() & (align - 1)) << " ok" << std::endl;
> +      */
> +      ++p;
> +      continue;
> +    }
> +
> +    // consolidate unaligned items, until we get something that is sized+aligned
> +    list unaligned;
> +    unsigned offset = 0;
> +    do {
> +      /*cout << " segment " << (void*)p->c_str()
> +             << " offset " << ((unsigned long)p->c_str() & (align - 1))
> +             << " length " << p->length() << " " << (p->length() & (align - 1))
> +             << " overall offset " << offset << " " << (offset & (align - 1))
> +             << " not ok" << std::endl;
> +      */
> +      offset += p->length();
> +      unaligned.push_back(*p);
> +      _buffers.erase(p++);
> +    } while (p != _buffers.end() &&
> +	     (!p->is_aligned(align) ||
> +	      !p->is_n_align_sized(align) ||
> +	      (offset % align)));
> +    ptr nb(buffer::create_aligned(unaligned._len, align));
> +    unaligned.rebuild(nb);
> +    _buffers.insert(p, unaligned._buffers.front());
> +  }
> +}
> +
>  void buffer::list::rebuild_page_aligned()
>  {
>    std::list<ptr>::iterator p = _buffers.begin();
> diff --git a/src/include/buffer.h b/src/include/buffer.h
> index e5c1b50..2a32cf4 100644
> --- a/src/include/buffer.h
> +++ b/src/include/buffer.h
> @@ -124,6 +124,7 @@ private:
>     */
>    class raw;
>    class raw_malloc;
> +  class raw_aligned;
>    class raw_static;
>    class raw_mmap_pages;
>    class raw_posix_aligned;
> @@ -144,6 +145,7 @@ public:
>    static raw* create_malloc(unsigned len);
>    static raw* claim_malloc(unsigned len, char *buf);
>    static raw* create_static(unsigned len, char *buf);
> +  static raw* create_aligned(unsigned len, unsigned align);
>    static raw* create_page_aligned(unsigned len);
>    static raw* create_zero_copy(unsigned len, int fd, int64_t *offset);
>  
> @@ -177,7 +179,17 @@ public:
>      bool at_buffer_head() const { return _off == 0; }
>      bool at_buffer_tail() const;
>  
> +    bool is_aligned(unsigned align) const
> +    {
> +      assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
> +      return ((intptr_t)c_str() & (align - 1)) == 0;
> +    }
>      bool is_page_aligned() const { return ((long)c_str() & ~CEPH_PAGE_MASK) == 0; }
> +    bool is_n_align_sized(unsigned align) const
> +    {
> +      assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
> +      return (length() % align) == 0;
> +    }
>      bool is_n_page_sized() const { return (length() & ~CEPH_PAGE_MASK) == 0; }
>  
>      // accessors
> @@ -344,7 +356,9 @@ public:
>      bool contents_equal(buffer::list& other);
>  
>      bool can_zero_copy() const;
> +    bool is_aligned(unsigned align) const;
>      bool is_page_aligned() const;
> +    bool is_n_align_sized(unsigned align) const;
>      bool is_n_page_sized() const;
>  
>      bool is_zero() const;
> @@ -382,6 +396,7 @@ public:
>      bool is_contiguous();
>      void rebuild();
>      void rebuild(ptr& nb);
> +    void rebuild_aligned(unsigned align);
>      void rebuild_page_aligned();
>  
>      // sort-of-like-assignment-op
> 

-- 
Loïc Dachary, Artisan Logiciel Libre


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 263 bytes --]

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v3 0/4] buffer alignment for erasure code SIMD
  2014-09-29 12:34 ` [PATCH v3 0/4] buffer alignment for erasure code SIMD Janne Grunau
                     ` (4 preceding siblings ...)
  2014-09-29 13:15   ` [PATCH v3 0/4] buffer alignment for erasure code SIMD Loic Dachary
@ 2014-09-29 15:18   ` Milosz Tanski
  2014-09-29 15:24     ` C++11 Sage Weil
  2014-10-02 12:15     ` [PATCH v3 0/4] buffer alignment for erasure code SIMD Janne Grunau
  5 siblings, 2 replies; 41+ messages in thread
From: Milosz Tanski @ 2014-09-29 15:18 UTC (permalink / raw)
  To: Janne Grunau; +Cc: ceph-devel

Janne, I have a couple questions and forgive me if I missed something
from the past.

Did you consider using std::aligned_storage? That way you don't have
to worry as much about portability yourself. Also, there is a
boost::aligned_storage as well if Ceph can't build against C++11
goodness.

A second more general Ceph question is somewhat off-topic. What about
C++11 use in the Ceph code base (like in this case)? It's not
explicitly prohibited by the coding style document, but I imagine the
goal is to build on as many systems as possible and quite a few
supported distros have pretty old versions of GCC. I'm asking this
because I imagine some of the performance work that's about to happen
will want to use things like lockless queues, and then you get into
C++11 memory model and std::atomic... etc.

- Milosz

On Mon, Sep 29, 2014 at 8:34 AM, Janne Grunau <j@jannau.net> wrote:
> Hi,
>
> reworked patchset to address the comments in https://github.com/ceph/ceph/pull/2558
>
> variable alignment instead of the hardcoded 32-byte alignment
> fixed copy and paste error in a comment
> change buffer alignment for decoding too (much simpler than the encoding
> changes)
>
> I'll do a github pull request once the last make check run finishes
> locally.
>
> Also available from http://git.jannau.net/ceph.git/log/?h=buffer_align
>
> regards
>
> Janne
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Milosz Tanski
CTO
16 East 34th Street, 15th floor
New York, NY 10016

p: 646-253-9055
e: milosz@adfin.com

^ permalink raw reply	[flat|nested] 41+ messages in thread

* C++11
  2014-09-29 15:18   ` Milosz Tanski
@ 2014-09-29 15:24     ` Sage Weil
  2014-09-29 15:44       ` C++11 Milosz Tanski
  2014-09-29 17:56       ` C++11 Wido den Hollander
  2014-10-02 12:15     ` [PATCH v3 0/4] buffer alignment for erasure code SIMD Janne Grunau
  1 sibling, 2 replies; 41+ messages in thread
From: Sage Weil @ 2014-09-29 15:24 UTC (permalink / raw)
  To: Milosz Tanski; +Cc: Janne Grunau, ceph-devel

On Mon, 29 Sep 2014, Milosz Tanski wrote:
> A second more general Ceph question is somewhat off-topic. What about
> C++11 use in the Ceph code base (like in this case)? It's not
> explicitly prohibited by the coding style document, but I imagine the
> goal is to build on as many systems as possible and quite a few
> supported distros have pretty old versions of GCC. I'm asking this
> because I imagine some of the performance work that's about to happen
> will want to use things like lockless queues, and then you get into
> C++11 memory model and std::atomic... etc.

We are all very eager to move to C++11.  The challenge is that we still 
need to build packages for the target distros.  That doesn't necessarily 
mean that the compilers on those distros need to support c++11 as long as 
the runtime does... if we can make the build enviroment sane.

I'm really curious what other projects do here...

sage

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: C++11
  2014-09-29 15:24     ` C++11 Sage Weil
@ 2014-09-29 15:44       ` Milosz Tanski
  2014-09-29 17:56       ` C++11 Wido den Hollander
  1 sibling, 0 replies; 41+ messages in thread
From: Milosz Tanski @ 2014-09-29 15:44 UTC (permalink / raw)
  To: Sage Weil; +Cc: Janne Grunau, ceph-devel

On Mon, Sep 29, 2014 at 11:24 AM, Sage Weil <sweil@redhat.com> wrote:
> On Mon, 29 Sep 2014, Milosz Tanski wrote:
>> A second more general Ceph question is somewhat off-topic. What about
>> C++11 use in the Ceph code base (like in this case)? It's not
>> explicitly prohibited by the coding style document, but I imagine the
>> goal is to build on as many systems as possible and quite a few
>> supported distros have pretty old versions of GCC. I'm asking this
>> because I imagine some of the performance work that's about to happen
>> will want to use things like lockless queues, and then you get into
>> C++11 memory model and std::atomic... etc.
>
> We are all very eager to move to C++11.  The challenge is that we still
> need to build packages for the target distros.  That doesn't necessarily
> mean that the compilers on those distros need to support c++11 as long as
> the runtime does... if we can make the build enviroment sane.
>
> I'm really curious what other projects do here...
>
> sage

I know that Ubuntu (12.04) and RHEL (6) provide updated compilers. I
think you can get 4.8.x in ubuntu and 4.7 in RHEL6. One problem you're
going to into is that the system (eg. old) version boost will be build
whatever is the default compiler (4.6 for ubuntu, 4.4 for RHEL6)
without C++11 features. Your new runtime vector becomes very large.

-- 
Milosz Tanski
CTO
16 East 34th Street, 15th floor
New York, NY 10016

p: 646-253-9055
e: milosz@adfin.com

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: C++11
  2014-09-29 15:24     ` C++11 Sage Weil
  2014-09-29 15:44       ` C++11 Milosz Tanski
@ 2014-09-29 17:56       ` Wido den Hollander
  1 sibling, 0 replies; 41+ messages in thread
From: Wido den Hollander @ 2014-09-29 17:56 UTC (permalink / raw)
  To: Sage Weil, Milosz Tanski; +Cc: Janne Grunau, ceph-devel

On 29-09-14 17:24, Sage Weil wrote:
> On Mon, 29 Sep 2014, Milosz Tanski wrote:
>> A second more general Ceph question is somewhat off-topic. What about
>> C++11 use in the Ceph code base (like in this case)? It's not
>> explicitly prohibited by the coding style document, but I imagine the
>> goal is to build on as many systems as possible and quite a few
>> supported distros have pretty old versions of GCC. I'm asking this
>> because I imagine some of the performance work that's about to happen
>> will want to use things like lockless queues, and then you get into
>> C++11 memory model and std::atomic... etc.
> 
> We are all very eager to move to C++11.  The challenge is that we still 
> need to build packages for the target distros.  That doesn't necessarily 
> mean that the compilers on those distros need to support c++11 as long as 
> the runtime does... if we can make the build enviroment sane.
> 
> I'm really curious what other projects do here...
> 

At the CloudStack project we recently switched from Java 6 to Java 7 and
we said that from version X we required Java 7 on the system.

For Ceph, what keeps us from saying that version H (after Giant)
requires a C++11 compliant compiler?

That might rule out Ubuntu 12.04 and CentOS6/RHEL6, but does that really
matter that much for something 'new' like Ceph?

> sage
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


-- 
Wido den Hollander
42on B.V.

Phone: +31 (0)20 700 9902
Skype: contact42on

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v3 1/4] buffer: add an aligned buffer with less alignment than a page
  2014-09-29 13:12     ` Loic Dachary
@ 2014-10-02 12:09       ` Janne Grunau
  0 siblings, 0 replies; 41+ messages in thread
From: Janne Grunau @ 2014-10-02 12:09 UTC (permalink / raw)
  To: Loic Dachary; +Cc: ceph-devel

On 2014-09-29 15:12:58 +0200, Loic Dachary wrote:
> 
> 
> On 29/09/2014 14:34, Janne Grunau wrote:
> > SIMD optimized erasure code computation needs aligned memory. Buffers
> > aligned to a page boundary are not needed though. The buffers used
> > for the erasure code computation are typical smaller than a page.
> > 
> > The typical alignment requirements SIMD operations are 16 bytes for
> > SSE2 and NEON and 32 bytes for AVX/AVX2.
> > 
> > Signed-off-by: Janne Grunau <j@jannau.net>
> > ---
> >  configure.ac         |   9 +++++
> >  src/common/buffer.cc | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >  src/include/buffer.h |  15 ++++++++
> >  3 files changed, 130 insertions(+)
> > 
> > diff --git a/configure.ac b/configure.ac
> > index cccf2d9..1bb27c4 100644
> > --- a/configure.ac
> > +++ b/configure.ac
> > @@ -793,6 +793,15 @@ AC_MSG_RESULT([no])
> >  ])
> >  
> >  #
> > +# Check for functions to provide aligned memory
> > +#
> > +AC_CHECK_HEADERS([malloc.h])
> > +AC_CHECK_FUNCS([posix_memalign _aligned_malloc memalign aligned_malloc],
> > +               [found_memalign=yes; break])
> > +
> > +AS_IF([test "x$found_memalign" != "xyes"], [AC_MSG_WARN([No function for aligned memory allocation found])])
> > +
> > +#
> >  # Check for pthread spinlock (depends on ACX_PTHREAD)
> >  #
> >  saved_LIBS="$LIBS"
> > diff --git a/src/common/buffer.cc b/src/common/buffer.cc
> > index af298ac..dfe887e 100644
> > --- a/src/common/buffer.cc
> > +++ b/src/common/buffer.cc
> > @@ -30,6 +30,10 @@
> >  #include <sys/uio.h>
> >  #include <limits.h>
> >  
> > +#ifdef HAVE_MALLOC_H
> > +#include <malloc.h>
> > +#endif
> > +
> >  namespace ceph {
> >  
> >  #ifdef BUFFER_DEBUG
> > @@ -155,9 +159,17 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
> >      virtual int zero_copy_to_fd(int fd, loff_t *offset) {
> >        return -ENOTSUP;
> >      }
> > +    virtual bool is_aligned(unsigned align) {
> > +      assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
> > +      return ((intptr_t)data & (align - 1)) == 0;
> > +    }
> >      virtual bool is_page_aligned() {
> >        return ((long)data & ~CEPH_PAGE_MASK) == 0;
> >      }
> > +    bool is_n_align_sized(unsigned align) {
> > +      assert((align >= sizeof(void *)) && (align & (align - 1)) == 0);
> 
> This does not seem necessary in this context ?

it is not strictly necessary but I would still consider it a bug if 
is_n_align_sized() is called with an align parameter that is not 
supported for allocating buffers.

Janne

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v3 1/4] buffer: add an aligned buffer with less alignment than a page
  2014-09-29 13:27     ` Loic Dachary
@ 2014-10-02 12:12       ` Janne Grunau
  2014-10-02 14:17         ` Loic Dachary
  0 siblings, 1 reply; 41+ messages in thread
From: Janne Grunau @ 2014-10-02 12:12 UTC (permalink / raw)
  To: Loic Dachary; +Cc: ceph-devel

Hi,

On 2014-09-29 15:27:02 +0200, Loic Dachary wrote:
> 
> I think the implementation of the bufferlist n_align_sized functions is missing. That showed when adding unit tests for them

yes, I didn't saw the need for it.

> https://github.com/dachary/ceph/commit/fd01824f1681edd0ec029b02318b6c40b923c156
> 
> Feel free to pick up where I left, I won't work on them any more today ;-)

Did you continue to work on them? If not I'll start from there now.

Janne



^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v3 0/4] buffer alignment for erasure code SIMD
  2014-09-29 15:18   ` Milosz Tanski
  2014-09-29 15:24     ` C++11 Sage Weil
@ 2014-10-02 12:15     ` Janne Grunau
  1 sibling, 0 replies; 41+ messages in thread
From: Janne Grunau @ 2014-10-02 12:15 UTC (permalink / raw)
  To: Milosz Tanski; +Cc: ceph-devel

Hi,

On 2014-09-29 11:18:03 -0400, Milosz Tanski wrote:
> Janne, I have a couple questions and forgive me if I missed something
> from the past.
> 
> Did you consider using std::aligned_storage? That way you don't have
> to worry as much about portability yourself. Also, there is a
> boost::aligned_storage as well if Ceph can't build against C++11
> goodness.

no. I'm more familiar with C than C++ so those are not in my active 
C/C++ vocabulary.

Janne

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v3 1/4] buffer: add an aligned buffer with less alignment than a page
  2014-10-02 12:12       ` Janne Grunau
@ 2014-10-02 14:17         ` Loic Dachary
  0 siblings, 0 replies; 41+ messages in thread
From: Loic Dachary @ 2014-10-02 14:17 UTC (permalink / raw)
  To: Janne Grunau; +Cc: ceph-devel

[-- Attachment #1: Type: text/plain, Size: 715 bytes --]



On 02/10/2014 14:12, Janne Grunau wrote:
> Hi,
> 
> On 2014-09-29 15:27:02 +0200, Loic Dachary wrote:
>>
>> I think the implementation of the bufferlist n_align_sized functions is missing. That showed when adding unit tests for them
> 
> yes, I didn't saw the need for it.

Then the declaration should be removed don't you think ?

> 
>> https://github.com/dachary/ceph/commit/fd01824f1681edd0ec029b02318b6c40b923c156
>>
>> Feel free to pick up where I left, I won't work on them any more today ;-)
> 
> Did you continue to work on them? If not I'll start from there now.
> 

I was busy elsewhere, it's all yours if you want them :-)

Cheers

-- 
Loïc Dachary, Artisan Logiciel Libre


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 263 bytes --]

^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2014-10-02 14:17 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-09-15 15:55 [PATCH 1/3] buffer: add an aligned buffer with less alignment than a page Janne Grunau
2014-09-15 15:55 ` [PATCH 2/3] ec: make use of added aligned buffers Janne Grunau
2014-09-15 17:20   ` Loic Dachary
2014-09-15 23:56     ` Ma, Jianpeng
2014-09-16  0:02       ` Sage Weil
2014-09-16  0:08         ` Ma, Jianpeng
2014-09-16  6:47           ` Loic Dachary
2014-09-16  6:59             ` Ma, Jianpeng
2014-09-16  7:55               ` Loic Dachary
2014-09-16  8:23                 ` Ma, Jianpeng
2014-09-15 15:55 ` [PATCH 3/3] ceph_erasure_code_benchmark: align the encoding input Janne Grunau
2014-09-15 16:46 ` [PATCH 1/3] buffer: add an aligned buffer with less alignment than a page Loic Dachary
2014-09-18 10:33 ` v2 aligned buffer changes for erasure codes Janne Grunau
2014-09-18 10:33   ` [PATCH v2 1/3] buffer: add an aligned buffer with less alignment than a page Janne Grunau
2014-09-18 10:33   ` [PATCH v2 2/3] ec: use 32-byte aligned buffers Janne Grunau
2014-09-19  9:47     ` Loic Dachary
2014-09-18 10:33   ` [PATCH v2 3/3] ceph_erasure_code_benchmark: align the encoding input Janne Grunau
2014-09-18 12:18   ` v2 aligned buffer changes for erasure codes Andreas Joachim Peters
2014-09-18 12:34     ` Andreas Joachim Peters
2014-09-18 12:53       ` Janne Grunau
2014-09-19  9:18       ` Loic Dachary
2014-09-18 12:40     ` Janne Grunau
2014-09-18 13:01       ` Andreas Joachim Peters
2014-09-18 13:23         ` Janne Grunau
2014-09-18 14:47           ` Andreas Joachim Peters
2014-09-29 12:34 ` [PATCH v3 0/4] buffer alignment for erasure code SIMD Janne Grunau
2014-09-29 12:34   ` [PATCH v3 1/4] buffer: add an aligned buffer with less alignment than a page Janne Grunau
2014-09-29 13:12     ` Loic Dachary
2014-10-02 12:09       ` Janne Grunau
2014-09-29 13:27     ` Loic Dachary
2014-10-02 12:12       ` Janne Grunau
2014-10-02 14:17         ` Loic Dachary
2014-09-29 12:34   ` [PATCH v3 2/4] erasure code: use a function for the chunk mapping index Janne Grunau
2014-09-29 12:34   ` [PATCH v3 3/4] erasure code: use 32-byte aligned buffers Janne Grunau
2014-09-29 12:34   ` [PATCH v3 4/4] ceph_erasure_code_benchmark: use 32-byte aligned input Janne Grunau
2014-09-29 13:15   ` [PATCH v3 0/4] buffer alignment for erasure code SIMD Loic Dachary
2014-09-29 15:18   ` Milosz Tanski
2014-09-29 15:24     ` C++11 Sage Weil
2014-09-29 15:44       ` C++11 Milosz Tanski
2014-09-29 17:56       ` C++11 Wido den Hollander
2014-10-02 12:15     ` [PATCH v3 0/4] buffer alignment for erasure code SIMD Janne Grunau

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.