linux-erofs.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/4] erofs-utils: introduce z_erofs_inmem_extent
@ 2022-09-06 11:40 ZiyangZhang
  2022-09-06 11:40 ` [PATCH 2/4] erofs-utils: lib: add rb-tree implementation ZiyangZhang
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: ZiyangZhang @ 2022-09-06 11:40 UTC (permalink / raw)
  To: linux-erofs; +Cc: Gao Xiang, Ziyang Zhang

From: Gao Xiang <hsiangkao@linux.alibaba.com>

In order to introduce deduplicatation for compressed pclusters.
A lookahead extent is recorded so that the following deduplication
process can adjust the previous extent on demand.

Also, in the future, it can be used for parallel compression
as well.

Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Signed-off-by: Ziyang Zhang <ZiyangZhang@linux.alibaba.com>
---
 lib/compress.c | 87 ++++++++++++++++++++++++++++----------------------
 1 file changed, 48 insertions(+), 39 deletions(-)

diff --git a/lib/compress.c b/lib/compress.c
index fd02053..3c1d9db 100644
--- a/lib/compress.c
+++ b/lib/compress.c
@@ -22,12 +22,19 @@
 static struct erofs_compress compresshandle;
 static unsigned int algorithmtype[2];
 
-struct z_erofs_vle_compress_ctx {
-	u8 *metacur;
+struct z_erofs_inmem_extent {
+	erofs_blk_t blkaddr;
+	unsigned int compressedblks;
+	unsigned int length;
+	bool raw;
+};
 
+struct z_erofs_vle_compress_ctx {
 	u8 queue[EROFS_CONFIG_COMPR_MAX_SZ * 2];
+	struct z_erofs_inmem_extent e;	/* (lookahead) extent */
+
+	u8 *metacur;
 	unsigned int head, tail;
-	unsigned int compressedblks;
 	erofs_blk_t blkaddr;		/* pointing to the next blkaddr */
 	u16 clusterofs;
 };
@@ -43,7 +50,7 @@ static unsigned int vle_compressmeta_capacity(erofs_off_t filesize)
 	return Z_EROFS_LEGACY_MAP_HEADER_SIZE + indexsize;
 }
 
-static void vle_write_indexes_final(struct z_erofs_vle_compress_ctx *ctx)
+static void z_erofs_write_indexes_final(struct z_erofs_vle_compress_ctx *ctx)
 {
 	const unsigned int type = Z_EROFS_VLE_CLUSTER_TYPE_PLAIN;
 	struct z_erofs_vle_decompressed_index di;
@@ -59,10 +66,10 @@ static void vle_write_indexes_final(struct z_erofs_vle_compress_ctx *ctx)
 	ctx->metacur += sizeof(di);
 }
 
-static void vle_write_indexes(struct z_erofs_vle_compress_ctx *ctx,
-			      unsigned int count, bool raw)
+static void z_erofs_write_indexes(struct z_erofs_vle_compress_ctx *ctx)
 {
 	unsigned int clusterofs = ctx->clusterofs;
+	unsigned int count = ctx->e.length;
 	unsigned int d0 = 0, d1 = (clusterofs + count) / EROFS_BLKSIZ;
 	struct z_erofs_vle_decompressed_index di;
 	unsigned int type;
@@ -76,13 +83,13 @@ static void vle_write_indexes(struct z_erofs_vle_compress_ctx *ctx,
 		 * A lcluster cannot have three parts with the middle one which
 		 * is well-compressed for !ztailpacking cases.
 		 */
-		DBG_BUGON(!raw && !cfg.c_ztailpacking);
-		type = raw ? Z_EROFS_VLE_CLUSTER_TYPE_PLAIN :
+		DBG_BUGON(!ctx->e.raw && !cfg.c_ztailpacking);
+		type = ctx->e.raw ? Z_EROFS_VLE_CLUSTER_TYPE_PLAIN :
 			Z_EROFS_VLE_CLUSTER_TYPE_HEAD;
 		advise = cpu_to_le16(type << Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT);
 
 		di.di_advise = advise;
-		di.di_u.blkaddr = cpu_to_le32(ctx->blkaddr);
+		di.di_u.blkaddr = cpu_to_le32(ctx->e.blkaddr);
 		memcpy(ctx->metacur, &di, sizeof(di));
 		ctx->metacur += sizeof(di);
 
@@ -95,7 +102,7 @@ static void vle_write_indexes(struct z_erofs_vle_compress_ctx *ctx,
 		/* XXX: big pcluster feature should be per-inode */
 		if (d0 == 1 && erofs_sb_has_big_pcluster()) {
 			type = Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD;
-			di.di_u.delta[0] = cpu_to_le16(ctx->compressedblks |
+			di.di_u.delta[0] = cpu_to_le16(ctx->e.compressedblks |
 					Z_EROFS_VLE_DI_D0_CBLKCNT);
 			di.di_u.delta[1] = cpu_to_le16(d1);
 		} else if (d0) {
@@ -119,9 +126,9 @@ static void vle_write_indexes(struct z_erofs_vle_compress_ctx *ctx,
 				di.di_u.delta[0] = cpu_to_le16(d0);
 			di.di_u.delta[1] = cpu_to_le16(d1);
 		} else {
-			type = raw ? Z_EROFS_VLE_CLUSTER_TYPE_PLAIN :
+			type = ctx->e.raw ? Z_EROFS_VLE_CLUSTER_TYPE_PLAIN :
 				Z_EROFS_VLE_CLUSTER_TYPE_HEAD;
-			di.di_u.blkaddr = cpu_to_le32(ctx->blkaddr);
+			di.di_u.blkaddr = cpu_to_le32(ctx->e.blkaddr);
 		}
 		advise = cpu_to_le16(type << Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT);
 		di.di_advise = advise;
@@ -226,18 +233,16 @@ static int vle_compress_one(struct erofs_inode *inode,
 			    struct z_erofs_vle_compress_ctx *ctx,
 			    bool final)
 {
+	static char dstbuf[EROFS_CONFIG_COMPR_MAX_SZ + EROFS_BLKSIZ];
+	char *const dst = dstbuf + EROFS_BLKSIZ;
 	struct erofs_compress *const h = &compresshandle;
 	unsigned int len = ctx->tail - ctx->head;
-	unsigned int count;
 	int ret;
-	static char dstbuf[EROFS_CONFIG_COMPR_MAX_SZ + EROFS_BLKSIZ];
-	char *const dst = dstbuf + EROFS_BLKSIZ;
 
 	while (len) {
 		unsigned int pclustersize =
 			z_erofs_get_max_pclusterblks(inode) * EROFS_BLKSIZ;
 		bool may_inline = (cfg.c_ztailpacking && final);
-		bool raw;
 
 		if (len <= pclustersize) {
 			if (!final)
@@ -246,10 +251,11 @@ static int vle_compress_one(struct erofs_inode *inode,
 				goto nocompression;
 		}
 
-		count = min(len, cfg.c_max_decompressed_extent_bytes);
+		ctx->e.length = min(len,
+				cfg.c_max_decompressed_extent_bytes);
 		ret = erofs_compress_destsize(h, ctx->queue + ctx->head,
-					      &count, dst, pclustersize,
-					      !(final && len == count));
+				&ctx->e.length, dst, pclustersize,
+				!(final && len == ctx->e.length));
 		if (ret <= 0) {
 			if (ret != -EAGAIN) {
 				erofs_err("failed to compress %s: %s",
@@ -267,17 +273,17 @@ nocompression:
 
 			if (ret < 0)
 				return ret;
-			count = ret;
+			ctx->e.length = ret;
 
 			/*
 			 * XXX: For now, we have to leave `ctx->compressedblks
 			 * = 1' since there is no way to generate compressed
 			 * indexes after the time that ztailpacking is decided.
 			 */
-			ctx->compressedblks = 1;
-			raw = true;
+			ctx->e.compressedblks = 1;
+			ctx->e.raw = true;
 		/* tailpcluster should be less than 1 block */
-		} else if (may_inline && len == count &&
+		} else if (may_inline && len == ctx->e.length &&
 			   ret < EROFS_BLKSIZ) {
 			if (ctx->clusterofs + len <= EROFS_BLKSIZ) {
 				inode->eof_tailraw = malloc(len);
@@ -292,19 +298,20 @@ nocompression:
 			ret = z_erofs_fill_inline_data(inode, dst, ret, false);
 			if (ret < 0)
 				return ret;
-			ctx->compressedblks = 1;
-			raw = false;
+			ctx->e.compressedblks = 1;
+			ctx->e.raw = false;
 		} else {
 			unsigned int tailused, padding;
 
-			if (may_inline && len == count)
+			if (may_inline && len == ctx->e.length)
 				tryrecompress_trailing(ctx->queue + ctx->head,
-						       &count, dst, &ret);
+						&ctx->e.length, dst, &ret);
 
 			tailused = ret & (EROFS_BLKSIZ - 1);
 			padding = 0;
-			ctx->compressedblks = BLK_ROUND_UP(ret);
-			DBG_BUGON(ctx->compressedblks * EROFS_BLKSIZ >= count);
+			ctx->e.compressedblks = BLK_ROUND_UP(ret);
+			DBG_BUGON(ctx->e.compressedblks * EROFS_BLKSIZ >=
+				  ctx->e.length);
 
 			/* zero out garbage trailing data for non-0padding */
 			if (!erofs_sb_has_lz4_0padding())
@@ -315,21 +322,22 @@ nocompression:
 
 			/* write compressed data */
 			erofs_dbg("Writing %u compressed data to %u of %u blocks",
-				  count, ctx->blkaddr, ctx->compressedblks);
+				  ctx->e.length, ctx->blkaddr,
+				  ctx->e.compressedblks);
 
 			ret = blk_write(dst - padding, ctx->blkaddr,
-					ctx->compressedblks);
+					ctx->e.compressedblks);
 			if (ret)
 				return ret;
-			raw = false;
+			ctx->e.raw = false;
 		}
+		/* write indexes for this pcluster */
+		ctx->e.blkaddr = ctx->blkaddr;
+		z_erofs_write_indexes(ctx);
 
-		ctx->head += count;
-		/* write compression indexes for this pcluster */
-		vle_write_indexes(ctx, count, raw);
-
-		ctx->blkaddr += ctx->compressedblks;
-		len -= count;
+		ctx->blkaddr += ctx->e.compressedblks;
+		ctx->head += ctx->e.length;
+		len -= ctx->e.length;
 
 		if (!final && ctx->head >= EROFS_CONFIG_COMPR_MAX_SZ) {
 			const unsigned int qh_aligned =
@@ -654,6 +662,7 @@ int erofs_write_compressed_file(struct erofs_inode *inode)
 	ctx.metacur = compressmeta + Z_EROFS_LEGACY_MAP_HEADER_SIZE;
 	ctx.head = ctx.tail = 0;
 	ctx.clusterofs = 0;
+	ctx.e.length = 0;
 	remaining = inode->i_size;
 
 	while (remaining) {
@@ -679,7 +688,7 @@ int erofs_write_compressed_file(struct erofs_inode *inode)
 	DBG_BUGON(compressed_blocks < !!inode->idata_size);
 	compressed_blocks -= !!inode->idata_size;
 
-	vle_write_indexes_final(&ctx);
+	z_erofs_write_indexes_final(&ctx);
 	legacymetasize = ctx.metacur - compressmeta;
 	/* estimate if data compression saves space or not */
 	if (compressed_blocks * EROFS_BLKSIZ + inode->idata_size +
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/4] erofs-utils: lib: add rb-tree implementation
  2022-09-06 11:40 [PATCH 1/4] erofs-utils: introduce z_erofs_inmem_extent ZiyangZhang
@ 2022-09-06 11:40 ` ZiyangZhang
  2022-09-06 11:40 ` [PATCH 3/4] erofs-utils: fuse: introduce partial-referenced pclusters ZiyangZhang
  2022-09-06 11:40 ` [PATCH 4/4] erofs-utils: mkfs: introduce global compressed data deduplication ZiyangZhang
  2 siblings, 0 replies; 6+ messages in thread
From: ZiyangZhang @ 2022-09-06 11:40 UTC (permalink / raw)
  To: linux-erofs; +Cc: Ziyang Zhang

From: Ziyang Zhang <ZiyangZhang@linux.alibaba.com>

Introduce a simple rb-tree implementation in order to store the
hash map for deduplication.

Signed-off-by: Ziyang Zhang <ZiyangZhang@linux.alibaba.com>
---
 lib/Makefile.am |   2 +-
 lib/rb_tree.c   | 512 ++++++++++++++++++++++++++++++++++++++++++++++++
 lib/rb_tree.h   | 104 ++++++++++
 3 files changed, 617 insertions(+), 1 deletion(-)
 create mode 100644 lib/rb_tree.c
 create mode 100644 lib/rb_tree.h

diff --git a/lib/Makefile.am b/lib/Makefile.am
index 3fad357..b92adee 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -27,7 +27,7 @@ noinst_HEADERS = $(top_srcdir)/include/erofs_fs.h \
 noinst_HEADERS += compressor.h
 liberofs_la_SOURCES = config.c io.c cache.c super.c inode.c xattr.c exclude.c \
 		      namei.c data.c compress.c compressor.c zmap.c decompress.c \
-		      compress_hints.c hashmap.c sha256.c blobchunk.c dir.c
+		      compress_hints.c hashmap.c sha256.c blobchunk.c dir.c rb_tree.c
 liberofs_la_CFLAGS = -Wall -I$(top_srcdir)/include
 if ENABLE_LZ4
 liberofs_la_CFLAGS += ${LZ4_CFLAGS}
diff --git a/lib/rb_tree.c b/lib/rb_tree.c
new file mode 100644
index 0000000..28800a9
--- /dev/null
+++ b/lib/rb_tree.c
@@ -0,0 +1,512 @@
+// SPDX-License-Identifier: Unlicense
+//
+// Based on Julienne Walker's <http://eternallyconfuzzled.com/> rb_tree
+// implementation.
+//
+// Modified by Mirek Rusin <http://github.com/mirek/rb_tree>.
+//
+// This is free and unencumbered software released into the public domain.
+//
+// Anyone is free to copy, modify, publish, use, compile, sell, or
+// distribute this software, either in source code form or as a compiled
+// binary, for any purpose, commercial or non-commercial, and by any
+// means.
+//
+// In jurisdictions that recognize copyright laws, the author or authors
+// of this software dedicate any and all copyright interest in the
+// software to the public domain. We make this dedication for the benefit
+// of the public at large and to the detriment of our heirs and
+// successors. We intend this dedication to be an overt act of
+// relinquishment in perpetuity of all present and future rights to this
+// software under copyright law.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// For more information, please refer to <http://unlicense.org/>
+//
+
+#include "rb_tree.h"
+
+// rb_node
+
+struct rb_node *
+rb_node_alloc () {
+    return malloc(sizeof(struct rb_node));
+}
+
+struct rb_node *
+rb_node_init (struct rb_node *self, void *value) {
+    if (self) {
+        self->red = 1;
+        self->link[0] = self->link[1] = NULL;
+        self->value = value;
+    }
+    return self;
+}
+
+struct rb_node *
+rb_node_create (void *value) {
+    return rb_node_init(rb_node_alloc(), value);
+}
+
+void
+rb_node_dealloc (struct rb_node *self) {
+    if (self) {
+        free(self);
+    }
+}
+
+static int
+rb_node_is_red (const struct rb_node *self) {
+    return self ? self->red : 0;
+}
+
+static struct rb_node *
+rb_node_rotate (struct rb_node *self, int dir) {
+    struct rb_node *result = NULL;
+    if (self) {
+        result = self->link[!dir];
+        self->link[!dir] = result->link[dir];
+        result->link[dir] = self;
+        self->red = 1;
+        result->red = 0;
+    }
+    return result;
+}
+
+static struct rb_node *
+rb_node_rotate2 (struct rb_node *self, int dir) {
+    struct rb_node *result = NULL;
+    if (self) {
+        self->link[!dir] = rb_node_rotate(self->link[!dir], !dir);
+        result = rb_node_rotate(self, dir);
+    }
+    return result;
+}
+
+// rb_tree - default callbacks
+
+int
+rb_tree_node_cmp_ptr_cb (struct rb_tree *self, struct rb_node *a, struct rb_node *b) {
+    return (a->value > b->value) - (a->value < b->value);
+}
+
+void
+rb_tree_node_dealloc_cb (struct rb_tree *self, struct rb_node *node) {
+    if (self) {
+        if (node) {
+            rb_node_dealloc(node);
+        }
+    }
+}
+
+// rb_tree
+
+struct rb_tree *
+rb_tree_alloc () {
+    return malloc(sizeof(struct rb_tree));
+}
+
+struct rb_tree *
+rb_tree_init (struct rb_tree *self, rb_tree_node_cmp_f node_cmp_cb) {
+    if (self) {
+        self->root = NULL;
+        self->size = 0;
+        self->cmp = node_cmp_cb ? node_cmp_cb : rb_tree_node_cmp_ptr_cb;
+    }
+    return self;
+}
+
+struct rb_tree *
+rb_tree_create (rb_tree_node_cmp_f node_cb) {
+    return rb_tree_init(rb_tree_alloc(), node_cb);
+}
+
+void
+rb_tree_dealloc (struct rb_tree *self, rb_tree_node_f node_cb) {
+    if (self) {
+        if (node_cb) {
+            struct rb_node *node = self->root;
+            struct rb_node *save = NULL;
+
+            // Rotate away the left links so that
+            // we can treat this like the destruction
+            // of a linked list
+            while (node) {
+                if (node->link[0] == NULL) {
+
+                    // No left links, just kill the node and move on
+                    save = node->link[1];
+                    node_cb(self, node);
+                    node = NULL;
+                } else {
+
+                    // Rotate away the left link and check again
+                    save = node->link[0];
+                    node->link[0] = save->link[1];
+                    save->link[1] = node;
+                }
+                node = save;
+            }
+        }
+        free(self);
+    }
+}
+
+int
+rb_tree_test (struct rb_tree *self, struct rb_node *root) {
+    int lh, rh;
+
+    if ( root == NULL )
+        return 1;
+    else {
+        struct rb_node *ln = root->link[0];
+        struct rb_node *rn = root->link[1];
+
+        /* Consecutive red links */
+        if (rb_node_is_red(root)) {
+            if (rb_node_is_red(ln) || rb_node_is_red(rn)) {
+                printf("Red violation");
+                return 0;
+            }
+        }
+
+        lh = rb_tree_test(self, ln);
+        rh = rb_tree_test(self, rn);
+
+        /* Invalid binary search tree */
+        if ( ( ln != NULL && self->cmp(self, ln, root) >= 0 )
+            || ( rn != NULL && self->cmp(self, rn, root) <= 0))
+        {
+            puts ( "Binary tree violation" );
+            return 0;
+        }
+
+        /* Black height mismatch */
+        if ( lh != 0 && rh != 0 && lh != rh ) {
+            puts ( "Black violation" );
+            return 0;
+        }
+
+        /* Only count black links */
+        if ( lh != 0 && rh != 0 )
+            return rb_node_is_red ( root ) ? lh : lh + 1;
+        else
+            return 0;
+    }
+}
+
+void *
+rb_tree_find(struct rb_tree *self, void *value) {
+    void *result = NULL;
+    if (self) {
+        struct rb_node node = { .value = value };
+        struct rb_node *it = self->root;
+        int cmp = 0;
+        while (it) {
+            if ((cmp = self->cmp(self, it, &node))) {
+
+                // If the tree supports duplicates, they should be
+                // chained to the right subtree for this to work
+                it = it->link[cmp < 0];
+            } else {
+                break;
+            }
+        }
+        result = it ? it->value : NULL;
+    }
+    return result;
+}
+
+// Creates (malloc'ates)
+int
+rb_tree_insert (struct rb_tree *self, void *value) {
+    return rb_tree_insert_node(self, rb_node_create(value));
+}
+
+// Returns 1 on success, 0 otherwise.
+int
+rb_tree_insert_node (struct rb_tree *self, struct rb_node *node) {
+    if (self && node) {
+        if (self->root == NULL) {
+            self->root = node;
+        } else {
+            struct rb_node head = { 0 }; // False tree root
+            struct rb_node *g, *t;       // Grandparent & parent
+            struct rb_node *p, *q;       // Iterator & parent
+            int dir = 0, last = 0;
+
+            // Set up our helpers
+            t = &head;
+            g = p = NULL;
+            q = t->link[1] = self->root;
+
+            // Search down the tree for a place to insert
+            while (1) {
+                if (q == NULL) {
+
+                    // Insert node at the first null link.
+                    p->link[dir] = q = node;
+                } else if (rb_node_is_red(q->link[0]) && rb_node_is_red(q->link[1])) {
+
+                    // Simple red violation: color flip
+                    q->red = 1;
+                    q->link[0]->red = 0;
+                    q->link[1]->red = 0;
+                }
+
+                if (rb_node_is_red(q) && rb_node_is_red(p)) {
+
+                    // Hard red violation: rotations necessary
+                    int dir2 = t->link[1] == g;
+                    if (q == p->link[last]) {
+                        t->link[dir2] = rb_node_rotate(g, !last);
+                    } else {
+                        t->link[dir2] = rb_node_rotate2(g, !last);
+                    }
+                }
+
+                // Stop working if we inserted a node. This
+                // check also disallows duplicates in the tree
+                if (self->cmp(self, q, node) == 0) {
+                    break;
+                }
+
+                last = dir;
+                dir = self->cmp(self, q, node) < 0;
+
+                // Move the helpers down
+                if (g != NULL) {
+                    t = g;
+                }
+
+                g = p, p = q;
+                q = q->link[dir];
+            }
+
+            // Update the root (it may be different)
+            self->root = head.link[1];
+        }
+
+        // Make the root black for simplified logic
+        self->root->red = 0;
+        ++self->size;
+        return 1;
+    }
+    return 0;
+}
+
+// Returns 1 if the value was removed, 0 otherwise. Optional node callback
+// can be provided to dealloc node and/or user data. Use rb_tree_node_dealloc
+// default callback to deallocate node created by rb_tree_insert(...).
+int
+rb_tree_remove_with_cb (struct rb_tree *self, void *value, rb_tree_node_f node_cb) {
+    if (self->root != NULL) {
+        struct rb_node head = {0}; // False tree root
+        struct rb_node node = { .value = value }; // Value wrapper node
+        struct rb_node *q, *p, *g; // Helpers
+        struct rb_node *f = NULL;  // Found item
+        int dir = 1;
+
+        // Set up our helpers
+        q = &head;
+        g = p = NULL;
+        q->link[1] = self->root;
+
+        // Search and push a red node down
+        // to fix red violations as we go
+        while (q->link[dir] != NULL) {
+            int last = dir;
+
+            // Move the helpers down
+            g = p, p = q;
+            q = q->link[dir];
+            dir = self->cmp(self, q, &node) < 0;
+
+            // Save the node with matching value and keep
+            // going; we'll do removal tasks at the end
+            if (self->cmp(self, q, &node) == 0) {
+                f = q;
+            }
+
+            // Push the red node down with rotations and color flips
+            if (!rb_node_is_red(q) && !rb_node_is_red(q->link[dir])) {
+                if (rb_node_is_red(q->link[!dir])) {
+                    p = p->link[last] = rb_node_rotate(q, dir);
+                } else if (!rb_node_is_red(q->link[!dir])) {
+                    struct rb_node *s = p->link[!last];
+                    if (s) {
+                        if (!rb_node_is_red(s->link[!last]) && !rb_node_is_red(s->link[last])) {
+
+                            // Color flip
+                            p->red = 0;
+                            s->red = 1;
+                            q->red = 1;
+                        } else {
+                            int dir2 = g->link[1] == p;
+                            if (rb_node_is_red(s->link[last])) {
+                                g->link[dir2] = rb_node_rotate2(p, last);
+                            } else if (rb_node_is_red(s->link[!last])) {
+                                g->link[dir2] = rb_node_rotate(p, last);
+                            }
+
+                            // Ensure correct coloring
+                            q->red = g->link[dir2]->red = 1;
+                            g->link[dir2]->link[0]->red = 0;
+                            g->link[dir2]->link[1]->red = 0;
+                        }
+                    }
+                }
+            }
+        }
+
+        // Replace and remove the saved node
+        if (f) {
+            void *tmp = f->value;
+            f->value = q->value;
+            q->value = tmp;
+
+            p->link[p->link[1] == q] = q->link[q->link[0] == NULL];
+
+            if (node_cb) {
+                node_cb(self, q);
+            }
+            q = NULL;
+        }
+
+        // Update the root (it may be different)
+        self->root = head.link[1];
+
+        // Make the root black for simplified logic
+        if (self->root != NULL) {
+            self->root->red = 0;
+        }
+
+        --self->size;
+    }
+    return 1;
+}
+
+int
+rb_tree_remove (struct rb_tree *self, void *value) {
+    int result = 0;
+    if (self) {
+        result = rb_tree_remove_with_cb(self, value, rb_tree_node_dealloc_cb);
+    }
+    return result;
+}
+
+size_t
+rb_tree_size (struct rb_tree *self) {
+    size_t result = 0;
+    if (self) {
+        result = self->size;
+    }
+    return result;
+}
+
+// rb_iter
+
+struct rb_iter *
+rb_iter_alloc () {
+    return malloc(sizeof(struct rb_iter));
+}
+
+struct rb_iter *
+rb_iter_init (struct rb_iter *self) {
+    if (self) {
+        self->tree = NULL;
+        self->node = NULL;
+        self->top = 0;
+    }
+    return self;
+}
+
+struct rb_iter *
+rb_iter_create () {
+    return rb_iter_init(rb_iter_alloc());
+}
+
+void
+rb_iter_dealloc (struct rb_iter *self) {
+    if (self) {
+        free(self);
+    }
+}
+
+// Internal function, init traversal object, dir determines whether
+// to begin traversal at the smallest or largest valued node.
+static void *
+rb_iter_start (struct rb_iter *self, struct rb_tree *tree, int dir) {
+    void *result = NULL;
+    if (self) {
+        self->tree = tree;
+        self->node = tree->root;
+        self->top = 0;
+
+        // Save the path for later selfersal
+        if (self->node != NULL) {
+            while (self->node->link[dir] != NULL) {
+                self->path[self->top++] = self->node;
+                self->node = self->node->link[dir];
+            }
+        }
+
+        result = self->node == NULL ? NULL : self->node->value;
+    }
+    return result;
+}
+
+// Traverse a red black tree in the user-specified direction (0 asc, 1 desc)
+static void *
+rb_iter_move (struct rb_iter *self, int dir) {
+    if (self->node->link[dir] != NULL) {
+
+        // Continue down this branch
+        self->path[self->top++] = self->node;
+        self->node = self->node->link[dir];
+        while ( self->node->link[!dir] != NULL ) {
+            self->path[self->top++] = self->node;
+            self->node = self->node->link[!dir];
+        }
+    } else {
+
+        // Move to the next branch
+        struct rb_node *last = NULL;
+        do {
+            if (self->top == 0) {
+                self->node = NULL;
+                break;
+            }
+            last = self->node;
+            self->node = self->path[--self->top];
+        } while (last == self->node->link[dir]);
+    }
+    return self->node == NULL ? NULL : self->node->value;
+}
+
+void *
+rb_iter_first (struct rb_iter *self, struct rb_tree *tree) {
+    return rb_iter_start(self, tree, 0);
+}
+
+void *
+rb_iter_last (struct rb_iter *self, struct rb_tree *tree) {
+    return rb_iter_start(self, tree, 1);
+}
+
+void *
+rb_iter_next (struct rb_iter *self) {
+    return rb_iter_move(self, 1);
+}
+
+void *
+rb_iter_prev (struct rb_iter *self) {
+    return rb_iter_move(self, 0);
+}
diff --git a/lib/rb_tree.h b/lib/rb_tree.h
new file mode 100644
index 0000000..5b35c74
--- /dev/null
+++ b/lib/rb_tree.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: Unlicense */
+//
+// Based on Julienne Walker's <http://eternallyconfuzzled.com/> rb_tree
+// implementation.
+//
+// Modified by Mirek Rusin <http://github.com/mirek/rb_tree>.
+//
+// This is free and unencumbered software released into the public domain.
+//
+// Anyone is free to copy, modify, publish, use, compile, sell, or
+// distribute this software, either in source code form or as a compiled
+// binary, for any purpose, commercial or non-commercial, and by any
+// means.
+//
+// In jurisdictions that recognize copyright laws, the author or authors
+// of this software dedicate any and all copyright interest in the
+// software to the public domain. We make this dedication for the benefit
+// of the public at large and to the detriment of our heirs and
+// successors. We intend this dedication to be an overt act of
+// relinquishment in perpetuity of all present and future rights to this
+// software under copyright law.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+// For more information, please refer to <http://unlicense.org/>
+//
+
+#ifndef __RB_TREE_H__
+#define __RB_TREE_H__ 1
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+#ifndef RB_ITER_MAX_HEIGHT
+#define RB_ITER_MAX_HEIGHT 64 // Tallest allowable tree to iterate
+#endif
+
+struct rb_node;
+struct rb_tree;
+
+typedef int  (*rb_tree_node_cmp_f) (struct rb_tree *self, struct rb_node *a, struct rb_node *b);
+typedef void (*rb_tree_node_f)     (struct rb_tree *self, struct rb_node *node);
+
+struct rb_node {
+    int             red;     // Color red (1), black (0)
+    struct rb_node *link[2]; // Link left [0] and right [1]
+    void           *value;   // User provided, used indirectly via rb_tree_node_cmp_f.
+};
+
+struct rb_tree {
+    struct rb_node    *root;
+    rb_tree_node_cmp_f cmp;
+    size_t             size;
+    void              *info; // User provided, not used by rb_tree.
+};
+
+struct rb_iter {
+    struct rb_tree *tree;
+    struct rb_node *node;                     // Current node
+    struct rb_node *path[RB_ITER_MAX_HEIGHT]; // Traversal path
+    size_t          top;                      // Top of stack
+    void           *info;                     // User provided, not used by rb_iter.
+};
+
+int             rb_tree_node_cmp_ptr_cb (struct rb_tree *self, struct rb_node *a, struct rb_node *b);
+void            rb_tree_node_dealloc_cb (struct rb_tree *self, struct rb_node *node);
+
+struct rb_node *rb_node_alloc           ();
+struct rb_node *rb_node_create          (void *value);
+struct rb_node *rb_node_init            (struct rb_node *self, void *value);
+void            rb_node_dealloc         (struct rb_node *self);
+
+struct rb_tree *rb_tree_alloc           ();
+struct rb_tree *rb_tree_create          (rb_tree_node_cmp_f cmp);
+struct rb_tree *rb_tree_init            (struct rb_tree *self, rb_tree_node_cmp_f cmp);
+void            rb_tree_dealloc         (struct rb_tree *self, rb_tree_node_f node_cb);
+void           *rb_tree_find            (struct rb_tree *self, void *value);
+int             rb_tree_insert          (struct rb_tree *self, void *value);
+int             rb_tree_remove          (struct rb_tree *self, void *value);
+size_t          rb_tree_size            (struct rb_tree *self);
+
+int             rb_tree_insert_node     (struct rb_tree *self, struct rb_node *node);
+int             rb_tree_remove_with_cb  (struct rb_tree *self, void *value, rb_tree_node_f node_cb);
+
+int             rb_tree_test            (struct rb_tree *self, struct rb_node *root);
+
+struct rb_iter *rb_iter_alloc           ();
+struct rb_iter *rb_iter_init            ();
+struct rb_iter *rb_iter_create          ();
+void            rb_iter_dealloc         (struct rb_iter *self);
+void           *rb_iter_first           (struct rb_iter *self, struct rb_tree *tree);
+void           *rb_iter_last            (struct rb_iter *self, struct rb_tree *tree);
+void           *rb_iter_next            (struct rb_iter *self);
+void           *rb_iter_prev            (struct rb_iter *self);
+
+#endif
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/4] erofs-utils: fuse: introduce partial-referenced pclusters
  2022-09-06 11:40 [PATCH 1/4] erofs-utils: introduce z_erofs_inmem_extent ZiyangZhang
  2022-09-06 11:40 ` [PATCH 2/4] erofs-utils: lib: add rb-tree implementation ZiyangZhang
@ 2022-09-06 11:40 ` ZiyangZhang
  2022-09-06 11:40 ` [PATCH 4/4] erofs-utils: mkfs: introduce global compressed data deduplication ZiyangZhang
  2 siblings, 0 replies; 6+ messages in thread
From: ZiyangZhang @ 2022-09-06 11:40 UTC (permalink / raw)
  To: linux-erofs; +Cc: Gao Xiang, Ziyang Zhang

From: Gao Xiang <hsiangkao@linux.alibaba.com>

Due to deduplication for compressed data, pclusters can be partially
referenced with their prefixes.

Decompression algorithms should know that in advance, otherwise they
will fail out unexpectedly.

Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Signed-off-by: Ziyang Zhang <ZiyangZhang@linux.alibaba.com>
---
 include/erofs/internal.h | 3 +++
 include/erofs_fs.h       | 3 +++
 lib/data.c               | 3 ++-
 lib/zmap.c               | 6 ++++++
 4 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/erofs/internal.h b/include/erofs/internal.h
index 2e0aae8..2ae7737 100644
--- a/include/erofs/internal.h
+++ b/include/erofs/internal.h
@@ -279,6 +279,7 @@ enum {
 	BH_Mapped,
 	BH_Encoded,
 	BH_FullMapped,
+	BH_Partialref,
 };
 
 /* Has a disk mapping */
@@ -289,6 +290,8 @@ enum {
 #define EROFS_MAP_ENCODED	(1 << BH_Encoded)
 /* The length of extent is full */
 #define EROFS_MAP_FULL_MAPPED	(1 << BH_FullMapped)
+/* The extent refers to partial compressed data */
+#define EROFS_MAP_PARTIAL_REF	(1 << BH_Partialref)
 
 struct erofs_map_blocks {
 	char mpage[EROFS_BLKSIZ];
diff --git a/include/erofs_fs.h b/include/erofs_fs.h
index 08f9761..09f672b 100644
--- a/include/erofs_fs.h
+++ b/include/erofs_fs.h
@@ -355,6 +355,9 @@ enum {
 #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS        2
 #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT         0
 
+/* (noncompact, HEAD) This pcluster refers to compressed data partially */
+#define Z_EROFS_VLE_DI_PARTIAL_REF		(1 << 15)
+
 /*
  * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the
  * compressed block count of a compressed extent (in logical clusters, aka.
diff --git a/lib/data.c b/lib/data.c
index ad7b2cb..9b2ab6a 100644
--- a/lib/data.c
+++ b/lib/data.c
@@ -258,7 +258,8 @@ static int z_erofs_read_data(struct erofs_inode *inode, char *buffer,
 		} else {
 			DBG_BUGON(end != map.m_la + map.m_llen);
 			length = map.m_llen;
-			partial = !(map.m_flags & EROFS_MAP_FULL_MAPPED);
+			partial = !(map.m_flags & EROFS_MAP_FULL_MAPPED) ||
+				(map.m_flags & EROFS_MAP_PARTIAL_REF);
 		}
 
 		if (map.m_la < offset) {
diff --git a/lib/zmap.c b/lib/zmap.c
index abe0d31..56efaa7 100644
--- a/lib/zmap.c
+++ b/lib/zmap.c
@@ -99,6 +99,7 @@ struct z_erofs_maprecorder {
 	u16 delta[2];
 	erofs_blk_t pblk, compressedlcs;
 	erofs_off_t nextpackoff;
+	bool partialref;
 };
 
 static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
@@ -161,6 +162,9 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
 		break;
 	case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
 	case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+		if (advise & Z_EROFS_VLE_DI_PARTIAL_REF)
+			m->partialref = true;
+
 		m->clusterofs = le16_to_cpu(di->di_clusterofs);
 		m->pblk = le32_to_cpu(di->di_u.blkaddr);
 		break;
@@ -602,6 +606,8 @@ static int z_erofs_do_map_blocks(struct erofs_inode *vi,
 		goto out;
 	}
 
+	if (m.partialref)
+		map->m_flags |= EROFS_MAP_PARTIAL_REF;
 	map->m_llen = end - map->m_la;
 	if (flags & EROFS_GET_BLOCKS_FINDTAIL)
 		vi->z_tailextent_headlcn = m.lcn;
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 4/4] erofs-utils: mkfs: introduce global compressed data deduplication
  2022-09-06 11:40 [PATCH 1/4] erofs-utils: introduce z_erofs_inmem_extent ZiyangZhang
  2022-09-06 11:40 ` [PATCH 2/4] erofs-utils: lib: add rb-tree implementation ZiyangZhang
  2022-09-06 11:40 ` [PATCH 3/4] erofs-utils: fuse: introduce partial-referenced pclusters ZiyangZhang
@ 2022-09-06 11:40 ` ZiyangZhang
  2022-09-07  5:55   ` Gao Xiang
  2 siblings, 1 reply; 6+ messages in thread
From: ZiyangZhang @ 2022-09-06 11:40 UTC (permalink / raw)
  To: linux-erofs; +Cc: Gao Xiang, Ziyang Zhang

From: Ziyang Zhang <ZiyangZhang@linux.alibaba.com>

This patch introduces global compressed data deduplication to
reuse potential prefixes for each pcluster.

It also uses rolling hashing and shortens the previous compressed
extent in order to explore more possibilities for deduplication.

Co-developped-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Signed-off-by: Ziyang Zhang <ZiyangZhang@linux.alibaba.com>
---
 include/erofs/config.h |   1 +
 include/erofs/dedupe.h |  39 +++++++++
 lib/Makefile.am        |   4 +-
 lib/compress.c         | 112 +++++++++++++++++++++-----
 lib/dedupe.c           | 174 +++++++++++++++++++++++++++++++++++++++++
 lib/rolling_hash.h     |  60 ++++++++++++++
 mkfs/main.c            |  19 +++++
 7 files changed, 387 insertions(+), 22 deletions(-)
 create mode 100644 include/erofs/dedupe.h
 create mode 100644 lib/dedupe.c
 create mode 100644 lib/rolling_hash.h

diff --git a/include/erofs/config.h b/include/erofs/config.h
index 539d813..6f94183 100644
--- a/include/erofs/config.h
+++ b/include/erofs/config.h
@@ -44,6 +44,7 @@ struct erofs_configure {
 	char c_chunkbits;
 	bool c_noinline_data;
 	bool c_ztailpacking;
+	bool c_dedupe;
 	bool c_ignore_mtime;
 	bool c_showprogress;
 
diff --git a/include/erofs/dedupe.h b/include/erofs/dedupe.h
new file mode 100644
index 0000000..153bd4c
--- /dev/null
+++ b/include/erofs/dedupe.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0 */
+/*
+ * Copyright (C) 2022 Alibaba Cloud
+ */
+#ifndef __EROFS_DEDUPE_H
+#define __EROFS_DEDUPE_H
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include "internal.h"
+
+struct z_erofs_inmem_extent {
+	erofs_blk_t blkaddr;
+	unsigned int compressedblks;
+	unsigned int length;
+	bool raw, partial;
+};
+
+struct z_erofs_dedupe_ctx {
+	u8		*start, *end;
+	u8		*cur;
+	struct z_erofs_inmem_extent	e;
+};
+
+int z_erofs_dedupe_match(struct z_erofs_dedupe_ctx *ctx);
+int z_erofs_dedupe_insert(struct z_erofs_inmem_extent *e,
+			  void *original_data);
+void z_erofs_dedupe_commit(bool drop);
+int z_erofs_dedupe_init(unsigned int wsiz);
+void z_erofs_dedupe_exit(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/lib/Makefile.am b/lib/Makefile.am
index b92adee..8bf29d9 100644
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -27,7 +27,9 @@ noinst_HEADERS = $(top_srcdir)/include/erofs_fs.h \
 noinst_HEADERS += compressor.h
 liberofs_la_SOURCES = config.c io.c cache.c super.c inode.c xattr.c exclude.c \
 		      namei.c data.c compress.c compressor.c zmap.c decompress.c \
-		      compress_hints.c hashmap.c sha256.c blobchunk.c dir.c rb_tree.c
+		      compress_hints.c hashmap.c sha256.c blobchunk.c dir.c rb_tree.c \
+		      dedupe.c
+
 liberofs_la_CFLAGS = -Wall -I$(top_srcdir)/include
 if ENABLE_LZ4
 liberofs_la_CFLAGS += ${LZ4_CFLAGS}
diff --git a/lib/compress.c b/lib/compress.c
index 3c1d9db..bdb6e78 100644
--- a/lib/compress.c
+++ b/lib/compress.c
@@ -15,6 +15,7 @@
 #include "erofs/io.h"
 #include "erofs/cache.h"
 #include "erofs/compress.h"
+#include "erofs/dedupe.h"
 #include "compressor.h"
 #include "erofs/block_list.h"
 #include "erofs/compress_hints.h"
@@ -22,13 +23,6 @@
 static struct erofs_compress compresshandle;
 static unsigned int algorithmtype[2];
 
-struct z_erofs_inmem_extent {
-	erofs_blk_t blkaddr;
-	unsigned int compressedblks;
-	unsigned int length;
-	bool raw;
-};
-
 struct z_erofs_vle_compress_ctx {
 	u8 queue[EROFS_CONFIG_COMPR_MAX_SZ * 2];
 	struct z_erofs_inmem_extent e;	/* (lookahead) extent */
@@ -72,9 +66,12 @@ static void z_erofs_write_indexes(struct z_erofs_vle_compress_ctx *ctx)
 	unsigned int count = ctx->e.length;
 	unsigned int d0 = 0, d1 = (clusterofs + count) / EROFS_BLKSIZ;
 	struct z_erofs_vle_decompressed_index di;
-	unsigned int type;
-	__le16 advise;
+	unsigned int type, advise;
 
+	if (!count)
+		return;
+
+	ctx->e.length = 0;	/* mark as written first */
 	di.di_clusterofs = cpu_to_le16(ctx->clusterofs);
 
 	/* whether the tail-end (un)compressed block or not */
@@ -84,11 +81,12 @@ static void z_erofs_write_indexes(struct z_erofs_vle_compress_ctx *ctx)
 		 * is well-compressed for !ztailpacking cases.
 		 */
 		DBG_BUGON(!ctx->e.raw && !cfg.c_ztailpacking);
+		DBG_BUGON(ctx->e.partial);
 		type = ctx->e.raw ? Z_EROFS_VLE_CLUSTER_TYPE_PLAIN :
 			Z_EROFS_VLE_CLUSTER_TYPE_HEAD;
-		advise = cpu_to_le16(type << Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT);
+		advise = type << Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT;
+		di.di_advise = cpu_to_le16(advise);
 
-		di.di_advise = advise;
 		di.di_u.blkaddr = cpu_to_le32(ctx->e.blkaddr);
 		memcpy(ctx->metacur, &di, sizeof(di));
 		ctx->metacur += sizeof(di);
@@ -99,6 +97,7 @@ static void z_erofs_write_indexes(struct z_erofs_vle_compress_ctx *ctx)
 	}
 
 	do {
+		advise = 0;
 		/* XXX: big pcluster feature should be per-inode */
 		if (d0 == 1 && erofs_sb_has_big_pcluster()) {
 			type = Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD;
@@ -129,9 +128,14 @@ static void z_erofs_write_indexes(struct z_erofs_vle_compress_ctx *ctx)
 			type = ctx->e.raw ? Z_EROFS_VLE_CLUSTER_TYPE_PLAIN :
 				Z_EROFS_VLE_CLUSTER_TYPE_HEAD;
 			di.di_u.blkaddr = cpu_to_le32(ctx->e.blkaddr);
+
+			if (ctx->e.partial) {
+				DBG_BUGON(ctx->e.raw);
+				advise |= Z_EROFS_VLE_DI_PARTIAL_REF;
+			}
 		}
-		advise = cpu_to_le16(type << Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT);
-		di.di_advise = advise;
+		advise |= type << Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT;
+		di.di_advise = cpu_to_le16(advise);
 
 		memcpy(ctx->metacur, &di, sizeof(di));
 		ctx->metacur += sizeof(di);
@@ -146,6 +150,64 @@ static void z_erofs_write_indexes(struct z_erofs_vle_compress_ctx *ctx)
 	ctx->clusterofs = clusterofs + count;
 }
 
+static int z_erofs_compress_dedupe(struct erofs_inode *inode,
+				   struct z_erofs_vle_compress_ctx *ctx,
+				   unsigned int *len)
+{
+	int ret = 0;
+
+	do {
+		struct z_erofs_dedupe_ctx dctx = {
+			.start = ctx->queue + ctx->head -
+				(ctx->e.length < EROFS_BLKSIZ ? 0 :
+					ctx->e.length - EROFS_BLKSIZ),
+			.end = ctx->queue + ctx->head + *len,
+			.cur = ctx->queue + ctx->head,
+		};
+		int delta;
+
+		if (z_erofs_dedupe_match(&dctx))
+			break;
+
+		/* fall back to noncompact indexes for deduplication */
+		inode->z_advise &= ~Z_EROFS_ADVISE_COMPACTED_2B;
+		inode->datalayout = EROFS_INODE_FLAT_COMPRESSION_LEGACY;
+
+		delta = ctx->queue + ctx->head - dctx.cur;
+		if (delta) {
+			DBG_BUGON(delta < 0);
+			DBG_BUGON(!ctx->e.length);
+			ctx->e.partial = true;
+			ctx->e.length -= delta;
+		}
+
+		erofs_dbg("Dedupe %u %scompressed data (delta %d) to %u of %u blocks",
+			  dctx.e.length, dctx.e.raw ? "un" : "",
+			  delta, dctx.e.blkaddr, dctx.e.compressedblks);
+		z_erofs_write_indexes(ctx);
+		ctx->e = dctx.e;
+		ctx->head += dctx.e.length - delta;
+		DBG_BUGON(*len < dctx.e.length - delta);
+		*len -= dctx.e.length - delta;
+
+		if (ctx->head >= EROFS_CONFIG_COMPR_MAX_SZ) {
+			const unsigned int qh_aligned =
+				round_down(ctx->head, EROFS_BLKSIZ);
+			const unsigned int qh_after = ctx->head - qh_aligned;
+
+			memmove(ctx->queue, ctx->queue + qh_aligned,
+				*len + qh_after);
+			ctx->head = qh_after;
+			ctx->tail = qh_after + *len;
+			ret = -EAGAIN;
+			break;
+		}
+	} while (*len);
+
+	z_erofs_write_indexes(ctx);
+	return ret;
+}
+
 static int write_uncompressed_extent(struct z_erofs_vle_compress_ctx *ctx,
 				     unsigned int *len, char *dst)
 {
@@ -244,8 +306,11 @@ static int vle_compress_one(struct erofs_inode *inode,
 			z_erofs_get_max_pclusterblks(inode) * EROFS_BLKSIZ;
 		bool may_inline = (cfg.c_ztailpacking && final);
 
+		if (z_erofs_compress_dedupe(inode, ctx, &len) && !final)
+			break;
+
 		if (len <= pclustersize) {
-			if (!final)
+			if (!final || !len)
 				break;
 			if (!may_inline && len <= EROFS_BLKSIZ)
 				goto nocompression;
@@ -263,13 +328,15 @@ static int vle_compress_one(struct erofs_inode *inode,
 					  erofs_strerror(ret));
 			}
 
-			if (may_inline && len < EROFS_BLKSIZ)
+			if (may_inline && len < EROFS_BLKSIZ) {
 				ret = z_erofs_fill_inline_data(inode,
 						ctx->queue + ctx->head,
 						len, true);
-			else
+			} else {
+				may_inline = false;
 nocompression:
 				ret = write_uncompressed_extent(ctx, &len, dst);
+			}
 
 			if (ret < 0)
 				return ret;
@@ -330,11 +397,13 @@ nocompression:
 			if (ret)
 				return ret;
 			ctx->e.raw = false;
+			may_inline = false;
 		}
-		/* write indexes for this pcluster */
+		ctx->e.partial = false;
 		ctx->e.blkaddr = ctx->blkaddr;
-		z_erofs_write_indexes(ctx);
-
+		if (!may_inline)
+			(void)z_erofs_dedupe_insert(&ctx->e,
+						    ctx->queue + ctx->head);
 		ctx->blkaddr += ctx->e.compressedblks;
 		ctx->head += ctx->e.length;
 		len -= ctx->e.length;
@@ -688,22 +757,23 @@ int erofs_write_compressed_file(struct erofs_inode *inode)
 	DBG_BUGON(compressed_blocks < !!inode->idata_size);
 	compressed_blocks -= !!inode->idata_size;
 
+	z_erofs_write_indexes(&ctx);
 	z_erofs_write_indexes_final(&ctx);
 	legacymetasize = ctx.metacur - compressmeta;
 	/* estimate if data compression saves space or not */
 	if (compressed_blocks * EROFS_BLKSIZ + inode->idata_size +
 	    legacymetasize >= inode->i_size) {
+		z_erofs_dedupe_commit(true);
 		ret = -ENOSPC;
 		goto err_free_idata;
 	}
+	z_erofs_dedupe_commit(false);
 	z_erofs_write_mapheader(inode, compressmeta);
 
 	close(fd);
 	if (compressed_blocks) {
 		ret = erofs_bh_balloon(bh, blknr_to_addr(compressed_blocks));
 		DBG_BUGON(ret != EROFS_BLKSIZ);
-	} else {
-		DBG_BUGON(!inode->idata_size);
 	}
 
 	erofs_info("compressed %s (%llu bytes) into %u blocks",
diff --git a/lib/dedupe.c b/lib/dedupe.c
new file mode 100644
index 0000000..c53a64e
--- /dev/null
+++ b/lib/dedupe.c
@@ -0,0 +1,174 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0 */
+/*
+ * Copyright (C) 2022 Alibaba Cloud
+ */
+#include "erofs/dedupe.h"
+#include "erofs/print.h"
+#include "rb_tree.h"
+#include "rolling_hash.h"
+
+void erofs_sha256(const unsigned char *in, unsigned long in_size,
+		  unsigned char out[32]);
+
+static unsigned int window_size, rollinghash_rm;
+static struct rb_tree *dedupe_tree, *dedupe_subtree;
+
+struct z_erofs_dedupe_item {
+	long long	hash;
+	u8		prefix_sha256[32];
+
+	erofs_blk_t	compressed_blkaddr;
+	unsigned int	compressed_blks;
+
+	int		original_length;
+	bool		partial;
+	u8		extra_data[];
+};
+
+static int z_erofs_dedupe_rbtree_cmp(struct rb_tree *self,
+		struct rb_node *node_a, struct rb_node *node_b)
+{
+	struct z_erofs_dedupe_item *e_a = node_a->value;
+	struct z_erofs_dedupe_item *e_b = node_b->value;
+
+	return (e_a->hash > e_b->hash) - (e_a->hash < e_b->hash);
+}
+
+int z_erofs_dedupe_match(struct z_erofs_dedupe_ctx *ctx)
+{
+	struct z_erofs_dedupe_item e_find;
+	u8 *cur;
+	bool initial = true;
+
+	if (!dedupe_tree)
+		return -ENOENT;
+
+	if (ctx->cur > ctx->end - window_size)
+		cur = ctx->end - window_size;
+	else
+		cur = ctx->cur;
+
+	/* move backward byte-by-byte */
+	for (; cur >= ctx->start; --cur) {
+		struct z_erofs_dedupe_item *e;
+		unsigned int extra;
+		u8 sha256[32];
+
+		if (initial) {
+			/* initial try */
+			e_find.hash = erofs_rolling_hash_init(cur, window_size, true);
+			initial = false;
+		} else {
+			e_find.hash = erofs_rolling_hash_advance(e_find.hash,
+				rollinghash_rm, cur[window_size], cur[0]);
+		}
+
+		e = rb_tree_find(dedupe_tree, &e_find);
+		if (!e) {
+			e = rb_tree_find(dedupe_subtree, &e_find);
+			if (!e)
+				continue;
+		}
+
+		erofs_sha256(cur, window_size, sha256);
+		if (memcmp(sha256, e->prefix_sha256, sizeof(sha256)))
+			continue;
+
+		extra = 0;
+		while (cur + window_size + extra < ctx->end &&
+		       window_size + extra < e->original_length &&
+		       e->extra_data[extra] == cur[window_size + extra])
+			++extra;
+
+		if (window_size + extra <= ctx->cur - cur)
+			continue;
+		ctx->cur = cur;
+		ctx->e.length = window_size + extra;
+		ctx->e.partial = e->partial ||
+			(window_size + extra < e->original_length);
+		ctx->e.blkaddr = e->compressed_blkaddr;
+		ctx->e.compressedblks = e->compressed_blks;
+		return 0;
+	}
+	return -ENOENT;
+}
+
+int z_erofs_dedupe_insert(struct z_erofs_inmem_extent *e,
+			  void *original_data)
+{
+	struct z_erofs_dedupe_item *di;
+
+	if (e->length < window_size)
+		return 0;
+
+	di = malloc(sizeof(*di) + e->length - window_size);
+	if (!di)
+		return -ENOMEM;
+
+	di->original_length = e->length;
+	erofs_sha256(original_data, window_size, di->prefix_sha256);
+	di->hash = erofs_rolling_hash_init(original_data,
+			window_size, true);
+	memcpy(di->extra_data, original_data + window_size,
+	       e->length - window_size);
+	di->compressed_blkaddr = e->blkaddr;
+	di->compressed_blks = e->compressedblks;
+	di->partial = e->partial;
+
+	/* with the same rolling hash */
+	if (!rb_tree_insert(dedupe_subtree, di))
+		free(di);
+	return 0;
+}
+
+static void z_erofs_dedupe_node_free_cb(struct rb_tree *self,
+					struct rb_node *node)
+{
+	free(node->value);
+	rb_tree_node_dealloc_cb(self, node);
+}
+
+void z_erofs_dedupe_commit(bool drop)
+{
+	if (!dedupe_subtree)
+		return;
+	if (!drop) {
+		struct rb_iter iter;
+		struct z_erofs_dedupe_item *di;
+
+		di = rb_iter_first(&iter, dedupe_subtree);
+		while (di) {
+			if (!rb_tree_insert(dedupe_tree, di))
+				DBG_BUGON(1);
+			di = rb_iter_next(&iter);
+		}
+		/*rb_iter_dealloc(iter);*/
+		rb_tree_dealloc(dedupe_subtree, rb_tree_node_dealloc_cb);
+	} else {
+		rb_tree_dealloc(dedupe_subtree, z_erofs_dedupe_node_free_cb);
+	}
+	dedupe_subtree = rb_tree_create(z_erofs_dedupe_rbtree_cmp);
+}
+
+int z_erofs_dedupe_init(unsigned int wsiz)
+{
+	dedupe_tree = rb_tree_create(z_erofs_dedupe_rbtree_cmp);
+	if (!dedupe_tree)
+		return -ENOMEM;
+
+	dedupe_subtree = rb_tree_create(z_erofs_dedupe_rbtree_cmp);
+	if (!dedupe_subtree) {
+		rb_tree_dealloc(dedupe_subtree, NULL);
+		return -ENOMEM;
+	}
+	window_size = wsiz;
+	rollinghash_rm = erofs_rollinghash_calc_rm(window_size);
+	return 0;
+}
+
+void z_erofs_dedupe_exit(void)
+{
+	z_erofs_dedupe_commit(true);
+	rb_tree_dealloc(dedupe_subtree, NULL);
+	rb_tree_dealloc(dedupe_tree, z_erofs_dedupe_node_free_cb);
+}
diff --git a/lib/rolling_hash.h b/lib/rolling_hash.h
new file mode 100644
index 0000000..448db34
--- /dev/null
+++ b/lib/rolling_hash.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR Apache-2.0 */
+/*
+ * Copyright (C) 2022 Alibaba Cloud
+ */
+#ifndef __ROLLING_HASH_H__
+#define __ROLLING_HASH_H__
+
+#include <erofs/defs.h>
+
+#define PRIME_NUMBER	4294967295LL
+#define RADIX		256
+
+static inline long long erofs_rolling_hash_init(u8 *input,
+						int len, bool backwards)
+{
+	long long hash = 0;
+
+	if (!backwards) {
+		int i;
+
+		for (i = 0; i < len; ++i)
+			hash = (RADIX * hash + input[i]) % PRIME_NUMBER;
+	} else {
+		while (len)
+			hash = (RADIX * hash + input[--len]) % PRIME_NUMBER;
+	}
+	return hash;
+}
+
+/* RM = R ^ (M-1) % Q */
+/*
+ * NOTE: value of "hash" could be negative so we cannot use unsiged types for "hash"
+ * "long long" is used here and PRIME_NUMBER can be ULONG_MAX
+ */
+static inline long long erofs_rolling_hash_advance(long long old_hash,
+						   unsigned long long RM,
+						   u8 to_remove, u8 to_add)
+{
+	long long hash = old_hash;
+	long long to_remove_val = (to_remove * RM) % PRIME_NUMBER;
+
+	hash = RADIX * (old_hash - to_remove_val) % PRIME_NUMBER;
+	hash = (hash + to_add) % PRIME_NUMBER;
+
+	/* We might get negative value of hash, converting it to positive */
+	if (hash < 0)
+		hash += PRIME_NUMBER;
+	return hash;
+}
+
+static inline long long erofs_rollinghash_calc_rm(int window_size)
+{
+	int i;
+	long long RM = 1;
+
+	for (i = 0; i < window_size - 1; ++i)
+		RM = (RM * RADIX) % PRIME_NUMBER;
+	return RM;
+}
+#endif
diff --git a/mkfs/main.c b/mkfs/main.c
index b969b35..deba5ec 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -18,6 +18,7 @@
 #include "erofs/inode.h"
 #include "erofs/io.h"
 #include "erofs/compress.h"
+#include "erofs/dedupe.h"
 #include "erofs/xattr.h"
 #include "erofs/exclude.h"
 #include "erofs/block_list.h"
@@ -202,6 +203,12 @@ static int parse_extended_opts(const char *opts)
 				return -EINVAL;
 			cfg.c_ztailpacking = true;
 		}
+
+		if (MATCH_EXTENTED_OPT("dedupe", token, keylen)) {
+			if (vallen)
+				return -EINVAL;
+			cfg.c_dedupe = true;
+		}
 	}
 	return 0;
 }
@@ -670,6 +677,8 @@ int main(int argc, char **argv)
 		erofs_warn("EXPERIMENTAL chunked file feature in use. Use at your own risk!");
 	if (cfg.c_ztailpacking)
 		erofs_warn("EXPERIMENTAL compressed inline data feature in use. Use at your own risk!");
+	if (cfg.c_dedupe)
+		erofs_warn("EXPERIMENTAL data deduplication feature in use. Use at your own risk!");
 	erofs_set_fs_root(cfg.c_src_path);
 #ifndef NDEBUG
 	if (cfg.c_random_pclusterblks)
@@ -696,6 +705,15 @@ int main(int argc, char **argv)
 		goto exit;
 	}
 
+	if (cfg.c_dedupe) {
+		err = z_erofs_dedupe_init(EROFS_BLKSIZ);
+		if (err) {
+			erofs_err("failed to initialize deduplication: %s",
+				  erofs_strerror(err));
+			goto exit;
+		}
+	}
+
 	err = z_erofs_compress_init(sb_bh);
 	if (err) {
 		erofs_err("failed to initialize compressor: %s",
@@ -753,6 +771,7 @@ int main(int argc, char **argv)
 		err = erofs_mkfs_superblock_csum_set();
 exit:
 	z_erofs_compress_exit();
+	z_erofs_dedupe_exit();
 #ifdef WITH_ANDROID
 	erofs_droid_blocklist_fclose();
 #endif
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 4/4] erofs-utils: mkfs: introduce global compressed data deduplication
  2022-09-06 11:40 ` [PATCH 4/4] erofs-utils: mkfs: introduce global compressed data deduplication ZiyangZhang
@ 2022-09-07  5:55   ` Gao Xiang
  2022-09-08  0:56     ` Gao Xiang
  0 siblings, 1 reply; 6+ messages in thread
From: Gao Xiang @ 2022-09-07  5:55 UTC (permalink / raw)
  To: ZiyangZhang; +Cc: linux-erofs

On Tue, Sep 06, 2022 at 07:40:57PM +0800, ZiyangZhang wrote:
> From: Ziyang Zhang <ZiyangZhang@linux.alibaba.com>
> 
> This patch introduces global compressed data deduplication to
> reuse potential prefixes for each pcluster.
> 
> It also uses rolling hashing and shortens the previous compressed
> extent in order to explore more possibilities for deduplication.
> 
> Co-developped-by: Gao Xiang <hsiangkao@linux.alibaba.com>
> Signed-off-by: Ziyang Zhang <ZiyangZhang@linux.alibaba.com>

Some preliminary numbers:
Image				Fs	Type						Size
system.raven.87e115a1           erofs   uncompressed                                    910082048
system.raven.87e115a1           erofs   4k pcluster + lz4hc,9 + ztailpacking            584970240       -35.7% off
system.raven.87e115a1           erofs   4k pcluster + lz4hc,9 + ztailpacking + dedupe   569376768       -37.4% off

linux-5.10 + linux-5.10.87      erofs   uncompressed                                     1943691264
linux-5.10 + linux-5.10.87      erofs   4k pcluster + lz4hc,9 + ztailpacking             661987328      -65.9% off
linux-5.10 + linux-5.10.87      erofs   4k pcluster + lz4hc,9 + ztailpacking + dedupe    490295296      -74.8% off

linux-5.10.87                   erofs   4k pcluster + lz4hc,9                            331292672

One observation is since the tailpacking pcluster doesn't have blkaddr
so data relating to tailpacking pcluster cannot be deduped.

On the other side, it can work with `fragment' feature later together to
minimize image sizes.


Attach a fix for uncompressed pcluster:

diff --git a/lib/dedupe.c b/lib/dedupe.c
index c53a64edfc8d..c382303e2ceb 100644
--- a/lib/dedupe.c
+++ b/lib/dedupe.c
@@ -21,7 +21,7 @@ struct z_erofs_dedupe_item {
 	unsigned int	compressed_blks;
 
 	int		original_length;
-	bool		partial;
+	bool		partial, raw;
 	u8		extra_data[];
 };
 
@@ -86,6 +86,7 @@ int z_erofs_dedupe_match(struct z_erofs_dedupe_ctx *ctx)
 		ctx->e.length = window_size + extra;
 		ctx->e.partial = e->partial ||
 			(window_size + extra < e->original_length);
+		ctx->e.raw = e->raw;
 		ctx->e.blkaddr = e->compressed_blkaddr;
 		ctx->e.compressedblks = e->compressed_blks;
 		return 0;
@@ -114,6 +115,7 @@ int z_erofs_dedupe_insert(struct z_erofs_inmem_extent *e,
 	di->compressed_blkaddr = e->blkaddr;
 	di->compressed_blks = e->compressedblks;
 	di->partial = e->partial;
+	di->raw = e->raw;
 
 	/* with the same rolling hash */
 	if (!rb_tree_insert(dedupe_subtree, di))
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 4/4] erofs-utils: mkfs: introduce global compressed data deduplication
  2022-09-07  5:55   ` Gao Xiang
@ 2022-09-08  0:56     ` Gao Xiang
  0 siblings, 0 replies; 6+ messages in thread
From: Gao Xiang @ 2022-09-08  0:56 UTC (permalink / raw)
  To: Gao Xiang; +Cc: linux-erofs, ZiyangZhang

On Wed, Sep 07, 2022 at 01:55:50PM +0800, Gao Xiang wrote:
> On Tue, Sep 06, 2022 at 07:40:57PM +0800, ZiyangZhang wrote:
> > From: Ziyang Zhang <ZiyangZhang@linux.alibaba.com>
> > 
> > This patch introduces global compressed data deduplication to
> > reuse potential prefixes for each pcluster.
> > 
> > It also uses rolling hashing and shortens the previous compressed
> > extent in order to explore more possibilities for deduplication.
> > 
> > Co-developped-by: Gao Xiang <hsiangkao@linux.alibaba.com>
> > Signed-off-by: Ziyang Zhang <ZiyangZhang@linux.alibaba.com>
> 
> Some preliminary numbers:
> Image				Fs	Type						Size
> system.raven.87e115a1           erofs   uncompressed                                    910082048
> system.raven.87e115a1           erofs   4k pcluster + lz4hc,9 + ztailpacking            584970240       -35.7% off
> system.raven.87e115a1           erofs   4k pcluster + lz4hc,9 + ztailpacking + dedupe   569376768       -37.4% off
> 
> linux-5.10 + linux-5.10.87      erofs   uncompressed                                     1943691264
> linux-5.10 + linux-5.10.87      erofs   4k pcluster + lz4hc,9 + ztailpacking             661987328      -65.9% off
> linux-5.10 + linux-5.10.87      erofs   4k pcluster + lz4hc,9 + ztailpacking + dedupe    490295296      -74.8% off
> 
> linux-5.10.87                   erofs   4k pcluster + lz4hc,9                            331292672
> 
> One observation is since the tailpacking pcluster doesn't have blkaddr
> so data relating to tailpacking pcluster cannot be deduped.
> 
> On the other side, it can work with `fragment' feature later together to
> minimize image sizes.
> 
> 
> Attach a fix for uncompressed pcluster:
> 
> diff --git a/lib/dedupe.c b/lib/dedupe.c
> index c53a64edfc8d..c382303e2ceb 100644
> --- a/lib/dedupe.c
> +++ b/lib/dedupe.c
> @@ -21,7 +21,7 @@ struct z_erofs_dedupe_item {
>  	unsigned int	compressed_blks;
>  
>  	int		original_length;
> -	bool		partial;
> +	bool		partial, raw;
>  	u8		extra_data[];
>  };
>  
> @@ -86,6 +86,7 @@ int z_erofs_dedupe_match(struct z_erofs_dedupe_ctx *ctx)
>  		ctx->e.length = window_size + extra;
>  		ctx->e.partial = e->partial ||
>  			(window_size + extra < e->original_length);
> +		ctx->e.raw = e->raw;
>  		ctx->e.blkaddr = e->compressed_blkaddr;
>  		ctx->e.compressedblks = e->compressed_blks;
>  		return 0;
> @@ -114,6 +115,7 @@ int z_erofs_dedupe_insert(struct z_erofs_inmem_extent *e,
>  	di->compressed_blkaddr = e->blkaddr;
>  	di->compressed_blks = e->compressedblks;
>  	di->partial = e->partial;
> +	di->raw = e->raw;
>  
>  	/* with the same rolling hash */
>  	if (!rb_tree_insert(dedupe_subtree, di))
> -- 
> 2.30.2
>

Another fix detected by an Android system image:

diff --git a/lib/compress.c b/lib/compress.c
index bdb6e78d32ca..3247835b75b6 100644
--- a/lib/compress.c
+++ b/lib/compress.c
@@ -158,9 +158,14 @@ static int z_erofs_compress_dedupe(struct erofs_inode *inode,
 
 	do {
 		struct z_erofs_dedupe_ctx dctx = {
-			.start = ctx->queue + ctx->head -
-				(ctx->e.length < EROFS_BLKSIZ ? 0 :
-					ctx->e.length - EROFS_BLKSIZ),
+			.start = ctx->queue + ctx->head - ({ int rc;
+				if (ctx->e.length <= EROFS_BLKSIZ)
+					rc = 0;
+				else if (ctx->e.length - EROFS_BLKSIZ >= ctx->head)
+					rc = ctx->head;
+				else
+					rc = ctx->e.length - EROFS_BLKSIZ;
+				rc; }),
 			.end = ctx->queue + ctx->head + *len,
 			.cur = ctx->queue + ctx->head,
 		};


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2022-09-08  0:57 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-06 11:40 [PATCH 1/4] erofs-utils: introduce z_erofs_inmem_extent ZiyangZhang
2022-09-06 11:40 ` [PATCH 2/4] erofs-utils: lib: add rb-tree implementation ZiyangZhang
2022-09-06 11:40 ` [PATCH 3/4] erofs-utils: fuse: introduce partial-referenced pclusters ZiyangZhang
2022-09-06 11:40 ` [PATCH 4/4] erofs-utils: mkfs: introduce global compressed data deduplication ZiyangZhang
2022-09-07  5:55   ` Gao Xiang
2022-09-08  0:56     ` Gao Xiang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).