All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFCv2 02/10] dm-dedup: core deduplication logic
@ 2014-08-28 22:02 Vasily Tarasov
  2015-02-02 15:23 ` Vivek Goyal
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Vasily Tarasov @ 2014-08-28 22:02 UTC (permalink / raw)
  To: dm-devel
  Cc: Joe Thornber, Mike Snitzer, Christoph Hellwig, Philip Shilane,
	Sonam Mandal, Erez Zadok

The implementation of the constructor, destructor, and map functions.

The map function just puts all I/O requests in a work queue. For reads,
the worker thread locates Physical Block Numbers (PBNs) in the
LBN-to-PBN mapping and remaps the I/O there. If PBN is not found, a
block of zeroes is returned.

For writes, the worker thread computes the hash of a block and tries to
find the hash in the HASH-to-PBN mapping. If the hash is found, only the
LBN-to-PBN mapping needs to be updated. If the hash is not found, the
request is remapped to the new location on the data device and both
LBN-to-PBN and HASH-to-PBN mappings are updated.

Signed-off-by: Vasily Tarasov <tarasov@vasily.name>
---
 drivers/md/dm-dedup-target.c |  912 ++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 912 insertions(+), 0 deletions(-)
 create mode 100644 drivers/md/dm-dedup-target.c

diff --git a/drivers/md/dm-dedup-target.c b/drivers/md/dm-dedup-target.c
new file mode 100644
index 0000000..0887543
--- /dev/null
+++ b/drivers/md/dm-dedup-target.c
@@ -0,0 +1,912 @@
+/*
+ * Copyright (C) 2012-2014 Vasily Tarasov
+ * Copyright (C) 2012-2014 Geoff Kuenning
+ * Copyright (C) 2012-2014 Sonam Mandal
+ * Copyright (C) 2012-2014 Karthikeyani Palanisami
+ * Copyright (C) 2012-2014 Philip Shilane
+ * Copyright (C) 2012-2014 Sagar Trehan
+ * Copyright (C) 2012-2014 Erez Zadok
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-dedup-target.h"
+#include "dm-dedup-rw.h"
+#include "dm-dedup-hash.h"
+#include "dm-dedup-backend.h"
+#include "dm-dedup-ram.h"
+#include "dm-dedup-cbt.h"
+#include "dm-dedup-kvstore.h"
+
+#define MAX_DEV_NAME_LEN (64)
+#define MAX_BACKEND_NAME_LEN (64)
+
+#define MIN_DATA_DEV_BLOCK_SIZE (4 * 1024)
+#define MAX_DATA_DEV_BLOCK_SIZE (1024 * 1024)
+
+struct on_disk_stats {
+	uint64_t physical_block_counter;
+	uint64_t logical_block_counter;
+};
+
+/*
+ * All incoming requests are packed in the dedup_work structure
+ * for further processing by the workqueue thread.
+ */
+struct dedup_work {
+	struct work_struct worker;
+	struct dedup_config *config;
+	struct bio *bio;
+};
+
+struct mark_and_sweep_data {
+	unsigned long *bitmap;
+	uint64_t bitmap_len;
+	uint64_t cleanup_count; /* number of hashes cleaned up */
+	struct dedup_config *dc;
+};
+
+enum backend {
+	BKND_INRAM,
+	BKND_COWBTREE
+};
+
+static void bio_zero_endio(struct bio *bio)
+{
+	zero_fill_bio(bio);
+	bio_endio(bio, 0);
+}
+
+static uint64_t bio_lbn(struct dedup_config *dc, struct bio *bio)
+{
+	return bio->bi_iter.bi_sector / dc->sectors_per_block;
+}
+
+static void do_io(struct dedup_config *dc, struct bio *bio,
+			uint64_t pbn)
+{
+	int offset;
+
+	offset = (sector_t) bio->bi_iter.bi_sector % dc->sectors_per_block;
+	bio->bi_iter.bi_sector = (sector_t)pbn * dc->sectors_per_block + offset;
+
+	bio->bi_bdev = dc->data_dev->bdev;
+
+	generic_make_request(bio);
+}
+
+static void handle_read(struct dedup_config *dc, struct bio *bio)
+{
+	uint64_t lbn;
+	uint32_t vsize;
+	struct lbn_pbn_value lbnpbn_value;
+	int r;
+
+	lbn = bio_lbn(dc, bio);
+
+	r = dc->kvs_lbn_pbn->kvs_lookup(dc->kvs_lbn_pbn, (void *)&lbn,
+			sizeof(lbn), (void *)&lbnpbn_value, &vsize);
+	if (r == 0)
+		bio_zero_endio(bio);
+	else if (r == 1)
+		do_io(dc, bio, lbnpbn_value.pbn);
+	else
+		BUG();
+}
+
+static int allocate_block(struct dedup_config *dc, uint64_t *pbn_new)
+{
+	int r;
+
+	r = dc->mdops->alloc_data_block(dc->bmd, pbn_new);
+
+	if (!r) {
+		dc->logical_block_counter++;
+		dc->physical_block_counter++;
+	}
+
+	return r;
+}
+
+static int write_to_new_block(struct dedup_config *dc, uint64_t *pbn_new,
+				struct bio *bio, uint64_t lbn)
+{
+	int r = 0;
+	struct lbn_pbn_value lbnpbn_value;
+
+	r = allocate_block(dc, pbn_new);
+	if (r < 0) {
+		r = -EIO;
+		return r;
+	}
+
+	lbnpbn_value.pbn = *pbn_new;
+
+	do_io(dc, bio, *pbn_new);
+
+	r = dc->kvs_lbn_pbn->kvs_insert(dc->kvs_lbn_pbn, (void *)&lbn,
+		sizeof(lbn), (void *)&lbnpbn_value, sizeof(lbnpbn_value));
+	if (r < 0)
+		BUG();
+
+	return r;
+}
+
+static int handle_write_no_hash(struct dedup_config *dc,
+			struct bio *bio, uint64_t lbn, u8 *hash)
+{
+	int r;
+	uint32_t vsize;
+	uint64_t pbn_new, pbn_old;
+	struct lbn_pbn_value lbnpbn_value;
+	struct hash_pbn_value hashpbn_value;
+
+	dc->uniqwrites++;
+
+	r = dc->kvs_lbn_pbn->kvs_lookup(dc->kvs_lbn_pbn, (void *)&lbn,
+			sizeof(lbn), (void *)&lbnpbn_value, &vsize);
+	if (r == 0) {
+		/* No LBN->PBN mapping entry */
+		dc->newwrites++;
+		r = write_to_new_block(dc, &pbn_new, bio, lbn);
+		if (r < 0)
+			goto out;
+
+		hashpbn_value.pbn = pbn_new;
+
+		r = dc->kvs_hash_pbn->kvs_insert(dc->kvs_hash_pbn, (void *)hash,
+				dc->crypto_key_size, (void *)&hashpbn_value,
+				sizeof(hashpbn_value));
+		if (r < 0)
+			BUG();
+
+		r = dc->mdops->inc_refcount(dc->bmd, pbn_new);
+		if (r < 0)
+			BUG();
+
+		goto out;
+	} else if (r < 0)
+		BUG();
+
+	/* LBN->PBN mappings exist */
+	dc->overwrites++;
+	r = write_to_new_block(dc, &pbn_new, bio, lbn);
+	if (r < 0)
+		goto out;
+
+	pbn_old = lbnpbn_value.pbn;
+	r = dc->mdops->dec_refcount(dc->bmd, pbn_old);
+	if (r < 0)
+		BUG();
+
+	dc->logical_block_counter--;
+
+	hashpbn_value.pbn = pbn_new;
+
+	r = dc->kvs_hash_pbn->kvs_insert(dc->kvs_hash_pbn, (void *)hash,
+				dc->crypto_key_size, (void *)&hashpbn_value,
+				sizeof(hashpbn_value));
+	if (r < 0)
+		BUG();
+
+	r = dc->mdops->inc_refcount(dc->bmd, pbn_new);
+	if (r < 0)
+		BUG();
+out:
+	return r;
+}
+
+static int handle_write_with_hash(struct dedup_config *dc, struct bio *bio,
+				uint64_t lbn, u8 *final_hash,
+				struct hash_pbn_value hashpbn_value)
+{
+	int r;
+	uint32_t vsize;
+	uint64_t pbn_new, pbn_old;
+	struct lbn_pbn_value lbnpbn_value;
+	struct lbn_pbn_value new_lbnpbn_value;
+
+	dc->dupwrites++;
+
+	pbn_new = hashpbn_value.pbn;
+	r = dc->kvs_lbn_pbn->kvs_lookup(dc->kvs_lbn_pbn, (void *)&lbn,
+			sizeof(lbn), (void *)&lbnpbn_value, &vsize);
+	if (r == 0) {
+		/* No LBN->PBN mapping entry */
+		dc->newwrites++;
+
+		r = dc->mdops->inc_refcount(dc->bmd, pbn_new);
+		if (r < 0)
+			BUG();
+
+		lbnpbn_value.pbn = pbn_new;
+
+		r = dc->kvs_lbn_pbn->kvs_insert(dc->kvs_lbn_pbn, (void *)&lbn,
+				sizeof(lbn), (void *)&lbnpbn_value,
+				sizeof(lbnpbn_value));
+		if (r < 0)
+			BUG();
+
+		dc->logical_block_counter++;
+
+		goto out;
+	} else if (r < 0)
+		BUG();
+
+	/* LBN->PBN mapping entry exists */
+	dc->overwrites++;
+	pbn_old = lbnpbn_value.pbn;
+	if (pbn_new != pbn_old) {
+		r = dc->mdops->inc_refcount(dc->bmd, pbn_new);
+		if (r < 0)
+			BUG();
+
+		new_lbnpbn_value.pbn = pbn_new;
+
+		r = dc->kvs_lbn_pbn->kvs_insert(dc->kvs_lbn_pbn, (void *)&lbn,
+			sizeof(lbn), (void *)&new_lbnpbn_value,
+			sizeof(new_lbnpbn_value));
+		if (r < 0)
+			BUG();
+
+		r = dc->mdops->dec_refcount(dc->bmd, pbn_old);
+		if (r < 0)
+			BUG();
+
+		goto out;
+	}
+
+	/* Nothing to do if writing same data to same LBN */
+
+out:
+	bio_endio(bio, 0);
+	return r;
+}
+
+static void handle_write(struct dedup_config *dc, struct bio *bio)
+{
+	uint64_t lbn;
+	u8 hash[MAX_DIGEST_SIZE];
+	struct hash_pbn_value hashpbn_value;
+	uint32_t vsize;
+	struct bio *new_bio = NULL;
+	int r;
+
+	dc->writes++;
+
+	/* Read-on-write handling */
+	if (bio->bi_iter.bi_size < dc->block_size) {
+		dc->reads_on_writes++;
+		new_bio = prepare_bio_on_write(dc, bio);
+		if (!new_bio) {
+			bio_endio(bio, -ENOMEM);
+			return;
+		}
+		bio = new_bio;
+	}
+
+	lbn = bio_lbn(dc, bio);
+
+	r = compute_hash_bio(dc->desc_table, bio, hash);
+	if (r)
+		BUG();
+
+	r = dc->kvs_hash_pbn->kvs_lookup(dc->kvs_hash_pbn, hash,
+				dc->crypto_key_size, &hashpbn_value, &vsize);
+
+	if (r == 0)
+		r = handle_write_no_hash(dc, bio, lbn, hash);
+	else if (r > 0)
+		r = handle_write_with_hash(dc, bio, lbn, hash,
+					hashpbn_value);
+	else
+		BUG();
+
+	dc->writes_after_flush++;
+	if ((dc->flushrq && dc->writes_after_flush >= dc->flushrq) ||
+			(bio->bi_rw & (REQ_FLUSH | REQ_FUA))) {
+		r = dc->mdops->flush_meta(dc->bmd);
+		dc->writes_after_flush = 0;
+	}
+}
+
+static void process_bio(struct dedup_config *dc, struct bio *bio)
+{
+	switch (bio_data_dir(bio)) {
+	case READ:
+		handle_read(dc, bio);
+		break;
+	case WRITE:
+		handle_write(dc, bio);
+		break;
+	default:
+		DMERR("Unknown request type!");
+		bio_endio(bio, -EINVAL);
+	}
+}
+
+static void do_work(struct work_struct *ws)
+{
+	struct dedup_work *data = container_of(ws, struct dedup_work, worker);
+	struct dedup_config *dc = (struct dedup_config *)data->config;
+	struct bio *bio = (struct bio *)data->bio;
+
+	mempool_free(data, dc->dedup_work_pool);
+
+	process_bio(dc, bio);
+}
+
+static void dedup_defer_bio(struct dedup_config *dc, struct bio *bio)
+{
+	struct dedup_work *data;
+
+	data = mempool_alloc(dc->dedup_work_pool, GFP_NOIO);
+	if (!data) {
+		bio_endio(bio, -ENOMEM);
+		return;
+	}
+
+	data->bio = bio;
+	data->config = dc;
+
+	INIT_WORK(&(data->worker), do_work);
+
+	queue_work(dc->workqueue, &(data->worker));
+}
+
+static int dm_dedup_map_fn(struct dm_target *ti, struct bio *bio)
+{
+	dedup_defer_bio(ti->private, bio);
+
+	return DM_MAPIO_SUBMITTED;
+}
+
+struct dedup_args {
+	struct dm_target *ti;
+
+	struct dm_dev *meta_dev;
+
+	struct dm_dev *data_dev;
+	uint64_t data_size;
+
+	uint32_t block_size;
+
+	char hash_algo[CRYPTO_ALG_NAME_LEN];
+
+	enum backend backend;
+
+	uint32_t flushrq;
+};
+
+static int parse_meta_dev(struct dedup_args *da, struct dm_arg_set *as,
+				char **err)
+{
+	int r;
+
+	r = dm_get_device(da->ti, dm_shift_arg(as),
+			dm_table_get_mode(da->ti->table), &da->meta_dev);
+	if (r)
+		*err = "Error opening metadata device";
+
+	return r;
+}
+
+static int parse_data_dev(struct dedup_args *da, struct dm_arg_set *as,
+				char **err)
+{
+	int r;
+
+	r = dm_get_device(da->ti, dm_shift_arg(as),
+			dm_table_get_mode(da->ti->table), &da->data_dev);
+	if (r)
+		*err = "Error opening data device";
+
+	da->data_size = i_size_read(da->data_dev->bdev->bd_inode);
+
+	return r;
+}
+
+static int parse_block_size(struct dedup_args *da, struct dm_arg_set *as,
+				char **err)
+{
+	uint32_t block_size;
+
+	if (kstrtou32(dm_shift_arg(as), 10, &block_size) ||
+		!block_size ||
+		block_size < MIN_DATA_DEV_BLOCK_SIZE ||
+		block_size > MAX_DATA_DEV_BLOCK_SIZE ||
+		!is_power_of_2(block_size)) {
+		*err = "Invalid data block size";
+		return -EINVAL;
+	}
+
+	if (block_size > da->data_size) {
+		*err = "Data block size is larger than the data device";
+		return -EINVAL;
+	}
+
+	da->block_size = block_size;
+
+	return 0;
+}
+
+static int parse_hash_algo(struct dedup_args *da, struct dm_arg_set *as,
+				char **err)
+{
+	strlcpy(da->hash_algo, dm_shift_arg(as), CRYPTO_ALG_NAME_LEN);
+
+	if (!crypto_has_hash(da->hash_algo, 0, CRYPTO_ALG_ASYNC)) {
+		*err = "Unrecognized hash algorithm";
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int parse_backend(struct dedup_args *da, struct dm_arg_set *as,
+				char **err)
+{
+	char backend[MAX_BACKEND_NAME_LEN];
+
+	strlcpy(backend, dm_shift_arg(as), MAX_BACKEND_NAME_LEN);
+
+	if (!strcmp(backend, "inram"))
+		da->backend = BKND_INRAM;
+	else if (!strcmp(backend, "cowbtree"))
+		da->backend = BKND_COWBTREE;
+	else {
+		*err = "Unsupported metadata backend";
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int parse_flushrq(struct dedup_args *da, struct dm_arg_set *as,
+				char **err)
+{
+	if (kstrtou32(dm_shift_arg(as), 10, &da->flushrq))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int parse_dedup_args(struct dedup_args *da, int argc,
+				char **argv, char **err)
+{
+	struct dm_arg_set as;
+	int r;
+
+	if (argc < 6) {
+		*err = "Insufficient args";
+		return -EINVAL;
+	}
+
+	if (argc > 6) {
+		*err = "Too many args";
+		return -EINVAL;
+	}
+
+	as.argc = argc;
+	as.argv = argv;
+
+	r = parse_meta_dev(da, &as, err);
+	if (r)
+		return r;
+
+	r = parse_data_dev(da, &as, err);
+	if (r)
+		return r;
+
+	r = parse_block_size(da, &as, err);
+	if (r)
+		return r;
+
+	r = parse_hash_algo(da, &as, err);
+	if (r)
+		return r;
+
+	r = parse_backend(da, &as, err);
+	if (r)
+		return r;
+
+	r = parse_flushrq(da, &as, err);
+	if (r)
+		return r;
+
+	return 0;
+}
+
+static void destroy_dedup_args(struct dedup_args *da)
+{
+	if (da->meta_dev)
+		dm_put_device(da->ti, da->meta_dev);
+
+	if (da->data_dev)
+		dm_put_device(da->ti, da->data_dev);
+
+	kfree(da);
+}
+
+static int dm_dedup_ctr_fn(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct dedup_args *da;
+	struct dedup_config *dc;
+	struct workqueue_struct *wq;
+
+	struct init_param_inram iparam_inram;
+	struct init_param_cowbtree iparam_cowbtree;
+	void *iparam;
+	struct metadata *md = NULL;
+
+	uint64_t data_size;
+	int r;
+	int crypto_key_size;
+
+	struct on_disk_stats *data = NULL;
+	uint64_t logical_block_counter = 0;
+	uint64_t physical_block_counter = 0;
+
+	uint32_t flushrq = 0;
+	mempool_t *dedup_work_pool = NULL;
+
+	bool unformatted;
+
+	da = kzalloc(sizeof(*da), GFP_KERNEL);
+	if (!da) {
+		ti->error = "Error allocating memory for dedup arguments";
+		return -ENOMEM;
+	}
+
+	da->ti = ti;
+
+	r = parse_dedup_args(da, argc, argv, &ti->error);
+	if (r)
+		goto out;
+
+	dc = kmalloc(sizeof(*dc), GFP_NOIO);
+	if (!dc) {
+		ti->error = "Error allocating memory for dedup config";
+		r = -ENOMEM;
+		goto out;
+	}
+
+	wq = create_singlethread_workqueue("dm-dedup");
+	if (!wq) {
+		ti->error = "failed to create workqueue";
+		r = -ENOMEM;
+		goto bad_wq;
+	}
+
+	dedup_work_pool = mempool_create_kmalloc_pool(MIN_DEDUP_WORK_IO,
+						sizeof(struct dedup_work));
+	if (!dedup_work_pool) {
+		r = -ENOMEM;
+		ti->error = "failed to create mempool";
+		goto bad_mempool;
+	}
+
+	dc->io_client = dm_io_client_create();
+	if (IS_ERR(dc->io_client)) {
+		r = PTR_ERR(dc->io_client);
+		ti->error = "failed to create dm_io_client";
+		goto bad_io_client;
+	}
+
+	dc->block_size = da->block_size;
+	dc->sectors_per_block = to_sector(da->block_size);
+	dc->lblocks = ti->len / dc->sectors_per_block;
+
+	data_size = i_size_read(da->data_dev->bdev->bd_inode);
+	dc->pblocks = data_size / da->block_size;
+
+	/* Meta-data backend specific part */
+	if (da->backend == BKND_INRAM) {
+		dc->mdops = &metadata_ops_inram;
+		iparam_inram.blocks = dc->pblocks;
+		iparam = &iparam_inram;
+	} else if (da->backend == BKND_COWBTREE) {
+		r = -EINVAL;
+		dc->mdops = &metadata_ops_cowbtree;
+		iparam_cowbtree.blocks = dc->pblocks;
+		iparam_cowbtree.metadata_bdev = da->meta_dev->bdev;
+		iparam = &iparam_cowbtree;
+	} else
+		BUG();
+
+	md = dc->mdops->init_meta(iparam, &unformatted);
+	if (IS_ERR(md)) {
+		ti->error = "failed to initialize backend metadata";
+		r = PTR_ERR(md);
+		goto bad_metadata_init;
+	}
+
+	dc->desc_table = desc_table_init(da->hash_algo);
+	if (!dc->desc_table || IS_ERR(dc->desc_table)) {
+		ti->error = "failed to initialize crypto API";
+		r = PTR_ERR(dc->desc_table);
+		goto bad_metadata_init;
+	}
+
+	crypto_key_size = get_hash_digestsize(dc->desc_table);
+
+	dc->kvs_hash_pbn = dc->mdops->kvs_create_sparse(md, crypto_key_size,
+				sizeof(struct hash_pbn_value),
+				dc->pblocks, unformatted);
+	if (IS_ERR(dc->kvs_hash_pbn)) {
+		r = PTR_ERR(dc->kvs_hash_pbn);
+		ti->error = "failed to create sparse KVS";
+		goto bad_kvstore_init;
+	}
+
+	dc->kvs_lbn_pbn = dc->mdops->kvs_create_linear(md, 8,
+			sizeof(struct lbn_pbn_value), dc->lblocks, unformatted);
+	if (IS_ERR(dc->kvs_lbn_pbn)) {
+		ti->error = "failed to create linear KVS";
+		r = PTR_ERR(dc->kvs_lbn_pbn);
+		goto bad_kvstore_init;
+	}
+
+	r = dc->mdops->flush_meta(md);
+	if (r < 0)
+		BUG();
+
+	if (!unformatted && dc->mdops->get_private_data) {
+		r = dc->mdops->get_private_data(md, (void **)&data,
+				sizeof(struct on_disk_stats));
+		if (r < 0)
+			BUG();
+
+		logical_block_counter = data->logical_block_counter;
+		physical_block_counter = data->physical_block_counter;
+	}
+
+	dc->data_dev = da->data_dev;
+	dc->metadata_dev = da->meta_dev;
+
+	dc->workqueue = wq;
+	dc->dedup_work_pool = dedup_work_pool;
+	dc->bmd = md;
+
+	dc->logical_block_counter = logical_block_counter;
+	dc->physical_block_counter = physical_block_counter;
+
+	dc->writes = 0;
+	dc->dupwrites = 0;
+	dc->uniqwrites = 0;
+	dc->reads_on_writes = 0;
+	dc->overwrites = 0;
+	dc->newwrites = 0;
+
+	strcpy(dc->crypto_alg, da->hash_algo);
+	dc->crypto_key_size = crypto_key_size;
+
+	dc->flushrq = flushrq;
+	dc->writes_after_flush = 0;
+
+	dm_set_target_max_io_len(ti, dc->sectors_per_block);
+	ti->private = dc;
+
+	da->meta_dev = da->data_dev = NULL;
+	r = 0;
+	goto out;
+
+bad_kvstore_init:
+	desc_table_deinit(dc->desc_table);
+bad_metadata_init:
+	if (md && !IS_ERR(md))
+		dc->mdops->exit_meta(md);
+	dm_io_client_destroy(dc->io_client);
+bad_io_client:
+	mempool_destroy(dedup_work_pool);
+bad_mempool:
+	destroy_workqueue(wq);
+bad_wq:
+	kfree(dc);
+out:
+	destroy_dedup_args(da);
+	return r;
+}
+
+static void dm_dedup_dtr_fn(struct dm_target *ti)
+{
+	struct dedup_config *dc = ti->private;
+	struct on_disk_stats data;
+	int ret;
+
+	if (dc->mdops->set_private_data) {
+		data.physical_block_counter = dc->physical_block_counter;
+		data.logical_block_counter = dc->logical_block_counter;
+
+		ret = dc->mdops->set_private_data(dc->bmd, &data,
+				sizeof(struct on_disk_stats));
+		if (ret < 0)
+			BUG();
+	}
+
+	ret = dc->mdops->flush_meta(dc->bmd);
+	if (ret < 0)
+		BUG();
+
+	flush_workqueue(dc->workqueue);
+	destroy_workqueue(dc->workqueue);
+
+	mempool_destroy(dc->dedup_work_pool);
+
+	dc->mdops->exit_meta(dc->bmd);
+
+	dm_io_client_destroy(dc->io_client);
+
+	dm_put_device(ti, dc->data_dev);
+	dm_put_device(ti, dc->metadata_dev);
+	desc_table_deinit(dc->desc_table);
+
+	kfree(dc);
+}
+
+static int mark_lbn_pbn_bitmap(void *key, int32_t ksize,
+		void *value, int32_t vsize, void *data)
+{
+	int ret = 0;
+	struct mark_and_sweep_data *ms_data =
+		(struct mark_and_sweep_data *)data;
+	uint64_t pbn_val = *((uint64_t *)value);
+
+	BUG_ON(!data);
+	BUG_ON(!ms_data->bitmap);
+	BUG_ON(pbn_val > ms_data->bitmap_len);
+
+	bitmap_set(ms_data->bitmap, pbn_val, 1);
+
+	return ret;
+}
+
+static int cleanup_hash_pbn(void *key, int32_t ksize, void *value,
+		int32_t vsize, void *data)
+{
+	int ret = 0, r = 0;
+	uint64_t pbn_val = 0;
+	struct mark_and_sweep_data *ms_data =
+		(struct mark_and_sweep_data *)data;
+	struct hash_pbn_value hashpbn_value = *((struct hash_pbn_value *)value);
+	struct dedup_config *dc = ms_data->dc;
+
+	BUG_ON(!data);
+	BUG_ON(!ms_data->bitmap);
+
+	pbn_val = hashpbn_value.pbn;
+	BUG_ON(pbn_val > ms_data->bitmap_len);
+
+	if (test_bit(pbn_val, ms_data->bitmap) == 0) {
+		ret = dc->kvs_hash_pbn->kvs_delete(dc->kvs_hash_pbn,
+							key, ksize);
+		if (ret < 0)
+			BUG();
+
+		r = dc->mdops->dec_refcount(ms_data->dc->bmd, pbn_val);
+		if (r < 0)
+			BUG();
+
+		ms_data->cleanup_count++;
+	}
+
+	return ret;
+}
+
+/*
+ * Creates a bitmap of all PBNs in use by walking through
+ * kvs_lbn_pbn. Then walks through kvs_hash_pbn and deletes
+ * any entries which do not have an lbn-to-pbn mapping.
+ */
+static int mark_and_sweep(struct dedup_config *dc)
+{
+	int err = 0;
+	uint64_t data_size = 0;
+	uint64_t bitmap_size = 0;
+	struct mark_and_sweep_data ms_data;
+
+	BUG_ON(!dc);
+
+	data_size = i_size_read(dc->data_dev->bdev->bd_inode);
+	bitmap_size = data_size / dc->block_size;
+
+	memset(&ms_data, 0, sizeof(struct mark_and_sweep_data));
+
+	ms_data.bitmap = vmalloc(BITS_TO_LONGS(bitmap_size) *
+			sizeof(unsigned long));
+	if (!ms_data.bitmap) {
+		DMERR("Could not vmalloc ms_data.bitmap");
+		err = -ENOMEM;
+		goto out;
+	}
+	bitmap_zero(ms_data.bitmap, bitmap_size);
+
+	ms_data.bitmap_len = bitmap_size;
+	ms_data.cleanup_count = 0;
+	ms_data.dc = dc;
+
+	/* Create bitmap of used pbn blocks */
+	err = dc->kvs_lbn_pbn->kvs_iterate(dc->kvs_lbn_pbn,
+			&mark_lbn_pbn_bitmap, (void *)&ms_data);
+	if (err < 0)
+		goto out_free;
+
+	/* Cleanup hashes based on above bitmap of used pbn blocks */
+	err = dc->kvs_hash_pbn->kvs_iterate(dc->kvs_hash_pbn,
+			&cleanup_hash_pbn, (void *)&ms_data);
+	if (err < 0)
+		goto out_free;
+
+	dc->physical_block_counter -= ms_data.cleanup_count;
+
+out_free:
+	vfree(ms_data.bitmap);
+out:
+	return err;
+}
+
+static int dm_dedup_message_fn(struct dm_target *ti,
+				unsigned argc, char **argv)
+{
+	int r = 0;
+
+	struct dedup_config *dc = ti->private;
+
+	if (!strcasecmp(argv[0], "mark_and_sweep")) {
+		r = mark_and_sweep(dc);
+		if (r < 0)
+			DMERR("Error in performing mark_and_sweep: %d.", r);
+	} else if (!strcasecmp(argv[0], "drop_bufio_cache")) {
+		if (dc->mdops->flush_bufio_cache)
+			dc->mdops->flush_bufio_cache(dc->bmd);
+		else
+			r = -ENOTSUPP;
+	} else
+		r = -EINVAL;
+
+	return r;
+}
+
+static int dm_dedup_endio_fn(struct dm_target *ti, struct bio *bio, int error)
+{
+	if (error || bio_data_dir(bio) != READ)
+		return 0;
+
+	return 0;
+}
+
+static struct target_type dm_dedup_target = {
+	.name = "dedup",
+	.version = {1, 0, 0},
+	.module = THIS_MODULE,
+	.features = 0x0,
+	.ctr = dm_dedup_ctr_fn,
+	.dtr = dm_dedup_dtr_fn,
+	.map = dm_dedup_map_fn,
+	.end_io = dm_dedup_endio_fn,
+	.message = dm_dedup_message_fn,
+};
+
+static int __init dm_dedup_init(void)
+{
+	int r;
+
+	r = dm_register_target(&dm_dedup_target);
+	if (r < 0)
+		DMERR("target registration failed: %d.", r);
+	else
+		DMINFO("target registered succesfully.");
+
+	return r;
+}
+
+static void __exit dm_dedup_exit(void)
+{
+	dm_unregister_target(&dm_dedup_target);
+}
+
+module_init(dm_dedup_init);
+module_exit(dm_dedup_exit);
+
+MODULE_DESCRIPTION(DM_NAME " target for data deduplication");
+MODULE_LICENSE("GPL");
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH RFCv2 02/10] dm-dedup: core deduplication logic
  2014-08-28 22:02 [PATCH RFCv2 02/10] dm-dedup: core deduplication logic Vasily Tarasov
@ 2015-02-02 15:23 ` Vivek Goyal
  2015-02-05 20:21 ` Vivek Goyal
  2015-02-06 16:53 ` Vivek Goyal
  2 siblings, 0 replies; 4+ messages in thread
From: Vivek Goyal @ 2015-02-02 15:23 UTC (permalink / raw)
  To: Vasily Tarasov
  Cc: Joe Thornber, Mike Snitzer, Christoph Hellwig, dm-devel,
	Philip Shilane, Sonam Mandal, Erez Zadok

On Thu, Aug 28, 2014 at 06:02:11PM -0400, Vasily Tarasov wrote:

Hi,

Spend some time looking at this patch. Few comments are inline.

> +static void process_bio(struct dedup_config *dc, struct bio *bio)
> +{
> +	switch (bio_data_dir(bio)) {
> +	case READ:
> +		handle_read(dc, bio);
> +		break;
> +	case WRITE:
> +		handle_write(dc, bio);
> +		break;
> +	default:
> +		DMERR("Unknown request type!");
> +		bio_endio(bio, -EINVAL);
> +	}

I guess some of this error checking could go in map function and if bio is
not of desired type and return error right then and there.

> +}
> +
> +static void do_work(struct work_struct *ws)
> +{
> +	struct dedup_work *data = container_of(ws, struct dedup_work, worker);
> +	struct dedup_config *dc = (struct dedup_config *)data->config;
> +	struct bio *bio = (struct bio *)data->bio;
> +
> +	mempool_free(data, dc->dedup_work_pool);
> +
> +	process_bio(dc, bio);
> +}
> +
> +static void dedup_defer_bio(struct dedup_config *dc, struct bio *bio)
> +{
> +	struct dedup_work *data;
> +
> +	data = mempool_alloc(dc->dedup_work_pool, GFP_NOIO);
> +	if (!data) {
> +		bio_endio(bio, -ENOMEM);
> +		return;
> +	}

So why are we allocating one work struct for each bio. Why not have a 
work struct embedded in say dedup_config and queue incoming bios in a
list and wake up worker. And worker will go through list of bios and
process bios. 

Does not look like there is a need to create one work struct for each
bio.

Please look at other targets how they have handled it.


[..]
> +static int dm_dedup_ctr_fn(struct dm_target *ti, unsigned int argc, char **argv)
> +{
> +	struct dedup_args *da;
> +	struct dedup_config *dc;
> +	struct workqueue_struct *wq;
> +
> +	struct init_param_inram iparam_inram;
> +	struct init_param_cowbtree iparam_cowbtree;
> +	void *iparam;
> +	struct metadata *md = NULL;
> +
> +	uint64_t data_size;
> +	int r;
> +	int crypto_key_size;
> +
> +	struct on_disk_stats *data = NULL;
> +	uint64_t logical_block_counter = 0;
> +	uint64_t physical_block_counter = 0;
> +
> +	uint32_t flushrq = 0;
> +	mempool_t *dedup_work_pool = NULL;
> +
> +	bool unformatted;
> +
> +	da = kzalloc(sizeof(*da), GFP_KERNEL);
> +	if (!da) {
> +		ti->error = "Error allocating memory for dedup arguments";
> +		return -ENOMEM;
> +	}
> +
> +	da->ti = ti;
> +
> +	r = parse_dedup_args(da, argc, argv, &ti->error);
> +	if (r)
> +		goto out;
> +
> +	dc = kmalloc(sizeof(*dc), GFP_NOIO);
> +	if (!dc) {
> +		ti->error = "Error allocating memory for dedup config";
> +		r = -ENOMEM;
> +		goto out;
> +	}
> +
> +	wq = create_singlethread_workqueue("dm-dedup");
> +	if (!wq) {
> +		ti->error = "failed to create workqueue";
> +		r = -ENOMEM;
> +		goto bad_wq;
> +	}
> +
> +	dedup_work_pool = mempool_create_kmalloc_pool(MIN_DEDUP_WORK_IO,
> +						sizeof(struct dedup_work));
> +	if (!dedup_work_pool) {
> +		r = -ENOMEM;
> +		ti->error = "failed to create mempool";
> +		goto bad_mempool;
> +	}
> +
> +	dc->io_client = dm_io_client_create();
> +	if (IS_ERR(dc->io_client)) {
> +		r = PTR_ERR(dc->io_client);
> +		ti->error = "failed to create dm_io_client";
> +		goto bad_io_client;
> +	}
> +
> +	dc->block_size = da->block_size;
> +	dc->sectors_per_block = to_sector(da->block_size);
> +	dc->lblocks = ti->len / dc->sectors_per_block;
> +
> +	data_size = i_size_read(da->data_dev->bdev->bd_inode);
> +	dc->pblocks = data_size / da->block_size;
> +
> +	/* Meta-data backend specific part */
> +	if (da->backend == BKND_INRAM) {
> +		dc->mdops = &metadata_ops_inram;
> +		iparam_inram.blocks = dc->pblocks;
> +		iparam = &iparam_inram;
> +	} else if (da->backend == BKND_COWBTREE) {
> +		r = -EINVAL;
> +		dc->mdops = &metadata_ops_cowbtree;
> +		iparam_cowbtree.blocks = dc->pblocks;
> +		iparam_cowbtree.metadata_bdev = da->meta_dev->bdev;
> +		iparam = &iparam_cowbtree;
> +	} else
> +		BUG();
> +
> +	md = dc->mdops->init_meta(iparam, &unformatted);
> +	if (IS_ERR(md)) {
> +		ti->error = "failed to initialize backend metadata";
> +		r = PTR_ERR(md);
> +		goto bad_metadata_init;
> +	}
> +
> +	dc->desc_table = desc_table_init(da->hash_algo);
> +	if (!dc->desc_table || IS_ERR(dc->desc_table)) {
> +		ti->error = "failed to initialize crypto API";
> +		r = PTR_ERR(dc->desc_table);
> +		goto bad_metadata_init;
> +	}
> +
> +	crypto_key_size = get_hash_digestsize(dc->desc_table);

Looks like this function has been defined in patch3 but used in patch2.
That means compilation will fail only if series is applied till patch 2.
That's not a good idea. You patch series should be bisectable.

So any definitions you want to use, introduce them first.

Seconly I had a quick look at patch 3 and this function also seems to
be doing following.

+
+       slot = get_next_slot(desc_table);
+       desc = slot_to_desc(desc_table, slot);

This is odd. This function is supposed to return the size of hash and
not do extra magic of getting next free slot and then getting to desc
associated with slot.

Break out this logic in separate function appropriately.

[..]
> +static int dm_dedup_message_fn(struct dm_target *ti,
> +				unsigned argc, char **argv)
> +{
> +	int r = 0;
> +

I think we should document these messages in documentation file and
explain what do they do.

Also looks like mark_and sweep has been replaced by garbage_collect later,
why?

> +	struct dedup_config *dc = ti->private;
> +
> +	if (!strcasecmp(argv[0], "mark_and_sweep")) {
> +		r = mark_and_sweep(dc);
> +		if (r < 0)
> +			DMERR("Error in performing mark_and_sweep: %d.", r);
> +	} else if (!strcasecmp(argv[0], "drop_bufio_cache")) {
> +		if (dc->mdops->flush_bufio_cache)
> +			dc->mdops->flush_bufio_cache(dc->bmd);
> +		else
> +			r = -ENOTSUPP;
> +	} else
> +		r = -EINVAL;
> +
> +	return r;
> +}
> +
> +static int dm_dedup_endio_fn(struct dm_target *ti, struct bio *bio, int error)
> +{
> +	if (error || bio_data_dir(bio) != READ)
> +		return 0;
> +
> +	return 0;
> +}

So this function always returns 0. What's the point of that if block then?

Thanks
Vivek

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH RFCv2 02/10] dm-dedup: core deduplication logic
  2014-08-28 22:02 [PATCH RFCv2 02/10] dm-dedup: core deduplication logic Vasily Tarasov
  2015-02-02 15:23 ` Vivek Goyal
@ 2015-02-05 20:21 ` Vivek Goyal
  2015-02-06 16:53 ` Vivek Goyal
  2 siblings, 0 replies; 4+ messages in thread
From: Vivek Goyal @ 2015-02-05 20:21 UTC (permalink / raw)
  To: Vasily Tarasov
  Cc: Joe Thornber, Mike Snitzer, Christoph Hellwig, dm-devel,
	Philip Shilane, Sonam Mandal, Erez Zadok

On Thu, Aug 28, 2014 at 06:02:11PM -0400, Vasily Tarasov wrote:

[..]
> +static void handle_read(struct dedup_config *dc, struct bio *bio)
> +{
> +	uint64_t lbn;
> +	uint32_t vsize;
> +	struct lbn_pbn_value lbnpbn_value;
> +	int r;
> +
> +	lbn = bio_lbn(dc, bio);
> +
> +	r = dc->kvs_lbn_pbn->kvs_lookup(dc->kvs_lbn_pbn, (void *)&lbn,
> +			sizeof(lbn), (void *)&lbnpbn_value, &vsize);
> +	if (r == 0)
> +		bio_zero_endio(bio);
> +	else if (r == 1)
> +		do_io(dc, bio, lbnpbn_value.pbn);
> +	else
> +		BUG();

kvs_lookup() return values look little odd to me. I am wondering how
this function will ever return any errors. Because if it returns error,
we will call BUG().

I think this function should return 0 upon success and negative error
code in case of error. And that error code should be propagated upwards.

To differentiate between whether key was found or not, may be you
can set vsize to zero before calling the function and check vsize
value after the call. If it is non zero, you know key has been
found.

> +}
> +
> +static int allocate_block(struct dedup_config *dc, uint64_t *pbn_new)
> +{
> +	int r;
> +
> +	r = dc->mdops->alloc_data_block(dc->bmd, pbn_new);
> +
> +	if (!r) {
> +		dc->logical_block_counter++;

I am not sure why we are bumping up logical_block_counter here. Right
now we are just allocating a physical block and that has nothing to
do with bumping up logical block count.

> +		dc->physical_block_counter++;
> +	}
> +
> +	return r;
> +}
> +
> +static int write_to_new_block(struct dedup_config *dc, uint64_t *pbn_new,
> +				struct bio *bio, uint64_t lbn)
> +{
> +	int r = 0;
> +	struct lbn_pbn_value lbnpbn_value;
> +
> +	r = allocate_block(dc, pbn_new);
> +	if (r < 0) {
> +		r = -EIO;

Is this a common practice to overwrite error code with -EIO?

> +		return r;
> +	}
> +
> +	lbnpbn_value.pbn = *pbn_new;
> +
> +	do_io(dc, bio, *pbn_new);
> +
> +	r = dc->kvs_lbn_pbn->kvs_insert(dc->kvs_lbn_pbn, (void *)&lbn,
> +		sizeof(lbn), (void *)&lbnpbn_value, sizeof(lbnpbn_value));
> +	if (r < 0)
> +		BUG();
> +

I think everywhere we need to fix this BUG() thing. If there is an error
propagate it upwards instead of saying bug. Let the IO fail but why
crash the OS?

Also I am assuming that kvs_insert() is going to allocate memory. What
if that memory allocation fails. Write to block will succeed but we will
never create an entry in lbn to pbn kvs and next time read comes, we can't
read that data and lost the data?

I think this needs to be though through more properly and one needs to
think about all the corner error cases.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH RFCv2 02/10] dm-dedup: core deduplication logic
  2014-08-28 22:02 [PATCH RFCv2 02/10] dm-dedup: core deduplication logic Vasily Tarasov
  2015-02-02 15:23 ` Vivek Goyal
  2015-02-05 20:21 ` Vivek Goyal
@ 2015-02-06 16:53 ` Vivek Goyal
  2 siblings, 0 replies; 4+ messages in thread
From: Vivek Goyal @ 2015-02-06 16:53 UTC (permalink / raw)
  To: Vasily Tarasov
  Cc: Joe Thornber, Mike Snitzer, Christoph Hellwig, dm-devel,
	Philip Shilane, Sonam Mandal, Erez Zadok

On Thu, Aug 28, 2014 at 06:02:11PM -0400, Vasily Tarasov wrote:

[..]
> +/*
> + * Creates a bitmap of all PBNs in use by walking through
> + * kvs_lbn_pbn. Then walks through kvs_hash_pbn and deletes
> + * any entries which do not have an lbn-to-pbn mapping.
> + */
> +static int mark_and_sweep(struct dedup_config *dc)
> +{
> +	int err = 0;
> +	uint64_t data_size = 0;
> +	uint64_t bitmap_size = 0;
> +	struct mark_and_sweep_data ms_data;
> +
> +	BUG_ON(!dc);
> +
> +	data_size = i_size_read(dc->data_dev->bdev->bd_inode);
> +	bitmap_size = data_size / dc->block_size;
> +
> +	memset(&ms_data, 0, sizeof(struct mark_and_sweep_data));
> +
> +	ms_data.bitmap = vmalloc(BITS_TO_LONGS(bitmap_size) *
> +			sizeof(unsigned long));
> +	if (!ms_data.bitmap) {
> +		DMERR("Could not vmalloc ms_data.bitmap");
> +		err = -ENOMEM;
> +		goto out;
> +	}
> +	bitmap_zero(ms_data.bitmap, bitmap_size);
> +
> +	ms_data.bitmap_len = bitmap_size;
> +	ms_data.cleanup_count = 0;
> +	ms_data.dc = dc;
> +
> +	/* Create bitmap of used pbn blocks */
> +	err = dc->kvs_lbn_pbn->kvs_iterate(dc->kvs_lbn_pbn,
> +			&mark_lbn_pbn_bitmap, (void *)&ms_data);
> +	if (err < 0)
> +		goto out_free;
> +
> +	/* Cleanup hashes based on above bitmap of used pbn blocks */
> +	err = dc->kvs_hash_pbn->kvs_iterate(dc->kvs_hash_pbn,
> +			&cleanup_hash_pbn, (void *)&ms_data);

How does the locking work in general for this target. So far I have not
seen any locks being taken anywhere. I want to know what's the thought
process and what locks are where and what are they protecting.

For example, here, we create a bitmap of unused pbn first and then
go on to remove those pbn. How do we know that some unused pbn
did not get reused again by the time we called cleanup_hash_pbn.

I see that last patch in the series removed mark_and_sweep logic
and replaced with garbage_collect. Please rework the series so that
mark_and_sweep is not introduced to begin with.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2015-02-06 16:53 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-08-28 22:02 [PATCH RFCv2 02/10] dm-dedup: core deduplication logic Vasily Tarasov
2015-02-02 15:23 ` Vivek Goyal
2015-02-05 20:21 ` Vivek Goyal
2015-02-06 16:53 ` Vivek Goyal

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.