[PATCH RFCv2 02/10] dm-dedup: core deduplication logic

* [PATCH RFCv2 02/10] dm-dedup: core deduplication logic
@ 2014-08-28 22:02 Vasily Tarasov
  2015-02-02 15:23 ` Vivek Goyal
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Vasily Tarasov @ 2014-08-28 22:02 UTC (permalink / raw)
  To: dm-devel
  Cc: Joe Thornber, Mike Snitzer, Christoph Hellwig, Philip Shilane,
	Sonam Mandal, Erez Zadok

The implementation of the constructor, destructor, and map functions.

The map function just puts all I/O requests in a work queue. For reads,
the worker thread locates Physical Block Numbers (PBNs) in the
LBN-to-PBN mapping and remaps the I/O there. If PBN is not found, a
block of zeroes is returned.

For writes, the worker thread computes the hash of a block and tries to
find the hash in the HASH-to-PBN mapping. If the hash is found, only the
LBN-to-PBN mapping needs to be updated. If the hash is not found, the
request is remapped to the new location on the data device and both
LBN-to-PBN and HASH-to-PBN mappings are updated.

Signed-off-by: Vasily Tarasov <tarasov@vasily.name>
---
 drivers/md/dm-dedup-target.c |  912 ++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 912 insertions(+), 0 deletions(-)
 create mode 100644 drivers/md/dm-dedup-target.c

diff --git a/drivers/md/dm-dedup-target.c b/drivers/md/dm-dedup-target.c
new file mode 100644
index 0000000..0887543
--- /dev/null
+++ b/drivers/md/dm-dedup-target.c
@@ -0,0 +1,912 @@
+/*
+ * Copyright (C) 2012-2014 Vasily Tarasov
+ * Copyright (C) 2012-2014 Geoff Kuenning
+ * Copyright (C) 2012-2014 Sonam Mandal
+ * Copyright (C) 2012-2014 Karthikeyani Palanisami
+ * Copyright (C) 2012-2014 Philip Shilane
+ * Copyright (C) 2012-2014 Sagar Trehan
+ * Copyright (C) 2012-2014 Erez Zadok
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-dedup-target.h"
+#include "dm-dedup-rw.h"
+#include "dm-dedup-hash.h"
+#include "dm-dedup-backend.h"
+#include "dm-dedup-ram.h"
+#include "dm-dedup-cbt.h"
+#include "dm-dedup-kvstore.h"
+
+#define MAX_DEV_NAME_LEN (64)
+#define MAX_BACKEND_NAME_LEN (64)
+
+#define MIN_DATA_DEV_BLOCK_SIZE (4 * 1024)
+#define MAX_DATA_DEV_BLOCK_SIZE (1024 * 1024)
+
+struct on_disk_stats {
+	uint64_t physical_block_counter;
+	uint64_t logical_block_counter;
+};
+
+/*
+ * All incoming requests are packed in the dedup_work structure
+ * for further processing by the workqueue thread.
+ */
+struct dedup_work {
+	struct work_struct worker;
+	struct dedup_config *config;
+	struct bio *bio;
+};
+
+struct mark_and_sweep_data {
+	unsigned long *bitmap;
+	uint64_t bitmap_len;
+	uint64_t cleanup_count; /* number of hashes cleaned up */
+	struct dedup_config *dc;
+};
+
+enum backend {
+	BKND_INRAM,
+	BKND_COWBTREE
+};
+
+static void bio_zero_endio(struct bio *bio)
+{
+	zero_fill_bio(bio);
+	bio_endio(bio, 0);
+}
+
+static uint64_t bio_lbn(struct dedup_config *dc, struct bio *bio)
+{
+	return bio->bi_iter.bi_sector / dc->sectors_per_block;
+}
+
+static void do_io(struct dedup_config *dc, struct bio *bio,
+			uint64_t pbn)
+{
+	int offset;
+
+	offset = (sector_t) bio->bi_iter.bi_sector % dc->sectors_per_block;
+	bio->bi_iter.bi_sector = (sector_t)pbn * dc->sectors_per_block + offset;
+
+	bio->bi_bdev = dc->data_dev->bdev;
+
+	generic_make_request(bio);
+}
+
+static void handle_read(struct dedup_config *dc, struct bio *bio)
+{
+	uint64_t lbn;
+	uint32_t vsize;
+	struct lbn_pbn_value lbnpbn_value;
+	int r;
+
+	lbn = bio_lbn(dc, bio);
+
+	r = dc->kvs_lbn_pbn->kvs_lookup(dc->kvs_lbn_pbn, (void *)&lbn,
+			sizeof(lbn), (void *)&lbnpbn_value, &vsize);
+	if (r == 0)
+		bio_zero_endio(bio);
+	else if (r == 1)
+		do_io(dc, bio, lbnpbn_value.pbn);
+	else
+		BUG();
+}
+
+static int allocate_block(struct dedup_config *dc, uint64_t *pbn_new)
+{
+	int r;
+
+	r = dc->mdops->alloc_data_block(dc->bmd, pbn_new);
+
+	if (!r) {
+		dc->logical_block_counter++;
+		dc->physical_block_counter++;
+	}
+
+	return r;
+}
+
+static int write_to_new_block(struct dedup_config *dc, uint64_t *pbn_new,
+				struct bio *bio, uint64_t lbn)
+{
+	int r = 0;
+	struct lbn_pbn_value lbnpbn_value;
+
+	r = allocate_block(dc, pbn_new);
+	if (r < 0) {
+		r = -EIO;
+		return r;
+	}
+
+	lbnpbn_value.pbn = *pbn_new;
+
+	do_io(dc, bio, *pbn_new);
+
+	r = dc->kvs_lbn_pbn->kvs_insert(dc->kvs_lbn_pbn, (void *)&lbn,
+		sizeof(lbn), (void *)&lbnpbn_value, sizeof(lbnpbn_value));
+	if (r < 0)
+		BUG();
+
+	return r;
+}
+
+static int handle_write_no_hash(struct dedup_config *dc,
+			struct bio *bio, uint64_t lbn, u8 *hash)
+{
+	int r;
+	uint32_t vsize;
+	uint64_t pbn_new, pbn_old;
+	struct lbn_pbn_value lbnpbn_value;
+	struct hash_pbn_value hashpbn_value;
+
+	dc->uniqwrites++;
+
+	r = dc->kvs_lbn_pbn->kvs_lookup(dc->kvs_lbn_pbn, (void *)&lbn,
+			sizeof(lbn), (void *)&lbnpbn_value, &vsize);
+	if (r == 0) {
+		/* No LBN->PBN mapping entry */
+		dc->newwrites++;
+		r = write_to_new_block(dc, &pbn_new, bio, lbn);
+		if (r < 0)
+			goto out;
+
+		hashpbn_value.pbn = pbn_new;
+
+		r = dc->kvs_hash_pbn->kvs_insert(dc->kvs_hash_pbn, (void *)hash,
+				dc->crypto_key_size, (void *)&hashpbn_value,
+				sizeof(hashpbn_value));
+		if (r < 0)
+			BUG();
+
+		r = dc->mdops->inc_refcount(dc->bmd, pbn_new);
+		if (r < 0)
+			BUG();
+
+		goto out;
+	} else if (r < 0)
+		BUG();
+
+	/* LBN->PBN mappings exist */
+	dc->overwrites++;
+	r = write_to_new_block(dc, &pbn_new, bio, lbn);
+	if (r < 0)
+		goto out;
+
+	pbn_old = lbnpbn_value.pbn;
+	r = dc->mdops->dec_refcount(dc->bmd, pbn_old);
+	if (r < 0)
+		BUG();
+
+	dc->logical_block_counter--;
+
+	hashpbn_value.pbn = pbn_new;
+
+	r = dc->kvs_hash_pbn->kvs_insert(dc->kvs_hash_pbn, (void *)hash,
+				dc->crypto_key_size, (void *)&hashpbn_value,
+				sizeof(hashpbn_value));
+	if (r < 0)
+		BUG();
+
+	r = dc->mdops->inc_refcount(dc->bmd, pbn_new);
+	if (r < 0)
+		BUG();
+out:
+	return r;
+}
+
+static int handle_write_with_hash(struct dedup_config *dc, struct bio *bio,
+				uint64_t lbn, u8 *final_hash,
+				struct hash_pbn_value hashpbn_value)
+{
+	int r;
+	uint32_t vsize;
+	uint64_t pbn_new, pbn_old;
+	struct lbn_pbn_value lbnpbn_value;
+	struct lbn_pbn_value new_lbnpbn_value;
+
+	dc->dupwrites++;
+
+	pbn_new = hashpbn_value.pbn;
+	r = dc->kvs_lbn_pbn->kvs_lookup(dc->kvs_lbn_pbn, (void *)&lbn,
+			sizeof(lbn), (void *)&lbnpbn_value, &vsize);
+	if (r == 0) {
+		/* No LBN->PBN mapping entry */
+		dc->newwrites++;
+
+		r = dc->mdops->inc_refcount(dc->bmd, pbn_new);
+		if (r < 0)
+			BUG();
+
+		lbnpbn_value.pbn = pbn_new;
+
+		r = dc->kvs_lbn_pbn->kvs_insert(dc->kvs_lbn_pbn, (void *)&lbn,
+				sizeof(lbn), (void *)&lbnpbn_value,
+				sizeof(lbnpbn_value));
+		if (r < 0)
+			BUG();
+
+		dc->logical_block_counter++;
+
+		goto out;
+	} else if (r < 0)
+		BUG();
+
+	/* LBN->PBN mapping entry exists */
+	dc->overwrites++;
+	pbn_old = lbnpbn_value.pbn;
+	if (pbn_new != pbn_old) {
+		r = dc->mdops->inc_refcount(dc->bmd, pbn_new);
+		if (r < 0)
+			BUG();
+
+		new_lbnpbn_value.pbn = pbn_new;
+
+		r = dc->kvs_lbn_pbn->kvs_insert(dc->kvs_lbn_pbn, (void *)&lbn,
+			sizeof(lbn), (void *)&new_lbnpbn_value,
+			sizeof(new_lbnpbn_value));
+		if (r < 0)
+			BUG();
+
+		r = dc->mdops->dec_refcount(dc->bmd, pbn_old);
+		if (r < 0)
+			BUG();
+
+		goto out;
+	}
+
+	/* Nothing to do if writing same data to same LBN */
+
+out:
+	bio_endio(bio, 0);
+	return r;
+}
+
+static void handle_write(struct dedup_config *dc, struct bio *bio)
+{
+	uint64_t lbn;
+	u8 hash[MAX_DIGEST_SIZE];
+	struct hash_pbn_value hashpbn_value;
+	uint32_t vsize;
+	struct bio *new_bio = NULL;
+	int r;
+
+	dc->writes++;
+
+	/* Read-on-write handling */
+	if (bio->bi_iter.bi_size < dc->block_size) {
+		dc->reads_on_writes++;
+		new_bio = prepare_bio_on_write(dc, bio);
+		if (!new_bio) {
+			bio_endio(bio, -ENOMEM);
+			return;
+		}
+		bio = new_bio;
+	}
+
+	lbn = bio_lbn(dc, bio);
+
+	r = compute_hash_bio(dc->desc_table, bio, hash);
+	if (r)
+		BUG();
+
+	r = dc->kvs_hash_pbn->kvs_lookup(dc->kvs_hash_pbn, hash,
+				dc->crypto_key_size, &hashpbn_value, &vsize);
+
+	if (r == 0)
+		r = handle_write_no_hash(dc, bio, lbn, hash);
+	else if (r > 0)
+		r = handle_write_with_hash(dc, bio, lbn, hash,
+					hashpbn_value);
+	else
+		BUG();
+
+	dc->writes_after_flush++;
+	if ((dc->flushrq && dc->writes_after_flush >= dc->flushrq) ||
+			(bio->bi_rw & (REQ_FLUSH | REQ_FUA))) {
+		r = dc->mdops->flush_meta(dc->bmd);
+		dc->writes_after_flush = 0;
+	}
+}
+
+static void process_bio(struct dedup_config *dc, struct bio *bio)
+{
+	switch (bio_data_dir(bio)) {
+	case READ:
+		handle_read(dc, bio);
+		break;
+	case WRITE:
+		handle_write(dc, bio);
+		break;
+	default:
+		DMERR("Unknown request type!");
+		bio_endio(bio, -EINVAL);
+	}
+}
+
+static void do_work(struct work_struct *ws)
+{
+	struct dedup_work *data = container_of(ws, struct dedup_work, worker);
+	struct dedup_config *dc = (struct dedup_config *)data->config;
+	struct bio *bio = (struct bio *)data->bio;
+
+	mempool_free(data, dc->dedup_work_pool);
+
+	process_bio(dc, bio);
+}
+
+static void dedup_defer_bio(struct dedup_config *dc, struct bio *bio)
+{
+	struct dedup_work *data;
+
+	data = mempool_alloc(dc->dedup_work_pool, GFP_NOIO);
+	if (!data) {
+		bio_endio(bio, -ENOMEM);
+		return;
+	}
+
+	data->bio = bio;
+	data->config = dc;
+
+	INIT_WORK(&(data->worker), do_work);
+
+	queue_work(dc->workqueue, &(data->worker));
+}
+
+static int dm_dedup_map_fn(struct dm_target *ti, struct bio *bio)
+{
+	dedup_defer_bio(ti->private, bio);
+
+	return DM_MAPIO_SUBMITTED;
+}
+
+struct dedup_args {
+	struct dm_target *ti;
+
+	struct dm_dev *meta_dev;
+
+	struct dm_dev *data_dev;
+	uint64_t data_size;
+
+	uint32_t block_size;
+
+	char hash_algo[CRYPTO_ALG_NAME_LEN];
+
+	enum backend backend;
+
+	uint32_t flushrq;
+};
+
+static int parse_meta_dev(struct dedup_args *da, struct dm_arg_set *as,
+				char **err)
+{
+	int r;
+
+	r = dm_get_device(da->ti, dm_shift_arg(as),
+			dm_table_get_mode(da->ti->table), &da->meta_dev);
+	if (r)
+		*err = "Error opening metadata device";
+
+	return r;
+}
+
+static int parse_data_dev(struct dedup_args *da, struct dm_arg_set *as,
+				char **err)
+{
+	int r;
+
+	r = dm_get_device(da->ti, dm_shift_arg(as),
+			dm_table_get_mode(da->ti->table), &da->data_dev);
+	if (r)
+		*err = "Error opening data device";
+
+	da->data_size = i_size_read(da->data_dev->bdev->bd_inode);
+
+	return r;
+}
+
+static int parse_block_size(struct dedup_args *da, struct dm_arg_set *as,
+				char **err)
+{
+	uint32_t block_size;
+
+	if (kstrtou32(dm_shift_arg(as), 10, &block_size) ||
+		!block_size ||
+		block_size < MIN_DATA_DEV_BLOCK_SIZE ||
+		block_size > MAX_DATA_DEV_BLOCK_SIZE ||
+		!is_power_of_2(block_size)) {
+		*err = "Invalid data block size";
+		return -EINVAL;
+	}
+
+	if (block_size > da->data_size) {
+		*err = "Data block size is larger than the data device";
+		return -EINVAL;
+	}
+
+	da->block_size = block_size;
+
+	return 0;
+}
+
+static int parse_hash_algo(struct dedup_args *da, struct dm_arg_set *as,
+				char **err)
+{
+	strlcpy(da->hash_algo, dm_shift_arg(as), CRYPTO_ALG_NAME_LEN);
+
+	if (!crypto_has_hash(da->hash_algo, 0, CRYPTO_ALG_ASYNC)) {
+		*err = "Unrecognized hash algorithm";
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int parse_backend(struct dedup_args *da, struct dm_arg_set *as,
+				char **err)
+{
+	char backend[MAX_BACKEND_NAME_LEN];
+
+	strlcpy(backend, dm_shift_arg(as), MAX_BACKEND_NAME_LEN);
+
+	if (!strcmp(backend, "inram"))
+		da->backend = BKND_INRAM;
+	else if (!strcmp(backend, "cowbtree"))
+		da->backend = BKND_COWBTREE;
+	else {
+		*err = "Unsupported metadata backend";
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int parse_flushrq(struct dedup_args *da, struct dm_arg_set *as,
+				char **err)
+{
+	if (kstrtou32(dm_shift_arg(as), 10, &da->flushrq))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int parse_dedup_args(struct dedup_args *da, int argc,
+				char **argv, char **err)
+{
+	struct dm_arg_set as;
+	int r;
+
+	if (argc < 6) {
+		*err = "Insufficient args";
+		return -EINVAL;
+	}
+
+	if (argc > 6) {
+		*err = "Too many args";
+		return -EINVAL;
+	}
+
+	as.argc = argc;
+	as.argv = argv;
+
+	r = parse_meta_dev(da, &as, err);
+	if (r)
+		return r;
+
+	r = parse_data_dev(da, &as, err);
+	if (r)
+		return r;
+
+	r = parse_block_size(da, &as, err);
+	if (r)
+		return r;
+
+	r = parse_hash_algo(da, &as, err);
+	if (r)
+		return r;
+
+	r = parse_backend(da, &as, err);
+	if (r)
+		return r;
+
+	r = parse_flushrq(da, &as, err);
+	if (r)
+		return r;
+
+	return 0;
+}
+
+static void destroy_dedup_args(struct dedup_args *da)
+{
+	if (da->meta_dev)
+		dm_put_device(da->ti, da->meta_dev);
+
+	if (da->data_dev)
+		dm_put_device(da->ti, da->data_dev);
+
+	kfree(da);
+}
+
+static int dm_dedup_ctr_fn(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct dedup_args *da;
+	struct dedup_config *dc;
+	struct workqueue_struct *wq;
+
+	struct init_param_inram iparam_inram;
+	struct init_param_cowbtree iparam_cowbtree;
+	void *iparam;
+	struct metadata *md = NULL;
+
+	uint64_t data_size;
+	int r;
+	int crypto_key_size;
+
+	struct on_disk_stats *data = NULL;
+	uint64_t logical_block_counter = 0;
+	uint64_t physical_block_counter = 0;
+
+	uint32_t flushrq = 0;
+	mempool_t *dedup_work_pool = NULL;
+
+	bool unformatted;
+
+	da = kzalloc(sizeof(*da), GFP_KERNEL);
+	if (!da) {
+		ti->error = "Error allocating memory for dedup arguments";
+		return -ENOMEM;
+	}
+
+	da->ti = ti;
+
+	r = parse_dedup_args(da, argc, argv, &ti->error);
+	if (r)
+		goto out;
+
+	dc = kmalloc(sizeof(*dc), GFP_NOIO);
+	if (!dc) {
+		ti->error = "Error allocating memory for dedup config";
+		r = -ENOMEM;
+		goto out;
+	}
+
+	wq = create_singlethread_workqueue("dm-dedup");
+	if (!wq) {
+		ti->error = "failed to create workqueue";
+		r = -ENOMEM;
+		goto bad_wq;
+	}
+
+	dedup_work_pool = mempool_create_kmalloc_pool(MIN_DEDUP_WORK_IO,
+						sizeof(struct dedup_work));
+	if (!dedup_work_pool) {
+		r = -ENOMEM;
+		ti->error = "failed to create mempool";
+		goto bad_mempool;
+	}
+
+	dc->io_client = dm_io_client_create();
+	if (IS_ERR(dc->io_client)) {
+		r = PTR_ERR(dc->io_client);
+		ti->error = "failed to create dm_io_client";
+		goto bad_io_client;
+	}
+
+	dc->block_size = da->block_size;
+	dc->sectors_per_block = to_sector(da->block_size);
+	dc->lblocks = ti->len / dc->sectors_per_block;
+
+	data_size = i_size_read(da->data_dev->bdev->bd_inode);
+	dc->pblocks = data_size / da->block_size;
+
+	/* Meta-data backend specific part */
+	if (da->backend == BKND_INRAM) {
+		dc->mdops = &metadata_ops_inram;
+		iparam_inram.blocks = dc->pblocks;
+		iparam = &iparam_inram;
+	} else if (da->backend == BKND_COWBTREE) {
+		r = -EINVAL;
+		dc->mdops = &metadata_ops_cowbtree;
+		iparam_cowbtree.blocks = dc->pblocks;
+		iparam_cowbtree.metadata_bdev = da->meta_dev->bdev;
+		iparam = &iparam_cowbtree;
+	} else
+		BUG();
+
+	md = dc->mdops->init_meta(iparam, &unformatted);
+	if (IS_ERR(md)) {
+		ti->error = "failed to initialize backend metadata";
+		r = PTR_ERR(md);
+		goto bad_metadata_init;
+	}
+
+	dc->desc_table = desc_table_init(da->hash_algo);
+	if (!dc->desc_table || IS_ERR(dc->desc_table)) {
+		ti->error = "failed to initialize crypto API";
+		r = PTR_ERR(dc->desc_table);
+		goto bad_metadata_init;
+	}
+
+	crypto_key_size = get_hash_digestsize(dc->desc_table);
+
+	dc->kvs_hash_pbn = dc->mdops->kvs_create_sparse(md, crypto_key_size,
+				sizeof(struct hash_pbn_value),
+				dc->pblocks, unformatted);
+	if (IS_ERR(dc->kvs_hash_pbn)) {
+		r = PTR_ERR(dc->kvs_hash_pbn);
+		ti->error = "failed to create sparse KVS";
+		goto bad_kvstore_init;
+	}
+
+	dc->kvs_lbn_pbn = dc->mdops->kvs_create_linear(md, 8,
+			sizeof(struct lbn_pbn_value), dc->lblocks, unformatted);
+	if (IS_ERR(dc->kvs_lbn_pbn)) {
+		ti->error = "failed to create linear KVS";
+		r = PTR_ERR(dc->kvs_lbn_pbn);
+		goto bad_kvstore_init;
+	}
+
+	r = dc->mdops->flush_meta(md);
+	if (r < 0)
+		BUG();
+
+	if (!unformatted && dc->mdops->get_private_data) {
+		r = dc->mdops->get_private_data(md, (void **)&data,
+				sizeof(struct on_disk_stats));
+		if (r < 0)
+			BUG();
+
+		logical_block_counter = data->logical_block_counter;
+		physical_block_counter = data->physical_block_counter;
+	}
+
+	dc->data_dev = da->data_dev;
+	dc->metadata_dev = da->meta_dev;
+
+	dc->workqueue = wq;
+	dc->dedup_work_pool = dedup_work_pool;
+	dc->bmd = md;
+
+	dc->logical_block_counter = logical_block_counter;
+	dc->physical_block_counter = physical_block_counter;
+
+	dc->writes = 0;
+	dc->dupwrites = 0;
+	dc->uniqwrites = 0;
+	dc->reads_on_writes = 0;
+	dc->overwrites = 0;
+	dc->newwrites = 0;
+
+	strcpy(dc->crypto_alg, da->hash_algo);
+	dc->crypto_key_size = crypto_key_size;
+
+	dc->flushrq = flushrq;
+	dc->writes_after_flush = 0;
+
+	dm_set_target_max_io_len(ti, dc->sectors_per_block);
+	ti->private = dc;
+
+	da->meta_dev = da->data_dev = NULL;
+	r = 0;
+	goto out;
+
+bad_kvstore_init:
+	desc_table_deinit(dc->desc_table);
+bad_metadata_init:
+	if (md && !IS_ERR(md))
+		dc->mdops->exit_meta(md);
+	dm_io_client_destroy(dc->io_client);
+bad_io_client:
+	mempool_destroy(dedup_work_pool);
+bad_mempool:
+	destroy_workqueue(wq);
+bad_wq:
+	kfree(dc);
+out:
+	destroy_dedup_args(da);
+	return r;
+}
+
+static void dm_dedup_dtr_fn(struct dm_target *ti)
+{
+	struct dedup_config *dc = ti->private;
+	struct on_disk_stats data;
+	int ret;
+
+	if (dc->mdops->set_private_data) {
+		data.physical_block_counter = dc->physical_block_counter;
+		data.logical_block_counter = dc->logical_block_counter;
+
+		ret = dc->mdops->set_private_data(dc->bmd, &data,
+				sizeof(struct on_disk_stats));
+		if (ret < 0)
+			BUG();
+	}
+
+	ret = dc->mdops->flush_meta(dc->bmd);
+	if (ret < 0)
+		BUG();
+
+	flush_workqueue(dc->workqueue);
+	destroy_workqueue(dc->workqueue);
+
+	mempool_destroy(dc->dedup_work_pool);
+
+	dc->mdops->exit_meta(dc->bmd);
+
+	dm_io_client_destroy(dc->io_client);
+
+	dm_put_device(ti, dc->data_dev);
+	dm_put_device(ti, dc->metadata_dev);
+	desc_table_deinit(dc->desc_table);
+
+	kfree(dc);
+}
+
+static int mark_lbn_pbn_bitmap(void *key, int32_t ksize,
+		void *value, int32_t vsize, void *data)
+{
+	int ret = 0;
+	struct mark_and_sweep_data *ms_data =
+		(struct mark_and_sweep_data *)data;
+	uint64_t pbn_val = *((uint64_t *)value);
+
+	BUG_ON(!data);
+	BUG_ON(!ms_data->bitmap);
+	BUG_ON(pbn_val > ms_data->bitmap_len);
+
+	bitmap_set(ms_data->bitmap, pbn_val, 1);
+
+	return ret;
+}
+
+static int cleanup_hash_pbn(void *key, int32_t ksize, void *value,
+		int32_t vsize, void *data)
+{
+	int ret = 0, r = 0;
+	uint64_t pbn_val = 0;
+	struct mark_and_sweep_data *ms_data =
+		(struct mark_and_sweep_data *)data;
+	struct hash_pbn_value hashpbn_value = *((struct hash_pbn_value *)value);
+	struct dedup_config *dc = ms_data->dc;
+
+	BUG_ON(!data);
+	BUG_ON(!ms_data->bitmap);
+
+	pbn_val = hashpbn_value.pbn;
+	BUG_ON(pbn_val > ms_data->bitmap_len);
+
+	if (test_bit(pbn_val, ms_data->bitmap) == 0) {
+		ret = dc->kvs_hash_pbn->kvs_delete(dc->kvs_hash_pbn,
+							key, ksize);
+		if (ret < 0)
+			BUG();
+
+		r = dc->mdops->dec_refcount(ms_data->dc->bmd, pbn_val);
+		if (r < 0)
+			BUG();
+
+		ms_data->cleanup_count++;
+	}
+
+	return ret;
+}
+
+/*
+ * Creates a bitmap of all PBNs in use by walking through
+ * kvs_lbn_pbn. Then walks through kvs_hash_pbn and deletes
+ * any entries which do not have an lbn-to-pbn mapping.
+ */
+static int mark_and_sweep(struct dedup_config *dc)
+{
+	int err = 0;
+	uint64_t data_size = 0;
+	uint64_t bitmap_size = 0;
+	struct mark_and_sweep_data ms_data;
+
+	BUG_ON(!dc);
+
+	data_size = i_size_read(dc->data_dev->bdev->bd_inode);
+	bitmap_size = data_size / dc->block_size;
+
+	memset(&ms_data, 0, sizeof(struct mark_and_sweep_data));
+
+	ms_data.bitmap = vmalloc(BITS_TO_LONGS(bitmap_size) *
+			sizeof(unsigned long));
+	if (!ms_data.bitmap) {
+		DMERR("Could not vmalloc ms_data.bitmap");
+		err = -ENOMEM;
+		goto out;
+	}
+	bitmap_zero(ms_data.bitmap, bitmap_size);
+
+	ms_data.bitmap_len = bitmap_size;
+	ms_data.cleanup_count = 0;
+	ms_data.dc = dc;
+
+	/* Create bitmap of used pbn blocks */
+	err = dc->kvs_lbn_pbn->kvs_iterate(dc->kvs_lbn_pbn,
+			&mark_lbn_pbn_bitmap, (void *)&ms_data);
+	if (err < 0)
+		goto out_free;
+
+	/* Cleanup hashes based on above bitmap of used pbn blocks */
+	err = dc->kvs_hash_pbn->kvs_iterate(dc->kvs_hash_pbn,
+			&cleanup_hash_pbn, (void *)&ms_data);
+	if (err < 0)
+		goto out_free;
+
+	dc->physical_block_counter -= ms_data.cleanup_count;
+
+out_free:
+	vfree(ms_data.bitmap);
+out:
+	return err;
+}
+
+static int dm_dedup_message_fn(struct dm_target *ti,
+				unsigned argc, char **argv)
+{
+	int r = 0;
+
+	struct dedup_config *dc = ti->private;
+
+	if (!strcasecmp(argv[0], "mark_and_sweep")) {
+		r = mark_and_sweep(dc);
+		if (r < 0)
+			DMERR("Error in performing mark_and_sweep: %d.", r);
+	} else if (!strcasecmp(argv[0], "drop_bufio_cache")) {
+		if (dc->mdops->flush_bufio_cache)
+			dc->mdops->flush_bufio_cache(dc->bmd);
+		else
+			r = -ENOTSUPP;
+	} else
+		r = -EINVAL;
+
+	return r;
+}
+
+static int dm_dedup_endio_fn(struct dm_target *ti, struct bio *bio, int error)
+{
+	if (error || bio_data_dir(bio) != READ)
+		return 0;
+
+	return 0;
+}
+
+static struct target_type dm_dedup_target = {
+	.name = "dedup",
+	.version = {1, 0, 0},
+	.module = THIS_MODULE,
+	.features = 0x0,
+	.ctr = dm_dedup_ctr_fn,
+	.dtr = dm_dedup_dtr_fn,
+	.map = dm_dedup_map_fn,
+	.end_io = dm_dedup_endio_fn,
+	.message = dm_dedup_message_fn,
+};
+
+static int __init dm_dedup_init(void)
+{
+	int r;
+
+	r = dm_register_target(&dm_dedup_target);
+	if (r < 0)
+		DMERR("target registration failed: %d.", r);
+	else
+		DMINFO("target registered succesfully.");
+
+	return r;
+}
+
+static void __exit dm_dedup_exit(void)
+{
+	dm_unregister_target(&dm_dedup_target);
+}
+
+module_init(dm_dedup_init);
+module_exit(dm_dedup_exit);
+
+MODULE_DESCRIPTION(DM_NAME " target for data deduplication");
+MODULE_LICENSE("GPL");
-- 
1.7.1

^ permalink raw reply related	[flat|nested] 4+ messages in thread