All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jonathan Brassow <jbrassow@redhat.com>
To: dm-devel@redhat.com
Subject: [PATCH 1 of 1] DM Exception Store:  clusterized type
Date: Wed, 25 Mar 2009 16:46:14 -0500	[thread overview]
Message-ID: <200903252146.n2PLkExv025563@hydrogen.msp.redhat.com> (raw)

Patch name: dm-exception-store-clusterized-type.patch

This patch provides an exception store implementation that is
capable of "wrapping" other exception store implementations and
making them cluster-aware.  It is not a stand-alone implementation.
It merely uses distributed locking to protect exception store
metadata as the single-machine "core" exception stores perform
their actions independently.  This is why the module uses the
term "clusterized" instead of "clustered".  This is just a toy
right now.  I'm not sure how it will perform - I have more
optimizations to do yet.

Preliminary test shows the concept works to provide cluster-aware
snapshots.  Testing is somewhat difficult... and there is quite
a bit to do yet.  I have a script that will convert single machine
snapshots to cluster-aware snapshots that I am willing to share to
those who wish to test cluster-aware snapshots.

Index: linux-2.6/drivers/md/Kconfig
===================================================================
--- linux-2.6.orig/drivers/md/Kconfig
+++ linux-2.6/drivers/md/Kconfig
@@ -244,10 +244,10 @@ config DM_CRYPT
 	  If unsure, say N.
 
 config DM_SNAPSHOT
-       tristate "Snapshot target"
-       depends on BLK_DEV_DM
-       ---help---
-         Allow volume managers to take writable snapshots of a device.
+	tristate "Snapshot target"
+	depends on BLK_DEV_DM
+	---help---
+	  Allow volume managers to take writable snapshots of a device.
 
 config DM_EXSTORE_SHARED
 	tristate "Shared exception store (EXPERIMENTAL)"
@@ -257,6 +257,19 @@ config DM_EXSTORE_SHARED
 	  yields space and performance gains when more than one
 	  snapshot is taken of a device.
 
+config DM_EXSTORE_CLUSTERIZED
+	tristate "Cluster-aware exception store wrapper (EXPERIMENTAL)"
+	depends on BLK_DEV_DM && DM_SNAPSHOT
+	select DLM
+	---help---
+	  An exception store is a module that is used by snapshots to
+	  record COW areas.  This module is capable of wrapping certain
+	  exception stores so that they appear to be cluster-aware.  This
+	  has the affect of making device-mapper snapshots cluster-aware.
+	  Not every exception store type can be wrapped.  Check the end
+	  of drivers/md/dm-ex-store-clusterized.c to find out what stores
+	  are supported.
+
 config DM_MIRROR
        tristate "Mirror target"
        depends on BLK_DEV_DM
Index: linux-2.6/drivers/md/Makefile
===================================================================
--- linux-2.6.orig/drivers/md/Makefile
+++ linux-2.6/drivers/md/Makefile
@@ -8,6 +8,7 @@ dm-multipath-objs := dm-path-selector.o 
 dm-snapshot-objs := dm-snap.o dm-exception.o dm-exception-store.o \
 		    dm-snap-persistent.o dm-snap-transient.o
 dm-exstore-shared-objs := dm-ex-store-shared.o
+dm-exstore-clusterized-objs := dm-ex-store-clusterized.o
 dm-mirror-objs	:= dm-raid1.o
 md-mod-objs     := md.o bitmap.o
 raid456-objs	:= raid5.o raid6algos.o raid6recov.o raid6tables.o \
@@ -37,6 +38,7 @@ obj-$(CONFIG_DM_DELAY)		+= dm-delay.o
 obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_EXSTORE_SHARED) += dm-exstore-shared.o
+obj-$(CONFIG_DM_EXSTORE_CLUSTERIZED) += dm-exstore-clusterized.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
 
Index: linux-2.6/drivers/md/dm-ex-store-clusterized.c
===================================================================
--- /dev/null
+++ linux-2.6/drivers/md/dm-ex-store-clusterized.c
@@ -0,0 +1,522 @@
+/*
+ * Copyright (C) 2009 Red Hat, Inc. All rights reserved.
+ *
+ * Device-mapper exception structure and associated functions.
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/device-mapper.h>
+#include <linux/dlm.h>
+#include "dm-exception-store.h"
+
+#define DM_MSG_PREFIX "clusterized exception store"
+
+struct clusterized_c {
+	struct dm_exception_store *core_store;
+
+	atomic_t prepared_exceptions;
+
+	struct completion completion;
+
+	int current_lock_mode;
+	struct semaphore serialize; /* serialize DLM lock modes */
+	dlm_lockspace_t *lockspace;
+	struct dlm_lksb lksb;
+
+	uint64_t metadata_counter;
+	uint64_t cluster_metadata_counter;
+
+	char uuid[0]; /* must be last */
+};
+
+static void lock_obtained(void *context)
+{
+	struct clusterized_c *cc = context;
+
+	complete(&cc->completion);
+}
+
+static int cluster_lock(struct clusterized_c *cc, int mode)
+{
+	int r;
+	uint32_t flags = DLM_LKF_VALBLK;
+
+	down(&cc->serialize);
+	if (mode == DLM_LOCK_NL) { /* Only for first aquisition */
+		flags |= DLM_LKF_EXPEDITE;
+		up(&cc->serialize);
+	} else if (mode == cc->current_lock_mode)
+		DMERR("*** Lock already aquired in asking mode ***");
+	else
+		flags |= DLM_LKF_CONVERT;
+
+	r = dlm_lock(cc->lockspace, mode, &cc->lksb,
+		     flags, cc->uuid, strlen(cc->uuid), 0,
+		     lock_obtained, cc, NULL);
+
+	if (r) {
+		DMERR("cluster_lock failure: %d", r);
+		up(&cc->serialize);
+		return r;
+	}
+
+	wait_for_completion(&cc->completion);
+
+	if (cc->lksb.sb_status) {
+		DMERR("cluster_lock failure: -EAGAIN (sb_status = %d)",
+		      cc->lksb.sb_status);
+		return -EAGAIN; /* not entirely true for unlock ops */
+	}
+
+	cc->current_lock_mode = mode;
+	return 0;
+}
+
+/*
+ * cluster_unlock
+ * @cc
+ *
+ * Doesn't completely unlock, but rather puts the lock back into
+ * the DLM_LOCK_NL mode.  This preserves the LVB.
+ *
+ */
+static int cluster_unlock(struct clusterized_c *cc)
+{
+	int r;
+	uint32_t flags = DLM_LKF_VALBLK;
+
+	if (cc->current_lock_mode == DLM_LOCK_NL) {
+		DMERR("Final unlock issued");
+		dlm_unlock(cc->lockspace, cc->lksb.sb_lkid,
+			   DLM_LKF_FORCEUNLOCK, &cc->lksb, cc);
+		/* FIXME: do I need wait_for_completion? */
+		return 0;
+	}
+
+	flags |= DLM_LKF_CONVERT;
+
+	if (cc->current_lock_mode == DLM_LOCK_EX) {
+		/* FIXME: endian issues? */
+		if (cc->metadata_counter != cc->cluster_metadata_counter)
+			cc->cluster_metadata_counter = cc->metadata_counter;
+	}
+
+	r = dlm_lock(cc->lockspace, DLM_LOCK_NL, &cc->lksb,
+		     flags, cc->uuid, strlen(cc->uuid), 0,
+		     lock_obtained, cc, NULL);
+
+	if (r) {
+		DMERR("cluster_unlock failed to convert to NL: %d", r);
+		up(&cc->serialize);
+		return r;
+	}
+
+	wait_for_completion(&cc->completion);
+
+	if (cc->lksb.sb_status) {
+		DMERR("cluster_unlock failure: -EAGAIN (sb_status = %d)",
+		      cc->lksb.sb_status);
+		return -EAGAIN; /* not entirely true for unlock ops */
+	}
+
+	cc->current_lock_mode = DLM_LOCK_NL;
+	up(&cc->serialize);
+	return 0;
+}
+
+/*
+ * clusterized_ctr
+ * @store
+ * @argc
+ * @argv
+ *
+ * The mapping table will be the same as the exception
+ * store it is covering, but will also include the
+ * argument:
+ *	<non-clustered args> cluster_uuid:<UUID>
+ *
+ * Returns: 0 on success, -EXXX on failure
+ */
+static int clusterized_ctr(struct dm_exception_store *store,
+			   unsigned argc, char **argv)
+{
+	int r;
+	unsigned i, j, len;
+	unsigned my_argc = argc + 1;
+	char *my_argv[my_argc];
+	char chunk_size_str[32];
+	char *core_name;
+	struct clusterized_c *cc = NULL;
+
+	/*
+	 * First, in order to pass down to non-clustered
+	 * core, we must add back the COW and chunk size
+	 * arguments
+	 */
+	my_argv[0] = store->cow->name;
+	sprintf(chunk_size_str, "%llu", (unsigned long long)store->chunk_size);
+	my_argv[1] = chunk_size_str;
+
+	/* Now we strip off the cluster_uuid argument */
+	argc--;
+	if (strncmp("cluster_uuid:", argv[argc], 13)) {
+		DMERR("No 'cluster_uuid:' argument provided.");
+		return -EINVAL;
+	}
+	for (i = 0, j = 2; i < argc; i++, j++)
+		my_argv[j] = argv[i];
+
+	/*
+	 * We just want to count the actual UUID, plus 1
+	 * for the trailing NULL.  (With MAX size being
+	 * what is able to fit in the LVB of a DLM lock.)
+	 */
+	len = strlen(argv[argc] + 13) + 1;
+	len = (len > DLM_RESNAME_MAXLEN) ? DLM_RESNAME_MAXLEN : len;
+	cc = kzalloc(sizeof(*cc) + len, GFP_KERNEL);
+	if (!cc)
+		return -ENOMEM;
+	strncpy(cc->uuid, argv[argc] + 13, len);
+	cc->lksb.sb_lvbptr = (char *)&cc->cluster_metadata_counter;
+
+	init_completion(&cc->completion);
+
+	init_MUTEX(&cc->serialize);
+
+	/* Create (or join) the lock space */
+	r = dlm_new_lockspace(store->type->name, strlen(store->type->name),
+			      &cc->lockspace, 0, sizeof(uint64_t));
+
+	if (r) {
+		DMERR("Unable to create DLM lockspace for %s",
+		      store->type->name);
+		kfree(cc);
+		return r;
+	}
+	r = cluster_lock(cc, DLM_LOCK_NL);
+
+	/*
+	 * Now we find the non-clustered exception store name.
+	 * It will be whatever is left when we strip 'clusterized_' off.
+	 */
+	core_name = strstr(store->type->name, "-");
+	BUG_ON(!core_name);
+	core_name++;
+
+	r = dm_exception_store_create(core_name, store->ti, my_argc, my_argv,
+				      &cc->core_store);
+
+	if (r) {
+		DMERR("Failed to create foundational exception store, %s",
+		      core_name);
+		dlm_release_lockspace(cc->lockspace, 1);
+		kfree(cc);
+		return r;
+	}
+
+	/* If the core store is shared, we are shared */
+	store->shared_uuid = cc->core_store->shared_uuid;
+
+	store->context = cc;
+
+	return 0;
+}
+
+static void clusterized_dtr(struct dm_exception_store *store)
+{
+	struct clusterized_c *cc = store->context;
+
+	cc->core_store->type->dtr(cc->core_store);
+	cluster_unlock(cc);
+	dlm_release_lockspace(cc->lockspace, 1);
+	kfree(cc);
+}
+
+static int clusterized_resume(struct dm_exception_store *store)
+{
+	int r;
+	struct clusterized_c *cc = store->context;
+
+	cluster_lock(cc, DLM_LOCK_CR);
+
+	r = cc->core_store->type->resume(cc->core_store);
+	cc->metadata_counter = cc->cluster_metadata_counter;
+
+	cluster_unlock(cc);
+
+	return r;
+}
+
+static void clusterized_presuspend(struct dm_exception_store *store)
+{
+	struct clusterized_c *cc = store->context;
+
+	if (cc->core_store->type->presuspend)
+		cc->core_store->type->presuspend(store);
+}
+
+static void clusterized_postsuspend(struct dm_exception_store *store)
+{
+	struct clusterized_c *cc = store->context;
+
+	if (cc->core_store->type->postsuspend)
+		cc->core_store->type->postsuspend(store);
+}
+
+static int clusterized_prepare_exception(struct dm_exception_store *store,
+					 struct dm_exception *e, int group)
+{
+	int r;
+	struct clusterized_c *cc = store->context;
+
+	if (atomic_inc_return(&cc->prepared_exceptions) == 1)
+		cluster_lock(cc, DLM_LOCK_EX);
+
+	r = cc->core_store->type->prepare_exception(cc->core_store, e, group);
+
+	if (r) {
+		DMERR("Core store failed to prepare_exception");
+		atomic_dec(&cc->prepared_exceptions);
+		cluster_unlock(cc);
+	}
+
+	return r;
+}
+
+/* cbc - callback context */
+struct cbc {
+	struct clusterized_c *cc;
+
+	void (*callback) (void *, int success);
+	void *callback_data;
+};
+
+void commit_callback(void *data, int success)
+{
+	struct cbc *context = data;
+
+	context->cc->metadata_counter++;
+	if (atomic_dec_and_test(&context->cc->prepared_exceptions))
+		cluster_unlock(context->cc);
+
+	context->callback(context->callback_data, success);
+	kfree(context);
+}
+
+static void clusterized_commit_exception(struct dm_exception_store *store,
+					 struct dm_exception *e,
+					 void (*callback) (void *, int success),
+					 void *callback_context)
+{
+	struct clusterized_c *cc = store->context;
+	struct cbc *cbc;
+
+	cbc = kmalloc(sizeof(*cbc), GFP_NOIO);
+	if (!cbc) {
+		callback(callback_context, 0);
+		return;
+	}
+
+	cbc->cc = cc;
+	cbc->callback = callback;
+	cbc->callback_data = callback_context;
+
+	cc->core_store->type->commit_exception(cc->core_store, e,
+					       commit_callback, cbc);
+}
+
+/*
+ * clusterized_lookup_exception
+ * @store
+ * @old
+ * @new: NULL if they don't want data back
+ * @group
+ * @can_block
+ *
+ * A "shared" exception store can alter the metadata
+ * outside the scope of our cluster-wide LVB counter.
+ * We have no way of knowing whether we need to re-read/resume
+ * the metadata if a "shared" exception store is in use.
+ *
+ * We could re-read the metadata regardless, but that seems
+ * like an aweful waste... just don't allow "shared"
+ * exception stores right now (enforced in the ctr).
+ *
+ * Returns: 0 if found, -ENOENT if not found, -Exxx otherwise
+ */
+static int clusterized_lookup_exception(struct dm_exception_store *store,
+					chunk_t old, chunk_t *new,
+					int group, int can_block)
+{
+	int r;
+	struct clusterized_c *cc = store->context;
+
+	/*
+	 * Even if the metadata counters don't match, we don't
+	 * need to re-read the metadata if we can find the
+	 * exception right now.  In fact, we don't even need to
+	 * take out the cluster lock if we are just looking in our
+	 * local cache.
+	 */
+	r = cc->core_store->type->lookup_exception(cc->core_store, old,
+						   new, group, can_block);
+
+	/* If we found the exception or there was an error, we can return */
+	if (r != ENOENT)
+		return r;
+
+	/* We block when we aquire the DLM lock - respect !can_block */
+	if (!can_block)
+		return -EWOULDBLOCK;
+
+	cluster_lock(cc, DLM_LOCK_CR);
+
+	/*
+	 * If a "shared" core exception store is used, then the
+	 * metadata_counter is incapable of keeping track of all
+	 * changes that occur, so we must re-read the metadata
+	 * (i.e. resume).
+	 */
+	if (!store->shared_uuid &&
+	    (cc->cluster_metadata_counter == cc->metadata_counter)) {
+		/*
+		 * Exception was not found, and the metadata was not
+		 * changed by other node.
+		 */
+		cluster_unlock(cc);
+		return -ENOENT;
+	}
+
+	/*
+	 * The core exception store's resume method must be capable of
+	 * re-reading its metadata and updating its cache.  IOW, it must
+	 * be able to resume multiple times before a suspend is issued.
+	 */
+	cc->core_store->type->resume(cc->core_store);
+
+	cc->metadata_counter = cc->cluster_metadata_counter;
+	cluster_unlock(cc);
+
+	/* Now, try to find the exception again. */
+	r = cc->core_store->type->lookup_exception(cc->core_store, old,
+						   new, group, can_block);
+	return r;
+}
+
+static void clusterized_fraction_full(struct dm_exception_store *store,
+				      sector_t *numerator, sector_t *denominator)
+{
+	struct clusterized_c *cc = store->context;
+
+	/*
+	 * FIXME: If we want more exact numbers, then we should
+	 * check the LVB for changes and potentially force the
+	 * core store to re-read metadata.
+	 */
+	cc->core_store->type->fraction_full(cc->core_store, numerator,
+					    denominator);
+}
+
+static unsigned clusterized_status(struct dm_exception_store *store,
+				   status_type_t status, char *result,
+				   unsigned int maxlen)
+{
+	int sz = 0;
+	char *tmp_result;
+	struct clusterized_c *cc = store->context;
+
+	switch (status) {
+	case STATUSTYPE_INFO:
+		break;
+	case STATUSTYPE_TABLE:
+		DMEMIT(" clusterized");
+		tmp_result = result + sz;
+		sz += cc->core_store->type->status(cc->core_store, status,
+						   result+sz, maxlen-sz);
+		tmp_result[0] = '-'; /* s/ /-/ */
+
+		/* FIXME: inc parameter count to account for cluster_uuid */
+
+		DMEMIT(" cluster_uuid:%s", cc->uuid);
+	}
+
+	return sz;
+}
+
+static int clusterized_message(struct dm_exception_store *store,
+			       unsigned argc, char **argv)
+{
+	int r;
+	struct clusterized_c *cc = store->context;
+
+	cluster_lock(cc, DLM_LOCK_EX);
+
+	r = cc->core_store->type->message(cc->core_store, argc, argv);
+
+	cc->metadata_counter++;
+	cluster_unlock(cc);
+
+	return r;
+}
+
+/*
+ * Here is where we define what core exception store types are
+ * valid for this module to clusterize.  The necessary qualities
+ * of the core exception store are:
+ *	1) Must be able to resume multiple times (i.e. re-read
+ *	   its metadata).  This is because other nodes are allowed
+ *	   to add/alter the metadata underneath you.  Ideally, only
+ *	   the delta's will be picked up when the metadata is
+ *	   re-read - as is the case with the "persistent" store.
+ *	*2) Must not be a "shared" exception store.  IOW, the alteration
+ *	   of one exception store cannot affect another.  Currently, this
+ *	   situation is not adequately handled (but could be handled if
+ *	   people really want it).
+ *
+ * If the above conditions are met, then you can simply add an addtional
+ * 'dm_exception_store_type' below.  In fact, you could copy the block of
+ * code that is there and replace 'persistent' with the name of the
+ * exception store type that is being covered.
+ */
+static struct dm_exception_store_type _clusterized_persistent = {
+	.name = "clusterized-persistent",
+	.module = THIS_MODULE,
+	.ctr = clusterized_ctr,
+	.dtr = clusterized_dtr,
+	.resume = clusterized_resume,
+	.presuspend = clusterized_presuspend,
+	.postsuspend = clusterized_postsuspend,
+	.prepare_exception = clusterized_prepare_exception,
+	.commit_exception = clusterized_commit_exception,
+	.lookup_exception = clusterized_lookup_exception,
+	.fraction_full = clusterized_fraction_full,
+	.status = clusterized_status,
+	.message = clusterized_message,
+};
+
+static int __init dm_clusterized_exception_store_init(void)
+{
+	int r;
+
+	r = dm_exception_store_type_register(&_clusterized_persistent);
+	if (r)
+		DMERR("Unable to register clusterized-persistent"
+		      " exception store type: %d", r);
+	else
+		DMINFO("(built %s %s) installed", __DATE__, __TIME__);
+
+	return r;
+}
+
+static void __exit dm_clusterized_exception_store_exit(void)
+{
+	dm_exception_store_type_unregister(&_clusterized_persistent);
+	DMINFO("(built %s %s) removed", __DATE__, __TIME__);
+}
+
+module_init(dm_clusterized_exception_store_init);
+module_exit(dm_clusterized_exception_store_exit);
+
+MODULE_DESCRIPTION(DM_MSG_PREFIX);
+MODULE_AUTHOR("Jonathan Brassow <jbrassow@redhat.com>");
+MODULE_LICENSE("GPL");

             reply	other threads:[~2009-03-25 21:46 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-03-25 21:46 Jonathan Brassow [this message]
2009-03-25 21:47 ` [PATCH 1 of 1] DM Exception Store: clusterized type Jonathan Brassow

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=200903252146.n2PLkExv025563@hydrogen.msp.redhat.com \
    --to=jbrassow@redhat.com \
    --cc=dm-devel@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.