All of lore.kernel.org
 help / color / mirror / Atom feed
* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-02-20 19:35 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-02-20 19:35 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-02-20 19:35:10

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-cman.c 
	                    dm-cmirror-cman.h dm-cmirror-server.c 

Log message:
	Bug 217895: lost election results from cmirror server cause mirror ...
	
	There was a race happening as a result of simultaneous cman issued
	'starts'.  The client receives the start requests, but the server
	processes them.  So, it was possible for the server to reset the
	event id/type while the client was trying to set them.  This would
	cause the next kcl_start_done command issued by the server to fail.
	
	The bug can be interpretted many different ways depending on which
	machine in the cluster you are looking at when it happens.
	
	The fix was to have the client wait to set the event id/type until
	it knows the server has completed the previous request.
	
	This fix may resolve other bugs as well, but I will test them
	individually.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.37&r2=1.1.2.38
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-cman.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.5&r2=1.1.2.6
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-cman.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.1&r2=1.1.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.22&r2=1.1.2.23

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/02/19 16:29:42	1.1.2.37
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/02/20 19:35:10	1.1.2.38
@@ -45,6 +45,7 @@
 static int shutting_down=0;
 static atomic_t suspend_client;
 static wait_queue_head_t suspend_client_queue;
+static wait_queue_head_t event_queue;
 
 static DECLARE_MUTEX(consult_server_lock);
 
@@ -1228,8 +1229,11 @@
 	kcl_get_node_by_nodeid(0, &node);
 	my_id = node.node_id;
 
+	/* Wait for any outstanding starts to complete */
+	suspend_on(&event_queue, atomic_read(&restart_event_type));
+
 	restart_event_id = event_id;
-	restart_event_type = type;
+	atomic_set(&restart_event_type, type);
 
 	switch(type){
 	case SERVICE_NODE_LEAVE:
@@ -1391,6 +1395,7 @@
 	}
 
 	init_waitqueue_head(&suspend_client_queue);
+	init_waitqueue_head(&event_queue);
 
 	r = dm_register_dirty_log_type(&_clustered_core_type);
 	if (r) {
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-cman.c	2006/06/15 19:48:00	1.1.2.5
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-cman.c	2007/02/20 19:35:10	1.1.2.6
@@ -27,7 +27,7 @@
 int global_count=0;
 uint32_t *global_nodeids=NULL;
 
-int restart_event_type=0;
+atomic_t restart_event_type = ATOMIC_INIT(0);
 int restart_event_id=0;
 
 uint32_t nodeid_to_ipaddr(uint32_t nodeid){
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-cman.h	2005/07/27 16:09:31	1.1.2.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-cman.h	2007/02/20 19:35:10	1.1.2.2
@@ -12,7 +12,7 @@
 extern int global_count;
 extern uint32_t *global_nodeids;
 
-extern int restart_event_type;
+extern atomic_t restart_event_type;
 extern int restart_event_id;
 
 uint32_t nodeid_to_ipaddr(uint32_t nodeid);
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/02/19 16:29:42	1.1.2.22
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/02/20 19:35:10	1.1.2.23
@@ -1067,7 +1067,7 @@
 		}
 
 		suspend_on(&_suspend_queue, atomic_read(&_suspend));
-		switch(restart_event_type){
+		switch(atomic_read(&restart_event_type)){
 		case SERVICE_NODE_LEAVE:
 			/* ATTENTION -- may wish to check if regions **
 			** are still in use by this node.  For now,  **
@@ -1076,7 +1076,7 @@
 			** leaving node, it won't hurt anything - and**
 			** if there is, they will be recovered.      */
 		case SERVICE_NODE_FAILED:
-			if (restart_event_type == SERVICE_NODE_FAILED)
+			if (atomic_read(&restart_event_type) == SERVICE_NODE_FAILED)
 				DMINFO("A cluster mirror log member has failed.");
 			
 			list_for_each_entry(lc, &log_list_head, log_list){
@@ -1095,10 +1095,13 @@
 		}
 		
 		
-		if(restart_event_type){
+		if(atomic_read(&restart_event_type)){
 			/* finish the start phase */
 			kcl_start_done(local_id, restart_event_id);
-			restart_event_id = restart_event_type = 0;
+			restart_event_id = 0;
+
+			/* Trigger any waiting starts to proceed */
+			atomic_set(&restart_event_type, 0);
 		} else if (atomic_read(&_do_requests)) {
 			/* ATTENTION -- what to do with error ? */
 			if(process_log_request(sock))



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-10-03 19:02 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-10-03 19:02 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-10-03 19:02:52

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 

Log message:
	Bug 316031: dm-mirror: incorrect order of mirror presuspend ops caus...
	
	With kernel (dm-raid1.c) presuspend changes, we can now tell when
	recovery has been shutdown and when we can allow blocked writes.
	
	This should fix a hang issue when converting from one mirror type
	to another.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.54&r2=1.1.2.55
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.40&r2=1.1.2.41

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/09/27 20:31:18	1.1.2.54
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/10/03 19:02:51	1.1.2.55
@@ -592,6 +592,8 @@
 			 * should start out suspended.
 			 */
 			atomic_set(&lc->suspended, 1);
+			lc->recovery_halted = 1;
+			DMDEBUG("Secondary log... suspended and recovery_halted");
 		}
 	}
 
@@ -764,6 +766,11 @@
 
 static int cluster_presuspend(struct dirty_log *log)
 {
+	struct log_c *lc = (struct log_c *) log->context;
+
+	DMDEBUG("cluster_presuspend: recovery halted on %s(%d)",
+		lc->uuid + (strlen(lc->uuid) - 8), lc->uuid_ref);
+	lc->recovery_halted = 1;
 	return 0;
 }
 
@@ -834,8 +841,9 @@
 	struct log_c *lc = (struct log_c *) log->context;
 
 	lc->sync_search = 0;
-	lc->recovery_halted = 0;
 	resume_server_requests();
+	DMDEBUG("cluster_resume: Setting recovery_halted = 0");
+	lc->recovery_halted = 0;
 	atomic_set(&lc->suspended, 0);
 
 	return 0;
@@ -1253,6 +1261,8 @@
 						(atomic_read(&tmp_lc->in_sync)) ? "YES" : "NO");
 					DMDEBUG("  suspended   : %s",
 						(atomic_read(&tmp_lc->suspended)) ? "YES" : "NO");
+					DMDEBUG("  recovery_halted : %s",
+						(tmp_lc->recovery_halted) ? "YES" : "NO");
 					DMDEBUG("  server_id   : %u", tmp_lc->server_id);
 					DMDEBUG("  server_valid: %s",
 						((tmp_lc->server_id != 0xDEAD) &&
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/09/27 20:31:18	1.1.2.40
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/10/03 19:02:51	1.1.2.41
@@ -468,9 +468,15 @@
 		lr->u.lr_int_rtn = 1;
 
 		/* Try to make this region a priority */
+		/*
 		if ((lr->u.lr_region != lc->recovering_region) &&
 		    (lc->recovering_next == (uint64_t)-1))
 			lc->recovering_next = lr->u.lr_region;
+		*/
+		if ((lr->u.lr_region != lc->recovering_region) &&
+		    ((lc->recovering_next == (uint64_t)-1) ||
+		     (lc->recovering_next > lr->u.lr_region)))
+			lc->recovering_next = lr->u.lr_region;		
 		return 0;
 	}
 
@@ -728,6 +734,7 @@
 	 * failed.  In this case, there will not be a record for
 	 * the region.
 	 */
+	DMDEBUG("server_complete_resync_work - Setting recovery_halted = 1");
 	lc->recovery_halted = 1;
 
 	ru = find_ru(lc, who, lr->u.lr_region);



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-09-27 20:31 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-09-27 20:31 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-09-27 20:31:20

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c 

Log message:
	Bug 290821: cmirror write path appears deadlocked after recovery ...
	
	In some device failure cases, regions must be marked 'out-of-sync' -
	this was causing a following write to block because it thought the
	region had not yet been recovered - when in fact, it had just been
	put out-of-sync due to failing device.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.53&r2=1.1.2.54
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.15&r2=1.1.2.16
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.39&r2=1.1.2.40

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/09/26 03:15:40	1.1.2.53
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/09/27 20:31:18	1.1.2.54
@@ -773,6 +773,7 @@
 	struct region_state *rs, *tmp_rs;
 	struct log_c *lc = (struct log_c *) log->context;
 
+	DMDEBUG("cluster_postsuspend");
 	spin_lock(&lc->state_lock);
 	if (!list_empty(&lc->mark_waiting)) {
 		DMERR("Mark requests remain at postsuspend!");
@@ -833,6 +834,7 @@
 	struct log_c *lc = (struct log_c *) log->context;
 
 	lc->sync_search = 0;
+	lc->recovery_halted = 0;
 	resume_server_requests();
 	atomic_set(&lc->suspended, 0);
 
@@ -861,7 +863,7 @@
 {
 	int rtn;
 	struct log_c *lc = (struct log_c *) log->context;
- 	 
+	 
 	if (atomic_read(&lc->in_sync) == 1) {
 		return 0;
 	}
@@ -1170,6 +1172,10 @@
 	region_t rtn;
 	struct log_c *lc = (struct log_c *) log->context;
 
+	if (atomic_read(&lc->suspended)) {
+		return (atomic_read(&lc->in_sync)) ? lc->region_count : 0;
+	}
+
 	/* Try to get sync count up to five times */
 	for (i = 0; i < 5 && consult_server(lc, 0, LRT_GET_SYNC_COUNT, &rtn); i++);
 	if(i >= 5){
@@ -1226,6 +1232,7 @@
 		DMDEBUG(" ?sync_search : %d", lc->sync_search);
 		DMDEBUG("  in_sync     : %s", (atomic_read(&lc->in_sync)) ? "YES" : "NO");
 		DMDEBUG("  suspended   : %s", (atomic_read(&lc->suspended)) ? "YES" : "NO");
+		DMDEBUG("  recovery_halted : %s", (lc->recovery_halted) ? "YES" : "NO");
 		DMDEBUG("  server_id   : %u", lc->server_id);
 		DMDEBUG("  server_valid: %s",
 			((lc->server_id != 0xDEAD) && lc->server_valid) ? "YES" : "NO");
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/09/26 03:15:40	1.1.2.15
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/09/27 20:31:18	1.1.2.16
@@ -102,6 +102,7 @@
 
 	int sync_pass;          /* number of passes attempting to resync */
 	int sync_search;
+	int recovery_halted;    /* only useful for is_remote_recovering */
 
 	/* Resync flag */
 	enum sync {
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/09/26 03:15:40	1.1.2.39
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/09/27 20:31:18	1.1.2.40
@@ -451,6 +451,14 @@
 	if ((lc->sync_search > lc->region_count) && !lc->sync_pass)
 		return 0;
 
+	if (lc->recovery_halted &&
+	    (lc->recovering_region != lr->u.lr_region)) {
+		DMDEBUG("Recovery halted, allowing client: %Lu/%s",
+			lr->u.lr_region,
+			lc->uuid + (strlen(lc->uuid) - 8));
+		return 0;
+	}
+
 	/*
 	 * If the region hasn't been recovered yet,
 	 * we need to block the write
@@ -598,6 +606,12 @@
 
 	lr->u.lr_int_rtn = 0; /* Default to no work */
 
+	if (lc->recovery_halted) {
+		DMDEBUG("Recovery halted due to error on %s",
+			lc->uuid + (strlen(lc->uuid) - 8));
+		return 0;
+	}
+
 	if (lc->recovering_region != (uint64_t)-1) {
 		DMDEBUG("Someone is already recovering region %Lu/%s",
 			lc->recovering_region, lc->uuid + (strlen(lc->uuid) - 8));
@@ -704,11 +718,18 @@
 	/*
 	 * Recovery failed or mirror is being marked out-of-sync
 	 *
+	 * We need to stop dishing out recovery work.  If we don't
+	 * writes happening to NOSYNC regions can't proceed and the
+	 * mirror won't be able to suspend for reconfiguration - due
+	 * to the return of is_remote_recovering().
+	 *
 	 * We can recieve multiple calls to mark out-of-sync
 	 * if there were several writes to the same region that
 	 * failed.  In this case, there will not be a record for
 	 * the region.
 	 */
+	lc->recovery_halted = 1;
+
 	ru = find_ru(lc, who, lr->u.lr_region);
 
 	if ((lr->u.lr_region == lc->recovering_region) && !ru) {
@@ -873,8 +894,14 @@
 	 * New node joins and needs to know I am the server
 	 * We shortcut the election here and respond directly
 	 * to the inquirer
-	 */
+	 *
 	if((lc->server_id == my_id) && !atomic_read(&lc->suspended)){
+	*/
+	if (lc->server_id == my_id) {
+		if (atomic_read(&lc->suspended)) {
+			DMDEBUG("I'm suspended, but still responding as server: %s",
+				lc->uuid + (strlen(lc->uuid) - 8));
+		}
 		lr->u.lr_coordinator = my_id;
 		if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
 			return -1;



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-09-26  3:15 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-09-26  3:15 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-09-26 03:15:41

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c 

Log message:
	Bug 291521: Cluster mirror can become out-of-sync if nominal I/O overl...
	
	Another touch-up for this bug.
	
	Bad news:
	Because a node can cache the state of a region indefinitely (especially for
	blocks that are used alot - e.g. a journaling area of a file system), we must
	deny writes to any region of the mirror that is not yet recovered.  This is only
	the case with cluster mirroring.  This means poor performance of nominal I/O
	during recovery - probably really bad performance.  However, this is absolutely
	necessary for mirror reliability.
	
	Good news:
	The time I spent coding different fixes for this bug weren't a complete waste.
	I've been able to reuse some of that code to optimize the recovery process.
	Now, rather than going through the mirror from front to back, it skips ahead to
	recover regions that have pending writes.  Bottom line: performance will be bad
	during recovery, but it will be better than RHEL4.5.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.52&r2=1.1.2.53
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.14&r2=1.1.2.15
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.38&r2=1.1.2.39

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/09/21 20:07:37	1.1.2.52
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/09/26 03:15:40	1.1.2.53
@@ -142,6 +142,7 @@
 	lc->sync_count = (sync == NOSYNC) ? region_count : 0;
 
 	lc->recovering_region = (uint64_t)-1;
+	lc->recovering_next = (uint64_t)-1;
 	lc->sync_search = 0;
 	log->context = lc;
 	return 0;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/04/10 07:12:24	1.1.2.14
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/09/26 03:15:40	1.1.2.15
@@ -98,6 +98,7 @@
 	uint32_t *clean_bits;
 	uint32_t *sync_bits;
 	uint64_t recovering_region;
+	uint64_t recovering_next;
 
 	int sync_pass;          /* number of passes attempting to resync */
 	int sync_search;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/09/21 20:07:37	1.1.2.38
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/09/26 03:15:40	1.1.2.39
@@ -446,18 +446,25 @@
 
 static int server_is_remote_recovering(struct log_c *lc, struct log_request *lr)
 {
-	uint64_t high, low;
+	lr->u.lr_int_rtn = 0;
 
-	high = lc->sync_search + 10;
-	low = (lc->recovering_region != (uint64_t)-1) ?
-		lc->recovering_region :
-		lc->sync_search;
-	if ((lr->u.lr_region >= low) && (lr->u.lr_region <= high)) {
-		DMDEBUG("Remote recovery conflict: %Lu/%s",
-			lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
+	if ((lc->sync_search > lc->region_count) && !lc->sync_pass)
+		return 0;
+
+	/*
+	 * If the region hasn't been recovered yet,
+	 * we need to block the write
+	 */
+	if (!log_test_bit(lc->sync_bits, lr->u.lr_region) ||
+	    (lc->recovering_region == lr->u.lr_region)) {
 		lr->u.lr_int_rtn = 1;
-	} else
-		lr->u.lr_int_rtn = 0;
+
+		/* Try to make this region a priority */
+		if ((lr->u.lr_region != lc->recovering_region) &&
+		    (lc->recovering_next == (uint64_t)-1))
+			lc->recovering_next = lr->u.lr_region;
+		return 0;
+	}
 
 	return 0;
 }
@@ -542,7 +549,9 @@
 	}
 
 	if (!find_ru_by_region(lc, lr->u.lr_region)) {
-		log_set_bit(lc, lc->clean_bits, lr->u.lr_region);
+		/* Only clear the region if it is also in sync */
+		if (log_test_bit(lc->sync_bits, lr->u.lr_region))
+			log_set_bit(lc, lc->clean_bits, lr->u.lr_region);
 	} else if (check_bug) {
 		DMERR("Multiple marks exist on a region being recovered: %Lu/%s",
 		      lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
@@ -608,26 +617,45 @@
 		}
 	}
 
-	*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
-					  lc->region_count,
-					  lc->sync_search);
-	if (find_ru_by_region(lc, *region)) {
-		DMDEBUG("Recovery blocked by outstanding write on region %Lu/%s",
-			*region, lc->uuid + (strlen(lc->uuid) - 8));
-		return 0;
-	}
+	DMDEBUG("Priority recovery region: %Lu/%s",
+		lc->recovering_next, lc->uuid + (strlen(lc->uuid) - 8));
 
-	if (*region >= lc->region_count)
-		return 0;
+	if ((lc->recovering_next != (uint64_t)-1) &&
+	    (!log_test_bit(lc->sync_bits, lc->recovering_next))) {
+		new = mempool_alloc(region_user_pool, GFP_NOFS);
+		if (!new)
+			return -ENOMEM;
+		*region = lc->recovering_region = lc->recovering_next;
+		DMDEBUG("Preempting normal recovery work for preferred region...");
+	} else {
+		*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
+						  lc->region_count,
+						  lc->sync_search);
+		if (find_ru_by_region(lc, *region)) {
+			/*
+			 * We disallow writes to regions that have not yet been
+			 * recovered via is_remote_recovering(), so this should
+			 * not happen.
+			 */
+			DMERR("Recovery blocked by outstanding write on region %Lu/%s",
+			      *region, lc->uuid + (strlen(lc->uuid) - 8));
+			BUG();
+			return 0;
+		}
 
-	new = mempool_alloc(region_user_pool, GFP_NOFS);
-	if (!new)
-		return -ENOMEM;
+		if (*region >= lc->region_count)
+			return 0;
 
-	lc->sync_search = *region + 1;
+		new = mempool_alloc(region_user_pool, GFP_NOFS);
+		if (!new)
+			return -ENOMEM;
 
-	lc->recovering_region = *region;
+		lc->sync_search = *region + 1;
+
+		lc->recovering_region = *region;
+	}
 
+	lc->recovering_next = (uint64_t)-1;
 	lr->u.lr_int_rtn = 1; /* Assigning work */
 	new->ru_nodeid = who;
 	new->ru_region = *region;



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-09-21 20:07 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-09-21 20:07 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-09-21 20:07:37

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 
	                    dm-cmirror-xfr.h 

Log message:
	Bug 291521: Cluster mirror can become out-of-sync if nominal I/O overla...
	
	It is insufficient to simply delay flush requests that have marks
	pending to a recovering region.  Although a collision between nominal
	I/O and resync I/O can be avoided this way, the state of the region
	changes from RH_NOSYNC to RH_CLEAN in the mean time.  The machine
	being delayed will think the region is still in the RH_NOSYNC state
	and only write to the primary device... leaving the other mirror
	devices out-of-sync.
	
	We must delay writes to remotely recovering regions before the state
	of the region is determined and cached in the region caching code...
	The entry point for this already exists in 'is_remote_recovering'.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.51&r2=1.1.2.52
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.37&r2=1.1.2.38
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-xfr.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.6&r2=1.1.2.7

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/09/13 15:24:20	1.1.2.51
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/09/21 20:07:37	1.1.2.52
@@ -858,7 +858,15 @@
 
 static int cluster_is_remote_recovering(struct dirty_log *log, region_t region)
 {
-	return 0;
+	int rtn;
+	struct log_c *lc = (struct log_c *) log->context;
+ 	 
+	if (atomic_read(&lc->in_sync) == 1) {
+		return 0;
+	}
+
+	rtn = consult_server(lc, region, LRT_IS_REMOTE_RECOVERING, NULL);
+	return rtn;
 }
 
 static int cluster_in_sync(struct dirty_log *log, region_t region, int block)
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/09/13 15:24:20	1.1.2.37
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/09/21 20:07:37	1.1.2.38
@@ -444,6 +444,24 @@
 	return 0;
 }
 
+static int server_is_remote_recovering(struct log_c *lc, struct log_request *lr)
+{
+	uint64_t high, low;
+
+	high = lc->sync_search + 10;
+	low = (lc->recovering_region != (uint64_t)-1) ?
+		lc->recovering_region :
+		lc->sync_search;
+	if ((lr->u.lr_region >= low) && (lr->u.lr_region <= high)) {
+		DMDEBUG("Remote recovery conflict: %Lu/%s",
+			lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
+		lr->u.lr_int_rtn = 1;
+	} else
+		lr->u.lr_int_rtn = 0;
+
+	return 0;
+}
+
 static int server_in_sync(struct log_c *lc, struct log_request *lr)
 {
 	if (lr->u.lr_region > lc->region_count) {
@@ -485,51 +503,28 @@
 		list_add(&new->ru_list, &lc->region_users);
 	} else if (ru->ru_rw == RU_RECOVER) {
 		/*
-		 * The flush will block if a write conflicts with a
-		 * recovering region.  In the meantime, we add this
-		 * entry to the tail of the list so the recovery
-		 * gets cleared first.
-		 */
-		DMDEBUG("Attempt to mark a region " SECTOR_FORMAT
-		      "/%s which is being recovered.",
-		       lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
-		DMDEBUG("Current recoverer: %u", ru->ru_nodeid);
-		DMDEBUG("Mark requester   : %u", who);
-		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
-		list_add_tail(&new->ru_list, &lc->region_users);
+		 * A mark that happens to a region in recovery
+		 * means certain corruption.
+		 */
+		DMERR("Mark attempted to recovering region by %u: %Lu/%s",
+		      who, lr->u.lr_region,
+		      lc->uuid + (strlen(lc->uuid) - 8));
+		DMERR("  lc->recovering_region = %Lu", lc->recovering_region);
+		DMERR("  ru->ru_rw             = %d", ru->ru_rw);
+		DMERR("  ru->ru_nodeid         = %u", ru->ru_nodeid);
+		DMERR("  ru->ru_region         = %Lu", ru->ru_region);
+		BUG();
 	} else {
 		list_add(&new->ru_list, &ru->ru_list);
 	}
 
-	/*
-	if (!(ru = find_ru_by_region(lc, lr->u.lr_region))) {
-		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
-		list_add(&new->ru_list, &lc->region_users);
-	} else if (ru->ru_rw == RU_RECOVER) {
-		DMDEBUG("Attempt to mark a region " SECTOR_FORMAT
-		      "/%s which is being recovered.",
-		       lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
-		DMDEBUG("Current recoverer: %u", ru->ru_nodeid);
-		DMDEBUG("Mark requester   : %u", who);
-		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
-		list_add_tail(&new->ru_list, &lc->region_users);
-	} else if (!find_ru(lc, who, lr->u.lr_region)) {
-		list_add(&new->ru_list, &ru->ru_list);
-	} else {
-		DMWARN("Attempt to mark a already marked region (%u,"
-		       SECTOR_FORMAT
-		       "/%s)",
-		       who, lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
-		mempool_free(new, region_user_pool);
-	}
-	*/
-
 	return 0;
 }
 
 
 static int server_clear_region(struct log_c *lc, struct log_request *lr, uint32_t who)
 {
+	int check_bug = 0;
 	struct region_user *ru;
 
 	ru = find_ru(lc, who, lr->u.lr_region);
@@ -538,13 +533,22 @@
 		       who, lr->u.lr_region);
 		return -EINVAL;
 	} else {
+		if (lc->recovering_region == lr->u.lr_region) {
+			lc->recovering_region = (uint64_t)-1;
+			check_bug = 1;
+		}
 		list_del(&ru->ru_list);
 		mempool_free(ru, region_user_pool);
 	}
 
-	if(!find_ru_by_region(lc, lr->u.lr_region)){
+	if (!find_ru_by_region(lc, lr->u.lr_region)) {
 		log_set_bit(lc, lc->clean_bits, lr->u.lr_region);
+	} else if (check_bug) {
+		DMERR("Multiple marks exist on a region being recovered: %Lu/%s",
+		      lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
+		BUG();
 	}
+		
 	return 0;
 }
 
@@ -552,37 +556,17 @@
 static int server_flush(struct log_c *lc, uint32_t who)
 {
 	int r = 0;
-	int count = 0;
-	int do_flush = 1;
-	struct region_user *ru, *marker = NULL, *recoverer = NULL;
+	struct region_user *ru;
 
 	if (lc->recovering_region != (uint64_t)-1) {
-		list_for_each_entry(ru, &lc->region_users, ru_list)
-			if (ru->ru_region == lc->recovering_region) {
-				if (ru->ru_rw == RU_RECOVER)
-					recoverer = ru;
-				else if (ru->ru_nodeid == who) {
-					do_flush = 0;
-					marker = ru;
-				} else
-					marker = ru;
-
-				count++;
-			}
-
-		if (marker && recoverer) {
-			DMDEBUG("Flush/recovery collision on %Lu/%s: count = %d, marker = %u, recoverer = %u",
-				marker->ru_region, lc->uuid + (strlen(lc->uuid) - 8),
-				count, marker->ru_nodeid, recoverer->ru_nodeid);
-			DMDEBUG("  Count     = %d", count);
-			DMDEBUG("  Marker    = %u", marker->ru_nodeid);
-			DMDEBUG("  Recoverer = %u", recoverer->ru_nodeid);
-			DMDEBUG("  Flusher   = %u", who);
-			if (!do_flush) {
-				DMDEBUG("Blocking flush");
-				return -EBUSY;
-			} else
-				DMDEBUG("Allowing flush");
+		list_for_each_entry(ru, &lc->region_users, ru_list) {
+			if ((ru->ru_region == lc->recovering_region) &&
+			    (ru->ru_rw != RU_RECOVER)) {
+				DMERR("Flush attempted to recovering region by %u: %Lu/%s",
+				      who, lc->recovering_region,
+				      lc->uuid + (strlen(lc->uuid) - 8));
+				BUG();
+			}
 		}
 	}
 
@@ -601,7 +585,6 @@
 static int server_get_resync_work(struct log_c *lc, struct log_request *lr, uint32_t who)
 {
 	struct region_user *new;
-	int sync_search, conflict = 0;
 	region_t *region = &(lr->u.lr_region_rtn);
 
 	lr->u.lr_int_rtn = 0; /* Default to no work */
@@ -625,19 +608,13 @@
 		}
 	}
 
-	for (sync_search = lc->sync_search;
-	     sync_search < lc->region_count;
-	     sync_search = (*region + 1)) {
-		*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
-						  lc->region_count,
-						  sync_search);
-		if (find_ru_by_region(lc, *region)) {
-			conflict = 1;
-			DMDEBUG("Recovery blocked by outstanding write on region %Lu/%s",
-				*region, lc->uuid + (strlen(lc->uuid) - 8));
-		} else {
-			break;
-		}
+	*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
+					  lc->region_count,
+					  lc->sync_search);
+	if (find_ru_by_region(lc, *region)) {
+		DMDEBUG("Recovery blocked by outstanding write on region %Lu/%s",
+			*region, lc->uuid + (strlen(lc->uuid) - 8));
+		return 0;
 	}
 
 	if (*region >= lc->region_count)
@@ -647,8 +624,7 @@
 	if (!new)
 		return -ENOMEM;
 
-	if (!conflict)
-		lc->sync_search = *region + 1;
+	lc->sync_search = *region + 1;
 
 	lc->recovering_region = *region;
 
@@ -678,13 +654,18 @@
 			return -EINVAL;
 		}
 
-		lc->recovering_region = (uint64_t)-1;
-
 		/* We could receive multiple identical request due to network failure */
-		if(!log_test_bit(lc->sync_bits, lr->u.lr_region)) {
+		if (!log_test_bit(lc->sync_bits, lr->u.lr_region)) {
 			log_set_bit(lc, lc->sync_bits, lr->u.lr_region);
 			lc->sync_count++;
 		}
+
+		/*
+		 * We will: 
+		 * lc->recovering_region = (uint64_t)-1;
+		 * in clear_region so we can do extra validation
+		 */
+
 		lc->sync_pass = 0;
 
 		DMDEBUG("Resync work completed by %u: %Lu/%s",
@@ -1064,6 +1045,9 @@
 		case LRT_IS_CLEAN:
 			error = server_is_clean(lc, lr);
 			break;
+		case LRT_IS_REMOTE_RECOVERING:
+			error = server_is_remote_recovering(lc, lr);
+			break;
 		case LRT_IN_SYNC:
 			error = server_in_sync(lc, lr);
 			break;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/07/11 16:18:03	1.1.2.6
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/09/21 20:07:37	1.1.2.7
@@ -10,18 +10,19 @@
 #define MAX_NAME_LEN 128
 
 #define LRT_IS_CLEAN			1
-#define LRT_IN_SYNC             	2
-#define LRT_MARK_REGION         	3
-#define LRT_CLEAR_REGION        	4
-#define LRT_FLUSH                       5
-#define LRT_GET_RESYNC_WORK     	6
-#define LRT_COMPLETE_RESYNC_WORK        7
-#define LRT_GET_SYNC_COUNT      	8
-
-#define LRT_ELECTION			9
-#define LRT_SELECTION			10
-#define LRT_MASTER_ASSIGN		11
-#define LRT_MASTER_LEAVING		12
+#define LRT_IS_REMOTE_RECOVERING	2
+#define LRT_IN_SYNC             	3
+#define LRT_MARK_REGION         	4
+#define LRT_CLEAR_REGION        	5
+#define LRT_FLUSH                       6
+#define LRT_GET_RESYNC_WORK     	7
+#define LRT_COMPLETE_RESYNC_WORK        8
+#define LRT_GET_SYNC_COUNT      	9
+
+#define LRT_ELECTION			10
+#define LRT_SELECTION			11
+#define LRT_MASTER_ASSIGN		12
+#define LRT_MASTER_LEAVING		13
 
 #define CLUSTER_LOG_PORT 51005
 



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-09-13 15:24 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-09-13 15:24 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-09-13 15:24:20

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 

Log message:
	Bug 257881: Flush/recovery collision leads to deadlock after leg ...
	
	The procedure for coordinating nominal I/O and recovery I/O, was to
	either:
	1) delay a flush which contained a mark to a region being recovered
	2) skip over regions that are currently marked when assigning recovery
	
	This bug has to do with the way #1 was implemented.
	
	The following scenario would trigger it:
	1) node1 is assigned recovery on region X
	2) node1 also does a mark (write) on region Y
	3) node2 attempts to mark region X
	**) any flush issued here will delay waiting for recovery to complete on X
	4) node1 needs to perform the flush before it can get on with completing
	recovery - but it can't flush, so everyone is delayed *forever*.
	
	The fix was to allow flushes from nodes that are not attempting to mark
	regions that are being recovered.  In the example above, node1 should be
	allowed to complete the flush because it is not trying to write to the
	same region that is being recovered.  node2 would be correctly delayed.
	Since node1 can complete the flush, it can also complete the recovery -
	thus allowing things to proceed.
	
	This bug only affects mirrors that are not in-sync and are doing I/O.
	This bug can occur whether there are device/machine failures or not.
	This bug is most easily reproduced with a number of mirrors, but would
	be possible with just one.
	
	I've also fixed up some debugging output so it is more consistent and
	easier to follow the flow of events.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.50&r2=1.1.2.51
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.36&r2=1.1.2.37

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/08/23 16:51:39	1.1.2.50
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/09/13 15:24:20	1.1.2.51
@@ -946,7 +946,8 @@
 
 	while ((r = consult_server(lc, 0, LRT_FLUSH, NULL))) {
 		if (r == -EBUSY) {
-			DMDEBUG("Delaying flush due to recovery");
+			DMDEBUG("Delaying flush due to recovery (%s)",
+				lc->uuid + (strlen(lc->uuid) - 8));
 			set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout(HZ);
 			continue;
@@ -1110,8 +1111,8 @@
 	}
 
 	if (rtn)
-		DMDEBUG("Client received resync work: %Lu/%s",
-			*region, lc->uuid + (strlen(lc->uuid) - 8));
+		DMDEBUG("Received recovery work from %u: %Lu/%s",
+			lc->server_id, *region, lc->uuid + (strlen(lc->uuid) - 8));
 
 	/*
 	 * Check for bug 235039
@@ -1137,12 +1138,19 @@
 	region_t success_tmp = success;
 	struct log_c *lc = (struct log_c *) log->context;
 
+	if (success)
+		DMDEBUG("Client finishing recovery: %Lu/%s",
+			region, lc->uuid + (strlen(lc->uuid) - 8));
+	else
+		DMDEBUG("Notifying server(%u) of sync change: %Lu/%s",
+			lc->server_id, region,
+			lc->uuid + (strlen(lc->uuid) - 8));
 	for (i = 0; i < 5; i++) {
 		if (!consult_server(lc, region,
 				    LRT_COMPLETE_RESYNC_WORK, &success_tmp))
 			break;
 		success_tmp = success;
-		DMWARN("Unable to notify server of completed resync work");
+		DMWARN("Unable to notify server of sync state change");
 	}
 	return;
 }
@@ -1203,6 +1211,7 @@
 		DMDEBUG("LOG INFO:");
 		DMDEBUG("  uuid: %s", lc->uuid);
 		DMDEBUG("  uuid_ref    : %d", lc->uuid_ref);
+		DMDEBUG("  log type    : %s", (lc->log_dev) ? "disk" : "core");
 		DMDEBUG(" ?region_count: %Lu", lc->region_count);
 		DMDEBUG(" ?sync_count  : %Lu", lc->sync_count);
 		DMDEBUG(" ?sync_search : %d", lc->sync_search);
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/07/11 16:18:03	1.1.2.36
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/09/13 15:24:20	1.1.2.37
@@ -549,10 +549,11 @@
 }
 
 
-static int server_flush(struct log_c *lc)
+static int server_flush(struct log_c *lc, uint32_t who)
 {
 	int r = 0;
 	int count = 0;
+	int do_flush = 1;
 	struct region_user *ru, *marker = NULL, *recoverer = NULL;
 
 	if (lc->recovering_region != (uint64_t)-1) {
@@ -560,22 +561,28 @@
 			if (ru->ru_region == lc->recovering_region) {
 				if (ru->ru_rw == RU_RECOVER)
 					recoverer = ru;
-				else
+				else if (ru->ru_nodeid == who) {
+					do_flush = 0;
 					marker = ru;
+				} else
+					marker = ru;
+
 				count++;
 			}
 
 		if (marker && recoverer) {
-			DMDEBUG("Flush/recovery collision (%Lu/%s): count = %d, marker = %u, recoverer = %u",
+			DMDEBUG("Flush/recovery collision on %Lu/%s: count = %d, marker = %u, recoverer = %u",
 				marker->ru_region, lc->uuid + (strlen(lc->uuid) - 8),
 				count, marker->ru_nodeid, recoverer->ru_nodeid);
-			/*
-			DMDEBUG("  sync_bit: %s, clean_bit: %s",
-				log_test_bit(lc->sync_bits, lc->recovering_region) ? "set" : "unset",
-				log_test_bit(lc->clean_bits, lc->recovering_region) ? "set" : "unset");
-			*/
-
-			return -EBUSY;
+			DMDEBUG("  Count     = %d", count);
+			DMDEBUG("  Marker    = %u", marker->ru_nodeid);
+			DMDEBUG("  Recoverer = %u", recoverer->ru_nodeid);
+			DMDEBUG("  Flusher   = %u", who);
+			if (!do_flush) {
+				DMDEBUG("Blocking flush");
+				return -EBUSY;
+			} else
+				DMDEBUG("Allowing flush");
 		}
 	}
 
@@ -650,8 +657,8 @@
 	new->ru_region = *region;
 	new->ru_rw = RU_RECOVER;
 	list_add(&new->ru_list, &lc->region_users);
-	DMDEBUG("Assigning recovery work to %u/%s: %Lu",
-		who, lc->uuid + (strlen(lc->uuid) - 8), new->ru_region);
+	DMDEBUG("Assigning recovery work to %u: %Lu/%s",
+		who, new->ru_region, lc->uuid + (strlen(lc->uuid) - 8));
 
 	return 0;
 }
@@ -680,7 +687,8 @@
 		}
 		lc->sync_pass = 0;
 
-		DMDEBUG("Resync work completed: %Lu", lr->u.lr_region);
+		DMDEBUG("Resync work completed by %u: %Lu/%s",
+			who, lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
 		return 0;
 	}
 
@@ -1077,7 +1085,12 @@
 			error = server_clear_region(lc, lr, nodeid);
 			break;
 		case LRT_FLUSH:
-			error = server_flush(lc);
+			if(!(nodeid = 
+			     ipaddr_to_nodeid((struct sockaddr *)msg.msg_name))){
+				error = -ENXIO;
+				break;
+			}
+			error = server_flush(lc, nodeid);
 			break;
 		case LRT_GET_RESYNC_WORK:
 			if(!(nodeid = 



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-07-11 16:18 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-07-11 16:18 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-07-11 16:18:03

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 
	                    dm-cmirror-xfr.h 

Log message:
	Bug 238629: dm-cmirror: Remote recovery conflict...
	
	The kernel changes are now in place (marking/clearing the log
	during writes to nosync regions) to allow nominal I/O to
	region that have yet to be recovered.
	
	Also moved around some debugging messages and removed
	'is_remote_recovering()'.  (is_remote_recovering is obviated by
	the new mechanism for handling recovery/write ordering.)

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.48&r2=1.1.2.49
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.35&r2=1.1.2.36
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-xfr.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.5&r2=1.1.2.6

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/05/09 21:44:34	1.1.2.48
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/07/11 16:18:03	1.1.2.49
@@ -286,8 +286,8 @@
 		lc->server_id = lr.u.lr_coordinator;
 	} else {
 		/* ATTENTION -- what do we do with this ? */
-		DMWARN("Failed to receive election results from server: (%s,%d)",
-		       lc->uuid + (strlen(lc->uuid) - 8), len);
+		DMWARN("Failed to receive election results from server: (%s/%d,%d)",
+		       lc->uuid + (strlen(lc->uuid) - 8), lc->uuid_ref, len);
 		error = len;
 	}
 
@@ -601,6 +601,7 @@
 	INIT_LIST_HEAD(&lc->mark_logged);
 	spin_lock_init(&lc->state_lock);
 
+	atomic_set(&lc->suspended, 1);
 	lc->server_valid = 0;
 	lc->server_id = 0xDEAD;
 
@@ -853,15 +854,7 @@
 
 static int cluster_is_remote_recovering(struct dirty_log *log, region_t region)
 {
-	int rtn;
-	struct log_c *lc = (struct log_c *) log->context;
-
-	if(atomic_read(&lc->in_sync) == 1){
-		return 0;
-	}
-
-	rtn = consult_server(lc, region, LRT_IS_REMOTE_RECOVERING, NULL);
-	return rtn;
+	return 0;
 }
 
 static int cluster_in_sync(struct dirty_log *log, region_t region, int block)
@@ -977,57 +970,43 @@
 
 	spin_lock(&lc->state_lock);
 
-
 	/*
-	 * It is possible for the following in the mirror code:
-	 *  0) Mark is already logged for a region
-	 *  1) rh_dec, sets region state to RH_CLEAN (asynchronous)
-	 *  2) rh_update_states (DOESN'T FLUSH!!!, bug #235040)
-	 *  3) do_writes, trys to mark region
-	 *
-	 * The following shouldn't have to be handled b/c of the flush
-	 *  0) Region finishes recovery
-	 *  1) rh_update_states clears region (DOES FLUSH)
-	 *  2) do_writes, trys to mark region
-	 *
-	 * This can lead to this next case being valid.
+	 * An item on the clear_waiting list should have been flushed
+	 * before getting this mark_region call.
 	 */
-	list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) {
+	list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list)
 		if (region == rs->rs_region) {
-			if (!rs->rs_mark_logged) {
-				DMERR("Moving region(%Lu/%s) from clear_waiting -> mark_waiting",
-				      region, lc->uuid + (strlen(lc->uuid) - 8));
-			}
-			list_del_init(&rs->rs_list);
-			list_add(&rs->rs_list,
-				 (rs->rs_mark_logged) ?
-				 &lc->mark_logged : &lc->mark_waiting);
-			goto out;
+			DMERR("Region being marked found on clear_waiting list (%Lu/%s)",
+			      region, lc->uuid + (strlen(lc->uuid) - 8));
+			BUG();
 		}
-	}
+
+	/*
+	 * We should never get two marks before a flush, unless the
+	 * region is not in-sync.  One valid scenario would be:
+	 *  0) region not in-sync
+	 *  1) rh_inc (mark region)
+	 *  2) rh_update_states
+	 *  3) rh_dec (dec pending and put on clean_region list)
+	 *  4) do_writes -> rh_inc (second mark)
+	 */
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list)
+		if (region == rs->rs_region)
+			goto out;
 
 	/*
 	 * It is possible for the following in the mirror code:
 	 *  0) Mark is already logged for a region
-	 *  1) rh_update_states
+	 *  1) rh_update_states (were the clear_region happens)
 	 *  2) rh_dec, sets region state to RH_CLEAN (asynchronous)
 	 *  3) do_writes, trys to mark region
 	 *
 	 * This can lead to this next case being valid.
 	 */
-	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list){
-		if (region == rs->rs_region) {
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list)
+		if (region == rs->rs_region)
 			goto out;
-		}
-	}
 
-	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list){
-		if (region == rs->rs_region) {
-			DMERR("Mark already waiting (%Lu/%s)",
-			      region, lc->uuid + (strlen(lc->uuid) - 8));
-			BUG();
-		}
-	}
 	spin_unlock(&lc->state_lock);
 
 	rs_new = mempool_alloc(region_state_pool, GFP_NOFS);
@@ -1074,14 +1053,13 @@
 	 * 6) we recover the region
 	 * 7) clearing the region after recovery causes us to get here
 	 *
-	 * Once 235040 is cleared, any entries found in this list should
-	 * cause a bug.
+	 * Bug 235040 cleared.  This should no longer happen.
 	 */
 	list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list){
 		if(region == rs->rs_region){
-			DMERR("%d) Double clear on region ("
-			      SECTOR_FORMAT ")", __LINE__, region);
-			goto out;
+			DMERR("Double clear on region (%Lu/%s)",
+			      region, lc->uuid + (strlen(lc->uuid) - 8));
+			BUG();
 		}
 	}
 
@@ -1140,7 +1118,7 @@
 		list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) {
 			if (*region == rs->rs_region) {
 				DMERR("WARNING: Bug 235039/235040 detected!");
-				DMERR("Work-around in place.");
+				BUG();
 			}
 		}
 	}
@@ -1204,16 +1182,6 @@
 
 	switch(status){
 	case STATUSTYPE_INFO:
-		DMDEBUG("LOG INFO:");
-		DMDEBUG("  uuid: %s", lc->uuid);
-		DMDEBUG("  uuid_ref    : %d", lc->uuid_ref);
-		DMDEBUG(" ?region_count: %Lu", lc->region_count);
-		DMDEBUG(" ?sync_count  : %Lu", lc->sync_count);
-		DMDEBUG(" ?sync_search : %d", lc->sync_search);
-		DMDEBUG("  in_sync     : %s", (atomic_read(&lc->in_sync)) ? "YES" : "NO");
-		DMDEBUG("  server_id   : %u", lc->server_id);
-		DMDEBUG("  server_valid: %s",
-			((lc->server_id != 0xDEAD) && lc->server_valid) ? "YES" : "NO");
 		if(lc->sync != DEFAULTSYNC)
 			arg_count++;
 
@@ -1228,6 +1196,42 @@
                 break;
 
         case STATUSTYPE_TABLE:
+		DMDEBUG("LOG INFO:");
+		DMDEBUG("  uuid: %s", lc->uuid);
+		DMDEBUG("  uuid_ref    : %d", lc->uuid_ref);
+		DMDEBUG(" ?region_count: %Lu", lc->region_count);
+		DMDEBUG(" ?sync_count  : %Lu", lc->sync_count);
+		DMDEBUG(" ?sync_search : %d", lc->sync_search);
+		DMDEBUG("  in_sync     : %s", (atomic_read(&lc->in_sync)) ? "YES" : "NO");
+		DMDEBUG("  suspended   : %s", (atomic_read(&lc->suspended)) ? "YES" : "NO");
+		DMDEBUG("  server_id   : %u", lc->server_id);
+		DMDEBUG("  server_valid: %s",
+			((lc->server_id != 0xDEAD) && lc->server_valid) ? "YES" : "NO");
+		{
+			struct log_c *tmp_lc;
+
+			down(&log_list_lock);
+			list_for_each_entry(tmp_lc, &log_list_head, log_list){
+				if(!strncmp(tmp_lc->uuid, lc->uuid, MAX_NAME_LEN) &&
+				   (tmp_lc->uuid_ref != lc->uuid_ref)){
+					DMDEBUG("LOG INFO [COPY FOUND]:");
+					DMDEBUG("  uuid: %s", tmp_lc->uuid);
+					DMDEBUG("  uuid_ref    : %d", tmp_lc->uuid_ref);
+					DMDEBUG(" ?region_count: %Lu", tmp_lc->region_count);
+					DMDEBUG(" ?sync_count  : %Lu", tmp_lc->sync_count);
+					DMDEBUG(" ?sync_search : %d", tmp_lc->sync_search);
+					DMDEBUG("  in_sync     : %s",
+						(atomic_read(&tmp_lc->in_sync)) ? "YES" : "NO");
+					DMDEBUG("  suspended   : %s",
+						(atomic_read(&tmp_lc->suspended)) ? "YES" : "NO");
+					DMDEBUG("  server_id   : %u", tmp_lc->server_id);
+					DMDEBUG("  server_valid: %s",
+						((tmp_lc->server_id != 0xDEAD) &&
+						 tmp_lc->server_valid) ? "YES" : "NO");
+				}
+			}
+			up(&log_list_lock);
+		}
 		if(lc->sync != DEFAULTSYNC)
 			arg_count++;
 
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/26 16:54:49	1.1.2.35
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/07/11 16:18:03	1.1.2.36
@@ -221,54 +221,6 @@
 	return count;
 }
 
-struct region_user *find_ru_by_region(struct log_c *lc, region_t region);
-static int _core_get_resync_work(struct log_c *lc, region_t *region)
-{
-	int sync_search, conflict = 0;
-
-	if (lc->recovering_region != (uint64_t)-1) {
-		DMDEBUG("Someone is already recovering region %Lu/%s",
-			lc->recovering_region, lc->uuid + (strlen(lc->uuid) - 8));
-		return 0;
-	}
-
-	if (lc->sync_search >= lc->region_count) {
-		/*
-		 * FIXME: pvmove is not supported yet, but when it is,
-		 * an audit of sync_count changes will need to be made
-		 */
-		if ((lc->sync_count < lc->region_count) && !lc->sync_pass) {
-			lc->sync_search = 0;
-			lc->sync_pass++;
-		} else {
-			return 0;
-		}
-	}
-	for (sync_search = lc->sync_search;
-	     sync_search < lc->region_count;
-	     sync_search = (*region + 1)) {
-		*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
-						  lc->region_count,
-						  sync_search);
-		if (find_ru_by_region(lc, *region)) {
-			conflict = 1;
-			DMDEBUG("Recovery blocked by outstanding write on region %Lu/%s",
-				*region, lc->uuid + (strlen(lc->uuid) - 8));
-		} else {
-			break;
-		}
-	}
-	if (!conflict)
-		lc->sync_search = *region + 1;
-
-	if (*region >= lc->region_count)
-		return 0;
-
-	lc->recovering_region = *region;
-	return 1;
-}
-
-
 static int print_zero_bits(unsigned char *str, int offset, int bit_count){
 	int i,j;
 	int count=0;
@@ -492,39 +444,6 @@
 	return 0;
 }
 
-static int server_is_remote_recovering(struct log_c *lc, struct log_request *lr)
-{
-	region_t region;
-	struct region_user *ru;
-
-	/*
-	 * This gets a bit complicated.  I wish we didn't have to use this
-	 * function, but because the mirror code doesn't mark regions which
-	 * it writes to that are out-of-sync, we need this function.
-	 *
-	 * Problem is, we don't know how long the user is going to take to
-	 * write to the region after they have called this function.  So,
-	 * we are forced at this point to deny any writes to regions we
-	 * are recovering or might recover in the future.
-	 *
-	 * We can count on the client side to not send us one of these
-	 * requests if the mirror is known to be in-sync.
-	 *
-	 * And yes, it sucks to take this much time off the I/O.
-	 */
-	region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
-					 lc->region_count, 0);
-
-	if (lr->u.lr_region >= region) {
-		DMDEBUG("Remote recovery conflict: (%Lu >= %Lu)/%s",
-			lr->u.lr_region, region, lc->uuid + (strlen(lc->uuid) - 8));
-		lr->u.lr_int_rtn = 1;
-	} else
-		lr->u.lr_int_rtn = 0;
-
-	return 0;
-}
-
 static int server_in_sync(struct log_c *lc, struct log_request *lr)
 {
 	if (lr->u.lr_region > lc->region_count) {
@@ -555,7 +474,13 @@
 	new->ru_region = lr->u.lr_region;
 	new->ru_rw = RU_WRITE;
 
-	if (!(ru = find_ru_by_region(lc, lr->u.lr_region))) {
+	if (find_ru(lc, who, lr->u.lr_region)) {
+		DMWARN("Attempt to mark a already marked region (%u,"
+		       SECTOR_FORMAT
+		       "/%s)",
+		       who, lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
+		mempool_free(new, region_user_pool);
+	} else if (!(ru = find_ru_by_region(lc, lr->u.lr_region))) {
 		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
 		list_add(&new->ru_list, &lc->region_users);
 	} else if (ru->ru_rw == RU_RECOVER) {
@@ -572,6 +497,22 @@
 		DMDEBUG("Mark requester   : %u", who);
 		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
 		list_add_tail(&new->ru_list, &lc->region_users);
+	} else {
+		list_add(&new->ru_list, &ru->ru_list);
+	}
+
+	/*
+	if (!(ru = find_ru_by_region(lc, lr->u.lr_region))) {
+		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
+		list_add(&new->ru_list, &lc->region_users);
+	} else if (ru->ru_rw == RU_RECOVER) {
+		DMDEBUG("Attempt to mark a region " SECTOR_FORMAT
+		      "/%s which is being recovered.",
+		       lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
+		DMDEBUG("Current recoverer: %u", ru->ru_nodeid);
+		DMDEBUG("Mark requester   : %u", who);
+		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
+		list_add_tail(&new->ru_list, &lc->region_users);
 	} else if (!find_ru(lc, who, lr->u.lr_region)) {
 		list_add(&new->ru_list, &ru->ru_list);
 	} else {
@@ -581,6 +522,7 @@
 		       who, lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
 		mempool_free(new, region_user_pool);
 	}
+	*/
 
 	return 0;
 }
@@ -611,35 +553,28 @@
 {
 	int r = 0;
 	int count = 0;
-	struct region_user *ru, *ru2;
+	struct region_user *ru, *marker = NULL, *recoverer = NULL;
 
 	if (lc->recovering_region != (uint64_t)-1) {
 		list_for_each_entry(ru, &lc->region_users, ru_list)
-			if (ru->ru_region == lc->recovering_region)
-				count++;
-
-		if (count > 1) {
-			list_for_each_entry(ru, &lc->region_users, ru_list)
+			if (ru->ru_region == lc->recovering_region) {
 				if (ru->ru_rw == RU_RECOVER)
-					break;
+					recoverer = ru;
+				else
+					marker = ru;
+				count++;
+			}
 
-			DMDEBUG("Flush includes region which is being recovered (%u/%Lu).  Delaying...",
-				ru->ru_nodeid, ru->ru_region);
-			DMDEBUG("Recovering region: %Lu", lc->recovering_region);
+		if (marker && recoverer) {
+			DMDEBUG("Flush/recovery collision (%Lu/%s): count = %d, marker = %u, recoverer = %u",
+				marker->ru_region, lc->uuid + (strlen(lc->uuid) - 8),
+				count, marker->ru_nodeid, recoverer->ru_nodeid);
+			/*
 			DMDEBUG("  sync_bit: %s, clean_bit: %s",
 				log_test_bit(lc->sync_bits, lc->recovering_region) ? "set" : "unset",
 				log_test_bit(lc->clean_bits, lc->recovering_region) ? "set" : "unset");
+			*/
 
-			list_for_each_entry(ru2, &lc->region_users, ru_list)
-				if (ru->ru_region == ru2->ru_region)
-					DMDEBUG("  %s", (ru2->ru_rw == RU_RECOVER) ? "recover" :
-						(ru2->ru_rw == RU_WRITE) ? "writer" : "unknown");
-
-			/* FIXME: work-around for bug 235040 */
-			DMDEBUG("Revoking resync work");
-			lc->recovering_region = (uint64_t)-1;
-			list_del(&ru->ru_list);
-			mempool_free(ru, region_user_pool);
 			return -EBUSY;
 		}
 	}
@@ -1121,9 +1056,6 @@
 		case LRT_IS_CLEAN:
 			error = server_is_clean(lc, lr);
 			break;
-		case LRT_IS_REMOTE_RECOVERING:
-			error = server_is_remote_recovering(lc, lr);
-			break;
 		case LRT_IN_SYNC:
 			error = server_in_sync(lc, lr);
 			break;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/04/24 20:08:57	1.1.2.5
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/07/11 16:18:03	1.1.2.6
@@ -10,19 +10,18 @@
 #define MAX_NAME_LEN 128
 
 #define LRT_IS_CLEAN			1
-#define LRT_IS_REMOTE_RECOVERING	2
-#define LRT_IN_SYNC             	3
-#define LRT_MARK_REGION         	4
-#define LRT_CLEAR_REGION        	5
-#define LRT_FLUSH                       6
-#define LRT_GET_RESYNC_WORK     	7
-#define LRT_COMPLETE_RESYNC_WORK        8
-#define LRT_GET_SYNC_COUNT      	9
-
-#define LRT_ELECTION			10
-#define LRT_SELECTION			11
-#define LRT_MASTER_ASSIGN		12
-#define LRT_MASTER_LEAVING		13
+#define LRT_IN_SYNC             	2
+#define LRT_MARK_REGION         	3
+#define LRT_CLEAR_REGION        	4
+#define LRT_FLUSH                       5
+#define LRT_GET_RESYNC_WORK     	6
+#define LRT_COMPLETE_RESYNC_WORK        7
+#define LRT_GET_SYNC_COUNT      	8
+
+#define LRT_ELECTION			9
+#define LRT_SELECTION			10
+#define LRT_MASTER_ASSIGN		11
+#define LRT_MASTER_LEAVING		12
 
 #define CLUSTER_LOG_PORT 51005
 



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-04-26 16:55 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-04-26 16:55 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL45
Changes by:	jbrassow at sourceware.org	2007-04-26 17:55:51

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 

Log message:
	Bug 238031: cluster mirrors not handling all recovery/write conflicts
	
	Problem is that the kernel (main mirror code) does not do any marks/clears when
	writing to a region before its recovery.  So, it is not possible for the server
	to detect a conflict.  Basically, we must turn back on the
	'is_remote_recovering' function and disallow any writes to regions that are OR
	WILL BE recovering.
	
	It's really going to cause some pain during writes while mirrors are re-syncing.
	The better fix for the future is to have the writes always mark/clear the
	regions - then we can again remove the 'is_remote_recovering' function.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.41.2.5&r2=1.1.2.41.2.6
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.26.2.7&r2=1.1.2.26.2.8

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/24 20:10:20	1.1.2.41.2.5
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/26 16:55:51	1.1.2.41.2.6
@@ -861,11 +861,10 @@
 	int rtn;
 	struct log_c *lc = (struct log_c *) log->context;
 
-/* take out optimization
 	if(atomic_read(&lc->in_sync) == 1){
 		return 0;
 	}
-*/
+
 	rtn = consult_server(lc, region, LRT_IS_REMOTE_RECOVERING, NULL);
 	return rtn;
 }
@@ -876,11 +875,11 @@
 	struct log_c *lc = (struct log_c *) log->context;
   
 	/* check known_regions, return if found */
-/* take out optimization
+
 	if(atomic_read(&lc->in_sync) == 1){
 		return 1;
 	}
-*/
+
 	if(!block){
 		return -EWOULDBLOCK;
 	}
@@ -1414,7 +1413,7 @@
 	.resume = cluster_resume,
 	.get_region_size = cluster_get_region_size,
 	.is_clean = cluster_is_clean,
-/*	.is_remote_recovering = cluster_is_remote_recovering,*/
+	.is_remote_recovering = cluster_is_remote_recovering,
 	.in_sync = cluster_in_sync,
 	.flush = cluster_flush,
 	.mark_region = cluster_mark_region,
@@ -1436,7 +1435,7 @@
 	.resume = cluster_resume,
 	.get_region_size = cluster_get_region_size,
 	.is_clean = cluster_is_clean,
-/*	.is_remote_recovering = cluster_is_remote_recovering,*/
+	.is_remote_recovering = cluster_is_remote_recovering,
 	.in_sync = cluster_in_sync,
 	.flush = cluster_flush,
 	.mark_region = cluster_mark_region,
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/24 20:10:20	1.1.2.26.2.7
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/26 16:55:51	1.1.2.26.2.8
@@ -494,12 +494,32 @@
 
 static int server_is_remote_recovering(struct log_c *lc, struct log_request *lr)
 {
+	region_t region;
 	struct region_user *ru;
 
-	if ((ru = find_ru_by_region(lc, lr->u.lr_region)) && 
-	    (ru->ru_rw == RU_RECOVER))
+	/*
+	 * This gets a bit complicated.  I wish we didn't have to use this
+	 * function, but because the mirror code doesn't mark regions which
+	 * it writes to that are out-of-sync, we need this function.
+	 *
+	 * Problem is, we don't know how long the user is going to take to
+	 * write to the region after they have called this function.  So,
+	 * we are forced at this point to deny any writes to regions we
+	 * are recovering or might recover in the future.
+	 *
+	 * We can count on the client side to not send us one of these
+	 * requests if the mirror is known to be in-sync.
+	 *
+	 * And yes, it sucks to take this much time off the I/O.
+	 */
+	region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
+					 lc->region_count, 0);
+
+	if (lr->u.lr_region >= region) {
+		DMDEBUG("Remote recovery conflict: (%Lu >= %Lu)/%s",
+			lr->u.lr_region, region, lc->uuid + (strlen(lc->uuid) - 8));
 		lr->u.lr_int_rtn = 1;
-	else
+	} else
 		lr->u.lr_int_rtn = 0;
 
 	return 0;
@@ -639,24 +659,65 @@
 static int server_get_resync_work(struct log_c *lc, struct log_request *lr, uint32_t who)
 {
 	struct region_user *new;
+	int sync_search, conflict = 0;
+	region_t *region = &(lr->u.lr_region_rtn);
 
-	new = mempool_alloc(region_user_pool, GFP_NOFS);
-	if(!new){
-		lr->u.lr_int_rtn = 0;
-		return -ENOMEM;
+	lr->u.lr_int_rtn = 0; /* Default to no work */
+
+	if (lc->recovering_region != (uint64_t)-1) {
+		DMDEBUG("Someone is already recovering region %Lu/%s",
+			lc->recovering_region, lc->uuid + (strlen(lc->uuid) - 8));
+		return 0;
 	}
-	
-	if ((lr->u.lr_int_rtn = _core_get_resync_work(lc, &(lr->u.lr_region_rtn)))){
-		new->ru_nodeid = who;
-		new->ru_region = lr->u.lr_region_rtn;
-		new->ru_rw = RU_RECOVER;
-		list_add(&new->ru_list, &lc->region_users);
-		DMDEBUG("Assigning recovery work to %u/%s: %Lu",
-			who, lc->uuid + (strlen(lc->uuid) - 8), new->ru_region);
-	} else {
-		mempool_free(new, region_user_pool);
+
+	if (lc->sync_search >= lc->region_count) {
+		/*
+		 * FIXME: pvmove is not supported yet, but when it is,
+		 * an audit of sync_count changes will need to be made
+		 */
+		if ((lc->sync_count < lc->region_count) && !lc->sync_pass) {
+			lc->sync_search = 0;
+			lc->sync_pass++;
+		} else {
+			return 0;
+		}
+	}
+
+	for (sync_search = lc->sync_search;
+	     sync_search < lc->region_count;
+	     sync_search = (*region + 1)) {
+		*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
+						  lc->region_count,
+						  sync_search);
+		if (find_ru_by_region(lc, *region)) {
+			conflict = 1;
+			DMDEBUG("Recovery blocked by outstanding write on region %Lu/%s",
+				*region, lc->uuid + (strlen(lc->uuid) - 8));
+		} else {
+			break;
+		}
 	}
 
+	if (*region >= lc->region_count)
+		return 0;
+
+	new = mempool_alloc(region_user_pool, GFP_NOFS);
+	if (!new)
+		return -ENOMEM;
+
+	if (!conflict)
+		lc->sync_search = *region + 1;
+
+	lc->recovering_region = *region;
+
+	lr->u.lr_int_rtn = 1; /* Assigning work */
+	new->ru_nodeid = who;
+	new->ru_region = *region;
+	new->ru_rw = RU_RECOVER;
+	list_add(&new->ru_list, &lc->region_users);
+	DMDEBUG("Assigning recovery work to %u/%s: %Lu",
+		who, lc->uuid + (strlen(lc->uuid) - 8), new->ru_region);
+
 	return 0;
 }
 



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-04-26 16:54 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-04-26 16:54 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-04-26 17:54:49

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 

Log message:
	Bug 238031: cluster mirrors not handling all recovery/write conflicts
	
	Problem is that the kernel (main mirror code) does not do any marks/clears when
	writing to a region before its recovery.  So, it is not possible for the server
	to detect a conflict.  Basically, we must turn back on the
	'is_remote_recovering' function and disallow any writes to regions that are OR
	WILL BE recovering.
	
	It's really going to cause some pain during writes while mirrors are re-syncing.
	The better fix for the future is to have the writes always mark/clear the
	regions - then we can again remove the 'is_remote_recovering' function.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.46&r2=1.1.2.47
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.34&r2=1.1.2.35

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/24 20:08:57	1.1.2.46
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/26 16:54:49	1.1.2.47
@@ -861,11 +861,10 @@
 	int rtn;
 	struct log_c *lc = (struct log_c *) log->context;
 
-/* take out optimization
 	if(atomic_read(&lc->in_sync) == 1){
 		return 0;
 	}
-*/
+
 	rtn = consult_server(lc, region, LRT_IS_REMOTE_RECOVERING, NULL);
 	return rtn;
 }
@@ -876,11 +875,11 @@
 	struct log_c *lc = (struct log_c *) log->context;
   
 	/* check known_regions, return if found */
-/* take out optimization
+
 	if(atomic_read(&lc->in_sync) == 1){
 		return 1;
 	}
-*/
+
 	if(!block){
 		return -EWOULDBLOCK;
 	}
@@ -1414,7 +1413,7 @@
 	.resume = cluster_resume,
 	.get_region_size = cluster_get_region_size,
 	.is_clean = cluster_is_clean,
-/*	.is_remote_recovering = cluster_is_remote_recovering,*/
+	.is_remote_recovering = cluster_is_remote_recovering,
 	.in_sync = cluster_in_sync,
 	.flush = cluster_flush,
 	.mark_region = cluster_mark_region,
@@ -1436,7 +1435,7 @@
 	.resume = cluster_resume,
 	.get_region_size = cluster_get_region_size,
 	.is_clean = cluster_is_clean,
-/*	.is_remote_recovering = cluster_is_remote_recovering,*/
+	.is_remote_recovering = cluster_is_remote_recovering,
 	.in_sync = cluster_in_sync,
 	.flush = cluster_flush,
 	.mark_region = cluster_mark_region,
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/24 20:08:57	1.1.2.34
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/26 16:54:49	1.1.2.35
@@ -494,12 +494,32 @@
 
 static int server_is_remote_recovering(struct log_c *lc, struct log_request *lr)
 {
+	region_t region;
 	struct region_user *ru;
 
-	if ((ru = find_ru_by_region(lc, lr->u.lr_region)) && 
-	    (ru->ru_rw == RU_RECOVER))
+	/*
+	 * This gets a bit complicated.  I wish we didn't have to use this
+	 * function, but because the mirror code doesn't mark regions which
+	 * it writes to that are out-of-sync, we need this function.
+	 *
+	 * Problem is, we don't know how long the user is going to take to
+	 * write to the region after they have called this function.  So,
+	 * we are forced at this point to deny any writes to regions we
+	 * are recovering or might recover in the future.
+	 *
+	 * We can count on the client side to not send us one of these
+	 * requests if the mirror is known to be in-sync.
+	 *
+	 * And yes, it sucks to take this much time off the I/O.
+	 */
+	region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
+					 lc->region_count, 0);
+
+	if (lr->u.lr_region >= region) {
+		DMDEBUG("Remote recovery conflict: (%Lu >= %Lu)/%s",
+			lr->u.lr_region, region, lc->uuid + (strlen(lc->uuid) - 8));
 		lr->u.lr_int_rtn = 1;
-	else
+	} else
 		lr->u.lr_int_rtn = 0;
 
 	return 0;
@@ -639,24 +659,65 @@
 static int server_get_resync_work(struct log_c *lc, struct log_request *lr, uint32_t who)
 {
 	struct region_user *new;
+	int sync_search, conflict = 0;
+	region_t *region = &(lr->u.lr_region_rtn);
 
-	new = mempool_alloc(region_user_pool, GFP_NOFS);
-	if(!new){
-		lr->u.lr_int_rtn = 0;
-		return -ENOMEM;
+	lr->u.lr_int_rtn = 0; /* Default to no work */
+
+	if (lc->recovering_region != (uint64_t)-1) {
+		DMDEBUG("Someone is already recovering region %Lu/%s",
+			lc->recovering_region, lc->uuid + (strlen(lc->uuid) - 8));
+		return 0;
 	}
-	
-	if ((lr->u.lr_int_rtn = _core_get_resync_work(lc, &(lr->u.lr_region_rtn)))){
-		new->ru_nodeid = who;
-		new->ru_region = lr->u.lr_region_rtn;
-		new->ru_rw = RU_RECOVER;
-		list_add(&new->ru_list, &lc->region_users);
-		DMDEBUG("Assigning recovery work to %u/%s: %Lu",
-			who, lc->uuid + (strlen(lc->uuid) - 8), new->ru_region);
-	} else {
-		mempool_free(new, region_user_pool);
+
+	if (lc->sync_search >= lc->region_count) {
+		/*
+		 * FIXME: pvmove is not supported yet, but when it is,
+		 * an audit of sync_count changes will need to be made
+		 */
+		if ((lc->sync_count < lc->region_count) && !lc->sync_pass) {
+			lc->sync_search = 0;
+			lc->sync_pass++;
+		} else {
+			return 0;
+		}
+	}
+
+	for (sync_search = lc->sync_search;
+	     sync_search < lc->region_count;
+	     sync_search = (*region + 1)) {
+		*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
+						  lc->region_count,
+						  sync_search);
+		if (find_ru_by_region(lc, *region)) {
+			conflict = 1;
+			DMDEBUG("Recovery blocked by outstanding write on region %Lu/%s",
+				*region, lc->uuid + (strlen(lc->uuid) - 8));
+		} else {
+			break;
+		}
 	}
 
+	if (*region >= lc->region_count)
+		return 0;
+
+	new = mempool_alloc(region_user_pool, GFP_NOFS);
+	if (!new)
+		return -ENOMEM;
+
+	if (!conflict)
+		lc->sync_search = *region + 1;
+
+	lc->recovering_region = *region;
+
+	lr->u.lr_int_rtn = 1; /* Assigning work */
+	new->ru_nodeid = who;
+	new->ru_region = *region;
+	new->ru_rw = RU_RECOVER;
+	list_add(&new->ru_list, &lc->region_users);
+	DMDEBUG("Assigning recovery work to %u/%s: %Lu",
+		who, lc->uuid + (strlen(lc->uuid) - 8), new->ru_region);
+
 	return 0;
 }
 



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-04-24 20:10 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-04-24 20:10 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL45
Changes by:	jbrassow at sourceware.org	2007-04-24 21:10:20

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 
	                    dm-cmirror-server.h dm-cmirror-xfr.h 

Log message:
	Bug 199433: NULL pointer dereference in cman:process_messages for cmirro...
	- While this isn't a complete fix for 199433, it is most likely the
	cause of the error.  Cluster mirrors were steadily leaking memory
	every time they were deactivated.
	
	Bug 237028: cmirror recovery deadlock due to machine failure + primary l...
	- If there is outstanding resync work remaining when the server gets
	notice to suspend, delay for a moment to wait for it.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.41.2.4&r2=1.1.2.41.2.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.26.2.6&r2=1.1.2.26.2.7
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.h.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.2.8.1&r2=1.1.2.2.8.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-xfr.h.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.2.2.2&r2=1.1.2.2.2.3

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/10 07:13:15	1.1.2.41.2.4
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/24 20:10:20	1.1.2.41.2.5
@@ -379,7 +379,8 @@
 
 	if(len <= 0){
 		/* ATTENTION -- what do we do with this ? */
-		DMWARN("Error while listening for server response: %d", len);
+		DMWARN("Error listening for server(%u) response for %s: %d",
+		       lc->server_id, lc->uuid + (strlen(lc->uuid) - 8), len);
 		error = len;
 		*retry = 1;
 		seq++;
@@ -767,6 +768,7 @@
 
 static int cluster_postsuspend(struct dirty_log *log)
 {
+	int i;
 	struct region_state *rs, *tmp_rs;
 	struct log_c *lc = (struct log_c *) log->context;
 
@@ -788,10 +790,20 @@
 
 	spin_unlock(&lc->state_lock);
 
+	if(lc->server_id == my_id) {
+		for (i = 0; server_busy(lc) && (i < 10); i++) {
+			DMDEBUG("Server for %s still busy, waiting for others",
+				lc->uuid + (strlen(lc->uuid) - 8));
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(HZ*2);
+		}
+	}
+
 	atomic_set(&lc->suspended, 1);
 	if(lc->server_id == my_id) {
 		while (1) {
-			DMDEBUG("Telling everyone I'm suspending");
+			DMDEBUG("Telling everyone I'm suspending (%s)",
+				lc->uuid + (strlen(lc->uuid) - 8));
 			consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
 
 			down(&consult_server_lock);
@@ -799,13 +811,15 @@
 			up(&consult_server_lock);
 
 			if ((my_id && (lc->server_id == my_id))) {
-				DMDEBUG("Delaying suspend, work to be done.");
+				DMDEBUG("Delaying suspend, work to be done (%s)",
+					lc->uuid + (strlen(lc->uuid) - 8));
 				atomic_set(&lc->suspended, 0);
 				set_current_state(TASK_INTERRUPTIBLE);
 				schedule_timeout(HZ*2);
 				atomic_set(&lc->suspended, 1);
 			} else {
-				DMDEBUG("Suspending now");
+				DMDEBUG("Suspending now (%s)",
+					lc->uuid + (strlen(lc->uuid) - 8));
 				break;
 			}
 		}
@@ -1196,6 +1210,16 @@
 
 	switch(status){
 	case STATUSTYPE_INFO:
+		DMDEBUG("LOG INFO:");
+		DMDEBUG("  uuid: %s", lc->uuid);
+		DMDEBUG("  uuid_ref    : %d", lc->uuid_ref);
+		DMDEBUG(" ?region_count: %Lu", lc->region_count);
+		DMDEBUG(" ?sync_count  : %Lu", lc->sync_count);
+		DMDEBUG(" ?sync_search : %d", lc->sync_search);
+		DMDEBUG("  in_sync     : %s", (atomic_read(&lc->in_sync)) ? "YES" : "NO");
+		DMDEBUG("  server_id   : %u", lc->server_id);
+		DMDEBUG("  server_valid: %s",
+			((lc->server_id != 0xDEAD) && lc->server_valid) ? "YES" : "NO");
 		if(lc->sync != DEFAULTSYNC)
 			arg_count++;
 
@@ -1254,11 +1278,6 @@
 	}
 	up(&log_list_lock);
 
-	/*
-	if (likely(!shutting_down))
-		suspend_server();
-	*/
-
 	return 0;
 }
 
@@ -1311,9 +1330,7 @@
 		BUG();
 		break;
 	}
-	/*
-	resume_server();
-	*/
+
 	return 0;
 }
 
@@ -1452,6 +1469,7 @@
 	r = dm_register_dirty_log_type(&_clustered_core_type);
 	if (r) {
 		DMWARN("couldn't register clustered_core dirty log type");
+		mempool_destroy(region_state_pool);
 		return r;
 	}
 
@@ -1459,6 +1477,7 @@
 	if (r) {
 		DMWARN("couldn't register clustered_disk dirty log type");
 		dm_unregister_dirty_log_type(&_clustered_core_type);
+		mempool_destroy(region_state_pool);
 		return r;
 	}
 
@@ -1475,6 +1494,7 @@
 	}
 	dm_unregister_dirty_log_type(&_clustered_core_type);
 	dm_unregister_dirty_log_type(&_clustered_disk_type);
+	mempool_destroy(region_state_pool);
         DMINFO("dm-cmirror %s (built %s %s) removed",
                CMIRROR_RELEASE_NAME, __DATE__, __TIME__);
 }
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/10 18:10:42	1.1.2.26.2.6
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/24 20:10:20	1.1.2.26.2.7
@@ -42,8 +42,6 @@
 static atomic_t server_run;
 static struct completion server_completion;
 
-static wait_queue_head_t _suspend_queue;
-static atomic_t _suspend;
 static atomic_t _do_requests;
 
 static int debug_disk_write = 0;
@@ -706,8 +704,8 @@
 	}
 
 	if (!ru) {
-		DMERR("Unable to find region to be marked out-of-sync: %Lu/%s/%u",
-		      lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8), who);
+		DMDEBUG("Unable to find region to be marked out-of-sync: %Lu/%s/%u",
+			lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8), who);
 		/*
 		 * This is a valid case, when the following happens:
 		 * 1) a region is recovering and has waiting writes
@@ -798,6 +796,12 @@
 	uint32_t lowest, next;
 	uint32_t node_count=global_count, *nodeids=global_nodeids;
 
+	DMDEBUG("%s(%d): (%s)", RQ_STRING(lr->lr_type), lr->lr_type,
+		(lc) ? lc->uuid + (strlen(lc->uuid) - 8) : "none");
+	DMDEBUG("  starter     : %u", lr->u.lr_starter);
+	DMDEBUG("  co-ordinator: %u", lr->u.lr_coordinator);
+	DMDEBUG("  node_count  : %d", lr->u.lr_node_count);
+
 	/* Record the starter's port number so we can get back to him */
 	if((lr->u.lr_starter == my_id) && (!lr->u.lr_node_count)){
 		lr->u.lr_starter_port = saddr->sin_port;
@@ -1175,12 +1179,12 @@
 
 	complete(&server_completion);
   
+	DMDEBUG("cluster_log_serverd ready for work");
 	for(;;){
 		if(!atomic_read(&server_run)){
 			break;
 		}
 
-		suspend_on(&_suspend_queue, atomic_read(&_suspend));
 		switch(atomic_read(&restart_event_type)){
 		case SERVICE_NODE_LEAVE:
 			/* ATTENTION -- may wish to check if regions **
@@ -1206,6 +1210,9 @@
 			up(&log_list_lock);
 
 			break;
+		case SERVICE_NODE_JOIN:
+			DMDEBUG("Node joining");
+			break;
 		default:
 			/* Someone has joined, or there is no event */
 			break;
@@ -1227,6 +1234,7 @@
 		schedule();
 	}
 
+	DMDEBUG("Closing socket on server side");
 	sock_release(sock);
 	complete(&server_completion);
 	return 0;
@@ -1244,8 +1252,6 @@
 void print_server_status(struct log_c *lc){
 	int i;
 
-	atomic_set(&_suspend, 1);
-
 	DMINFO("SERVER OUTPUT::");
 
 	DMINFO("  Live nodes        :: %d", global_count);
@@ -1267,11 +1273,18 @@
 	i = print_zero_bits((unsigned char *)lc->sync_bits, 0, lc->bitset_uint32_count);
 	DMINFO("  Total = %d", i);
 
-	atomic_set(&_suspend, 0);
-	wake_up_all(&_suspend_queue);
 }
 */
 
+int server_busy(struct log_c *lc)
+{
+	if (!list_empty(&lc->region_users) ||
+	    (lc->recovering_region != (uint64_t)-1))
+		return 1;
+	else
+		return 0;
+}
+
 int server_free_region_users(struct log_c *lc)
 {
 	int i = 0;
@@ -1287,18 +1300,6 @@
 	return 0;
 }
 
-
-int suspend_server(void){
-	atomic_set(&_suspend, 1);
-	return 0;
-}
-
-int resume_server(void){
-	atomic_set(&_suspend, 0);
-	wake_up_all(&_suspend_queue);
-	return 0;
-}
-
 int resume_server_requests(void) {
 	atomic_set(&_do_requests, 1);
 	return 0;
@@ -1307,6 +1308,7 @@
 int start_server(void /* log_devices ? */){
 	int error;
 
+	DMDEBUG("start_server called");
 	region_user_pool = mempool_create(1000, region_user_alloc,
 					  region_user_free, NULL);
 	if(!region_user_pool){
@@ -1314,20 +1316,20 @@
 		return -ENOMEM;
 	}
 
-	init_waitqueue_head(&_suspend_queue);
-
 	atomic_set(&_do_requests, 0);
 	atomic_set(&server_run, 1);
 	init_completion(&server_completion);
 
 	error = kernel_thread(cluster_log_serverd, NULL, 0);
 	if(error < 0){
+		mempool_destroy(region_user_pool);
 		DMWARN("failed to start kernel thread.");
 		return error;
 	}
 	wait_for_completion(&server_completion);
 
 	if(!atomic_read(&server_run)){
+		mempool_destroy(region_user_pool);
 		DMWARN("Cluster mirror log server thread failed to start");
 		return -1;
 	}
@@ -1337,9 +1339,17 @@
 
 
 void stop_server(void){
+	DMDEBUG("stop_server called");
 	atomic_set(&server_run, 0);
 
 	wait_for_completion(&server_completion);
+	down(&log_list_lock);
+	if (!list_empty(&log_list_head)) {
+		DMERR("Log elements remain@cluster log server shutdown");
+	}
+	up(&log_list_lock);
+	mempool_destroy(region_user_pool);
+
 	dm_io_put(32);
 }
 /*
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h	2007/04/10 07:13:15	1.1.2.2.8.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h	2007/04/24 20:10:20	1.1.2.2.8.2
@@ -7,7 +7,7 @@
 #ifndef __DM_CMIRROR_SERVER_H__
 #define __DM_CMIRROR_SERVER_H__
 
-int suspend_server(void);
+int server_busy(struct log_c *lc);
 int resume_server(void);
 int resume_server_requests(void);
 int start_server(void);
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/04/03 18:23:01	1.1.2.2.2.2
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/04/24 20:10:20	1.1.2.2.2.3
@@ -30,14 +30,15 @@
 	((x) == LRT_IS_CLEAN) ? "LRT_IS_CLEAN": \
 	((x) == LRT_IN_SYNC) ? "LRT_IN_SYNC": \
 	((x) == LRT_MARK_REGION) ? "LRT_MARK_REGION": \
+	((x) == LRT_CLEAR_REGION) ? "LRT_CLEAR_REGION": \
 	((x) == LRT_FLUSH) ? "LRT_FLUSH": \
 	((x) == LRT_GET_RESYNC_WORK) ? "LRT_GET_RESYNC_WORK": \
-	((x) == LRT_GET_SYNC_COUNT) ? "LRT_GET_SYNC_COUNT": \
-	((x) == LRT_CLEAR_REGION) ? "LRT_CLEAR_REGION": \
 	((x) == LRT_COMPLETE_RESYNC_WORK) ? "LRT_COMPLETE_RESYNC_WORK": \
-	((x) == LRT_MASTER_LEAVING) ? "LRT_MASTER_LEAVING": \
+	((x) == LRT_GET_SYNC_COUNT) ? "LRT_GET_SYNC_COUNT": \
 	((x) == LRT_ELECTION) ? "LRT_ELECTION": \
-	((x) == LRT_SELECTION) ? "LRT_SELECTION": "UNKNOWN"
+	((x) == LRT_SELECTION) ? "LRT_SELECTION": \
+	((x) == LRT_MASTER_ASSIGN) ? "LRT_MASTER_ASSIGN": \
+	((x) == LRT_MASTER_LEAVING) ? "LRT_MASTER_LEAVING" : "UNKNOWN"
 
 struct log_request {
 	int lr_type;



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-04-24 20:08 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-04-24 20:08 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-04-24 21:08:57

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 
	                    dm-cmirror-server.h dm-cmirror-xfr.h 

Log message:
	Bug 199433: NULL pointer dereference in cman:process_messages for cmirro...
	- While this isn't a complete fix for 199433, it is most likely the
	cause of the error.  Cluster mirrors were steadily leaking memory
	every time they were deactivated.
	
	Bug 237028: cmirror recovery deadlock due to machine failure + primary l...
	- If there is outstanding resync work remaining when the server gets
	notice to suspend, delay for a moment to wait for it.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.45&r2=1.1.2.46
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.33&r2=1.1.2.34
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.3&r2=1.1.2.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-xfr.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.4&r2=1.1.2.5

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/10 07:12:24	1.1.2.45
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/24 20:08:57	1.1.2.46
@@ -379,7 +379,8 @@
 
 	if(len <= 0){
 		/* ATTENTION -- what do we do with this ? */
-		DMWARN("Error while listening for server response: %d", len);
+		DMWARN("Error listening for server(%u) response for %s: %d",
+		       lc->server_id, lc->uuid + (strlen(lc->uuid) - 8), len);
 		error = len;
 		*retry = 1;
 		seq++;
@@ -767,6 +768,7 @@
 
 static int cluster_postsuspend(struct dirty_log *log)
 {
+	int i;
 	struct region_state *rs, *tmp_rs;
 	struct log_c *lc = (struct log_c *) log->context;
 
@@ -788,10 +790,20 @@
 
 	spin_unlock(&lc->state_lock);
 
+	if(lc->server_id == my_id) {
+		for (i = 0; server_busy(lc) && (i < 10); i++) {
+			DMDEBUG("Server for %s still busy, waiting for others",
+				lc->uuid + (strlen(lc->uuid) - 8));
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(HZ*2);
+		}
+	}
+
 	atomic_set(&lc->suspended, 1);
 	if(lc->server_id == my_id) {
 		while (1) {
-			DMDEBUG("Telling everyone I'm suspending");
+			DMDEBUG("Telling everyone I'm suspending (%s)",
+				lc->uuid + (strlen(lc->uuid) - 8));
 			consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
 
 			down(&consult_server_lock);
@@ -799,13 +811,15 @@
 			up(&consult_server_lock);
 
 			if ((my_id && (lc->server_id == my_id))) {
-				DMDEBUG("Delaying suspend, work to be done.");
+				DMDEBUG("Delaying suspend, work to be done (%s)",
+					lc->uuid + (strlen(lc->uuid) - 8));
 				atomic_set(&lc->suspended, 0);
 				set_current_state(TASK_INTERRUPTIBLE);
 				schedule_timeout(HZ*2);
 				atomic_set(&lc->suspended, 1);
 			} else {
-				DMDEBUG("Suspending now");
+				DMDEBUG("Suspending now (%s)",
+					lc->uuid + (strlen(lc->uuid) - 8));
 				break;
 			}
 		}
@@ -1196,6 +1210,16 @@
 
 	switch(status){
 	case STATUSTYPE_INFO:
+		DMDEBUG("LOG INFO:");
+		DMDEBUG("  uuid: %s", lc->uuid);
+		DMDEBUG("  uuid_ref    : %d", lc->uuid_ref);
+		DMDEBUG(" ?region_count: %Lu", lc->region_count);
+		DMDEBUG(" ?sync_count  : %Lu", lc->sync_count);
+		DMDEBUG(" ?sync_search : %d", lc->sync_search);
+		DMDEBUG("  in_sync     : %s", (atomic_read(&lc->in_sync)) ? "YES" : "NO");
+		DMDEBUG("  server_id   : %u", lc->server_id);
+		DMDEBUG("  server_valid: %s",
+			((lc->server_id != 0xDEAD) && lc->server_valid) ? "YES" : "NO");
 		if(lc->sync != DEFAULTSYNC)
 			arg_count++;
 
@@ -1254,11 +1278,6 @@
 	}
 	up(&log_list_lock);
 
-	/*
-	if (likely(!shutting_down))
-		suspend_server();
-	*/
-
 	return 0;
 }
 
@@ -1311,9 +1330,7 @@
 		BUG();
 		break;
 	}
-	/*
-	resume_server();
-	*/
+
 	return 0;
 }
 
@@ -1452,6 +1469,7 @@
 	r = dm_register_dirty_log_type(&_clustered_core_type);
 	if (r) {
 		DMWARN("couldn't register clustered_core dirty log type");
+		mempool_destroy(region_state_pool);
 		return r;
 	}
 
@@ -1459,6 +1477,7 @@
 	if (r) {
 		DMWARN("couldn't register clustered_disk dirty log type");
 		dm_unregister_dirty_log_type(&_clustered_core_type);
+		mempool_destroy(region_state_pool);
 		return r;
 	}
 
@@ -1475,6 +1494,7 @@
 	}
 	dm_unregister_dirty_log_type(&_clustered_core_type);
 	dm_unregister_dirty_log_type(&_clustered_disk_type);
+	mempool_destroy(region_state_pool);
         DMINFO("dm-cmirror %s (built %s %s) removed",
                CMIRROR_RELEASE_NAME, __DATE__, __TIME__);
 }
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/17 19:49:11	1.1.2.33
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/24 20:08:57	1.1.2.34
@@ -42,8 +42,6 @@
 static atomic_t server_run;
 static struct completion server_completion;
 
-static wait_queue_head_t _suspend_queue;
-static atomic_t _suspend;
 static atomic_t _do_requests;
 
 static int debug_disk_write = 0;
@@ -798,6 +796,12 @@
 	uint32_t lowest, next;
 	uint32_t node_count=global_count, *nodeids=global_nodeids;
 
+	DMDEBUG("%s(%d): (%s)", RQ_STRING(lr->lr_type), lr->lr_type,
+		(lc) ? lc->uuid + (strlen(lc->uuid) - 8) : "none");
+	DMDEBUG("  starter     : %u", lr->u.lr_starter);
+	DMDEBUG("  co-ordinator: %u", lr->u.lr_coordinator);
+	DMDEBUG("  node_count  : %d", lr->u.lr_node_count);
+
 	/* Record the starter's port number so we can get back to him */
 	if((lr->u.lr_starter == my_id) && (!lr->u.lr_node_count)){
 		lr->u.lr_starter_port = saddr->sin_port;
@@ -1175,12 +1179,12 @@
 
 	complete(&server_completion);
   
+	DMDEBUG("cluster_log_serverd ready for work");
 	for(;;){
 		if(!atomic_read(&server_run)){
 			break;
 		}
 
-		suspend_on(&_suspend_queue, atomic_read(&_suspend));
 		switch(atomic_read(&restart_event_type)){
 		case SERVICE_NODE_LEAVE:
 			/* ATTENTION -- may wish to check if regions **
@@ -1206,6 +1210,9 @@
 			up(&log_list_lock);
 
 			break;
+		case SERVICE_NODE_JOIN:
+			DMDEBUG("Node joining");
+			break;
 		default:
 			/* Someone has joined, or there is no event */
 			break;
@@ -1227,6 +1234,7 @@
 		schedule();
 	}
 
+	DMDEBUG("Closing socket on server side");
 	sock_release(sock);
 	complete(&server_completion);
 	return 0;
@@ -1244,8 +1252,6 @@
 void print_server_status(struct log_c *lc){
 	int i;
 
-	atomic_set(&_suspend, 1);
-
 	DMINFO("SERVER OUTPUT::");
 
 	DMINFO("  Live nodes        :: %d", global_count);
@@ -1267,11 +1273,18 @@
 	i = print_zero_bits((unsigned char *)lc->sync_bits, 0, lc->bitset_uint32_count);
 	DMINFO("  Total = %d", i);
 
-	atomic_set(&_suspend, 0);
-	wake_up_all(&_suspend_queue);
 }
 */
 
+int server_busy(struct log_c *lc)
+{
+	if (!list_empty(&lc->region_users) ||
+	    (lc->recovering_region != (uint64_t)-1))
+		return 1;
+	else
+		return 0;
+}
+
 int server_free_region_users(struct log_c *lc)
 {
 	int i = 0;
@@ -1287,18 +1300,6 @@
 	return 0;
 }
 
-
-int suspend_server(void){
-	atomic_set(&_suspend, 1);
-	return 0;
-}
-
-int resume_server(void){
-	atomic_set(&_suspend, 0);
-	wake_up_all(&_suspend_queue);
-	return 0;
-}
-
 int resume_server_requests(void) {
 	atomic_set(&_do_requests, 1);
 	return 0;
@@ -1307,6 +1308,7 @@
 int start_server(void /* log_devices ? */){
 	int error;
 
+	DMDEBUG("start_server called");
 	region_user_pool = mempool_create(1000, region_user_alloc,
 					  region_user_free, NULL);
 	if(!region_user_pool){
@@ -1314,20 +1316,20 @@
 		return -ENOMEM;
 	}
 
-	init_waitqueue_head(&_suspend_queue);
-
 	atomic_set(&_do_requests, 0);
 	atomic_set(&server_run, 1);
 	init_completion(&server_completion);
 
 	error = kernel_thread(cluster_log_serverd, NULL, 0);
 	if(error < 0){
+		mempool_destroy(region_user_pool);
 		DMWARN("failed to start kernel thread.");
 		return error;
 	}
 	wait_for_completion(&server_completion);
 
 	if(!atomic_read(&server_run)){
+		mempool_destroy(region_user_pool);
 		DMWARN("Cluster mirror log server thread failed to start");
 		return -1;
 	}
@@ -1337,9 +1339,17 @@
 
 
 void stop_server(void){
+	DMDEBUG("stop_server called");
 	atomic_set(&server_run, 0);
 
 	wait_for_completion(&server_completion);
+	down(&log_list_lock);
+	if (!list_empty(&log_list_head)) {
+		DMERR("Log elements remain@cluster log server shutdown");
+	}
+	up(&log_list_lock);
+	mempool_destroy(region_user_pool);
+
 	dm_io_put(32);
 }
 /*
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h	2007/04/10 07:12:24	1.1.2.3
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h	2007/04/24 20:08:57	1.1.2.4
@@ -7,7 +7,7 @@
 #ifndef __DM_CMIRROR_SERVER_H__
 #define __DM_CMIRROR_SERVER_H__
 
-int suspend_server(void);
+int server_busy(struct log_c *lc);
 int resume_server(void);
 int resume_server_requests(void);
 int start_server(void);
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/04/03 18:21:10	1.1.2.4
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/04/24 20:08:57	1.1.2.5
@@ -30,14 +30,15 @@
 	((x) == LRT_IS_CLEAN) ? "LRT_IS_CLEAN": \
 	((x) == LRT_IN_SYNC) ? "LRT_IN_SYNC": \
 	((x) == LRT_MARK_REGION) ? "LRT_MARK_REGION": \
+	((x) == LRT_CLEAR_REGION) ? "LRT_CLEAR_REGION": \
 	((x) == LRT_FLUSH) ? "LRT_FLUSH": \
 	((x) == LRT_GET_RESYNC_WORK) ? "LRT_GET_RESYNC_WORK": \
-	((x) == LRT_GET_SYNC_COUNT) ? "LRT_GET_SYNC_COUNT": \
-	((x) == LRT_CLEAR_REGION) ? "LRT_CLEAR_REGION": \
 	((x) == LRT_COMPLETE_RESYNC_WORK) ? "LRT_COMPLETE_RESYNC_WORK": \
-	((x) == LRT_MASTER_LEAVING) ? "LRT_MASTER_LEAVING": \
+	((x) == LRT_GET_SYNC_COUNT) ? "LRT_GET_SYNC_COUNT": \
 	((x) == LRT_ELECTION) ? "LRT_ELECTION": \
-	((x) == LRT_SELECTION) ? "LRT_SELECTION": "UNKNOWN"
+	((x) == LRT_SELECTION) ? "LRT_SELECTION": \
+	((x) == LRT_MASTER_ASSIGN) ? "LRT_MASTER_ASSIGN": \
+	((x) == LRT_MASTER_LEAVING) ? "LRT_MASTER_LEAVING" : "UNKNOWN"
 
 struct log_request {
 	int lr_type;



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-04-10  7:13 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-04-10  7:13 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL45
Changes by:	jbrassow at sourceware.org	2007-04-10 08:13:15

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c dm-cmirror-server.h 

Log message:
	Bug 235686: Kernel BUG at dm_cmirror_server while recovering region
	
	Several fixes have gone in to fix the handling of this bug:
	1) During server relocation (which can happen due to machine failure
	or normal mirror suspension), the server value could get set before
	the client had a chance to clean-up.  This caused the server to
	become confused and issue a BUG().
	
	2) perform a flush of the log before suspending.  This ensures
	that regions which are in-sync get correctly flushed to the disk
	log.  Without this, there will always be recovery work to be done
	when a mirror starts up - even if it was properly in-sync during
	shutdown.
	
	3) clean-up memory used to record region users when a mirror is
	shutdown.  It was possible for some regions to be left over
	(causing a memory leak) during certain fault scenarios.
	
	4) properly initialize the state field (ru_rw) in the region
	user structure when a mark occurs.  Without the initialization,
	it was sometimes possible for the region to be misinterpretted
	as recovering instead of marked.
	
	5) resolve and unhandled case in server_complete_resync_work
	
	6) reset a variable in cluster_complete_resync_work.  Failure
	to do so was causing a retry to include the wrong value for
	the completion of the resync work - confusing the server.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.41.2.3&r2=1.1.2.41.2.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.12.2.1&r2=1.1.2.12.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.26.2.4&r2=1.1.2.26.2.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.h.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.2&r2=1.1.2.2.8.1

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/05 21:33:36	1.1.2.41.2.3
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/10 07:13:15	1.1.2.41.2.4
@@ -373,13 +373,8 @@
 	fs = get_fs();
 	set_fs(get_ds());
 
-	if(type == LRT_MASTER_LEAVING){
-		len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
-				 0, 10);
-	} else {
-		len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
-				 0, 5);
-	}
+	len = my_recvmsg(lc->client_sock, &msg,
+			 sizeof(struct log_request), 0, 15);
 	set_fs(fs);
 
 	if(len <= 0){
@@ -492,9 +487,14 @@
 			goto out;
 		}
 	election:
-		while(lc->server_id == 0xDEAD){
+		while ((lc->server_id == 0xDEAD) || (!lc->server_valid)) {
+			DMDEBUG("server_id=%x, server_valid=%u, %s",
+				lc->server_id, lc->server_valid,
+				lc->uuid + (strlen(lc->uuid) - 8));
+			DMDEBUG("trigger = %s", RQ_STRING(type));
 			run_election(lc, my_id);
 			new_server = 1;
+			lc->server_valid = 1;
 		}
 
 		spin_lock(&lc->state_lock);
@@ -600,6 +600,7 @@
 	INIT_LIST_HEAD(&lc->mark_logged);
 	spin_lock_init(&lc->state_lock);
 
+	lc->server_valid = 0;
 	lc->server_id = 0xDEAD;
 
 	if ((error = cluster_connect())) {
@@ -731,19 +732,20 @@
 	}
 
 	if (!list_empty(&lc->mark_logged)) {
-		DMERR("Mark requests remain at cluster log deactivation");
-		/*
-		 * Should I BUG() this?
-		 * No.  In the worst case, they will get cleaned up later
-		 */
-	}
-	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list) {
-		list_del_init(&rs->rs_list);
-		mempool_free(rs, region_state_pool);
+		int i = 0;
+		
+		list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list) {
+			i++;
+			list_del_init(&rs->rs_list);
+			mempool_free(rs, region_state_pool);
+		}
+		DMDEBUG("%d mark requests remain at cluster log deactivation", i);
 	}
 
 	spin_unlock(&lc->state_lock);
 
+	server_free_region_users(lc);
+
 	if (lc->log_dev)
 		disk_dtr(log);
 	else
@@ -753,8 +755,13 @@
 		DMERR("Unable to disconnect from cluster infrastructure.\n");
 }
 
+static int cluster_flush(struct dirty_log *log);
 static int cluster_presuspend(struct dirty_log *log)
 {
+	/* FIXME: flush is work-around for bug 235040 */
+	DMDEBUG("Performing flush to work around bug 235040");
+	cluster_flush(log);
+	DMDEBUG("Log flush complete");
 	return 0;
 }
 
@@ -784,6 +791,7 @@
 	atomic_set(&lc->suspended, 1);
 	if(lc->server_id == my_id) {
 		while (1) {
+			DMDEBUG("Telling everyone I'm suspending");
 			consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
 
 			down(&consult_server_lock);
@@ -791,11 +799,13 @@
 			up(&consult_server_lock);
 
 			if ((my_id && (lc->server_id == my_id))) {
+				DMDEBUG("Delaying suspend, work to be done.");
 				atomic_set(&lc->suspended, 0);
 				set_current_state(TASK_INTERRUPTIBLE);
 				schedule_timeout(HZ*2);
 				atomic_set(&lc->suspended, 1);
 			} else {
+				DMDEBUG("Suspending now");
 				break;
 			}
 		}
@@ -907,7 +917,7 @@
 				DMDEBUG("Delaying mark to region %Lu, due to recovery",
 					rs->rs_region);
 				set_current_state(TASK_INTERRUPTIBLE);
-				schedule_timeout(HZ/2);
+				schedule_timeout(HZ);
 				continue;
 			}
 
@@ -933,7 +943,7 @@
 		if (r == -EBUSY) {
 			DMDEBUG("Delaying flush due to recovery");
 			set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(HZ/2);
+			schedule_timeout(HZ);
 			continue;
 		}
 
@@ -1109,6 +1119,10 @@
 		rtn = 0;
 	}
 
+	if (rtn)
+		DMDEBUG("Client received resync work: %Lu/%s",
+			*region, lc->uuid + (strlen(lc->uuid) - 8));
+
 	/*
 	 * Check for bug 235039
 	 * Note the changes in cluser_clear_region
@@ -1129,10 +1143,16 @@
 static void cluster_complete_resync_work(struct dirty_log *log,
 					 region_t region, int success)
 {
+	int i;
 	region_t success_tmp = success;
 	struct log_c *lc = (struct log_c *) log->context;
-	while(consult_server(lc, region, LRT_COMPLETE_RESYNC_WORK, &success_tmp)){
-		DMWARN("unable to notify server of completed resync work");
+
+	for (i = 0; i < 5; i++) {
+		if (!consult_server(lc, region,
+				    LRT_COMPLETE_RESYNC_WORK, &success_tmp))
+			break;
+		success_tmp = success;
+		DMWARN("Unable to notify server of completed resync work");
 	}
 	return;
 }
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/04/03 18:23:01	1.1.2.12.2.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/04/10 07:13:15	1.1.2.12.2.2
@@ -141,6 +141,7 @@
 	struct list_head mark_waiting;
 	struct list_head mark_logged;
 
+	uint32_t server_valid;
 	uint32_t server_id;
 	struct socket *client_sock;
 };
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/05 21:33:36	1.1.2.26.2.4
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/10 07:13:15	1.1.2.26.2.5
@@ -229,7 +229,8 @@
 	int sync_search, conflict = 0;
 
 	if (lc->recovering_region != (uint64_t)-1) {
-		DMDEBUG("Someone is already recovering region %Lu", lc->recovering_region);
+		DMDEBUG("Someone is already recovering region %Lu/%s",
+			lc->recovering_region, lc->uuid + (strlen(lc->uuid) - 8));
 		return 0;
 	}
 
@@ -253,8 +254,8 @@
 						  sync_search);
 		if (find_ru_by_region(lc, *region)) {
 			conflict = 1;
-			DMDEBUG("Recovery blocked by outstanding write on region %Lu",
-			      *region);
+			DMDEBUG("Recovery blocked by outstanding write on region %Lu/%s",
+				*region, lc->uuid + (strlen(lc->uuid) - 8));
 		} else {
 			break;
 		}
@@ -534,7 +535,8 @@
 
 	new->ru_nodeid = who;
 	new->ru_region = lr->u.lr_region;
-    
+	new->ru_rw = RU_WRITE;
+
 	if (!(ru = find_ru_by_region(lc, lr->u.lr_region))) {
 		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
 		list_add(&new->ru_list, &lc->region_users);
@@ -615,6 +617,11 @@
 					DMDEBUG("  %s", (ru2->ru_rw == RU_RECOVER) ? "recover" :
 						(ru2->ru_rw == RU_WRITE) ? "writer" : "unknown");
 
+			/* FIXME: work-around for bug 235040 */
+			DMDEBUG("Revoking resync work");
+			lc->recovering_region = (uint64_t)-1;
+			list_del(&ru->ru_list);
+			mempool_free(ru, region_user_pool);
 			return -EBUSY;
 		}
 	}
@@ -646,7 +653,8 @@
 		new->ru_region = lr->u.lr_region_rtn;
 		new->ru_rw = RU_RECOVER;
 		list_add(&new->ru_list, &lc->region_users);
-		DMDEBUG("Assigning recovery work to %u: %Lu", who, new->ru_region);
+		DMDEBUG("Assigning recovery work to %u/%s: %Lu",
+			who, lc->uuid + (strlen(lc->uuid) - 8), new->ru_region);
 	} else {
 		mempool_free(new, region_user_pool);
 	}
@@ -654,8 +662,8 @@
 	return 0;
 }
 
-
-static int server_complete_resync_work(struct log_c *lc, struct log_request *lr, int success){
+static int server_complete_resync_work(struct log_c *lc, struct log_request *lr,
+				       int success, uint32_t who){
 	struct region_user *ru;
 
 	if (lr->u.lr_region > lc->region_count) {
@@ -679,51 +687,61 @@
 		lc->sync_pass = 0;
 
 		DMDEBUG("Resync work completed: %Lu", lr->u.lr_region);
-	} else if (log_test_bit(lc->sync_bits, lr->u.lr_region)) {
-		ru = find_ru_by_region(lc, lr->u.lr_region);
+		return 0;
+	}
 
-		/*
-		 * The following condition can never happen unless we have
-		 * a corrupted list or we had a communication error.
-		 *
-		 * If a write failed to one of the mirror devices, the ru
-		 * should be RU_WRITE.  If a recovery failed, it should be
-		 * RU_RECOVER
-		 */
-		if (!ru) {
-			DMERR("Unable to find region being marked out-of-sync: %Lu",
-			      lr->u.lr_region);
-			return -EINVAL;
-		}
+	/*
+	 * Recovery failed or mirror is being marked out-of-sync
+	 *
+	 * We can recieve multiple calls to mark out-of-sync
+	 * if there were several writes to the same region that
+	 * failed.  In this case, there will not be a record for
+	 * the region.
+	 */
+	ru = find_ru(lc, who, lr->u.lr_region);
 
-		if (ru->ru_rw == RU_RECOVER) {
-			if (lr->u.lr_region != lc->recovering_region) {
-				DMERR("Recovering region mismatch: (%Lu/%Lu)",
-				      lr->u.lr_region, lc->recovering_region);
-				BUG();
-			}
-			/*
-			 * Clear the recovery
-			 */
-			lc->recovering_region = (uint64_t)-1;
-			list_del(&ru->ru_list);
-			mempool_free(ru, region_user_pool);
-		} else {  /* ru->ru_rw == RU_WRITE */
-			/*
-			 * Mirror has place the region into RH_NOSYNC
-			 * It is safe to pull the ru
-			 */
-			list_del(&ru->ru_list);
-			mempool_free(ru, region_user_pool);			
+	if ((lr->u.lr_region == lc->recovering_region) && !ru) {
+		DMERR("Unable to locate record of recovery");
+		BUG();
+	}
+
+	if (!ru) {
+		DMERR("Unable to find region to be marked out-of-sync: %Lu/%s/%u",
+		      lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8), who);
+		BUG();
+	} else 	if (ru->ru_rw == RU_RECOVER) {
+		if (lr->u.lr_region != lc->recovering_region) {
+			DMERR("Recovering region mismatch from node %u: (%Lu/%Lu)",
+			      who, lr->u.lr_region, lc->recovering_region);
+			BUG();
 		}
-		/* gone again: lc->sync_count--;*/
-		log_clear_bit(lc, lc->sync_bits, lr->u.lr_region);
+		/*
+		 * Clear the recovery
+		 */
+		DMDEBUG("Setting recovering region out-of-sync: %Lu/%s/%u",
+			lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8), who);
+		lc->recovering_region = (uint64_t)-1;
+		list_del(&ru->ru_list);
+		mempool_free(ru, region_user_pool);
 	}
 
+	/* else if (ru->ru_rw == RU_WRITE)
+	 * Mirror has place the region into RH_NOSYNC
+	 * We will leave the record in place.  It is
+	 * likely there are more requests coming to
+	 * mark this region out-of-sync, due to the
+	 * way mirror handles the situation.
+	 *
+	 DMDEBUG("Setting marked region out-of-sync: %Lu/%s",
+                 lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
+	 */
+
+	/* gone again: lc->sync_count--;*/
+	log_clear_bit(lc, lc->sync_bits, lr->u.lr_region);
+
 	return 0;
 }
 
-
 static int server_get_sync_count(struct log_c *lc, struct log_request *lr){
 	if (lc->sync_count > lc->region_count) {
 		DMERR("sync_count (" SECTOR_FORMAT ") > region_count (" SECTOR_FORMAT ") in %s!",
@@ -901,6 +919,7 @@
 		 * If we are the server, assign it
 		 */
 		if(lr->u.lr_coordinator == my_id){
+			lc->server_valid = 0;
 			lc->server_id = my_id;
 		}
 
@@ -988,7 +1007,8 @@
 				DMDEBUG("I'm the cluster mirror log server for %s",
 				       lc->uuid + (strlen(lc->uuid) - 8));
 				if (atomic_read(&lc->suspended)) {
-					DMDEBUG("Not reading disk log because I'm suspended.");
+					DMDEBUG("Not reading disk log because I'm suspended (%s)",
+						lc->uuid + (strlen(lc->uuid) - 8));
 					
 				} else if (disk_resume(lc) == -EDEADLK) {
 					DMDEBUG("Unable to read disk log - deadlock potential.");
@@ -1060,7 +1080,12 @@
 			error = server_get_resync_work(lc, lr, nodeid);
 			break;
 		case LRT_COMPLETE_RESYNC_WORK:
-			error = server_complete_resync_work(lc, lr, lr->u.lr_int_rtn);
+			if(!(nodeid =
+			     ipaddr_to_nodeid((struct sockaddr *)msg.msg_name))){
+				error = -ENXIO;
+				break;
+			}
+			error = server_complete_resync_work(lc, lr, lr->u.lr_int_rtn, nodeid);
 			lr->u.lr_int_rtn = 0;
 			break;
 		case LRT_GET_SYNC_COUNT:
@@ -1236,6 +1261,22 @@
 }
 */
 
+int server_free_region_users(struct log_c *lc)
+{
+	int i = 0;
+	struct region_user *ru, *tmp_ru;
+
+	list_for_each_entry_safe(ru, tmp_ru, &lc->region_users, ru_list) {
+		i++;
+		list_del(&ru->ru_list);
+		mempool_free(ru, region_user_pool);
+	}
+
+	DMDEBUG("%d region user structures freed", i);
+	return 0;
+}
+
+
 int suspend_server(void){
 	atomic_set(&_suspend, 1);
 	return 0;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h	2005/08/11 18:26:19	1.1.2.2
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h	2007/04/10 07:13:15	1.1.2.2.8.1
@@ -13,5 +13,6 @@
 int start_server(void);
 void stop_server(void);
 void print_server_status(struct log_c *lc);
+int server_free_region_users(struct log_c *lc);
 
 #endif /* __DM_CMIRROR_SERVER_H__ */



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-04-10  7:12 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-04-10  7:12 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-04-10 08:12:24

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c dm-cmirror-server.h 

Log message:
	Bug 235686: Kernel BUG at dm_cmirror_server while recovering region
	
	Several fixes have gone in to fix the handling of this bug:
	1) During server relocation (which can happen due to machine failure
	or normal mirror suspension), the server value could get set before
	the client had a chance to clean-up.  This caused the server to
	become confused and issue a BUG().
	
	2) perform a flush of the log before suspending.  This ensures
	that regions which are in-sync get correctly flushed to the disk
	log.  Without this, there will always be recovery work to be done
	when a mirror starts up - even if it was properly in-sync during
	shutdown.
	
	3) clean-up memory used to record region users when a mirror is
	shutdown.  It was possible for some regions to be left over
	(causing a memory leak) during certain fault scenarios.
	
	4) properly initialize the state field (ru_rw) in the region
	user structure when a mark occurs.  Without the initialization,
	it was sometimes possible for the region to be misinterpretted
	as recovering instead of marked.
	
	5) resolve and unhandled case in server_complete_resync_work
	
	6) reset a variable in cluster_complete_resync_work.  Failure
	to do so was causing a retry to include the wrong value for
	the completion of the resync work - confusing the server.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.44&r2=1.1.2.45
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.13&r2=1.1.2.14
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.30&r2=1.1.2.31
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.2&r2=1.1.2.3

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/05 21:32:25	1.1.2.44
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/10 07:12:24	1.1.2.45
@@ -373,13 +373,8 @@
 	fs = get_fs();
 	set_fs(get_ds());
 
-	if(type == LRT_MASTER_LEAVING){
-		len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
-				 0, 10);
-	} else {
-		len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
-				 0, 5);
-	}
+	len = my_recvmsg(lc->client_sock, &msg,
+			 sizeof(struct log_request), 0, 15);
 	set_fs(fs);
 
 	if(len <= 0){
@@ -492,9 +487,14 @@
 			goto out;
 		}
 	election:
-		while(lc->server_id == 0xDEAD){
+		while ((lc->server_id == 0xDEAD) || (!lc->server_valid)) {
+			DMDEBUG("server_id=%x, server_valid=%u, %s",
+				lc->server_id, lc->server_valid,
+				lc->uuid + (strlen(lc->uuid) - 8));
+			DMDEBUG("trigger = %s", RQ_STRING(type));
 			run_election(lc, my_id);
 			new_server = 1;
+			lc->server_valid = 1;
 		}
 
 		spin_lock(&lc->state_lock);
@@ -600,6 +600,7 @@
 	INIT_LIST_HEAD(&lc->mark_logged);
 	spin_lock_init(&lc->state_lock);
 
+	lc->server_valid = 0;
 	lc->server_id = 0xDEAD;
 
 	if ((error = cluster_connect())) {
@@ -731,19 +732,20 @@
 	}
 
 	if (!list_empty(&lc->mark_logged)) {
-		DMERR("Mark requests remain at cluster log deactivation");
-		/*
-		 * Should I BUG() this?
-		 * No.  In the worst case, they will get cleaned up later
-		 */
-	}
-	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list) {
-		list_del_init(&rs->rs_list);
-		mempool_free(rs, region_state_pool);
+		int i = 0;
+		
+		list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list) {
+			i++;
+			list_del_init(&rs->rs_list);
+			mempool_free(rs, region_state_pool);
+		}
+		DMDEBUG("%d mark requests remain at cluster log deactivation", i);
 	}
 
 	spin_unlock(&lc->state_lock);
 
+	server_free_region_users(lc);
+
 	if (lc->log_dev)
 		disk_dtr(log);
 	else
@@ -753,8 +755,13 @@
 		DMERR("Unable to disconnect from cluster infrastructure.\n");
 }
 
+static int cluster_flush(struct dirty_log *log);
 static int cluster_presuspend(struct dirty_log *log)
 {
+	/* FIXME: flush is work-around for bug 235040 */
+	DMDEBUG("Performing flush to work around bug 235040");
+	cluster_flush(log);
+	DMDEBUG("Log flush complete");
 	return 0;
 }
 
@@ -784,6 +791,7 @@
 	atomic_set(&lc->suspended, 1);
 	if(lc->server_id == my_id) {
 		while (1) {
+			DMDEBUG("Telling everyone I'm suspending");
 			consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
 
 			down(&consult_server_lock);
@@ -791,11 +799,13 @@
 			up(&consult_server_lock);
 
 			if ((my_id && (lc->server_id == my_id))) {
+				DMDEBUG("Delaying suspend, work to be done.");
 				atomic_set(&lc->suspended, 0);
 				set_current_state(TASK_INTERRUPTIBLE);
 				schedule_timeout(HZ*2);
 				atomic_set(&lc->suspended, 1);
 			} else {
+				DMDEBUG("Suspending now");
 				break;
 			}
 		}
@@ -907,7 +917,7 @@
 				DMDEBUG("Delaying mark to region %Lu, due to recovery",
 					rs->rs_region);
 				set_current_state(TASK_INTERRUPTIBLE);
-				schedule_timeout(HZ/2);
+				schedule_timeout(HZ);
 				continue;
 			}
 
@@ -933,7 +943,7 @@
 		if (r == -EBUSY) {
 			DMDEBUG("Delaying flush due to recovery");
 			set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(HZ/2);
+			schedule_timeout(HZ);
 			continue;
 		}
 
@@ -1109,6 +1119,10 @@
 		rtn = 0;
 	}
 
+	if (rtn)
+		DMDEBUG("Client received resync work: %Lu/%s",
+			*region, lc->uuid + (strlen(lc->uuid) - 8));
+
 	/*
 	 * Check for bug 235039
 	 * Note the changes in cluser_clear_region
@@ -1129,10 +1143,16 @@
 static void cluster_complete_resync_work(struct dirty_log *log,
 					 region_t region, int success)
 {
+	int i;
 	region_t success_tmp = success;
 	struct log_c *lc = (struct log_c *) log->context;
-	while(consult_server(lc, region, LRT_COMPLETE_RESYNC_WORK, &success_tmp)){
-		DMWARN("unable to notify server of completed resync work");
+
+	for (i = 0; i < 5; i++) {
+		if (!consult_server(lc, region,
+				    LRT_COMPLETE_RESYNC_WORK, &success_tmp))
+			break;
+		success_tmp = success;
+		DMWARN("Unable to notify server of completed resync work");
 	}
 	return;
 }
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/04/03 18:21:10	1.1.2.13
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/04/10 07:12:24	1.1.2.14
@@ -141,6 +141,7 @@
 	struct list_head mark_waiting;
 	struct list_head mark_logged;
 
+	uint32_t server_valid;
 	uint32_t server_id;
 	struct socket *client_sock;
 };
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/05 21:32:25	1.1.2.30
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/10 07:12:24	1.1.2.31
@@ -229,7 +229,8 @@
 	int sync_search, conflict = 0;
 
 	if (lc->recovering_region != (uint64_t)-1) {
-		DMDEBUG("Someone is already recovering region %Lu", lc->recovering_region);
+		DMDEBUG("Someone is already recovering region %Lu/%s",
+			lc->recovering_region, lc->uuid + (strlen(lc->uuid) - 8));
 		return 0;
 	}
 
@@ -253,8 +254,8 @@
 						  sync_search);
 		if (find_ru_by_region(lc, *region)) {
 			conflict = 1;
-			DMDEBUG("Recovery blocked by outstanding write on region %Lu",
-			      *region);
+			DMDEBUG("Recovery blocked by outstanding write on region %Lu/%s",
+				*region, lc->uuid + (strlen(lc->uuid) - 8));
 		} else {
 			break;
 		}
@@ -534,7 +535,8 @@
 
 	new->ru_nodeid = who;
 	new->ru_region = lr->u.lr_region;
-    
+	new->ru_rw = RU_WRITE;
+
 	if (!(ru = find_ru_by_region(lc, lr->u.lr_region))) {
 		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
 		list_add(&new->ru_list, &lc->region_users);
@@ -615,6 +617,11 @@
 					DMDEBUG("  %s", (ru2->ru_rw == RU_RECOVER) ? "recover" :
 						(ru2->ru_rw == RU_WRITE) ? "writer" : "unknown");
 
+			/* FIXME: work-around for bug 235040 */
+			DMDEBUG("Revoking resync work");
+			lc->recovering_region = (uint64_t)-1;
+			list_del(&ru->ru_list);
+			mempool_free(ru, region_user_pool);
 			return -EBUSY;
 		}
 	}
@@ -646,7 +653,8 @@
 		new->ru_region = lr->u.lr_region_rtn;
 		new->ru_rw = RU_RECOVER;
 		list_add(&new->ru_list, &lc->region_users);
-		DMDEBUG("Assigning recovery work to %u: %Lu", who, new->ru_region);
+		DMDEBUG("Assigning recovery work to %u/%s: %Lu",
+			who, lc->uuid + (strlen(lc->uuid) - 8), new->ru_region);
 	} else {
 		mempool_free(new, region_user_pool);
 	}
@@ -654,8 +662,8 @@
 	return 0;
 }
 
-
-static int server_complete_resync_work(struct log_c *lc, struct log_request *lr, int success){
+static int server_complete_resync_work(struct log_c *lc, struct log_request *lr,
+				       int success, uint32_t who){
 	struct region_user *ru;
 
 	if (lr->u.lr_region > lc->region_count) {
@@ -679,51 +687,61 @@
 		lc->sync_pass = 0;
 
 		DMDEBUG("Resync work completed: %Lu", lr->u.lr_region);
-	} else if (log_test_bit(lc->sync_bits, lr->u.lr_region)) {
-		ru = find_ru_by_region(lc, lr->u.lr_region);
+		return 0;
+	}
 
-		/*
-		 * The following condition can never happen unless we have
-		 * a corrupted list or we had a communication error.
-		 *
-		 * If a write failed to one of the mirror devices, the ru
-		 * should be RU_WRITE.  If a recovery failed, it should be
-		 * RU_RECOVER
-		 */
-		if (!ru) {
-			DMERR("Unable to find region being marked out-of-sync: %Lu",
-			      lr->u.lr_region);
-			return -EINVAL;
-		}
+	/*
+	 * Recovery failed or mirror is being marked out-of-sync
+	 *
+	 * We can recieve multiple calls to mark out-of-sync
+	 * if there were several writes to the same region that
+	 * failed.  In this case, there will not be a record for
+	 * the region.
+	 */
+	ru = find_ru(lc, who, lr->u.lr_region);
 
-		if (ru->ru_rw == RU_RECOVER) {
-			if (lr->u.lr_region != lc->recovering_region) {
-				DMERR("Recovering region mismatch: (%Lu/%Lu)",
-				      lr->u.lr_region, lc->recovering_region);
-				BUG();
-			}
-			/*
-			 * Clear the recovery
-			 */
-			lc->recovering_region = (uint64_t)-1;
-			list_del(&ru->ru_list);
-			mempool_free(ru, region_user_pool);
-		} else {  /* ru->ru_rw == RU_WRITE */
-			/*
-			 * Mirror has place the region into RH_NOSYNC
-			 * It is safe to pull the ru
-			 */
-			list_del(&ru->ru_list);
-			mempool_free(ru, region_user_pool);			
+	if ((lr->u.lr_region == lc->recovering_region) && !ru) {
+		DMERR("Unable to locate record of recovery");
+		BUG();
+	}
+
+	if (!ru) {
+		DMERR("Unable to find region to be marked out-of-sync: %Lu/%s/%u",
+		      lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8), who);
+		BUG();
+	} else 	if (ru->ru_rw == RU_RECOVER) {
+		if (lr->u.lr_region != lc->recovering_region) {
+			DMERR("Recovering region mismatch from node %u: (%Lu/%Lu)",
+			      who, lr->u.lr_region, lc->recovering_region);
+			BUG();
 		}
-		/* gone again: lc->sync_count--;*/
-		log_clear_bit(lc, lc->sync_bits, lr->u.lr_region);
+		/*
+		 * Clear the recovery
+		 */
+		DMDEBUG("Setting recovering region out-of-sync: %Lu/%s/%u",
+			lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8), who);
+		lc->recovering_region = (uint64_t)-1;
+		list_del(&ru->ru_list);
+		mempool_free(ru, region_user_pool);
 	}
 
+	/* else if (ru->ru_rw == RU_WRITE)
+	 * Mirror has place the region into RH_NOSYNC
+	 * We will leave the record in place.  It is
+	 * likely there are more requests coming to
+	 * mark this region out-of-sync, due to the
+	 * way mirror handles the situation.
+	 *
+	 DMDEBUG("Setting marked region out-of-sync: %Lu/%s",
+                 lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
+	 */
+
+	/* gone again: lc->sync_count--;*/
+	log_clear_bit(lc, lc->sync_bits, lr->u.lr_region);
+
 	return 0;
 }
 
-
 static int server_get_sync_count(struct log_c *lc, struct log_request *lr){
 	if (lc->sync_count > lc->region_count) {
 		DMERR("sync_count (" SECTOR_FORMAT ") > region_count (" SECTOR_FORMAT ") in %s!",
@@ -901,6 +919,7 @@
 		 * If we are the server, assign it
 		 */
 		if(lr->u.lr_coordinator == my_id){
+			lc->server_valid = 0;
 			lc->server_id = my_id;
 		}
 
@@ -988,7 +1007,8 @@
 				DMDEBUG("I'm the cluster mirror log server for %s",
 				       lc->uuid + (strlen(lc->uuid) - 8));
 				if (atomic_read(&lc->suspended)) {
-					DMDEBUG("Not reading disk log because I'm suspended.");
+					DMDEBUG("Not reading disk log because I'm suspended (%s)",
+						lc->uuid + (strlen(lc->uuid) - 8));
 					
 				} else if (disk_resume(lc) == -EDEADLK) {
 					DMDEBUG("Unable to read disk log - deadlock potential.");
@@ -1060,7 +1080,12 @@
 			error = server_get_resync_work(lc, lr, nodeid);
 			break;
 		case LRT_COMPLETE_RESYNC_WORK:
-			error = server_complete_resync_work(lc, lr, lr->u.lr_int_rtn);
+			if(!(nodeid =
+			     ipaddr_to_nodeid((struct sockaddr *)msg.msg_name))){
+				error = -ENXIO;
+				break;
+			}
+			error = server_complete_resync_work(lc, lr, lr->u.lr_int_rtn, nodeid);
 			lr->u.lr_int_rtn = 0;
 			break;
 		case LRT_GET_SYNC_COUNT:
@@ -1236,6 +1261,22 @@
 }
 */
 
+int server_free_region_users(struct log_c *lc)
+{
+	int i = 0;
+	struct region_user *ru, *tmp_ru;
+
+	list_for_each_entry_safe(ru, tmp_ru, &lc->region_users, ru_list) {
+		i++;
+		list_del(&ru->ru_list);
+		mempool_free(ru, region_user_pool);
+	}
+
+	DMDEBUG("%d region user structures freed", i);
+	return 0;
+}
+
+
 int suspend_server(void){
 	atomic_set(&_suspend, 1);
 	return 0;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h	2005/08/11 18:26:19	1.1.2.2
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.h	2007/04/10 07:12:24	1.1.2.3
@@ -13,5 +13,6 @@
 int start_server(void);
 void stop_server(void);
 void print_server_status(struct log_c *lc);
+int server_free_region_users(struct log_c *lc);
 
 #endif /* __DM_CMIRROR_SERVER_H__ */



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-04-05 21:33 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-04-05 21:33 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL45
Changes by:	jbrassow at sourceware.org	2007-04-05 22:33:36

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 

Log message:
	Bug 234918 Processed: NMI Watchdog detected LOCKUP while running proces...
	Bug 217438: scrolling kernel requests to mark mirror regions
	
	Item 1:
	I needed to check for marked regions when getting resync work, not
	just check for resyncing regions when a mark/flush happens.
	
	Item 2:
	There is a corner case that allows two calls to clear the same
	region.  The second does not need to be logged.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.41.2.2&r2=1.1.2.41.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.26.2.3&r2=1.1.2.26.2.4

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/03 18:23:01	1.1.2.41.2.2
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/05 21:33:36	1.1.2.41.2.3
@@ -1034,7 +1034,9 @@
 
 	spin_lock(&lc->state_lock);
 
-	/* Should find match in this list, or no lists at all */
+	/*
+	 * The nominal case is to find the region in the marked list
+	 */
 	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list){
 		if(region == rs->rs_region){
 			list_del_init(&rs->rs_list);
@@ -1043,28 +1045,46 @@
 		}
 	}
 
-
-	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list){
+	/*
+	 * It is possible, but unlikely to get to this case. It requires
+	 * the following to happen:
+	 * 1) mark the region for writing
+	 * 2) clear the region
+	 * 3) clear doesn't get flushed because of bug 235040
+	 * 4) suspend due to server relocation
+	 * 5) on-disk log says we need to recover (because it hasn't been cleared)
+	 * 6) we recover the region
+	 * 7) clearing the region after recovery causes us to get here
+	 *
+	 * Once 235040 is cleared, any entries found in this list should
+	 * cause a bug.
+	 */
+	list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list){
 		if(region == rs->rs_region){
-			DMERR("Clear pre-empting mark (%Lu/%s)",
-			       region, lc->uuid + (strlen(lc->uuid) - 8));
-			BUG();
+			DMERR("%d) Double clear on region ("
+			      SECTOR_FORMAT ")", __LINE__, region);
+			goto out;
 		}
 	}
 
-	list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list){
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list){
 		if(region == rs->rs_region){
-			DMERR("%d) Double clear on region ("
-			      SECTOR_FORMAT ")", __LINE__, region);
+			DMERR("Clear pre-empting mark (%Lu/%s)",
+			       region, lc->uuid + (strlen(lc->uuid) - 8));
 			BUG();
 		}
 	}
+	
 	/* We can get here because we may be doing resync_work, and therefore,**
 	** clearing without ever marking..................................... */
 
 	/* Don't need to spin_unlock, because allocation is non-blocking */
 	rs_new = mempool_alloc(region_state_pool, GFP_ATOMIC);
-	BUG_ON(!rs_new);
+	if (!rs_new) {
+		DMERR("Failed to allocate space for clear region request: %Lu",
+		      region);
+		BUG();
+	}
 	memset(rs_new, 0, sizeof(struct region_state));
 
 	rs_new->rs_region = region;
@@ -1088,6 +1108,21 @@
 		DMWARN("Error while getting resync work: bad region");
 		rtn = 0;
 	}
+
+	/*
+	 * Check for bug 235039
+	 * Note the changes in cluser_clear_region
+	 */
+	if (rtn == 1) {
+		struct region_state *rs, *tmp_rs;
+		list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) {
+			if (*region == rs->rs_region) {
+				DMERR("WARNING: Bug 235039/235040 detected!");
+				DMERR("Work-around in place.");
+			}
+		}
+	}
+
 	return rtn;
 }
 
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/04 21:36:01	1.1.2.26.2.3
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/05 21:33:36	1.1.2.26.2.4
@@ -656,6 +656,8 @@
 
 
 static int server_complete_resync_work(struct log_c *lc, struct log_request *lr, int success){
+	struct region_user *ru;
+
 	if (lr->u.lr_region > lc->region_count) {
 		return -EINVAL;
 	}
@@ -678,6 +680,42 @@
 
 		DMDEBUG("Resync work completed: %Lu", lr->u.lr_region);
 	} else if (log_test_bit(lc->sync_bits, lr->u.lr_region)) {
+		ru = find_ru_by_region(lc, lr->u.lr_region);
+
+		/*
+		 * The following condition can never happen unless we have
+		 * a corrupted list or we had a communication error.
+		 *
+		 * If a write failed to one of the mirror devices, the ru
+		 * should be RU_WRITE.  If a recovery failed, it should be
+		 * RU_RECOVER
+		 */
+		if (!ru) {
+			DMERR("Unable to find region being marked out-of-sync: %Lu",
+			      lr->u.lr_region);
+			return -EINVAL;
+		}
+
+		if (ru->ru_rw == RU_RECOVER) {
+			if (lr->u.lr_region != lc->recovering_region) {
+				DMERR("Recovering region mismatch: (%Lu/%Lu)",
+				      lr->u.lr_region, lc->recovering_region);
+				BUG();
+			}
+			/*
+			 * Clear the recovery
+			 */
+			lc->recovering_region = (uint64_t)-1;
+			list_del(&ru->ru_list);
+			mempool_free(ru, region_user_pool);
+		} else {  /* ru->ru_rw == RU_WRITE */
+			/*
+			 * Mirror has place the region into RH_NOSYNC
+			 * It is safe to pull the ru
+			 */
+			list_del(&ru->ru_list);
+			mempool_free(ru, region_user_pool);			
+		}
 		/* gone again: lc->sync_count--;*/
 		log_clear_bit(lc, lc->sync_bits, lr->u.lr_region);
 	}



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-04-05 21:32 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-04-05 21:32 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-04-05 22:32:26

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 

Log message:
	Bug 234918 Processed: NMI Watchdog detected LOCKUP while running proces...
	Bug 217438: scrolling kernel requests to mark mirror regions
	
	Item 1:
	I needed to check for marked regions when getting resync work, not
	just check for resyncing regions when a mark/flush happens.
	
	Item 2:
	There is a corner case that allows two calls to clear the same
	region.  The second does not need to be logged.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.43&r2=1.1.2.44
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.29&r2=1.1.2.30

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/03 18:21:10	1.1.2.43
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/05 21:32:25	1.1.2.44
@@ -1034,7 +1034,9 @@
 
 	spin_lock(&lc->state_lock);
 
-	/* Should find match in this list, or no lists at all */
+	/*
+	 * The nominal case is to find the region in the marked list
+	 */
 	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list){
 		if(region == rs->rs_region){
 			list_del_init(&rs->rs_list);
@@ -1043,28 +1045,46 @@
 		}
 	}
 
-
-	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list){
+	/*
+	 * It is possible, but unlikely to get to this case. It requires
+	 * the following to happen:
+	 * 1) mark the region for writing
+	 * 2) clear the region
+	 * 3) clear doesn't get flushed because of bug 235040
+	 * 4) suspend due to server relocation
+	 * 5) on-disk log says we need to recover (because it hasn't been cleared)
+	 * 6) we recover the region
+	 * 7) clearing the region after recovery causes us to get here
+	 *
+	 * Once 235040 is cleared, any entries found in this list should
+	 * cause a bug.
+	 */
+	list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list){
 		if(region == rs->rs_region){
-			DMERR("Clear pre-empting mark (%Lu/%s)",
-			       region, lc->uuid + (strlen(lc->uuid) - 8));
-			BUG();
+			DMERR("%d) Double clear on region ("
+			      SECTOR_FORMAT ")", __LINE__, region);
+			goto out;
 		}
 	}
 
-	list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list){
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list){
 		if(region == rs->rs_region){
-			DMERR("%d) Double clear on region ("
-			      SECTOR_FORMAT ")", __LINE__, region);
+			DMERR("Clear pre-empting mark (%Lu/%s)",
+			       region, lc->uuid + (strlen(lc->uuid) - 8));
 			BUG();
 		}
 	}
+	
 	/* We can get here because we may be doing resync_work, and therefore,**
 	** clearing without ever marking..................................... */
 
 	/* Don't need to spin_unlock, because allocation is non-blocking */
 	rs_new = mempool_alloc(region_state_pool, GFP_ATOMIC);
-	BUG_ON(!rs_new);
+	if (!rs_new) {
+		DMERR("Failed to allocate space for clear region request: %Lu",
+		      region);
+		BUG();
+	}
 	memset(rs_new, 0, sizeof(struct region_state));
 
 	rs_new->rs_region = region;
@@ -1088,6 +1108,21 @@
 		DMWARN("Error while getting resync work: bad region");
 		rtn = 0;
 	}
+
+	/*
+	 * Check for bug 235039
+	 * Note the changes in cluser_clear_region
+	 */
+	if (rtn == 1) {
+		struct region_state *rs, *tmp_rs;
+		list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) {
+			if (*region == rs->rs_region) {
+				DMERR("WARNING: Bug 235039/235040 detected!");
+				DMERR("Work-around in place.");
+			}
+		}
+	}
+
 	return rtn;
 }
 
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/04 21:35:23	1.1.2.29
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/05 21:32:25	1.1.2.30
@@ -656,6 +656,8 @@
 
 
 static int server_complete_resync_work(struct log_c *lc, struct log_request *lr, int success){
+	struct region_user *ru;
+
 	if (lr->u.lr_region > lc->region_count) {
 		return -EINVAL;
 	}
@@ -678,6 +680,42 @@
 
 		DMDEBUG("Resync work completed: %Lu", lr->u.lr_region);
 	} else if (log_test_bit(lc->sync_bits, lr->u.lr_region)) {
+		ru = find_ru_by_region(lc, lr->u.lr_region);
+
+		/*
+		 * The following condition can never happen unless we have
+		 * a corrupted list or we had a communication error.
+		 *
+		 * If a write failed to one of the mirror devices, the ru
+		 * should be RU_WRITE.  If a recovery failed, it should be
+		 * RU_RECOVER
+		 */
+		if (!ru) {
+			DMERR("Unable to find region being marked out-of-sync: %Lu",
+			      lr->u.lr_region);
+			return -EINVAL;
+		}
+
+		if (ru->ru_rw == RU_RECOVER) {
+			if (lr->u.lr_region != lc->recovering_region) {
+				DMERR("Recovering region mismatch: (%Lu/%Lu)",
+				      lr->u.lr_region, lc->recovering_region);
+				BUG();
+			}
+			/*
+			 * Clear the recovery
+			 */
+			lc->recovering_region = (uint64_t)-1;
+			list_del(&ru->ru_list);
+			mempool_free(ru, region_user_pool);
+		} else {  /* ru->ru_rw == RU_WRITE */
+			/*
+			 * Mirror has place the region into RH_NOSYNC
+			 * It is safe to pull the ru
+			 */
+			list_del(&ru->ru_list);
+			mempool_free(ru, region_user_pool);			
+		}
 		/* gone again: lc->sync_count--;*/
 		log_clear_bit(lc, lc->sync_bits, lr->u.lr_region);
 	}



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-04-03 18:23 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-04-03 18:23 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL45
Changes by:	jbrassow at sourceware.org	2007-04-03 19:23:01

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c dm-cmirror-xfr.h 

Log message:
	Bug 234539: multiple streams of I/O can cause system to lock up
	
	This bug provoked an audit of the communications exchange, locking,
	and memory allocations/stack usage.
	
	Communication fixes include:
	1) Added sequence numbers to ensure that replies from the server
	correctly correspond to client requests.  It was found that if
	a client timed out waiting for a server to respond, it would send
	the request again.  However, the server may have simply been too
	busy to respond in a timely fashion.  It ends up responding to
	both the original request and the resent request - causing the
	client and server to become out-of-sync WRT log requests.
	
	Locking fixes include:
	1) A semaphore was being "up"ed twice in some cases, rendering
	the lock impotent.
	
	2) A spin lock controlling region status lists was being held
	across blocking operations - sometimes causing deadlocks.  The
	spin lock was changed to a per-log lock, and some logging
	operations were restructured to better suit the way locking
	needed to be done.  A side-effect of this fix is a 20%
	improvement in write operations.
	
	3) The log list protection lock needed to change from a spin lock
	to a semaphore to allow blocking operations.
	
	Memory allocation fixes include:
	1) Wrong flags to kmalloc could cause deadlock.  Use NOFS instead
	of KERNEL.
	
	2) Mempools needed more reserves for low memory conditions.
	
	3) Server now allocates a communication structure instead of having
	it on the stack.  This reduces the likelyhood of stack corruption.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.41.2.1&r2=1.1.2.41.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.12&r2=1.1.2.12.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.26.2.1&r2=1.1.2.26.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-xfr.h.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.2.2.1&r2=1.1.2.2.2.2

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/03/22 22:34:44	1.1.2.41.2.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/03 18:23:01	1.1.2.41.2.2
@@ -28,20 +28,16 @@
 #include "dm-cmirror-server.h"
 #include "dm-cmirror-cman.h"
 
-spinlock_t log_list_lock;
+DECLARE_MUTEX(log_list_lock);
 LIST_HEAD(log_list_head);
 
 struct region_state {
-	struct log_c *rs_lc;
+	int rs_mark_logged;
 	region_t rs_region;
 	struct list_head rs_list;
 };
 
 static mempool_t *region_state_pool = NULL;
-static spinlock_t region_state_lock;
-static int clear_region_count=0;
-static struct list_head clear_region_list;
-static struct list_head marked_region_list;
 
 static int shutting_down=0;
 static atomic_t suspend_client;
@@ -145,15 +141,7 @@
 	memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
 	lc->sync_count = (sync == NOSYNC) ? region_count : 0;
 
-	lc->recovering_bits = vmalloc(bitset_size);
-	if (!lc->recovering_bits) {
-		DMWARN("couldn't allocate sync bitset");
-		vfree(lc->sync_bits);
-		vfree(lc->clean_bits);
-		kfree(lc);
-		return -ENOMEM;
-	}
-	memset(lc->recovering_bits, 0, bitset_size);
+	lc->recovering_region = (uint64_t)-1;
 	lc->sync_search = 0;
 	log->context = lc;
 	return 0;
@@ -164,7 +152,6 @@
 	struct log_c *lc = (struct log_c *) log->context;
 	vfree(lc->clean_bits);
 	vfree(lc->sync_bits);
-	vfree(lc->recovering_bits);
 	kfree(lc);
 }
 
@@ -321,8 +308,9 @@
 
 	request_count++;
 
-	lr = kmalloc(sizeof(struct log_request), GFP_KERNEL);
+	lr = kmalloc(sizeof(struct log_request), GFP_NOFS);
 	if(!lr){
+		BUG();
 		error = -ENOMEM;
 		*retry = 1;
 		goto fail;
@@ -404,15 +392,15 @@
 	}
     
 	if (seq != lr->lr_seq) {
-		DMERR("Message sequence number mismatch: %d/%d",
+		DMDEBUG("Message sequence number mismatch: %d/%d",
 		      seq, lr->lr_seq);
 		if (seq > lr->lr_seq) {
-			DMERR(" Skipping.  Listening again for response to %s",
+			DMDEBUG(" Skipping.  Listening again for response to %s",
 			      RQ_STRING(type));
 			memset(lr, 0, sizeof(struct log_request));
 			goto rerecv;
 		}
-		DMERR(" Must try to resend request, %s", RQ_STRING(type));
+		DMERR(" Seq# mismatch: Must try to resend request, %s", RQ_STRING(type));
 		error = -EBADE;
 		*retry = 1;
 		seq++;
@@ -509,91 +497,43 @@
 			new_server = 1;
 		}
 
-		spin_lock(&region_state_lock);
+		spin_lock(&lc->state_lock);
 		if(new_server && 
-		   (!list_empty(&clear_region_list) ||
-		    !list_empty(&marked_region_list))){
+		   !list_empty(&lc->mark_logged)){
 			int i=0;
-			struct region_state *tmp_rs;
+			LIST_HEAD(mark);
 
 			DMINFO("Clean-up required due to new server");
-			DMINFO(" - Wiping clear region list");
-			list_for_each_entry_safe(rs, tmp_rs,
-						 &clear_region_list, rs_list){
-				/* Remove only those associated with referenced log */
-				if (rs->rs_lc != lc)
-					continue;
-				i++;
-				list_del_init(&rs->rs_list);
-				mempool_free(rs, region_state_pool);
-			}
-			clear_region_count -= i;
-			DMINFO(" - %d clear region requests wiped", i);
-			i=0;
 			DMINFO(" - Resending all mark region requests");
-			list_for_each_entry(rs, &marked_region_list, rs_list){
-				/* Resend only those associated with referenced log */
-				if (rs->rs_lc != lc)
-					continue;
+			list_splice_init(&lc->mark_logged, &mark);
+
+			spin_unlock(&lc->state_lock);
+
+			list_for_each_entry(rs, &mark, rs_list){
 				do {
 					retry = 0;
-					i++;
-					rtn = _consult_server(rs->rs_lc, rs->rs_region,
+					rtn = _consult_server(lc, rs->rs_region,
 							      LRT_MARK_REGION, NULL, &retry);
 					if (lc->server_id == 0xDEAD) {
-						spin_unlock(&region_state_lock);
 						goto election;
 					}
 				} while(retry);
+				i++;
 			}
+
+			spin_lock(&lc->state_lock);
+			list_splice_init(&mark, &lc->mark_logged);
+
 			DMINFO(" - %d mark region requests resent", i);
 			DMINFO("Clean-up complete");
-			if(type == LRT_MARK_REGION){
-				/* we just handled all marks */
-				DMWARN("Mark request ignored.\n");
-				spin_unlock(&region_state_lock);
-				goto out;
-			} else {
-				DMINFO("Continuing request type, %d (%s)", type,
-				       RQ_STRING(type));
-			}
+			DMINFO("Continuing request type, %d (%s)", type,
+			       RQ_STRING(type));
 			new_server = 0;
 		}
-
-		rs = NULL;
-
-		if(!list_empty(&clear_region_list)){
-			rs = list_entry(clear_region_list.next,
-					struct region_state, rs_list);
-			list_del_init(&rs->rs_list);
-			clear_region_count--;
-		}
-
-		spin_unlock(&region_state_lock);
-		
-		/* ATTENTION -- it may be possible to remove a clear region **
-		** request from the list.  Then, have a mark region happen  **
-		** while we are here.  If the clear region request fails, it**
-		** would be re-added - perhaps prematurely clearing the bit */
+		spin_unlock(&lc->state_lock);
 		
-		if(rs && !rs->rs_lc->log_dev_failed){
-			_consult_server(rs->rs_lc, rs->rs_region,
-					LRT_CLEAR_REGION, NULL, &retry);
-
-			if(retry){
-				spin_lock(&region_state_lock);
-				list_add(&rs->rs_list, &clear_region_list);
-				clear_region_count++;
-				spin_unlock(&region_state_lock);
-
-			} else {
-				mempool_free(rs, region_state_pool);
-			}
-		}
 		retry = 0;
-		
 		rtn = _consult_server(lc, region, type, result, &retry);
-		schedule();
 	} while(retry);
 out:
 	up(&consult_server_lock);
@@ -640,7 +580,7 @@
 	atomic_set(&lc->in_sync, -1);
 	lc->uuid_ref = 1;
 
-	spin_lock(&log_list_lock);
+	down(&log_list_lock);
 	list_for_each_entry(tmp_lc, &log_list_head, log_list){
 		if(!strncmp(tmp_lc->uuid, lc->uuid, MAX_NAME_LEN)){
 			lc->uuid_ref = (lc->uuid_ref > tmp_lc->uuid_ref) ?
@@ -649,12 +589,16 @@
 	}
 
 	list_add(&lc->log_list, &log_list_head);
-	spin_unlock(&log_list_lock);
+	up(&log_list_lock);
 	DMDEBUG("Creating %s (%d)",
 	       lc->uuid + (strlen(lc->uuid) - 8),
 	       lc->uuid_ref);
 
 	INIT_LIST_HEAD(&lc->region_users);
+	INIT_LIST_HEAD(&lc->clear_waiting);
+	INIT_LIST_HEAD(&lc->mark_waiting);
+	INIT_LIST_HEAD(&lc->mark_logged);
+	spin_lock_init(&lc->state_lock);
 
 	lc->server_id = 0xDEAD;
 
@@ -761,31 +705,44 @@
 	       lc->uuid + (strlen(lc->uuid) - 8),
 	       lc->uuid_ref);
 
-	if (!list_empty(&clear_region_list))
-		DMINFO("Leaving while clear region requests remain.");
-
-	spin_lock(&log_list_lock);
+	down(&log_list_lock);
 	list_del_init(&lc->log_list);
-	spin_unlock(&log_list_lock);
+	up(&log_list_lock);
 
 	if ((lc->server_id == my_id) && !atomic_read(&lc->suspended))
 		consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
 
 	sock_release(lc->client_sock);
 
-	spin_lock(&region_state_lock);
+	spin_lock(&lc->state_lock);
 
-	list_for_each_entry_safe(rs, tmp_rs, &clear_region_list, rs_list) {
-		if (lc == rs->rs_lc)
+	if (!list_empty(&lc->clear_waiting)) {
+		DMINFO("Clear requests remain at cluster log deactivation");
+		list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) {
 			list_del_init(&rs->rs_list);
+			DMINFO(" - Ignoring clear request: %Lu", rs->rs_region);
+			mempool_free(rs, region_state_pool);
+		}
 	}
 
-	list_for_each_entry_safe(rs, tmp_rs, &marked_region_list, rs_list) {
-		if (lc == rs->rs_lc)
-			list_del_init(&rs->rs_list);
+	if (!list_empty(&lc->mark_waiting)) {
+		DMERR("Pending mark requests remain at cluster_dtr");
+		BUG();
+	}
+
+	if (!list_empty(&lc->mark_logged)) {
+		DMERR("Mark requests remain at cluster log deactivation");
+		/*
+		 * Should I BUG() this?
+		 * No.  In the worst case, they will get cleaned up later
+		 */
+	}
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list) {
+		list_del_init(&rs->rs_list);
+		mempool_free(rs, region_state_pool);
 	}
 
-	spin_unlock(&region_state_lock);
+	spin_unlock(&lc->state_lock);
 
 	if (lc->log_dev)
 		disk_dtr(log);
@@ -803,19 +760,27 @@
 
 static int cluster_postsuspend(struct dirty_log *log)
 {
+	struct region_state *rs, *tmp_rs;
 	struct log_c *lc = (struct log_c *) log->context;
 
-	while (1) {
-		spin_lock(&region_state_lock);
-		if (list_empty(&clear_region_list)) {
-			spin_unlock(&region_state_lock);
-			break;
-		}
-		spin_unlock(&region_state_lock);
+	spin_lock(&lc->state_lock);
+	if (!list_empty(&lc->mark_waiting)) {
+		DMERR("Mark requests remain at postsuspend!");
+		BUG();
+	}
 
-		/* Just an unnessesary call to clear out regions */
-		consult_server(lc, 0, LRT_IN_SYNC, NULL);
+	if (!list_empty(&lc->clear_waiting)) {
+		DMERR("Clear requests remain at postsuspend!");
+
+		list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) {
+			list_del_init(&rs->rs_list);
+			DMERR(" - Ignoring clear request: %Lu", rs->rs_region);
+			mempool_free(rs, region_state_pool);
+		}
 	}
+
+	spin_unlock(&lc->state_lock);
+
 	atomic_set(&lc->suspended, 1);
 	if(lc->server_id == my_id) {
 		while (1) {
@@ -903,103 +868,162 @@
 
 static int cluster_flush(struct dirty_log *log)
 {
+	int r = 0;
+	int clear_count = 0;
+	int mark_count = 0;
 	struct log_c *lc = (struct log_c *) log->context;
+	struct region_state *rs, *tmp_rs;
+	LIST_HEAD(mark);
+	LIST_HEAD(clear);
+
+	/*
+	 * It should never be a problem to temporarily have
+	 * the mark requests in limbo.  The only functions
+	 * that call cluster_flush are rh_update_states and
+	 * do_writes, and they are in the same thread as
+	 * those changing the region states
+	 */
+	spin_lock(&lc->state_lock);
+	list_splice_init(&lc->clear_waiting, &clear);
+	list_splice_init(&lc->mark_waiting, &mark);
+	spin_unlock(&lc->state_lock);
+
+	list_for_each_entry_safe(rs, tmp_rs, &clear, rs_list) {
+		/* don't really care if LRT_CLEAR_REGION fails */
+		consult_server(lc, rs->rs_region, LRT_CLEAR_REGION, NULL);
+		list_del_init(&rs->rs_list);
+		mempool_free(rs, region_state_pool);
+		clear_count++;
+	}
+
+	list_for_each_entry_safe(rs, tmp_rs, &mark, rs_list) {
+		while (1) {
+			r = consult_server(lc, rs->rs_region,
+					   LRT_MARK_REGION, NULL);
+			if (!r)
+				break;
+
+			if (r == -EBUSY) {
+				DMDEBUG("Delaying mark to region %Lu, due to recovery",
+					rs->rs_region);
+				set_current_state(TASK_INTERRUPTIBLE);
+				schedule_timeout(HZ/2);
+				continue;
+			}
 
-	/* FIXME:  flush all clear_region requests to server */
-	return (lc->log_dev_failed) ? -EIO : 0;
+			if (r == -EIO)
+				goto fail;
+
+			DMWARN("unable to get server (%u) to mark region (%Lu)",
+			       lc->server_id, rs->rs_region);
+			DMWARN("Reason :: %d", r);
+		}
+		mark_count++;
+	}
+
+	/* No flush work? */
+	if (!clear_count && !mark_count)
+		return 0;
+
+	spin_lock(&lc->state_lock);
+	list_splice_init(&mark, &lc->mark_logged);
+	spin_unlock(&lc->state_lock);
+
+	while ((r = consult_server(lc, 0, LRT_FLUSH, NULL))) {
+		if (r == -EBUSY) {
+			DMDEBUG("Delaying flush due to recovery");
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(HZ/2);
+			continue;
+		}
+
+		if (r == -EIO)
+			break;
+	}
+
+fail:
+	if (r) {
+		DMERR("Log flush failure: %d%s", r,
+		      (r == -EIO) ? " -EIO" : "");
+		dm_table_event(lc->ti->table);
+		lc->log_dev_failed = 1;
+	}
+
+	return r;
 }
 
 static void cluster_mark_region(struct dirty_log *log, region_t region)
 {
-	int error = 0;
 	struct region_state *rs, *tmp_rs, *rs_new;
 	struct log_c *lc = (struct log_c *) log->context;
 
-	rs_new = mempool_alloc(region_state_pool, GFP_KERNEL);
+	spin_lock(&lc->state_lock);
 
-	memset(rs_new, 0, sizeof(struct region_state));
 
-	spin_lock(&region_state_lock);
-	list_for_each_entry_safe(rs, tmp_rs, &clear_region_list, rs_list){
-		if(lc == rs->rs_lc && region == rs->rs_region){
-			/*
-			DMDEBUG("Mark pre-empting clear (%Lu/%s)",
-				region, lc->uuid + (strlen(lc->uuid) - 8));
-			*/
+	/*
+	 * It is possible for the following in the mirror code:
+	 *  0) Mark is already logged for a region
+	 *  1) rh_dec, sets region state to RH_CLEAN (asynchronous)
+	 *  2) rh_update_states (DOESN'T FLUSH!!!, bug #235040)
+	 *  3) do_writes, trys to mark region
+	 *
+	 * The following shouldn't have to be handled b/c of the flush
+	 *  0) Region finishes recovery
+	 *  1) rh_update_states clears region (DOES FLUSH)
+	 *  2) do_writes, trys to mark region
+	 *
+	 * This can lead to this next case being valid.
+	 */
+	list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) {
+		if (region == rs->rs_region) {
+			if (!rs->rs_mark_logged) {
+				DMERR("Moving region(%Lu/%s) from clear_waiting -> mark_waiting",
+				      region, lc->uuid + (strlen(lc->uuid) - 8));
+			}
 			list_del_init(&rs->rs_list);
-			list_add(&rs->rs_list, &marked_region_list);
-			clear_region_count--;
-			spin_unlock(&region_state_lock);
-			if (rs_new)
-				mempool_free(rs_new, region_state_pool);
-
-			return;
+			list_add(&rs->rs_list,
+				 (rs->rs_mark_logged) ?
+				 &lc->mark_logged : &lc->mark_waiting);
+			goto out;
 		}
 	}
+
 	/*
-	 * In the mirroring code, it is possible for a write
-	 * to complete and call rh_dec - putting the region on
-	 * the clear_region list.  However, before the actual
-	 * clear request is issued to the log (rh_update_states)
-	 * another mark happens.  So, we check for and remove
-	 * duplicates.
+	 * It is possible for the following in the mirror code:
+	 *  0) Mark is already logged for a region
+	 *  1) rh_update_states
+	 *  2) rh_dec, sets region state to RH_CLEAN (asynchronous)
+	 *  3) do_writes, trys to mark region
+	 *
+	 * This can lead to this next case being valid.
 	 */
-	list_for_each_entry(rs, &marked_region_list, rs_list){
-		if(lc == rs->rs_lc && region == rs->rs_region){
-#ifdef DEBUG
-			DMINFO("Double mark on region ("
-			       SECTOR_FORMAT ")", region);
-#endif
-			spin_unlock(&region_state_lock);
-			if (rs_new)
-				mempool_free(rs_new, region_state_pool);
-
-			return;
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list){
+		if (region == rs->rs_region) {
+			goto out;
 		}
 	}
 
-	if(!rs_new){
-		DMERR("Unable to allocate region_state for mark.");
-		BUG();
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list){
+		if (region == rs->rs_region) {
+			DMERR("Mark already waiting (%Lu/%s)",
+			      region, lc->uuid + (strlen(lc->uuid) - 8));
+			BUG();
+		}
 	}
+	spin_unlock(&lc->state_lock);
 
-	rs_new->rs_lc = lc;
+	rs_new = mempool_alloc(region_state_pool, GFP_NOFS);
+	BUG_ON(!rs_new);
+	memset(rs_new, 0, sizeof(struct region_state));
+
+	spin_lock(&lc->state_lock);
+	rs_new->rs_mark_logged = 1;
 	rs_new->rs_region = region;
 	INIT_LIST_HEAD(&rs_new->rs_list);
-	list_add(&rs_new->rs_list, &marked_region_list);
-
-	spin_unlock(&region_state_lock);
-
-	if (!lc->log_dev_failed) {
-		while((error = consult_server(lc, region, LRT_MARK_REGION, NULL))){
-			if (error == -EBUSY) {
-				/* Remote recovering delay and try again */
-				DMDEBUG("Delaying mark to region %Lu, due to recovery",
-					region);
-				set_current_state(TASK_INTERRUPTIBLE);
-				schedule_timeout(HZ/2);
-				continue;
-			}
-
-			if (error == -EIO) {
-				lc->log_dev_failed = 1;
-				break;
-			}
-			DMWARN("unable to get server (%u) to mark region (%Lu)",
-			       lc->server_id, region);
-			DMWARN("Reason :: %d", error);
-		}
+	list_add(&rs_new->rs_list, &lc->mark_waiting);
+out:
+	spin_unlock(&lc->state_lock);
 
-		if (lc->log_dev_failed) {
-			dm_table_event(lc->ti->table);
-			/*
-			  DMERR("Write failed on mirror log device, %s",
-			  lc->log_dev->name);
-			  if (!atomic_read(&lc->suspended))
-			  wait_for_completion(&lc->failure_completion);
-			*/
-		}
-	}
 	return;
 }
 
@@ -1008,53 +1032,48 @@
 	struct log_c *lc = (struct log_c *) log->context;
 	struct region_state *rs, *tmp_rs, *rs_new;
 
-	rs_new = mempool_alloc(region_state_pool, GFP_ATOMIC);
+	spin_lock(&lc->state_lock);
 
-	memset(rs_new, 0, sizeof(struct region_state));
+	/* Should find match in this list, or no lists at all */
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list){
+		if(region == rs->rs_region){
+			list_del_init(&rs->rs_list);
+			list_add(&rs->rs_list, &lc->clear_waiting);
+			goto out;
+		}
+	}
 
-	spin_lock(&region_state_lock);
 
-	list_for_each_entry_safe(rs, tmp_rs, &clear_region_list, rs_list){
-		if(lc == rs->rs_lc && region == rs->rs_region){
-			DMINFO("%d) Double clear on region ("
-			      SECTOR_FORMAT ")", __LINE__, region);
-			spin_unlock(&region_state_lock);
-			if (rs_new)
-				mempool_free(rs_new, region_state_pool);
-			return;
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list){
+		if(region == rs->rs_region){
+			DMERR("Clear pre-empting mark (%Lu/%s)",
+			       region, lc->uuid + (strlen(lc->uuid) - 8));
+			BUG();
 		}
 	}
 
-	list_for_each_entry_safe(rs, tmp_rs, &marked_region_list, rs_list){
-		if(lc == rs->rs_lc && region == rs->rs_region){
-			list_del_init(&rs->rs_list);
-			list_add(&rs->rs_list, &clear_region_list);
-			clear_region_count++;
-			spin_unlock(&region_state_lock);
-			if (rs_new)
-				mempool_free(rs_new, region_state_pool);
-			return;
+	list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list){
+		if(region == rs->rs_region){
+			DMERR("%d) Double clear on region ("
+			      SECTOR_FORMAT ")", __LINE__, region);
+			BUG();
 		}
 	}
-
-	/* We can get here because we my be doing resync_work, and therefore, **
+	/* We can get here because we may be doing resync_work, and therefore,**
 	** clearing without ever marking..................................... */
 
-	if(!rs_new){
-		DMERR("Unable to allocate region_state for clear.");
-		BUG();
-	}
+	/* Don't need to spin_unlock, because allocation is non-blocking */
+	rs_new = mempool_alloc(region_state_pool, GFP_ATOMIC);
+	BUG_ON(!rs_new);
+	memset(rs_new, 0, sizeof(struct region_state));
 
-	rs_new->rs_lc = lc;
 	rs_new->rs_region = region;
 	INIT_LIST_HEAD(&rs_new->rs_list);
-	list_add(&rs_new->rs_list, &clear_region_list);
-	clear_region_count++;
-	if(!(clear_region_count & 0x7F)){
-		DMINFO("clear_region_count :: %d", clear_region_count);
-	}
+	list_add(&rs_new->rs_list, &lc->clear_waiting);
+
+out:
+	spin_unlock(&lc->state_lock);
 
-	spin_unlock(&region_state_lock);
 	return;
 }
 
@@ -1122,27 +1141,6 @@
 
 	switch(status){
 	case STATUSTYPE_INFO:
-/*
-		spin_lock(&region_state_lock);
-		i = clear_region_count;
-		list_for_each_entry(rs, &marked_region_list, rs_list){
-			j++;
-		}
-		spin_unlock(&region_state_lock);
-
-		DMINFO("CLIENT OUTPUT::");
-		DMINFO("  My ID            : %u", my_id);
-		DMINFO("  Server ID        : %u", lc->server_id);
-
-		DMINFO("  In-sync          : %s", (atomic_read(&lc->in_sync)>0)?
-		       "YES" : "NO");
-		DMINFO("  Regions marked   : %d", j);
-		DMINFO("  Regions clearing : %d", i);
-
-		if(lc->server_id == my_id){
-			print_server_status(lc);
-		}
-*/
 		if(lc->sync != DEFAULTSYNC)
 			arg_count++;
 
@@ -1195,11 +1193,11 @@
 
 	atomic_set(&suspend_client, 1);
 
-	spin_lock(&log_list_lock);
+	down(&log_list_lock);
 	list_for_each_entry(lc, &log_list_head, log_list) {
 		atomic_set(&lc->in_sync, 0);
 	}
-	spin_unlock(&log_list_lock);
+	up(&log_list_lock);
 
 	/*
 	if (likely(!shutting_down))
@@ -1221,7 +1219,12 @@
 	global_nodeids = nodeids;
 	global_count = count;
 
-	kcl_get_node_by_nodeid(0, &node);
+	for (i = 0; kcl_get_node_by_nodeid(0, &node); i++) {
+		if (i > 10)
+			BUG();
+		else
+			DMERR("Bad call to kcl_get_node_by_nodeid");
+	}
 	my_id = node.node_id;
 
 	/* Wait for any outstanding starts to complete */
@@ -1233,7 +1236,7 @@
 	switch(type){
 	case SERVICE_NODE_LEAVE:
 	case SERVICE_NODE_FAILED:
-		spin_lock(&log_list_lock);
+		down(&log_list_lock);
 		list_for_each_entry(lc, &log_list_head, log_list){
 			for(i=0, server = 0xDEAD; i < count; i++){
 				if(lc->server_id == nodeids[i]){
@@ -1243,7 +1246,7 @@
 			/* ATTENTION -- need locking around this ? */
 			lc->server_id = server;
 		}
-		spin_unlock(&log_list_lock);
+		up(&log_list_lock);
 
 		break;
 	case SERVICE_NODE_JOIN:
@@ -1279,10 +1282,8 @@
 
 	down(&cmirror_register_lock);
 
-	if (mirror_set_count++) {
-		up(&cmirror_register_lock);
+	if (mirror_set_count++)
 		goto out;
-	}
 
 	r = kcl_register_service("clustered_log", 13, SERVICE_LEVEL_GDLM, &clog_ops,
 				 1, NULL, &local_id);
@@ -1383,12 +1384,7 @@
         DMINFO("dm-cmirror %s (built %s %s) installed",
                CMIRROR_RELEASE_NAME, __DATE__, __TIME__);
 
-	INIT_LIST_HEAD(&clear_region_list);
-	INIT_LIST_HEAD(&marked_region_list);
-
-	spin_lock_init(&region_state_lock);
-	spin_lock_init(&log_list_lock);
-	region_state_pool = mempool_create(20, region_state_alloc,
+	region_state_pool = mempool_create(500, region_state_alloc,
 					   region_state_free, NULL);
 	if(!region_state_pool){
 		DMWARN("couldn't create region state pool");
@@ -1424,6 +1420,8 @@
 	}
 	dm_unregister_dirty_log_type(&_clustered_core_type);
 	dm_unregister_dirty_log_type(&_clustered_disk_type);
+        DMINFO("dm-cmirror %s (built %s %s) removed",
+               CMIRROR_RELEASE_NAME, __DATE__, __TIME__);
 }
 
 module_init(cluster_dirty_log_init);
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/02/21 17:14:44	1.1.2.12
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/04/03 18:23:01	1.1.2.12.2.1
@@ -97,7 +97,7 @@
 	unsigned bitset_uint32_count;
 	uint32_t *clean_bits;
 	uint32_t *sync_bits;
-	uint32_t *recovering_bits;	/* FIXME: this seems excessive */
+	uint64_t recovering_region;
 
 	int sync_pass;          /* number of passes attempting to resync */
 	int sync_search;
@@ -134,7 +134,12 @@
 	atomic_t in_sync;  /* like sync_count, except all or nothing */
 
 	struct list_head log_list;
-	struct list_head region_users;
+	struct list_head region_users;  /* Used by Server */
+
+	spinlock_t state_lock;
+	struct list_head clear_waiting;
+	struct list_head mark_waiting;
+	struct list_head mark_logged;
 
 	uint32_t server_id;
 	struct socket *client_sock;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/03/22 22:34:44	1.1.2.26.2.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/03 18:23:01	1.1.2.26.2.2
@@ -47,7 +47,7 @@
 static atomic_t _do_requests;
 
 static int debug_disk_write = 0;
-extern spinlock_t log_list_lock;
+extern struct semaphore log_list_lock;
 extern struct list_head log_list_head;
 
 static void *region_user_alloc(int gfp_mask, void *pool_data){
@@ -225,6 +225,11 @@
 
 static int _core_get_resync_work(struct log_c *lc, region_t *region)
 {
+	if (lc->recovering_region != (uint64_t)-1) {
+		DMDEBUG("Someone is already recovering (%Lu)", lc->recovering_region);
+		return 0;
+	}
+
 	if (lc->sync_search >= lc->region_count) {
 		/*
 		 * FIXME: pvmove is not supported yet, but when it is,
@@ -237,18 +242,16 @@
 			return 0;
 		}
 	}
-	do {
-		*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
-						  lc->region_count,
-						  lc->sync_search);
-		lc->sync_search = *region + 1;
-
-		if (*region >= lc->region_count)
-			return 0;
+	*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
+					  lc->region_count,
+					  lc->sync_search);
+	lc->sync_search = *region + 1;
 
-	} while (log_test_bit(lc->recovering_bits, *region));
+	if (*region >= lc->region_count)
+		return 0;
 
-	log_set_bit(lc, lc->recovering_bits, *region);
+	lc->recovering_region = *region;
+	DMDEBUG("Assigning recovery work: %Lu", *region);
 	return 1;
 }
 
@@ -371,7 +374,7 @@
 			bad_count++;
 			log_clear_bit(lc, lc->sync_bits, ru->ru_region);
 			if (ru->ru_rw == RU_RECOVER) {
-				log_clear_bit(lc, lc->recovering_bits, ru->ru_region);
+				lc->recovering_region = (uint64_t)-1;
 			}
 			list_del(&ru->ru_list);
 			mempool_free(ru, region_user_pool);
@@ -506,10 +509,9 @@
 
 static int server_mark_region(struct log_c *lc, struct log_request *lr, uint32_t who)
 {
-	int r = 0;
 	struct region_user *ru, *new;
 
-	new = mempool_alloc(region_user_pool, GFP_KERNEL);
+	new = mempool_alloc(region_user_pool, GFP_NOFS);
 	if(!new){
 		return -ENOMEM;
 	}
@@ -519,21 +521,13 @@
     
 	if (!(ru = find_ru_by_region(lc, lr->u.lr_region))) {
 		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
-		r = write_bits(lc);
-
 		list_add(&new->ru_list, &lc->region_users);
-		if (!r) {
-			lc->touched = 0;
-			lc->log_dev_failed = 0;
-		} else {
-			lc->log_dev_failed = 1;
-		}
 	} else if (ru->ru_rw == RU_RECOVER) {
-		DMINFO("Attempt to mark a region " SECTOR_FORMAT 
+		DMDEBUG("Attempt to mark a region " SECTOR_FORMAT
 		      "/%s which is being recovered.",
 		       lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
-		DMINFO("Current recoverer: %u", ru->ru_nodeid);
-		DMINFO("Mark requester   : %u", who);
+		DMDEBUG("Current recoverer: %u", ru->ru_nodeid);
+		DMDEBUG("Mark requester   : %u", who);
 
 		mempool_free(new, region_user_pool);
 		return -EBUSY;
@@ -547,7 +541,7 @@
 		mempool_free(new, region_user_pool);
 	}
 
-	return (lc->log_dev_failed) ? -EIO : 0;
+	return 0;
 }
 
 
@@ -567,28 +561,34 @@
 
 	if(!find_ru_by_region(lc, lr->u.lr_region)){
 		log_set_bit(lc, lc->clean_bits, lr->u.lr_region);
-		write_bits(lc);
-		/*
-		if (write_bits(lc))
-			DMERR("Write bits failed on mirror log device, %s",
-			      lc->log_dev->name);
-		*/
 	}
 	return 0;
 }
 
 
+static int server_flush(struct log_c *lc)
+{
+	int r = 0;
+
+	r = write_bits(lc);
+	if (!r) {
+		lc->touched = 0;
+		lc->log_dev_failed = 0;
+	} else {
+		lc->log_dev_failed = 1;
+	}
+
+	return (lc->log_dev_failed) ? -EIO : 0;
+}
+
+
 static int server_get_resync_work(struct log_c *lc, struct log_request *lr, uint32_t who)
 {
 	struct region_user *new;
 
-/* We now have the ability to use remote_recovering
-	if (my_id != who)
-		return 0;
-*/
-
-	new = mempool_alloc(region_user_pool, GFP_KERNEL);
+	new = mempool_alloc(region_user_pool, GFP_NOFS);
 	if(!new){
+		lr->u.lr_int_rtn = 0;
 		return -ENOMEM;
 	}
 	
@@ -610,9 +610,15 @@
 		return -EINVAL;
 	}
 
-	log_clear_bit(lc, lc->recovering_bits, lr->u.lr_region);
-
 	if (success) {
+		if (lr->u.lr_region != lc->recovering_region) {
+			DMERR("Told to clear recovery on wrong region %Lu/%Lu",
+			      lr->u.lr_region, lc->recovering_region);
+			return -EINVAL;
+		}
+
+		lc->recovering_region = (uint64_t)-1;
+
 		/* We could receive multiple identical request due to network failure */
 		if(!log_test_bit(lc->sync_bits, lr->u.lr_region)) {
 			log_set_bit(lc, lc->sync_bits, lr->u.lr_region);
@@ -650,7 +656,7 @@
 static struct log_c *get_log_context(char *uuid, int uuid_ref){
 	struct log_c *lc, *r = NULL;
 
-	spin_lock(&log_list_lock);
+	down(&log_list_lock);
 	list_for_each_entry(lc, &log_list_head, log_list){
 		if (!strncmp(lc->uuid, uuid, MAX_NAME_LEN) &&
 		    (uuid_ref == lc->uuid_ref)) {
@@ -660,7 +666,7 @@
 				r = lc;
 		}
 	}
-	spin_unlock(&log_list_lock);
+	up(&log_list_lock);
 
 	return r;
 }
@@ -838,6 +844,7 @@
  * Returns: 0 on success, -1 on error
  */
 static int process_log_request(struct socket *sock){
+	static struct log_request *lr = NULL;
 	int error;
 	uint32_t nodeid;
 	struct msghdr msg;
@@ -845,9 +852,13 @@
 	struct sockaddr_in saddr_in;
 	mm_segment_t fs;
 	struct log_c *lc;
-	struct log_request lr; /* ATTENTION -- could be too much on the stack */
 
-	memset(&lr, 0, sizeof(struct log_request));
+	if (unlikely(!lr))
+		lr = kmalloc(sizeof(*lr), GFP_KERNEL);
+	if (!lr)
+		return -1;
+
+	memset(lr, 0, sizeof(struct log_request));
 	memset(&saddr_in, 0, sizeof(saddr_in));
 		
 	msg.msg_control = NULL;
@@ -858,7 +869,7 @@
 	msg.msg_name = &saddr_in;
 	msg.msg_namelen = sizeof(saddr_in);
 	iov.iov_len = sizeof(struct log_request);
-	iov.iov_base = &lr;
+	iov.iov_base = lr;
 		
 	fs = get_fs();
 	set_fs(get_ds());
@@ -871,14 +882,14 @@
 		if(error < sizeof(struct log_request)){
 			DMERR("Cluster mirror log server received incomplete message.");
 		}
-		lc = get_log_context(lr.lr_uuid, lr.lr_uuid_ref);
+		lc = get_log_context(lr->lr_uuid, lr->lr_uuid_ref);
 
-		if(lr.lr_type == LRT_ELECTION ||
-		   lr.lr_type == LRT_SELECTION ||
-		   lr.lr_type == LRT_MASTER_ASSIGN ||
-		   lr.lr_type == LRT_MASTER_LEAVING){
+		if(lr->lr_type == LRT_ELECTION ||
+		   lr->lr_type == LRT_SELECTION ||
+		   lr->lr_type == LRT_MASTER_ASSIGN ||
+		   lr->lr_type == LRT_MASTER_LEAVING){
 			uint32_t old = (lc)?lc->server_id: 0xDEAD;
-			if(process_election(&lr, lc, &saddr_in)){
+			if(process_election(lr, lc, &saddr_in)){
 				DMERR("Election processing failed.");
 				return -1;
 			}
@@ -896,12 +907,12 @@
 		}
 
 		if(!lc){
-			lr.u.lr_int_rtn = -ENXIO;
+			lr->u.lr_int_rtn = -ENXIO;
 			goto reply;
 		}
 
 		if (lc->server_id != my_id) {
-			lr.u.lr_int_rtn = -ENXIO;
+			lr->u.lr_int_rtn = -ENXIO;
 			goto reply;
 		}
 
@@ -911,23 +922,23 @@
 			DMDEBUG("Getting request while server (%u) is suspended:", my_id);
 			DMDEBUG(" - Requester :: %u", nodeid);
 			DMDEBUG(" - log uuid  :: %s", lc->uuid + (strlen(lc->uuid) - 8));
-			DMDEBUG(" - req type  :: %s", RQ_STRING(lr.lr_type));
+			DMDEBUG(" - req type  :: %s", RQ_STRING(lr->lr_type));
 			*/
 			if (my_id != nodeid) {
-				lr.u.lr_int_rtn = -ENXIO;
+				lr->u.lr_int_rtn = -ENXIO;
 				goto reply;
 			}
 		}			
 
-		switch(lr.lr_type){
+		switch(lr->lr_type){
 		case LRT_IS_CLEAN:
-			error = server_is_clean(lc, &lr);
+			error = server_is_clean(lc, lr);
 			break;
 		case LRT_IS_REMOTE_RECOVERING:
-			error = server_is_remote_recovering(lc, &lr);
+			error = server_is_remote_recovering(lc, lr);
 			break;
 		case LRT_IN_SYNC:
-			error = server_in_sync(lc, &lr);
+			error = server_in_sync(lc, lr);
 			break;
 		case LRT_MARK_REGION:
 			if(!(nodeid = 
@@ -935,8 +946,8 @@
 				error = -ENXIO;
 				break;
 			}
-			error = server_mark_region(lc, &lr, nodeid);
-			lr.u.lr_int_rtn = 0;
+			error = server_mark_region(lc, lr, nodeid);
+			lr->u.lr_int_rtn = 0;
 			break;
 		case LRT_CLEAR_REGION:
 			if(!(nodeid = 
@@ -944,7 +955,10 @@
 				error = -ENXIO;
 				break;
 			}
-			error = server_clear_region(lc, &lr, nodeid);
+			error = server_clear_region(lc, lr, nodeid);
+			break;
+		case LRT_FLUSH:
+			error = server_flush(lc);
 			break;
 		case LRT_GET_RESYNC_WORK:
 			if(!(nodeid = 
@@ -952,14 +966,14 @@
 				error = -ENXIO;
 				break;
 			}
-			error = server_get_resync_work(lc, &lr, nodeid);
+			error = server_get_resync_work(lc, lr, nodeid);
 			break;
 		case LRT_COMPLETE_RESYNC_WORK:
-			error = server_complete_resync_work(lc, &lr, lr.u.lr_int_rtn);
-			lr.u.lr_int_rtn = 0;
+			error = server_complete_resync_work(lc, lr, lr->u.lr_int_rtn);
+			lr->u.lr_int_rtn = 0;
 			break;
 		case LRT_GET_SYNC_COUNT:
-			error = server_get_sync_count(lc, &lr);
+			error = server_get_sync_count(lc, lr);
 			break;
 		default:
 			DMWARN("unknown request type received");
@@ -971,15 +985,15 @@
 		if(error){
 /*
 			DMWARN("Error (%d) while processing request (%s)",
-			       error, RQ_STRING(lr.lr_type));
+			       error, RQ_STRING(lr->lr_type));
 */
-			lr.u.lr_int_rtn = error;
+			lr->u.lr_int_rtn = error;
 		}
 	reply:
     
 		/* Why do we need to reset this? */
 		iov.iov_len = sizeof(struct log_request);
-		iov.iov_base = &lr;
+		iov.iov_base = lr;
 		msg.msg_name = &saddr_in;
 		msg.msg_namelen = sizeof(saddr_in);
 
@@ -991,7 +1005,7 @@
 		set_fs(fs);
 		if(error < 0){
 			DMWARN("unable to sendmsg to client (type = %s, error = %d)",
-			       RQ_STRING(lr.lr_type), error);
+			       RQ_STRING(lr->lr_type), error);
 			return error;
 		}
 	} else if(error == -EAGAIN || error == -ETIMEDOUT){
@@ -1052,7 +1066,7 @@
 			if (atomic_read(&restart_event_type) == SERVICE_NODE_FAILED)
 				DMINFO("A cluster mirror log member has failed.");
 			
-			spin_lock(&log_list_lock);
+			down(&log_list_lock);
 			list_for_each_entry(lc, &log_list_head, log_list){
 				if(lc->server_id == my_id){
 					if (atomic_read(&lc->suspended)) {
@@ -1062,7 +1076,7 @@
 					}
 				}
 			}
-			spin_unlock(&log_list_lock);
+			up(&log_list_lock);
 
 			break;
 		default:
@@ -1150,7 +1164,7 @@
 int start_server(void /* log_devices ? */){
 	int error;
 
-	region_user_pool = mempool_create(100, region_user_alloc,
+	region_user_pool = mempool_create(1000, region_user_alloc,
 					  region_user_free, NULL);
 	if(!region_user_pool){
 		DMWARN("unable to allocate region user pool for server");
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/03/22 22:34:44	1.1.2.2.2.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/04/03 18:23:01	1.1.2.2.2.2
@@ -14,14 +14,15 @@
 #define LRT_IN_SYNC             	3
 #define LRT_MARK_REGION         	4
 #define LRT_CLEAR_REGION        	5
-#define LRT_GET_RESYNC_WORK     	6
-#define LRT_COMPLETE_RESYNC_WORK        7
-#define LRT_GET_SYNC_COUNT      	8
-
-#define LRT_ELECTION			9
-#define LRT_SELECTION			10
-#define LRT_MASTER_ASSIGN		11
-#define LRT_MASTER_LEAVING		12
+#define LRT_FLUSH                       6
+#define LRT_GET_RESYNC_WORK     	7
+#define LRT_COMPLETE_RESYNC_WORK        8
+#define LRT_GET_SYNC_COUNT      	9
+
+#define LRT_ELECTION			10
+#define LRT_SELECTION			11
+#define LRT_MASTER_ASSIGN		12
+#define LRT_MASTER_LEAVING		13
 
 #define CLUSTER_LOG_PORT 51005
 
@@ -29,6 +30,7 @@
 	((x) == LRT_IS_CLEAN) ? "LRT_IS_CLEAN": \
 	((x) == LRT_IN_SYNC) ? "LRT_IN_SYNC": \
 	((x) == LRT_MARK_REGION) ? "LRT_MARK_REGION": \
+	((x) == LRT_FLUSH) ? "LRT_FLUSH": \
 	((x) == LRT_GET_RESYNC_WORK) ? "LRT_GET_RESYNC_WORK": \
 	((x) == LRT_GET_SYNC_COUNT) ? "LRT_GET_SYNC_COUNT": \
 	((x) == LRT_CLEAR_REGION) ? "LRT_CLEAR_REGION": \



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-04-03 18:21 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-04-03 18:21 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-04-03 19:21:10

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c dm-cmirror-xfr.h 

Log message:
	Bug 234539: multiple streams of I/O can cause system to lock up
	
	This bug provoked an audit of the communications exchange, locking,
	and memory allocations/stack usage.
	
	Communication fixes include:
	1) Added sequence numbers to ensure that replies from the server
	correctly correspond to client requests.  It was found that if
	a client timed out waiting for a server to respond, it would send
	the request again.  However, the server may have simply been too
	busy to respond in a timely fashion.  It ends up responding to
	both the original request and the resent request - causing the
	client and server to become out-of-sync WRT log requests.
	
	Locking fixes include:
	1) A semaphore was being "up"ed twice in some cases, rendering
	the lock impotent.
	
	2) A spin lock controlling region status lists was being held
	across blocking operations - sometimes causing deadlocks.  The
	spin lock was changed to a per-log lock, and some logging
	operations were restructured to better suit the way locking
	needed to be done.  A side-effect of this fix is a 20%
	improvement in write operations.
	
	3) The log list protection lock needed to change from a spin lock
	to a semaphore to allow blocking operations.
	
	Memory allocation fixes include:
	1) Wrong flags to kmalloc could cause deadlock.  Use NOFS instead
	of KERNEL.
	
	2) Mempools needed more reserves for low memory conditions.
	
	3) Server now allocates a communication structure instead of having
	it on the stack.  This reduces the likelyhood of stack corruption.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.42&r2=1.1.2.43
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.12&r2=1.1.2.13
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.27&r2=1.1.2.28
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-xfr.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.3&r2=1.1.2.4

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/03/22 22:21:59	1.1.2.42
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/04/03 18:21:10	1.1.2.43
@@ -28,20 +28,16 @@
 #include "dm-cmirror-server.h"
 #include "dm-cmirror-cman.h"
 
-spinlock_t log_list_lock;
+DECLARE_MUTEX(log_list_lock);
 LIST_HEAD(log_list_head);
 
 struct region_state {
-	struct log_c *rs_lc;
+	int rs_mark_logged;
 	region_t rs_region;
 	struct list_head rs_list;
 };
 
 static mempool_t *region_state_pool = NULL;
-static spinlock_t region_state_lock;
-static int clear_region_count=0;
-static struct list_head clear_region_list;
-static struct list_head marked_region_list;
 
 static int shutting_down=0;
 static atomic_t suspend_client;
@@ -145,15 +141,7 @@
 	memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
 	lc->sync_count = (sync == NOSYNC) ? region_count : 0;
 
-	lc->recovering_bits = vmalloc(bitset_size);
-	if (!lc->recovering_bits) {
-		DMWARN("couldn't allocate sync bitset");
-		vfree(lc->sync_bits);
-		vfree(lc->clean_bits);
-		kfree(lc);
-		return -ENOMEM;
-	}
-	memset(lc->recovering_bits, 0, bitset_size);
+	lc->recovering_region = (uint64_t)-1;
 	lc->sync_search = 0;
 	log->context = lc;
 	return 0;
@@ -164,7 +152,6 @@
 	struct log_c *lc = (struct log_c *) log->context;
 	vfree(lc->clean_bits);
 	vfree(lc->sync_bits);
-	vfree(lc->recovering_bits);
 	kfree(lc);
 }
 
@@ -321,8 +308,9 @@
 
 	request_count++;
 
-	lr = kmalloc(sizeof(struct log_request), GFP_KERNEL);
+	lr = kmalloc(sizeof(struct log_request), GFP_NOFS);
 	if(!lr){
+		BUG();
 		error = -ENOMEM;
 		*retry = 1;
 		goto fail;
@@ -404,15 +392,15 @@
 	}
     
 	if (seq != lr->lr_seq) {
-		DMERR("Message sequence number mismatch: %d/%d",
+		DMDEBUG("Message sequence number mismatch: %d/%d",
 		      seq, lr->lr_seq);
 		if (seq > lr->lr_seq) {
-			DMERR(" Skipping.  Listening again for response to %s",
+			DMDEBUG(" Skipping.  Listening again for response to %s",
 			      RQ_STRING(type));
 			memset(lr, 0, sizeof(struct log_request));
 			goto rerecv;
 		}
-		DMERR(" Must try to resend request, %s", RQ_STRING(type));
+		DMERR(" Seq# mismatch: Must try to resend request, %s", RQ_STRING(type));
 		error = -EBADE;
 		*retry = 1;
 		seq++;
@@ -509,91 +497,43 @@
 			new_server = 1;
 		}
 
-		spin_lock(&region_state_lock);
+		spin_lock(&lc->state_lock);
 		if(new_server && 
-		   (!list_empty(&clear_region_list) ||
-		    !list_empty(&marked_region_list))){
+		   !list_empty(&lc->mark_logged)){
 			int i=0;
-			struct region_state *tmp_rs;
+			LIST_HEAD(mark);
 
 			DMINFO("Clean-up required due to new server");
-			DMINFO(" - Wiping clear region list");
-			list_for_each_entry_safe(rs, tmp_rs,
-						 &clear_region_list, rs_list){
-				/* Remove only those associated with referenced log */
-				if (rs->rs_lc != lc)
-					continue;
-				i++;
-				list_del_init(&rs->rs_list);
-				mempool_free(rs, region_state_pool);
-			}
-			clear_region_count -= i;
-			DMINFO(" - %d clear region requests wiped", i);
-			i=0;
 			DMINFO(" - Resending all mark region requests");
-			list_for_each_entry(rs, &marked_region_list, rs_list){
-				/* Resend only those associated with referenced log */
-				if (rs->rs_lc != lc)
-					continue;
+			list_splice_init(&lc->mark_logged, &mark);
+
+			spin_unlock(&lc->state_lock);
+
+			list_for_each_entry(rs, &mark, rs_list){
 				do {
 					retry = 0;
-					i++;
-					rtn = _consult_server(rs->rs_lc, rs->rs_region,
+					rtn = _consult_server(lc, rs->rs_region,
 							      LRT_MARK_REGION, NULL, &retry);
 					if (lc->server_id == 0xDEAD) {
-						spin_unlock(&region_state_lock);
 						goto election;
 					}
 				} while(retry);
+				i++;
 			}
+
+			spin_lock(&lc->state_lock);
+			list_splice_init(&mark, &lc->mark_logged);
+
 			DMINFO(" - %d mark region requests resent", i);
 			DMINFO("Clean-up complete");
-			if(type == LRT_MARK_REGION){
-				/* we just handled all marks */
-				DMWARN("Mark request ignored.\n");
-				spin_unlock(&region_state_lock);
-				goto out;
-			} else {
-				DMINFO("Continuing request type, %d (%s)", type,
-				       RQ_STRING(type));
-			}
+			DMINFO("Continuing request type, %d (%s)", type,
+			       RQ_STRING(type));
 			new_server = 0;
 		}
-
-		rs = NULL;
-
-		if(!list_empty(&clear_region_list)){
-			rs = list_entry(clear_region_list.next,
-					struct region_state, rs_list);
-			list_del_init(&rs->rs_list);
-			clear_region_count--;
-		}
-
-		spin_unlock(&region_state_lock);
-		
-		/* ATTENTION -- it may be possible to remove a clear region **
-		** request from the list.  Then, have a mark region happen  **
-		** while we are here.  If the clear region request fails, it**
-		** would be re-added - perhaps prematurely clearing the bit */
+		spin_unlock(&lc->state_lock);
 		
-		if(rs && !rs->rs_lc->log_dev_failed){
-			_consult_server(rs->rs_lc, rs->rs_region,
-					LRT_CLEAR_REGION, NULL, &retry);
-
-			if(retry){
-				spin_lock(&region_state_lock);
-				list_add(&rs->rs_list, &clear_region_list);
-				clear_region_count++;
-				spin_unlock(&region_state_lock);
-
-			} else {
-				mempool_free(rs, region_state_pool);
-			}
-		}
 		retry = 0;
-		
 		rtn = _consult_server(lc, region, type, result, &retry);
-		schedule();
 	} while(retry);
 out:
 	up(&consult_server_lock);
@@ -640,7 +580,7 @@
 	atomic_set(&lc->in_sync, -1);
 	lc->uuid_ref = 1;
 
-	spin_lock(&log_list_lock);
+	down(&log_list_lock);
 	list_for_each_entry(tmp_lc, &log_list_head, log_list){
 		if(!strncmp(tmp_lc->uuid, lc->uuid, MAX_NAME_LEN)){
 			lc->uuid_ref = (lc->uuid_ref > tmp_lc->uuid_ref) ?
@@ -649,12 +589,16 @@
 	}
 
 	list_add(&lc->log_list, &log_list_head);
-	spin_unlock(&log_list_lock);
+	up(&log_list_lock);
 	DMDEBUG("Creating %s (%d)",
 	       lc->uuid + (strlen(lc->uuid) - 8),
 	       lc->uuid_ref);
 
 	INIT_LIST_HEAD(&lc->region_users);
+	INIT_LIST_HEAD(&lc->clear_waiting);
+	INIT_LIST_HEAD(&lc->mark_waiting);
+	INIT_LIST_HEAD(&lc->mark_logged);
+	spin_lock_init(&lc->state_lock);
 
 	lc->server_id = 0xDEAD;
 
@@ -761,31 +705,44 @@
 	       lc->uuid + (strlen(lc->uuid) - 8),
 	       lc->uuid_ref);
 
-	if (!list_empty(&clear_region_list))
-		DMINFO("Leaving while clear region requests remain.");
-
-	spin_lock(&log_list_lock);
+	down(&log_list_lock);
 	list_del_init(&lc->log_list);
-	spin_unlock(&log_list_lock);
+	up(&log_list_lock);
 
 	if ((lc->server_id == my_id) && !atomic_read(&lc->suspended))
 		consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
 
 	sock_release(lc->client_sock);
 
-	spin_lock(&region_state_lock);
+	spin_lock(&lc->state_lock);
 
-	list_for_each_entry_safe(rs, tmp_rs, &clear_region_list, rs_list) {
-		if (lc == rs->rs_lc)
+	if (!list_empty(&lc->clear_waiting)) {
+		DMINFO("Clear requests remain at cluster log deactivation");
+		list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) {
 			list_del_init(&rs->rs_list);
+			DMINFO(" - Ignoring clear request: %Lu", rs->rs_region);
+			mempool_free(rs, region_state_pool);
+		}
 	}
 
-	list_for_each_entry_safe(rs, tmp_rs, &marked_region_list, rs_list) {
-		if (lc == rs->rs_lc)
-			list_del_init(&rs->rs_list);
+	if (!list_empty(&lc->mark_waiting)) {
+		DMERR("Pending mark requests remain at cluster_dtr");
+		BUG();
+	}
+
+	if (!list_empty(&lc->mark_logged)) {
+		DMERR("Mark requests remain at cluster log deactivation");
+		/*
+		 * Should I BUG() this?
+		 * No.  In the worst case, they will get cleaned up later
+		 */
+	}
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list) {
+		list_del_init(&rs->rs_list);
+		mempool_free(rs, region_state_pool);
 	}
 
-	spin_unlock(&region_state_lock);
+	spin_unlock(&lc->state_lock);
 
 	if (lc->log_dev)
 		disk_dtr(log);
@@ -803,19 +760,27 @@
 
 static int cluster_postsuspend(struct dirty_log *log)
 {
+	struct region_state *rs, *tmp_rs;
 	struct log_c *lc = (struct log_c *) log->context;
 
-	while (1) {
-		spin_lock(&region_state_lock);
-		if (list_empty(&clear_region_list)) {
-			spin_unlock(&region_state_lock);
-			break;
-		}
-		spin_unlock(&region_state_lock);
+	spin_lock(&lc->state_lock);
+	if (!list_empty(&lc->mark_waiting)) {
+		DMERR("Mark requests remain at postsuspend!");
+		BUG();
+	}
 
-		/* Just an unnessesary call to clear out regions */
-		consult_server(lc, 0, LRT_IN_SYNC, NULL);
+	if (!list_empty(&lc->clear_waiting)) {
+		DMERR("Clear requests remain at postsuspend!");
+
+		list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) {
+			list_del_init(&rs->rs_list);
+			DMERR(" - Ignoring clear request: %Lu", rs->rs_region);
+			mempool_free(rs, region_state_pool);
+		}
 	}
+
+	spin_unlock(&lc->state_lock);
+
 	atomic_set(&lc->suspended, 1);
 	if(lc->server_id == my_id) {
 		while (1) {
@@ -903,103 +868,162 @@
 
 static int cluster_flush(struct dirty_log *log)
 {
+	int r = 0;
+	int clear_count = 0;
+	int mark_count = 0;
 	struct log_c *lc = (struct log_c *) log->context;
+	struct region_state *rs, *tmp_rs;
+	LIST_HEAD(mark);
+	LIST_HEAD(clear);
+
+	/*
+	 * It should never be a problem to temporarily have
+	 * the mark requests in limbo.  The only functions
+	 * that call cluster_flush are rh_update_states and
+	 * do_writes, and they are in the same thread as
+	 * those changing the region states
+	 */
+	spin_lock(&lc->state_lock);
+	list_splice_init(&lc->clear_waiting, &clear);
+	list_splice_init(&lc->mark_waiting, &mark);
+	spin_unlock(&lc->state_lock);
+
+	list_for_each_entry_safe(rs, tmp_rs, &clear, rs_list) {
+		/* don't really care if LRT_CLEAR_REGION fails */
+		consult_server(lc, rs->rs_region, LRT_CLEAR_REGION, NULL);
+		list_del_init(&rs->rs_list);
+		mempool_free(rs, region_state_pool);
+		clear_count++;
+	}
+
+	list_for_each_entry_safe(rs, tmp_rs, &mark, rs_list) {
+		while (1) {
+			r = consult_server(lc, rs->rs_region,
+					   LRT_MARK_REGION, NULL);
+			if (!r)
+				break;
+
+			if (r == -EBUSY) {
+				DMDEBUG("Delaying mark to region %Lu, due to recovery",
+					rs->rs_region);
+				set_current_state(TASK_INTERRUPTIBLE);
+				schedule_timeout(HZ/2);
+				continue;
+			}
 
-	/* FIXME:  flush all clear_region requests to server */
-	return (lc->log_dev_failed) ? -EIO : 0;
+			if (r == -EIO)
+				goto fail;
+
+			DMWARN("unable to get server (%u) to mark region (%Lu)",
+			       lc->server_id, rs->rs_region);
+			DMWARN("Reason :: %d", r);
+		}
+		mark_count++;
+	}
+
+	/* No flush work? */
+	if (!clear_count && !mark_count)
+		return 0;
+
+	spin_lock(&lc->state_lock);
+	list_splice_init(&mark, &lc->mark_logged);
+	spin_unlock(&lc->state_lock);
+
+	while ((r = consult_server(lc, 0, LRT_FLUSH, NULL))) {
+		if (r == -EBUSY) {
+			DMDEBUG("Delaying flush due to recovery");
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(HZ/2);
+			continue;
+		}
+
+		if (r == -EIO)
+			break;
+	}
+
+fail:
+	if (r) {
+		DMERR("Log flush failure: %d%s", r,
+		      (r == -EIO) ? " -EIO" : "");
+		dm_table_event(lc->ti->table);
+		lc->log_dev_failed = 1;
+	}
+
+	return r;
 }
 
 static void cluster_mark_region(struct dirty_log *log, region_t region)
 {
-	int error = 0;
 	struct region_state *rs, *tmp_rs, *rs_new;
 	struct log_c *lc = (struct log_c *) log->context;
 
-	rs_new = mempool_alloc(region_state_pool, GFP_KERNEL);
+	spin_lock(&lc->state_lock);
 
-	memset(rs_new, 0, sizeof(struct region_state));
 
-	spin_lock(&region_state_lock);
-	list_for_each_entry_safe(rs, tmp_rs, &clear_region_list, rs_list){
-		if(lc == rs->rs_lc && region == rs->rs_region){
-			/*
-			DMDEBUG("Mark pre-empting clear (%Lu/%s)",
-				region, lc->uuid + (strlen(lc->uuid) - 8));
-			*/
+	/*
+	 * It is possible for the following in the mirror code:
+	 *  0) Mark is already logged for a region
+	 *  1) rh_dec, sets region state to RH_CLEAN (asynchronous)
+	 *  2) rh_update_states (DOESN'T FLUSH!!!, bug #235040)
+	 *  3) do_writes, trys to mark region
+	 *
+	 * The following shouldn't have to be handled b/c of the flush
+	 *  0) Region finishes recovery
+	 *  1) rh_update_states clears region (DOES FLUSH)
+	 *  2) do_writes, trys to mark region
+	 *
+	 * This can lead to this next case being valid.
+	 */
+	list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list) {
+		if (region == rs->rs_region) {
+			if (!rs->rs_mark_logged) {
+				DMERR("Moving region(%Lu/%s) from clear_waiting -> mark_waiting",
+				      region, lc->uuid + (strlen(lc->uuid) - 8));
+			}
 			list_del_init(&rs->rs_list);
-			list_add(&rs->rs_list, &marked_region_list);
-			clear_region_count--;
-			spin_unlock(&region_state_lock);
-			if (rs_new)
-				mempool_free(rs_new, region_state_pool);
-
-			return;
+			list_add(&rs->rs_list,
+				 (rs->rs_mark_logged) ?
+				 &lc->mark_logged : &lc->mark_waiting);
+			goto out;
 		}
 	}
+
 	/*
-	 * In the mirroring code, it is possible for a write
-	 * to complete and call rh_dec - putting the region on
-	 * the clear_region list.  However, before the actual
-	 * clear request is issued to the log (rh_update_states)
-	 * another mark happens.  So, we check for and remove
-	 * duplicates.
+	 * It is possible for the following in the mirror code:
+	 *  0) Mark is already logged for a region
+	 *  1) rh_update_states
+	 *  2) rh_dec, sets region state to RH_CLEAN (asynchronous)
+	 *  3) do_writes, trys to mark region
+	 *
+	 * This can lead to this next case being valid.
 	 */
-	list_for_each_entry(rs, &marked_region_list, rs_list){
-		if(lc == rs->rs_lc && region == rs->rs_region){
-#ifdef DEBUG
-			DMINFO("Double mark on region ("
-			       SECTOR_FORMAT ")", region);
-#endif
-			spin_unlock(&region_state_lock);
-			if (rs_new)
-				mempool_free(rs_new, region_state_pool);
-
-			return;
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list){
+		if (region == rs->rs_region) {
+			goto out;
 		}
 	}
 
-	if(!rs_new){
-		DMERR("Unable to allocate region_state for mark.");
-		BUG();
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list){
+		if (region == rs->rs_region) {
+			DMERR("Mark already waiting (%Lu/%s)",
+			      region, lc->uuid + (strlen(lc->uuid) - 8));
+			BUG();
+		}
 	}
+	spin_unlock(&lc->state_lock);
 
-	rs_new->rs_lc = lc;
+	rs_new = mempool_alloc(region_state_pool, GFP_NOFS);
+	BUG_ON(!rs_new);
+	memset(rs_new, 0, sizeof(struct region_state));
+
+	spin_lock(&lc->state_lock);
+	rs_new->rs_mark_logged = 1;
 	rs_new->rs_region = region;
 	INIT_LIST_HEAD(&rs_new->rs_list);
-	list_add(&rs_new->rs_list, &marked_region_list);
-
-	spin_unlock(&region_state_lock);
-
-	if (!lc->log_dev_failed) {
-		while((error = consult_server(lc, region, LRT_MARK_REGION, NULL))){
-			if (error == -EBUSY) {
-				/* Remote recovering delay and try again */
-				DMDEBUG("Delaying mark to region %Lu, due to recovery",
-					region);
-				set_current_state(TASK_INTERRUPTIBLE);
-				schedule_timeout(HZ/2);
-				continue;
-			}
-
-			if (error == -EIO) {
-				lc->log_dev_failed = 1;
-				break;
-			}
-			DMWARN("unable to get server (%u) to mark region (%Lu)",
-			       lc->server_id, region);
-			DMWARN("Reason :: %d", error);
-		}
+	list_add(&rs_new->rs_list, &lc->mark_waiting);
+out:
+	spin_unlock(&lc->state_lock);
 
-		if (lc->log_dev_failed) {
-			dm_table_event(lc->ti->table);
-			/*
-			  DMERR("Write failed on mirror log device, %s",
-			  lc->log_dev->name);
-			  if (!atomic_read(&lc->suspended))
-			  wait_for_completion(&lc->failure_completion);
-			*/
-		}
-	}
 	return;
 }
 
@@ -1008,53 +1032,48 @@
 	struct log_c *lc = (struct log_c *) log->context;
 	struct region_state *rs, *tmp_rs, *rs_new;
 
-	rs_new = mempool_alloc(region_state_pool, GFP_ATOMIC);
+	spin_lock(&lc->state_lock);
 
-	memset(rs_new, 0, sizeof(struct region_state));
+	/* Should find match in this list, or no lists at all */
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_logged, rs_list){
+		if(region == rs->rs_region){
+			list_del_init(&rs->rs_list);
+			list_add(&rs->rs_list, &lc->clear_waiting);
+			goto out;
+		}
+	}
 
-	spin_lock(&region_state_lock);
 
-	list_for_each_entry_safe(rs, tmp_rs, &clear_region_list, rs_list){
-		if(lc == rs->rs_lc && region == rs->rs_region){
-			DMINFO("%d) Double clear on region ("
-			      SECTOR_FORMAT ")", __LINE__, region);
-			spin_unlock(&region_state_lock);
-			if (rs_new)
-				mempool_free(rs_new, region_state_pool);
-			return;
+	list_for_each_entry_safe(rs, tmp_rs, &lc->mark_waiting, rs_list){
+		if(region == rs->rs_region){
+			DMERR("Clear pre-empting mark (%Lu/%s)",
+			       region, lc->uuid + (strlen(lc->uuid) - 8));
+			BUG();
 		}
 	}
 
-	list_for_each_entry_safe(rs, tmp_rs, &marked_region_list, rs_list){
-		if(lc == rs->rs_lc && region == rs->rs_region){
-			list_del_init(&rs->rs_list);
-			list_add(&rs->rs_list, &clear_region_list);
-			clear_region_count++;
-			spin_unlock(&region_state_lock);
-			if (rs_new)
-				mempool_free(rs_new, region_state_pool);
-			return;
+	list_for_each_entry_safe(rs, tmp_rs, &lc->clear_waiting, rs_list){
+		if(region == rs->rs_region){
+			DMERR("%d) Double clear on region ("
+			      SECTOR_FORMAT ")", __LINE__, region);
+			BUG();
 		}
 	}
-
-	/* We can get here because we my be doing resync_work, and therefore, **
+	/* We can get here because we may be doing resync_work, and therefore,**
 	** clearing without ever marking..................................... */
 
-	if(!rs_new){
-		DMERR("Unable to allocate region_state for clear.");
-		BUG();
-	}
+	/* Don't need to spin_unlock, because allocation is non-blocking */
+	rs_new = mempool_alloc(region_state_pool, GFP_ATOMIC);
+	BUG_ON(!rs_new);
+	memset(rs_new, 0, sizeof(struct region_state));
 
-	rs_new->rs_lc = lc;
 	rs_new->rs_region = region;
 	INIT_LIST_HEAD(&rs_new->rs_list);
-	list_add(&rs_new->rs_list, &clear_region_list);
-	clear_region_count++;
-	if(!(clear_region_count & 0x7F)){
-		DMINFO("clear_region_count :: %d", clear_region_count);
-	}
+	list_add(&rs_new->rs_list, &lc->clear_waiting);
+
+out:
+	spin_unlock(&lc->state_lock);
 
-	spin_unlock(&region_state_lock);
 	return;
 }
 
@@ -1122,27 +1141,6 @@
 
 	switch(status){
 	case STATUSTYPE_INFO:
-/*
-		spin_lock(&region_state_lock);
-		i = clear_region_count;
-		list_for_each_entry(rs, &marked_region_list, rs_list){
-			j++;
-		}
-		spin_unlock(&region_state_lock);
-
-		DMINFO("CLIENT OUTPUT::");
-		DMINFO("  My ID            : %u", my_id);
-		DMINFO("  Server ID        : %u", lc->server_id);
-
-		DMINFO("  In-sync          : %s", (atomic_read(&lc->in_sync)>0)?
-		       "YES" : "NO");
-		DMINFO("  Regions marked   : %d", j);
-		DMINFO("  Regions clearing : %d", i);
-
-		if(lc->server_id == my_id){
-			print_server_status(lc);
-		}
-*/
 		if(lc->sync != DEFAULTSYNC)
 			arg_count++;
 
@@ -1195,11 +1193,11 @@
 
 	atomic_set(&suspend_client, 1);
 
-	spin_lock(&log_list_lock);
+	down(&log_list_lock);
 	list_for_each_entry(lc, &log_list_head, log_list) {
 		atomic_set(&lc->in_sync, 0);
 	}
-	spin_unlock(&log_list_lock);
+	up(&log_list_lock);
 
 	/*
 	if (likely(!shutting_down))
@@ -1221,7 +1219,12 @@
 	global_nodeids = nodeids;
 	global_count = count;
 
-	kcl_get_node_by_nodeid(0, &node);
+	for (i = 0; kcl_get_node_by_nodeid(0, &node); i++) {
+		if (i > 10)
+			BUG();
+		else
+			DMERR("Bad call to kcl_get_node_by_nodeid");
+	}
 	my_id = node.node_id;
 
 	/* Wait for any outstanding starts to complete */
@@ -1233,7 +1236,7 @@
 	switch(type){
 	case SERVICE_NODE_LEAVE:
 	case SERVICE_NODE_FAILED:
-		spin_lock(&log_list_lock);
+		down(&log_list_lock);
 		list_for_each_entry(lc, &log_list_head, log_list){
 			for(i=0, server = 0xDEAD; i < count; i++){
 				if(lc->server_id == nodeids[i]){
@@ -1243,7 +1246,7 @@
 			/* ATTENTION -- need locking around this ? */
 			lc->server_id = server;
 		}
-		spin_unlock(&log_list_lock);
+		up(&log_list_lock);
 
 		break;
 	case SERVICE_NODE_JOIN:
@@ -1279,10 +1282,8 @@
 
 	down(&cmirror_register_lock);
 
-	if (mirror_set_count++) {
-		up(&cmirror_register_lock);
+	if (mirror_set_count++)
 		goto out;
-	}
 
 	r = kcl_register_service("clustered_log", 13, SERVICE_LEVEL_GDLM, &clog_ops,
 				 1, NULL, &local_id);
@@ -1383,12 +1384,7 @@
         DMINFO("dm-cmirror %s (built %s %s) installed",
                CMIRROR_RELEASE_NAME, __DATE__, __TIME__);
 
-	INIT_LIST_HEAD(&clear_region_list);
-	INIT_LIST_HEAD(&marked_region_list);
-
-	spin_lock_init(&region_state_lock);
-	spin_lock_init(&log_list_lock);
-	region_state_pool = mempool_create(20, region_state_alloc,
+	region_state_pool = mempool_create(500, region_state_alloc,
 					   region_state_free, NULL);
 	if(!region_state_pool){
 		DMWARN("couldn't create region state pool");
@@ -1424,6 +1420,8 @@
 	}
 	dm_unregister_dirty_log_type(&_clustered_core_type);
 	dm_unregister_dirty_log_type(&_clustered_disk_type);
+        DMINFO("dm-cmirror %s (built %s %s) removed",
+               CMIRROR_RELEASE_NAME, __DATE__, __TIME__);
 }
 
 module_init(cluster_dirty_log_init);
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/02/21 17:14:44	1.1.2.12
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/04/03 18:21:10	1.1.2.13
@@ -97,7 +97,7 @@
 	unsigned bitset_uint32_count;
 	uint32_t *clean_bits;
 	uint32_t *sync_bits;
-	uint32_t *recovering_bits;	/* FIXME: this seems excessive */
+	uint64_t recovering_region;
 
 	int sync_pass;          /* number of passes attempting to resync */
 	int sync_search;
@@ -134,7 +134,12 @@
 	atomic_t in_sync;  /* like sync_count, except all or nothing */
 
 	struct list_head log_list;
-	struct list_head region_users;
+	struct list_head region_users;  /* Used by Server */
+
+	spinlock_t state_lock;
+	struct list_head clear_waiting;
+	struct list_head mark_waiting;
+	struct list_head mark_logged;
 
 	uint32_t server_id;
 	struct socket *client_sock;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/03/22 22:21:59	1.1.2.27
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/04/03 18:21:10	1.1.2.28
@@ -47,7 +47,7 @@
 static atomic_t _do_requests;
 
 static int debug_disk_write = 0;
-extern spinlock_t log_list_lock;
+extern struct semaphore log_list_lock;
 extern struct list_head log_list_head;
 
 static void *region_user_alloc(int gfp_mask, void *pool_data){
@@ -225,6 +225,11 @@
 
 static int _core_get_resync_work(struct log_c *lc, region_t *region)
 {
+	if (lc->recovering_region != (uint64_t)-1) {
+		DMDEBUG("Someone is already recovering (%Lu)", lc->recovering_region);
+		return 0;
+	}
+
 	if (lc->sync_search >= lc->region_count) {
 		/*
 		 * FIXME: pvmove is not supported yet, but when it is,
@@ -237,18 +242,16 @@
 			return 0;
 		}
 	}
-	do {
-		*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
-						  lc->region_count,
-						  lc->sync_search);
-		lc->sync_search = *region + 1;
-
-		if (*region >= lc->region_count)
-			return 0;
+	*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
+					  lc->region_count,
+					  lc->sync_search);
+	lc->sync_search = *region + 1;
 
-	} while (log_test_bit(lc->recovering_bits, *region));
+	if (*region >= lc->region_count)
+		return 0;
 
-	log_set_bit(lc, lc->recovering_bits, *region);
+	lc->recovering_region = *region;
+	DMDEBUG("Assigning recovery work: %Lu", *region);
 	return 1;
 }
 
@@ -371,7 +374,7 @@
 			bad_count++;
 			log_clear_bit(lc, lc->sync_bits, ru->ru_region);
 			if (ru->ru_rw == RU_RECOVER) {
-				log_clear_bit(lc, lc->recovering_bits, ru->ru_region);
+				lc->recovering_region = (uint64_t)-1;
 			}
 			list_del(&ru->ru_list);
 			mempool_free(ru, region_user_pool);
@@ -506,10 +509,9 @@
 
 static int server_mark_region(struct log_c *lc, struct log_request *lr, uint32_t who)
 {
-	int r = 0;
 	struct region_user *ru, *new;
 
-	new = mempool_alloc(region_user_pool, GFP_KERNEL);
+	new = mempool_alloc(region_user_pool, GFP_NOFS);
 	if(!new){
 		return -ENOMEM;
 	}
@@ -519,21 +521,13 @@
     
 	if (!(ru = find_ru_by_region(lc, lr->u.lr_region))) {
 		log_clear_bit(lc, lc->clean_bits, lr->u.lr_region);
-		r = write_bits(lc);
-
 		list_add(&new->ru_list, &lc->region_users);
-		if (!r) {
-			lc->touched = 0;
-			lc->log_dev_failed = 0;
-		} else {
-			lc->log_dev_failed = 1;
-		}
 	} else if (ru->ru_rw == RU_RECOVER) {
-		DMINFO("Attempt to mark a region " SECTOR_FORMAT 
+		DMDEBUG("Attempt to mark a region " SECTOR_FORMAT
 		      "/%s which is being recovered.",
 		       lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
-		DMINFO("Current recoverer: %u", ru->ru_nodeid);
-		DMINFO("Mark requester   : %u", who);
+		DMDEBUG("Current recoverer: %u", ru->ru_nodeid);
+		DMDEBUG("Mark requester   : %u", who);
 
 		mempool_free(new, region_user_pool);
 		return -EBUSY;
@@ -547,7 +541,7 @@
 		mempool_free(new, region_user_pool);
 	}
 
-	return (lc->log_dev_failed) ? -EIO : 0;
+	return 0;
 }
 
 
@@ -567,28 +561,34 @@
 
 	if(!find_ru_by_region(lc, lr->u.lr_region)){
 		log_set_bit(lc, lc->clean_bits, lr->u.lr_region);
-		write_bits(lc);
-		/*
-		if (write_bits(lc))
-			DMERR("Write bits failed on mirror log device, %s",
-			      lc->log_dev->name);
-		*/
 	}
 	return 0;
 }
 
 
+static int server_flush(struct log_c *lc)
+{
+	int r = 0;
+
+	r = write_bits(lc);
+	if (!r) {
+		lc->touched = 0;
+		lc->log_dev_failed = 0;
+	} else {
+		lc->log_dev_failed = 1;
+	}
+
+	return (lc->log_dev_failed) ? -EIO : 0;
+}
+
+
 static int server_get_resync_work(struct log_c *lc, struct log_request *lr, uint32_t who)
 {
 	struct region_user *new;
 
-/* We now have the ability to use remote_recovering
-	if (my_id != who)
-		return 0;
-*/
-
-	new = mempool_alloc(region_user_pool, GFP_KERNEL);
+	new = mempool_alloc(region_user_pool, GFP_NOFS);
 	if(!new){
+		lr->u.lr_int_rtn = 0;
 		return -ENOMEM;
 	}
 	
@@ -610,9 +610,15 @@
 		return -EINVAL;
 	}
 
-	log_clear_bit(lc, lc->recovering_bits, lr->u.lr_region);
-
 	if (success) {
+		if (lr->u.lr_region != lc->recovering_region) {
+			DMERR("Told to clear recovery on wrong region %Lu/%Lu",
+			      lr->u.lr_region, lc->recovering_region);
+			return -EINVAL;
+		}
+
+		lc->recovering_region = (uint64_t)-1;
+
 		/* We could receive multiple identical request due to network failure */
 		if(!log_test_bit(lc->sync_bits, lr->u.lr_region)) {
 			log_set_bit(lc, lc->sync_bits, lr->u.lr_region);
@@ -650,7 +656,7 @@
 static struct log_c *get_log_context(char *uuid, int uuid_ref){
 	struct log_c *lc, *r = NULL;
 
-	spin_lock(&log_list_lock);
+	down(&log_list_lock);
 	list_for_each_entry(lc, &log_list_head, log_list){
 		if (!strncmp(lc->uuid, uuid, MAX_NAME_LEN) &&
 		    (uuid_ref == lc->uuid_ref)) {
@@ -660,7 +666,7 @@
 				r = lc;
 		}
 	}
-	spin_unlock(&log_list_lock);
+	up(&log_list_lock);
 
 	return r;
 }
@@ -838,6 +844,7 @@
  * Returns: 0 on success, -1 on error
  */
 static int process_log_request(struct socket *sock){
+	static struct log_request *lr = NULL;
 	int error;
 	uint32_t nodeid;
 	struct msghdr msg;
@@ -845,9 +852,13 @@
 	struct sockaddr_in saddr_in;
 	mm_segment_t fs;
 	struct log_c *lc;
-	struct log_request lr; /* ATTENTION -- could be too much on the stack */
 
-	memset(&lr, 0, sizeof(struct log_request));
+	if (unlikely(!lr))
+		lr = kmalloc(sizeof(*lr), GFP_KERNEL);
+	if (!lr)
+		return -1;
+
+	memset(lr, 0, sizeof(struct log_request));
 	memset(&saddr_in, 0, sizeof(saddr_in));
 		
 	msg.msg_control = NULL;
@@ -858,7 +869,7 @@
 	msg.msg_name = &saddr_in;
 	msg.msg_namelen = sizeof(saddr_in);
 	iov.iov_len = sizeof(struct log_request);
-	iov.iov_base = &lr;
+	iov.iov_base = lr;
 		
 	fs = get_fs();
 	set_fs(get_ds());
@@ -871,14 +882,14 @@
 		if(error < sizeof(struct log_request)){
 			DMERR("Cluster mirror log server received incomplete message.");
 		}
-		lc = get_log_context(lr.lr_uuid, lr.lr_uuid_ref);
+		lc = get_log_context(lr->lr_uuid, lr->lr_uuid_ref);
 
-		if(lr.lr_type == LRT_ELECTION ||
-		   lr.lr_type == LRT_SELECTION ||
-		   lr.lr_type == LRT_MASTER_ASSIGN ||
-		   lr.lr_type == LRT_MASTER_LEAVING){
+		if(lr->lr_type == LRT_ELECTION ||
+		   lr->lr_type == LRT_SELECTION ||
+		   lr->lr_type == LRT_MASTER_ASSIGN ||
+		   lr->lr_type == LRT_MASTER_LEAVING){
 			uint32_t old = (lc)?lc->server_id: 0xDEAD;
-			if(process_election(&lr, lc, &saddr_in)){
+			if(process_election(lr, lc, &saddr_in)){
 				DMERR("Election processing failed.");
 				return -1;
 			}
@@ -896,12 +907,12 @@
 		}
 
 		if(!lc){
-			lr.u.lr_int_rtn = -ENXIO;
+			lr->u.lr_int_rtn = -ENXIO;
 			goto reply;
 		}
 
 		if (lc->server_id != my_id) {
-			lr.u.lr_int_rtn = -ENXIO;
+			lr->u.lr_int_rtn = -ENXIO;
 			goto reply;
 		}
 
@@ -911,23 +922,23 @@
 			DMDEBUG("Getting request while server (%u) is suspended:", my_id);
 			DMDEBUG(" - Requester :: %u", nodeid);
 			DMDEBUG(" - log uuid  :: %s", lc->uuid + (strlen(lc->uuid) - 8));
-			DMDEBUG(" - req type  :: %s", RQ_STRING(lr.lr_type));
+			DMDEBUG(" - req type  :: %s", RQ_STRING(lr->lr_type));
 			*/
 			if (my_id != nodeid) {
-				lr.u.lr_int_rtn = -ENXIO;
+				lr->u.lr_int_rtn = -ENXIO;
 				goto reply;
 			}
 		}			
 
-		switch(lr.lr_type){
+		switch(lr->lr_type){
 		case LRT_IS_CLEAN:
-			error = server_is_clean(lc, &lr);
+			error = server_is_clean(lc, lr);
 			break;
 		case LRT_IS_REMOTE_RECOVERING:
-			error = server_is_remote_recovering(lc, &lr);
+			error = server_is_remote_recovering(lc, lr);
 			break;
 		case LRT_IN_SYNC:
-			error = server_in_sync(lc, &lr);
+			error = server_in_sync(lc, lr);
 			break;
 		case LRT_MARK_REGION:
 			if(!(nodeid = 
@@ -935,8 +946,8 @@
 				error = -ENXIO;
 				break;
 			}
-			error = server_mark_region(lc, &lr, nodeid);
-			lr.u.lr_int_rtn = 0;
+			error = server_mark_region(lc, lr, nodeid);
+			lr->u.lr_int_rtn = 0;
 			break;
 		case LRT_CLEAR_REGION:
 			if(!(nodeid = 
@@ -944,7 +955,10 @@
 				error = -ENXIO;
 				break;
 			}
-			error = server_clear_region(lc, &lr, nodeid);
+			error = server_clear_region(lc, lr, nodeid);
+			break;
+		case LRT_FLUSH:
+			error = server_flush(lc);
 			break;
 		case LRT_GET_RESYNC_WORK:
 			if(!(nodeid = 
@@ -952,14 +966,14 @@
 				error = -ENXIO;
 				break;
 			}
-			error = server_get_resync_work(lc, &lr, nodeid);
+			error = server_get_resync_work(lc, lr, nodeid);
 			break;
 		case LRT_COMPLETE_RESYNC_WORK:
-			error = server_complete_resync_work(lc, &lr, lr.u.lr_int_rtn);
-			lr.u.lr_int_rtn = 0;
+			error = server_complete_resync_work(lc, lr, lr->u.lr_int_rtn);
+			lr->u.lr_int_rtn = 0;
 			break;
 		case LRT_GET_SYNC_COUNT:
-			error = server_get_sync_count(lc, &lr);
+			error = server_get_sync_count(lc, lr);
 			break;
 		default:
 			DMWARN("unknown request type received");
@@ -971,15 +985,15 @@
 		if(error){
 /*
 			DMWARN("Error (%d) while processing request (%s)",
-			       error, RQ_STRING(lr.lr_type));
+			       error, RQ_STRING(lr->lr_type));
 */
-			lr.u.lr_int_rtn = error;
+			lr->u.lr_int_rtn = error;
 		}
 	reply:
     
 		/* Why do we need to reset this? */
 		iov.iov_len = sizeof(struct log_request);
-		iov.iov_base = &lr;
+		iov.iov_base = lr;
 		msg.msg_name = &saddr_in;
 		msg.msg_namelen = sizeof(saddr_in);
 
@@ -991,7 +1005,7 @@
 		set_fs(fs);
 		if(error < 0){
 			DMWARN("unable to sendmsg to client (type = %s, error = %d)",
-			       RQ_STRING(lr.lr_type), error);
+			       RQ_STRING(lr->lr_type), error);
 			return error;
 		}
 	} else if(error == -EAGAIN || error == -ETIMEDOUT){
@@ -1052,7 +1066,7 @@
 			if (atomic_read(&restart_event_type) == SERVICE_NODE_FAILED)
 				DMINFO("A cluster mirror log member has failed.");
 			
-			spin_lock(&log_list_lock);
+			down(&log_list_lock);
 			list_for_each_entry(lc, &log_list_head, log_list){
 				if(lc->server_id == my_id){
 					if (atomic_read(&lc->suspended)) {
@@ -1062,7 +1076,7 @@
 					}
 				}
 			}
-			spin_unlock(&log_list_lock);
+			up(&log_list_lock);
 
 			break;
 		default:
@@ -1150,7 +1164,7 @@
 int start_server(void /* log_devices ? */){
 	int error;
 
-	region_user_pool = mempool_create(100, region_user_alloc,
+	region_user_pool = mempool_create(1000, region_user_alloc,
 					  region_user_free, NULL);
 	if(!region_user_pool){
 		DMWARN("unable to allocate region user pool for server");
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/03/22 22:21:59	1.1.2.3
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/04/03 18:21:10	1.1.2.4
@@ -14,14 +14,15 @@
 #define LRT_IN_SYNC             	3
 #define LRT_MARK_REGION         	4
 #define LRT_CLEAR_REGION        	5
-#define LRT_GET_RESYNC_WORK     	6
-#define LRT_COMPLETE_RESYNC_WORK        7
-#define LRT_GET_SYNC_COUNT      	8
-
-#define LRT_ELECTION			9
-#define LRT_SELECTION			10
-#define LRT_MASTER_ASSIGN		11
-#define LRT_MASTER_LEAVING		12
+#define LRT_FLUSH                       6
+#define LRT_GET_RESYNC_WORK     	7
+#define LRT_COMPLETE_RESYNC_WORK        8
+#define LRT_GET_SYNC_COUNT      	9
+
+#define LRT_ELECTION			10
+#define LRT_SELECTION			11
+#define LRT_MASTER_ASSIGN		12
+#define LRT_MASTER_LEAVING		13
 
 #define CLUSTER_LOG_PORT 51005
 
@@ -29,6 +30,7 @@
 	((x) == LRT_IS_CLEAN) ? "LRT_IS_CLEAN": \
 	((x) == LRT_IN_SYNC) ? "LRT_IN_SYNC": \
 	((x) == LRT_MARK_REGION) ? "LRT_MARK_REGION": \
+	((x) == LRT_FLUSH) ? "LRT_FLUSH": \
 	((x) == LRT_GET_RESYNC_WORK) ? "LRT_GET_RESYNC_WORK": \
 	((x) == LRT_GET_SYNC_COUNT) ? "LRT_GET_SYNC_COUNT": \
 	((x) == LRT_CLEAR_REGION) ? "LRT_CLEAR_REGION": \



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-03-22 22:34 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-03-22 22:34 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL45
Changes by:	jbrassow at sourceware.org	2007-03-22 22:34:44

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 
	                    dm-cmirror-xfr.h 

Log message:
	Bug 233034: cmirror server failure/migration during GFS I/O causes metad...
	
	Add sequence number to messages to ensure
	that cmirror clients get the response they expect.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.41&r2=1.1.2.41.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.26&r2=1.1.2.26.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-xfr.h.diff?cvsroot=cluster&only_with_tag=RHEL45&r1=1.1.2.2&r2=1.1.2.2.2.1

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/03/14 04:28:32	1.1.2.41
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/03/22 22:34:44	1.1.2.41.2.1
@@ -53,12 +53,6 @@
 /* These vars are just for stats, and will be removed */
 static uint32_t request_count=0;
 static uint32_t request_retry_count=0;
-static int clear_req=0;
-static int mark_req=0;
-static int insync_req=0;
-static int clear_req2ser=0;
-static int mark_req2ser=0;
-static int insync_req2ser=0;
 
 static void *region_state_alloc(int gfp_mask, void *pool_data){
 	return kmalloc(sizeof(struct region_state), gfp_mask);
@@ -316,6 +310,7 @@
 
 static int _consult_server(struct log_c *lc, region_t region,
 			  int type, region_t *result, int *retry){
+	static int seq = 0;
 	int len;
 	int error=0;
 	struct sockaddr_in saddr_in;
@@ -336,6 +331,7 @@
 	memset(lr, 0, sizeof(struct log_request));
 	
 	lr->lr_type = type;
+	lr->lr_seq = seq;
 	if(type == LRT_MASTER_LEAVING){
 		lr->u.lr_starter = my_id;
 	} else {
@@ -369,18 +365,6 @@
 	iov.iov_len = sizeof(struct log_request);
 	iov.iov_base = lr;
 
-	if(lr->lr_type == LRT_MARK_REGION){
-		mark_req2ser++;
-	}
-
-	if(lr->lr_type == LRT_CLEAR_REGION){
-		clear_req2ser++;
-	}
-	
-	if(lr->lr_type == LRT_IN_SYNC){
-		insync_req2ser++;
-	}
-	
 	fs = get_fs();
 	set_fs(get_ds());
   
@@ -394,6 +378,7 @@
 		goto fail;
 	}
 
+rerecv:
 	iov.iov_len = sizeof(struct log_request);
 	iov.iov_base = lr;
 
@@ -414,9 +399,44 @@
 		DMWARN("Error while listening for server response: %d", len);
 		error = len;
 		*retry = 1;
+		seq++;
 		goto fail;
 	}
     
+	if (seq != lr->lr_seq) {
+		DMERR("Message sequence number mismatch: %d/%d",
+		      seq, lr->lr_seq);
+		if (seq > lr->lr_seq) {
+			DMERR(" Skipping.  Listening again for response to %s",
+			      RQ_STRING(type));
+			memset(lr, 0, sizeof(struct log_request));
+			goto rerecv;
+		}
+		DMERR(" Must try to resend request, %s", RQ_STRING(type));
+		error = -EBADE;
+		*retry = 1;
+		seq++;
+		goto fail;
+	}
+	seq++;
+
+	if (type != lr->lr_type) {
+		DMERR("Got incorrect message type back: %s/%s",
+		      RQ_STRING(type), RQ_STRING(lr->lr_type));
+		error = -EBADE;
+		*retry = 1;
+		goto fail;
+	}
+
+	if (memcmp(lc->uuid, lr->lr_uuid, MAX_NAME_LEN)) {
+		DMERR("Got reply from server for wrong log:");
+		DMERR(" Expected UUID: %s", lc->uuid);
+		DMERR(" Recieved UUID: %s", lr->lr_uuid);
+		error = -EBADE;
+		*retry = 1;
+		goto fail;
+	}
+
 	if(lr->u.lr_int_rtn == -EAGAIN){
 		DMWARN("Server (%u), request type %d, -EAGAIN."
 		       "  Mirror suspended?",
@@ -453,17 +473,7 @@
 			DMDEBUG(" - log uuid:: %s (%s)",
 			       lc->uuid + (strlen(lc->uuid) - 8),
 			       atomic_read(&lc->suspended) ? "suspended" : "active");
-			DMDEBUG(" - request :: %s",
-			       (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
-			       (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-			       (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-			       (type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-			       (type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-			       (type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-			       (type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-			       (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-			       (type == LRT_ELECTION)? "LRT_ELECTION":
-			       (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+			DMDEBUG(" - request :: %s", RQ_STRING(type));
 			DMDEBUG(" - error   :: %d", error);
 			DMINFO("Too many retries, attempting to re-establish server connection.");
 			lc->server_id = 0xDEAD;
@@ -519,7 +529,7 @@
 			}
 			clear_region_count -= i;
 			DMINFO(" - %d clear region requests wiped", i);
-
+			i=0;
 			DMINFO(" - Resending all mark region requests");
 			list_for_each_entry(rs, &marked_region_list, rs_list){
 				/* Resend only those associated with referenced log */
@@ -527,7 +537,7 @@
 					continue;
 				do {
 					retry = 0;
-					DMINFO("   - " SECTOR_FORMAT, rs->rs_region);
+					i++;
 					rtn = _consult_server(rs->rs_lc, rs->rs_region,
 							      LRT_MARK_REGION, NULL, &retry);
 					if (lc->server_id == 0xDEAD) {
@@ -536,6 +546,7 @@
 					}
 				} while(retry);
 			}
+			DMINFO(" - %d mark region requests resent", i);
 			DMINFO("Clean-up complete");
 			if(type == LRT_MARK_REGION){
 				/* we just handled all marks */
@@ -544,17 +555,7 @@
 				goto out;
 			} else {
 				DMINFO("Continuing request type, %d (%s)", type,
-				      (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
-				      (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-				      (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-				      (type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-				      (type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-				      (type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-				      (type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-				      (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-				      (type == LRT_ELECTION)? "LRT_ELECTION":
-				      (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN"
-					);
+				       RQ_STRING(type));
 			}
 			new_server = 0;
 		}
@@ -886,7 +887,6 @@
 	struct log_c *lc = (struct log_c *) log->context;
   
 	/* check known_regions, return if found */
-	insync_req++;
 /* take out optimization
 	if(atomic_read(&lc->in_sync) == 1){
 		return 1;
@@ -915,8 +915,6 @@
 	struct region_state *rs, *tmp_rs, *rs_new;
 	struct log_c *lc = (struct log_c *) log->context;
 
-	mark_req++;
-
 	rs_new = mempool_alloc(region_state_pool, GFP_KERNEL);
 
 	memset(rs_new, 0, sizeof(struct region_state));
@@ -924,8 +922,10 @@
 	spin_lock(&region_state_lock);
 	list_for_each_entry_safe(rs, tmp_rs, &clear_region_list, rs_list){
 		if(lc == rs->rs_lc && region == rs->rs_region){
+			/*
 			DMDEBUG("Mark pre-empting clear (%Lu/%s)",
 				region, lc->uuid + (strlen(lc->uuid) - 8));
+			*/
 			list_del_init(&rs->rs_list);
 			list_add(&rs->rs_list, &marked_region_list);
 			clear_region_count--;
@@ -1007,7 +1007,6 @@
 {
 	struct log_c *lc = (struct log_c *) log->context;
 	struct region_state *rs, *tmp_rs, *rs_new;
-	clear_req++;
 
 	rs_new = mempool_alloc(region_state_pool, GFP_ATOMIC);
 
@@ -1140,21 +1139,6 @@
 		DMINFO("  Regions marked   : %d", j);
 		DMINFO("  Regions clearing : %d", i);
 
-		DMINFO("  Mark requests    : %d", mark_req);
-		if(mark_req)
-			DMINFO("  Mark req to serv : %d (%d%%)", mark_req2ser,
-			       (mark_req2ser*100)/mark_req);
-
-		DMINFO("  Clear requests   : %d", clear_req);
-		if(clear_req)
-			DMINFO("  Clear req to serv: %d (%d%%)", clear_req2ser,
-			       (clear_req2ser*100)/clear_req);
-
-		DMINFO("  Sync  requests   : %d", insync_req);
-		if(insync_req)
-			DMINFO("  Sync req to serv : %d (%d%%)", insync_req2ser,
-			       (insync_req2ser*100)/insync_req);
-
 		if(lc->server_id == my_id){
 			print_server_status(lc);
 		}
@@ -1216,9 +1200,11 @@
 		atomic_set(&lc->in_sync, 0);
 	}
 	spin_unlock(&log_list_lock);
-	
+
+	/*
 	if (likely(!shutting_down))
 		suspend_server();
+	*/
 
 	return 0;
 }
@@ -1267,7 +1253,9 @@
 		BUG();
 		break;
 	}
+	/*
 	resume_server();
+	*/
 	return 0;
 }
 
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/03/14 04:28:32	1.1.2.26
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/03/22 22:34:44	1.1.2.26.2.1
@@ -911,17 +911,7 @@
 			DMDEBUG("Getting request while server (%u) is suspended:", my_id);
 			DMDEBUG(" - Requester :: %u", nodeid);
 			DMDEBUG(" - log uuid  :: %s", lc->uuid + (strlen(lc->uuid) - 8));
-			DMDEBUG(" - req type  :: %s",
-				(lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_C	LEAN":
-				(lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-				(lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-				(lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-				(lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-				(lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-				(lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-				(lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-				(lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
-				(lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+			DMDEBUG(" - req type  :: %s", RQ_STRING(lr.lr_type));
 			*/
 			if (my_id != nodeid) {
 				lr.u.lr_int_rtn = -ENXIO;
@@ -981,17 +971,7 @@
 		if(error){
 /*
 			DMWARN("Error (%d) while processing request (%s)",
-			       error,
-			       (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
-			       (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-			       (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-			       (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-			       (lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-			       (lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-			       (lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-			       (lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-			       (lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
-			       (lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+			       error, RQ_STRING(lr.lr_type));
 */
 			lr.u.lr_int_rtn = error;
 		}
@@ -1011,17 +991,7 @@
 		set_fs(fs);
 		if(error < 0){
 			DMWARN("unable to sendmsg to client (type = %s, error = %d)",
-			       (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
-			       (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-			       (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-			       (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-			       (lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-			       (lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-			       (lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-			       (lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-			       (lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
-			       (lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN",
-			       error);
+			       RQ_STRING(lr.lr_type), error);
 			return error;
 		}
 	} else if(error == -EAGAIN || error == -ETIMEDOUT){
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/02/14 17:44:07	1.1.2.2
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/03/22 22:34:44	1.1.2.2.2.1
@@ -25,8 +25,21 @@
 
 #define CLUSTER_LOG_PORT 51005
 
+#define RQ_STRING(x) \
+	((x) == LRT_IS_CLEAN) ? "LRT_IS_CLEAN": \
+	((x) == LRT_IN_SYNC) ? "LRT_IN_SYNC": \
+	((x) == LRT_MARK_REGION) ? "LRT_MARK_REGION": \
+	((x) == LRT_GET_RESYNC_WORK) ? "LRT_GET_RESYNC_WORK": \
+	((x) == LRT_GET_SYNC_COUNT) ? "LRT_GET_SYNC_COUNT": \
+	((x) == LRT_CLEAR_REGION) ? "LRT_CLEAR_REGION": \
+	((x) == LRT_COMPLETE_RESYNC_WORK) ? "LRT_COMPLETE_RESYNC_WORK": \
+	((x) == LRT_MASTER_LEAVING) ? "LRT_MASTER_LEAVING": \
+	((x) == LRT_ELECTION) ? "LRT_ELECTION": \
+	((x) == LRT_SELECTION) ? "LRT_SELECTION": "UNKNOWN"
+
 struct log_request {
 	int lr_type;
+	int lr_seq;
 	union {
 		struct {
 			uint32_t lr_starter;



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-03-22 22:22 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-03-22 22:22 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-03-22 22:21:59

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 
	                    dm-cmirror-xfr.h 

Log message:
	Bug 233034: cmirror server failure/migration during GFS I/O causes metad...
	(Likely fixes other bugs as well.)
	
	When a cmirror client timed out waiting for a response from the server,
	it would send the request again.  Sometimes, the server simply took to
	long to get back to the client... it would then develop an off-by-one
	error - responding to the first _and_ the second request.  The client
	could then be asking to mark a region, and recieve a response for
	a previous request.  This has the potential to cause many problems.
	
	Sequence numbers have been added to fix the problem.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.41&r2=1.1.2.42
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.26&r2=1.1.2.27
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-xfr.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.2&r2=1.1.2.3

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/03/14 04:28:32	1.1.2.41
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/03/22 22:21:59	1.1.2.42
@@ -53,12 +53,6 @@
 /* These vars are just for stats, and will be removed */
 static uint32_t request_count=0;
 static uint32_t request_retry_count=0;
-static int clear_req=0;
-static int mark_req=0;
-static int insync_req=0;
-static int clear_req2ser=0;
-static int mark_req2ser=0;
-static int insync_req2ser=0;
 
 static void *region_state_alloc(int gfp_mask, void *pool_data){
 	return kmalloc(sizeof(struct region_state), gfp_mask);
@@ -316,6 +310,7 @@
 
 static int _consult_server(struct log_c *lc, region_t region,
 			  int type, region_t *result, int *retry){
+	static int seq = 0;
 	int len;
 	int error=0;
 	struct sockaddr_in saddr_in;
@@ -336,6 +331,7 @@
 	memset(lr, 0, sizeof(struct log_request));
 	
 	lr->lr_type = type;
+	lr->lr_seq = seq;
 	if(type == LRT_MASTER_LEAVING){
 		lr->u.lr_starter = my_id;
 	} else {
@@ -369,18 +365,6 @@
 	iov.iov_len = sizeof(struct log_request);
 	iov.iov_base = lr;
 
-	if(lr->lr_type == LRT_MARK_REGION){
-		mark_req2ser++;
-	}
-
-	if(lr->lr_type == LRT_CLEAR_REGION){
-		clear_req2ser++;
-	}
-	
-	if(lr->lr_type == LRT_IN_SYNC){
-		insync_req2ser++;
-	}
-	
 	fs = get_fs();
 	set_fs(get_ds());
   
@@ -394,6 +378,7 @@
 		goto fail;
 	}
 
+rerecv:
 	iov.iov_len = sizeof(struct log_request);
 	iov.iov_base = lr;
 
@@ -414,9 +399,44 @@
 		DMWARN("Error while listening for server response: %d", len);
 		error = len;
 		*retry = 1;
+		seq++;
 		goto fail;
 	}
     
+	if (seq != lr->lr_seq) {
+		DMERR("Message sequence number mismatch: %d/%d",
+		      seq, lr->lr_seq);
+		if (seq > lr->lr_seq) {
+			DMERR(" Skipping.  Listening again for response to %s",
+			      RQ_STRING(type));
+			memset(lr, 0, sizeof(struct log_request));
+			goto rerecv;
+		}
+		DMERR(" Must try to resend request, %s", RQ_STRING(type));
+		error = -EBADE;
+		*retry = 1;
+		seq++;
+		goto fail;
+	}
+	seq++;
+
+	if (type != lr->lr_type) {
+		DMERR("Got incorrect message type back: %s/%s",
+		      RQ_STRING(type), RQ_STRING(lr->lr_type));
+		error = -EBADE;
+		*retry = 1;
+		goto fail;
+	}
+
+	if (memcmp(lc->uuid, lr->lr_uuid, MAX_NAME_LEN)) {
+		DMERR("Got reply from server for wrong log:");
+		DMERR(" Expected UUID: %s", lc->uuid);
+		DMERR(" Recieved UUID: %s", lr->lr_uuid);
+		error = -EBADE;
+		*retry = 1;
+		goto fail;
+	}
+
 	if(lr->u.lr_int_rtn == -EAGAIN){
 		DMWARN("Server (%u), request type %d, -EAGAIN."
 		       "  Mirror suspended?",
@@ -453,17 +473,7 @@
 			DMDEBUG(" - log uuid:: %s (%s)",
 			       lc->uuid + (strlen(lc->uuid) - 8),
 			       atomic_read(&lc->suspended) ? "suspended" : "active");
-			DMDEBUG(" - request :: %s",
-			       (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
-			       (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-			       (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-			       (type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-			       (type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-			       (type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-			       (type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-			       (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-			       (type == LRT_ELECTION)? "LRT_ELECTION":
-			       (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+			DMDEBUG(" - request :: %s", RQ_STRING(type));
 			DMDEBUG(" - error   :: %d", error);
 			DMINFO("Too many retries, attempting to re-establish server connection.");
 			lc->server_id = 0xDEAD;
@@ -519,7 +529,7 @@
 			}
 			clear_region_count -= i;
 			DMINFO(" - %d clear region requests wiped", i);
-
+			i=0;
 			DMINFO(" - Resending all mark region requests");
 			list_for_each_entry(rs, &marked_region_list, rs_list){
 				/* Resend only those associated with referenced log */
@@ -527,7 +537,7 @@
 					continue;
 				do {
 					retry = 0;
-					DMINFO("   - " SECTOR_FORMAT, rs->rs_region);
+					i++;
 					rtn = _consult_server(rs->rs_lc, rs->rs_region,
 							      LRT_MARK_REGION, NULL, &retry);
 					if (lc->server_id == 0xDEAD) {
@@ -536,6 +546,7 @@
 					}
 				} while(retry);
 			}
+			DMINFO(" - %d mark region requests resent", i);
 			DMINFO("Clean-up complete");
 			if(type == LRT_MARK_REGION){
 				/* we just handled all marks */
@@ -544,17 +555,7 @@
 				goto out;
 			} else {
 				DMINFO("Continuing request type, %d (%s)", type,
-				      (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
-				      (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-				      (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-				      (type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-				      (type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-				      (type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-				      (type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-				      (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-				      (type == LRT_ELECTION)? "LRT_ELECTION":
-				      (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN"
-					);
+				       RQ_STRING(type));
 			}
 			new_server = 0;
 		}
@@ -886,7 +887,6 @@
 	struct log_c *lc = (struct log_c *) log->context;
   
 	/* check known_regions, return if found */
-	insync_req++;
 /* take out optimization
 	if(atomic_read(&lc->in_sync) == 1){
 		return 1;
@@ -915,8 +915,6 @@
 	struct region_state *rs, *tmp_rs, *rs_new;
 	struct log_c *lc = (struct log_c *) log->context;
 
-	mark_req++;
-
 	rs_new = mempool_alloc(region_state_pool, GFP_KERNEL);
 
 	memset(rs_new, 0, sizeof(struct region_state));
@@ -924,8 +922,10 @@
 	spin_lock(&region_state_lock);
 	list_for_each_entry_safe(rs, tmp_rs, &clear_region_list, rs_list){
 		if(lc == rs->rs_lc && region == rs->rs_region){
+			/*
 			DMDEBUG("Mark pre-empting clear (%Lu/%s)",
 				region, lc->uuid + (strlen(lc->uuid) - 8));
+			*/
 			list_del_init(&rs->rs_list);
 			list_add(&rs->rs_list, &marked_region_list);
 			clear_region_count--;
@@ -1007,7 +1007,6 @@
 {
 	struct log_c *lc = (struct log_c *) log->context;
 	struct region_state *rs, *tmp_rs, *rs_new;
-	clear_req++;
 
 	rs_new = mempool_alloc(region_state_pool, GFP_ATOMIC);
 
@@ -1140,21 +1139,6 @@
 		DMINFO("  Regions marked   : %d", j);
 		DMINFO("  Regions clearing : %d", i);
 
-		DMINFO("  Mark requests    : %d", mark_req);
-		if(mark_req)
-			DMINFO("  Mark req to serv : %d (%d%%)", mark_req2ser,
-			       (mark_req2ser*100)/mark_req);
-
-		DMINFO("  Clear requests   : %d", clear_req);
-		if(clear_req)
-			DMINFO("  Clear req to serv: %d (%d%%)", clear_req2ser,
-			       (clear_req2ser*100)/clear_req);
-
-		DMINFO("  Sync  requests   : %d", insync_req);
-		if(insync_req)
-			DMINFO("  Sync req to serv : %d (%d%%)", insync_req2ser,
-			       (insync_req2ser*100)/insync_req);
-
 		if(lc->server_id == my_id){
 			print_server_status(lc);
 		}
@@ -1216,9 +1200,11 @@
 		atomic_set(&lc->in_sync, 0);
 	}
 	spin_unlock(&log_list_lock);
-	
+
+	/*
 	if (likely(!shutting_down))
 		suspend_server();
+	*/
 
 	return 0;
 }
@@ -1267,7 +1253,9 @@
 		BUG();
 		break;
 	}
+	/*
 	resume_server();
+	*/
 	return 0;
 }
 
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/03/14 04:28:32	1.1.2.26
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/03/22 22:21:59	1.1.2.27
@@ -911,17 +911,7 @@
 			DMDEBUG("Getting request while server (%u) is suspended:", my_id);
 			DMDEBUG(" - Requester :: %u", nodeid);
 			DMDEBUG(" - log uuid  :: %s", lc->uuid + (strlen(lc->uuid) - 8));
-			DMDEBUG(" - req type  :: %s",
-				(lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_C	LEAN":
-				(lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-				(lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-				(lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-				(lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-				(lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-				(lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-				(lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-				(lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
-				(lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+			DMDEBUG(" - req type  :: %s", RQ_STRING(lr.lr_type));
 			*/
 			if (my_id != nodeid) {
 				lr.u.lr_int_rtn = -ENXIO;
@@ -981,17 +971,7 @@
 		if(error){
 /*
 			DMWARN("Error (%d) while processing request (%s)",
-			       error,
-			       (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
-			       (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-			       (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-			       (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-			       (lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-			       (lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-			       (lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-			       (lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-			       (lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
-			       (lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+			       error, RQ_STRING(lr.lr_type));
 */
 			lr.u.lr_int_rtn = error;
 		}
@@ -1011,17 +991,7 @@
 		set_fs(fs);
 		if(error < 0){
 			DMWARN("unable to sendmsg to client (type = %s, error = %d)",
-			       (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
-			       (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-			       (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-			       (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-			       (lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-			       (lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-			       (lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-			       (lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-			       (lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
-			       (lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN",
-			       error);
+			       RQ_STRING(lr.lr_type), error);
 			return error;
 		}
 	} else if(error == -EAGAIN || error == -ETIMEDOUT){
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/02/14 17:44:07	1.1.2.2
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/03/22 22:21:59	1.1.2.3
@@ -25,8 +25,21 @@
 
 #define CLUSTER_LOG_PORT 51005
 
+#define RQ_STRING(x) \
+	((x) == LRT_IS_CLEAN) ? "LRT_IS_CLEAN": \
+	((x) == LRT_IN_SYNC) ? "LRT_IN_SYNC": \
+	((x) == LRT_MARK_REGION) ? "LRT_MARK_REGION": \
+	((x) == LRT_GET_RESYNC_WORK) ? "LRT_GET_RESYNC_WORK": \
+	((x) == LRT_GET_SYNC_COUNT) ? "LRT_GET_SYNC_COUNT": \
+	((x) == LRT_CLEAR_REGION) ? "LRT_CLEAR_REGION": \
+	((x) == LRT_COMPLETE_RESYNC_WORK) ? "LRT_COMPLETE_RESYNC_WORK": \
+	((x) == LRT_MASTER_LEAVING) ? "LRT_MASTER_LEAVING": \
+	((x) == LRT_ELECTION) ? "LRT_ELECTION": \
+	((x) == LRT_SELECTION) ? "LRT_SELECTION": "UNKNOWN"
+
 struct log_request {
 	int lr_type;
+	int lr_seq;
 	union {
 		struct {
 			uint32_t lr_starter;



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-03-14  4:28 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-03-14  4:28 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-03-14 04:28:32

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 

Log message:
	Bug 231230: leg failure on cmirrors causes devices to be stuck in SUSPE...
	
	The problem here appears to be timeouts related to clvmd.
	During failures under heavy load, clvmd commands (suspend/resume/
	activate/deactivate) can take a long time.  Clvmd assumes to quickly
	that they have failed.  This results in the fault handling being left
	half done.  Further calls to vgreduce (by hand or by dmeventd) will
	not help because the _on-disk_ version of the meta-data is consistent -
	that is, the faulty device has been removed.
	
	The most significant change in this patch is the removal of the
	'is_remote_recovering' function.  This function was designed to check
	if a remote node was recovering a region so that writes to the region
	could be delayed.  However, even with this function, it was possible
	for a remote node to begin recovery on a region _after_ the function
	was called, but before the write (mark request) took place.  Because
	of this, checking is done during the mark request stage - rendering
	the call to 'is_remote_recovering' meaningless.  Given the useless
	nature of this function, it has been pulled.  The benefits of its
	removal are increased performance and much faster (more than an
	order of magnitude) response during the mirror suspend process.
	
	The faster suspend process leads to less clvmd timeouts and
	reduced probability that bug 231230 will be triggered.
	
	However, when a mirror device is reconfigured, the mirror sub-devices
	are removed.  This is done by activating them cluster-wide before
	their removal.  With high enough load during recovery, these operations
	can still take a long time - even though they are linear devices.
	This too has the potential for causing clvmd to timeout and trigger
	bug 231230.  There is no cluster logging fix for this issue.  The
	delay on the linear devices must be determined.  A temporary
	work-around would be to increase the timeout of clvmd (e.g. clvmd -t #).

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.40&r2=1.1.2.41
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.25&r2=1.1.2.26

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/03/02 22:31:14	1.1.2.40
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/03/14 04:28:32	1.1.2.41
@@ -647,12 +647,12 @@
 		}
 	}
 
+	list_add(&lc->log_list, &log_list_head);
+	spin_unlock(&log_list_lock);
 	DMDEBUG("Creating %s (%d)",
 	       lc->uuid + (strlen(lc->uuid) - 8),
 	       lc->uuid_ref);
 
-	list_add(&lc->log_list, &log_list_head);
-	spin_unlock(&log_list_lock);
 	INIT_LIST_HEAD(&lc->region_users);
 
 	lc->server_id = 0xDEAD;
@@ -767,6 +767,11 @@
 	list_del_init(&lc->log_list);
 	spin_unlock(&log_list_lock);
 
+	if ((lc->server_id == my_id) && !atomic_read(&lc->suspended))
+		consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
+
+	sock_release(lc->client_sock);
+
 	spin_lock(&region_state_lock);
 
 	list_for_each_entry_safe(rs, tmp_rs, &clear_region_list, rs_list) {
@@ -781,11 +786,6 @@
 
 	spin_unlock(&region_state_lock);
 
-	if ((lc->server_id == my_id) && !atomic_read(&lc->suspended))
-		consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
-
-	sock_release(lc->client_sock);
-
 	if (lc->log_dev)
 		disk_dtr(log);
 	else
@@ -844,7 +844,6 @@
 	lc->sync_search = 0;
 	resume_server_requests();
 	atomic_set(&lc->suspended, 0);
-	consult_server(lc, 0, LRT_IN_SYNC, NULL);
 
 	return 0;
 }
@@ -1354,7 +1353,7 @@
 	.resume = cluster_resume,
 	.get_region_size = cluster_get_region_size,
 	.is_clean = cluster_is_clean,
-	.is_remote_recovering = cluster_is_remote_recovering,
+/*	.is_remote_recovering = cluster_is_remote_recovering,*/
 	.in_sync = cluster_in_sync,
 	.flush = cluster_flush,
 	.mark_region = cluster_mark_region,
@@ -1376,7 +1375,7 @@
 	.resume = cluster_resume,
 	.get_region_size = cluster_get_region_size,
 	.is_clean = cluster_is_clean,
-	.is_remote_recovering = cluster_is_remote_recovering,
+/*	.is_remote_recovering = cluster_is_remote_recovering,*/
 	.in_sync = cluster_in_sync,
 	.flush = cluster_flush,
 	.mark_region = cluster_mark_region,
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/02/26 17:38:06	1.1.2.25
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/03/14 04:28:32	1.1.2.26
@@ -619,7 +619,7 @@
 			lc->sync_count++;
 		}
 	} else if (log_test_bit(lc->sync_bits, lr->u.lr_region)) {
-		lc->sync_count--;
+		/* gone again: lc->sync_count--;*/
 		log_clear_bit(lc, lc->sync_bits, lr->u.lr_region);
 	}
 



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-02-26 17:38 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-02-26 17:38 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-02-26 17:38:06

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 

Log message:
	add locking around the log list.  There was a small window of opportunity
	for the log server to look up a log in the list while another entry was
	being deleted (bad for the server).
	
	Bug 229715 Processed: cmirror panic in dm_cmirror:cluster_log_serverd

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.38&r2=1.1.2.39
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.24&r2=1.1.2.25

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/02/20 19:35:10	1.1.2.38
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/02/26 17:38:06	1.1.2.39
@@ -28,6 +28,7 @@
 #include "dm-cmirror-server.h"
 #include "dm-cmirror-cman.h"
 
+spinlock_t log_list_lock;
 LIST_HEAD(log_list_head);
 
 struct region_state {
@@ -635,6 +636,7 @@
 	atomic_set(&lc->in_sync, -1);
 	lc->uuid_ref = 1;
 
+	spin_lock(&log_list_lock);
 	list_for_each_entry(tmp_lc, &log_list_head, log_list){
 		if(!strncmp(tmp_lc->uuid, lc->uuid, MAX_NAME_LEN)){
 			lc->uuid_ref = (lc->uuid_ref > tmp_lc->uuid_ref) ?
@@ -647,6 +649,7 @@
 	       lc->uuid_ref);
 
 	list_add(&lc->log_list, &log_list_head);
+	spin_unlock(&log_list_lock);
 	INIT_LIST_HEAD(&lc->region_users);
 
 	lc->server_id = 0xDEAD;
@@ -757,7 +760,9 @@
 	if (!list_empty(&clear_region_list))
 		DMINFO("Leaving while clear region requests remain.");
 
+	spin_lock(&log_list_lock);
 	list_del_init(&lc->log_list);
+	spin_unlock(&log_list_lock);
 
 	if ((lc->server_id == my_id) && !atomic_read(&lc->suspended))
 		consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
@@ -1204,9 +1209,11 @@
 
 	atomic_set(&suspend_client, 1);
 
+	spin_lock(&log_list_lock);
 	list_for_each_entry(lc, &log_list_head, log_list) {
 		atomic_set(&lc->in_sync, 0);
 	}
+	spin_unlock(&log_list_lock);
 	
 	if (likely(!shutting_down))
 		suspend_server();
@@ -1238,6 +1245,7 @@
 	switch(type){
 	case SERVICE_NODE_LEAVE:
 	case SERVICE_NODE_FAILED:
+		spin_lock(&log_list_lock);
 		list_for_each_entry(lc, &log_list_head, log_list){
 			for(i=0, server = 0xDEAD; i < count; i++){
 				if(lc->server_id == nodeids[i]){
@@ -1247,6 +1255,8 @@
 			/* ATTENTION -- need locking around this ? */
 			lc->server_id = server;
 		}
+		spin_unlock(&log_list_lock);
+
 		break;
 	case SERVICE_NODE_JOIN:
 		break;
@@ -1387,6 +1397,7 @@
 	INIT_LIST_HEAD(&marked_region_list);
 
 	spin_lock_init(&region_state_lock);
+	spin_lock_init(&log_list_lock);
 	region_state_pool = mempool_create(20, region_state_alloc,
 					   region_state_free, NULL);
 	if(!region_state_pool){
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/02/21 17:14:44	1.1.2.24
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/02/26 17:38:06	1.1.2.25
@@ -47,6 +47,7 @@
 static atomic_t _do_requests;
 
 static int debug_disk_write = 0;
+extern spinlock_t log_list_lock;
 extern struct list_head log_list_head;
 
 static void *region_user_alloc(int gfp_mask, void *pool_data){
@@ -649,6 +650,7 @@
 static struct log_c *get_log_context(char *uuid, int uuid_ref){
 	struct log_c *lc, *r = NULL;
 
+	spin_lock(&log_list_lock);
 	list_for_each_entry(lc, &log_list_head, log_list){
 		if (!strncmp(lc->uuid, uuid, MAX_NAME_LEN) &&
 		    (uuid_ref == lc->uuid_ref)) {
@@ -658,6 +660,7 @@
 				r = lc;
 		}
 	}
+	spin_unlock(&log_list_lock);
 
 	return r;
 }
@@ -1079,6 +1082,7 @@
 			if (atomic_read(&restart_event_type) == SERVICE_NODE_FAILED)
 				DMINFO("A cluster mirror log member has failed.");
 			
+			spin_lock(&log_list_lock);
 			list_for_each_entry(lc, &log_list_head, log_list){
 				if(lc->server_id == my_id){
 					if (atomic_read(&lc->suspended)) {
@@ -1088,6 +1092,8 @@
 					}
 				}
 			}
+			spin_unlock(&log_list_lock);
+
 			break;
 		default:
 			/* Someone has joined, or there is no event */



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-02-19 16:29 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-02-19 16:29 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-02-19 16:29:43

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 

Log message:
	Bug 228070: DLM assertion when running GFS I/O during cmirror leg failure
	
	When a log server drops out of the cluster, it ignores any requests -
	forcing the clients to retry.  Unfortunately, the clients never ran
	another election - causing operations to stall.  The server now replies
	that it cannot handle the requests, which causes proper initiation of
	elections.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.36&r2=1.1.2.37
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.21&r2=1.1.2.22

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/02/14 17:44:07	1.1.2.36
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/02/19 16:29:42	1.1.2.37
@@ -1058,6 +1058,10 @@
 
 	rtn = consult_server(lc, 0, LRT_GET_RESYNC_WORK, region);
 
+	if (*region > lc->region_count) {
+		DMWARN("Error while getting resync work: bad region");
+		rtn = 0;
+	}
 	return rtn;
 }
 
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/02/14 17:44:07	1.1.2.21
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/02/19 16:29:42	1.1.2.22
@@ -938,7 +938,7 @@
 		case LRT_MARK_REGION:
 			if(!(nodeid = 
 			     ipaddr_to_nodeid((struct sockaddr *)msg.msg_name))){
-				return -EINVAL;
+				error = -ENXIO;
 				break;
 			}
 			error = server_mark_region(lc, &lr, nodeid);
@@ -947,7 +947,7 @@
 		case LRT_CLEAR_REGION:
 			if(!(nodeid = 
 			     ipaddr_to_nodeid((struct sockaddr *)msg.msg_name))){
-				return -EINVAL;
+				error = -ENXIO;
 				break;
 			}
 			error = server_clear_region(lc, &lr, nodeid);
@@ -955,7 +955,7 @@
 		case LRT_GET_RESYNC_WORK:
 			if(!(nodeid = 
 			     ipaddr_to_nodeid((struct sockaddr *)msg.msg_name))){
-				return -EINVAL;
+				error = -ENXIO;
 				break;
 			}
 			error = server_get_resync_work(lc, &lr, nodeid);



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-02-14 17:44 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-02-14 17:44 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-02-14 17:44:08

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c dm-cmirror-xfr.h 

Log message:
	Changes to fix the following bugs:
	Bug 228104: greater than 2 legged cluster mirrors do not down ...
	Bug 228056: lvconvert should give warning if we don't support ...
	
	When converting from 3-way to 2-way mirror, the UUID for the mirror
	stays the same.  This creates conflicting entries in the cluster
	logging code.  I've added an additional identifier to allow for
	unique identification in these cases.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.35&r2=1.1.2.36
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.10&r2=1.1.2.11
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.20&r2=1.1.2.21
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-xfr.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.1&r2=1.1.2.2

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/02/02 17:22:55	1.1.2.35
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/02/14 17:44:07	1.1.2.36
@@ -257,6 +257,7 @@
 	lr.u.lr_starter = my_id;
 	lr.u.lr_coordinator = initial_server;
 	memcpy(lr.lr_uuid, lc->uuid, MAX_NAME_LEN);
+	lr.lr_uuid_ref = lc->uuid_ref;
 
 	memset(&saddr_in, 0, sizeof(struct sockaddr_cl));
 
@@ -342,6 +343,7 @@
 		lr->u.lr_int_rtn = (*result) ? 1 : 0;
 
 	memcpy(lr->lr_uuid, lc->uuid, MAX_NAME_LEN);
+	lr->lr_uuid_ref = lc->uuid_ref;
 
 	memset(&saddr_in, 0, sizeof(struct sockaddr_in));
 
@@ -630,16 +632,19 @@
 	}
 
 	atomic_set(&lc->in_sync, -1);
+	lc->uuid_ref = 1;
 
 	list_for_each_entry(tmp_lc, &log_list_head, log_list){
 		if(!strncmp(tmp_lc->uuid, lc->uuid, MAX_NAME_LEN)){
-			DMERR("Log already exists with uuid, %s",
-			      lc->uuid + (strlen(lc->uuid) - 8));
-			error = -EINVAL;
-			goto fail;
+			lc->uuid_ref = (lc->uuid_ref > tmp_lc->uuid_ref) ?
+				lc->uuid_ref : tmp_lc->uuid_ref + 1;
 		}
 	}
 
+	DMDEBUG("Creating %s (%d)",
+	       lc->uuid + (strlen(lc->uuid) - 8),
+	       lc->uuid_ref);
+
 	list_add(&lc->log_list, &log_list_head);
 	INIT_LIST_HEAD(&lc->region_users);
 
@@ -744,10 +749,15 @@
 	struct log_c *lc = (struct log_c *) log->context;
 	struct region_state *rs, *tmp_rs;
 
+	DMDEBUG("Removing %s (%d)",
+	       lc->uuid + (strlen(lc->uuid) - 8),
+	       lc->uuid_ref);
+
 	if (!list_empty(&clear_region_list))
 		DMINFO("Leaving while clear region requests remain.");
 
 	list_del_init(&lc->log_list);
+
 	if ((lc->server_id == my_id) && !atomic_read(&lc->suspended))
 		consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
 
@@ -767,6 +777,7 @@
 
 	spin_unlock(&region_state_lock);
 
+
 	if (lc->log_dev)
 		disk_dtr(log);
 	else
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/07/27 23:11:55	1.1.2.10
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2007/02/14 17:44:07	1.1.2.11
@@ -129,6 +129,7 @@
 	 * Cluster log fields
 	 */
 	char uuid[MAX_NAME_LEN];
+	int uuid_ref;
 	atomic_t in_sync;  /* like sync_count, except all or nothing */
 
 	struct list_head log_list;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/02/02 17:22:55	1.1.2.20
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/02/14 17:44:07	1.1.2.21
@@ -645,11 +645,12 @@
 	}
 }
 
-static struct log_c *get_log_context(char *uuid){
+static struct log_c *get_log_context(char *uuid, int uuid_ref){
 	struct log_c *lc, *r = NULL;
 
 	list_for_each_entry(lc, &log_list_head, log_list){
-		if(!strncmp(lc->uuid, uuid, MAX_NAME_LEN)){
+		if (!strncmp(lc->uuid, uuid, MAX_NAME_LEN) &&
+		    (uuid_ref == lc->uuid_ref)) {
 			if (r)
 				report_duplicate_log(lc);
 			else
@@ -866,7 +867,7 @@
 		if(error < sizeof(struct log_request)){
 			DMERR("Cluster mirror log server received incomplete message.");
 		}
-		lc = get_log_context(lr.lr_uuid);
+		lc = get_log_context(lr.lr_uuid, lr.lr_uuid_ref);
 
 		if(lr.lr_type == LRT_ELECTION ||
 		   lr.lr_type == LRT_SELECTION ||
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2005/07/27 16:09:31	1.1.2.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2007/02/14 17:44:07	1.1.2.2
@@ -41,6 +41,7 @@
 		};
 	} u;
 	char lr_uuid[MAX_NAME_LEN];
+	int lr_uuid_ref;
 };
 
 int my_recvmsg(struct socket *sock, struct msghdr *msg,



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-02-02 17:22 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-02-02 17:22 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-02-02 17:22:55

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 

Log message:
	- Fix for bug #225337
	
	Reset 'sync_search' if (lc->sync_search >= lc->region_count) &&
	(lc->sync_count < lc->region_count).  It indicates that a failure
	during recovery has taken place, and we are likely able to handle
	it.
	
	Also, do not issue clear/mark region requests if it is already
	known that the log device has failed.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.34&r2=1.1.2.35
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.19&r2=1.1.2.20

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/01/08 19:28:26	1.1.2.34
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/02/02 17:22:55	1.1.2.35
@@ -568,7 +568,7 @@
 		** while we are here.  If the clear region request fails, it**
 		** would be re-added - perhaps prematurely clearing the bit */
 		
-		if(rs){
+		if(rs && !rs->rs_lc->log_dev_failed){
 			_consult_server(rs->rs_lc, rs->rs_region,
 					LRT_CLEAR_REGION, NULL, &retry);
 
@@ -951,33 +951,35 @@
 
 	spin_unlock(&region_state_lock);
 
-	while((error = consult_server(lc, region, LRT_MARK_REGION, NULL))){
-		if (error == -EBUSY) {
-			/* Remote recovering delay and try again */
-			DMDEBUG("Delaying mark to region %Lu, due to recovery",
-				region);
-			set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(HZ/2);
-			continue;
-		}
+	if (!lc->log_dev_failed) {
+		while((error = consult_server(lc, region, LRT_MARK_REGION, NULL))){
+			if (error == -EBUSY) {
+				/* Remote recovering delay and try again */
+				DMDEBUG("Delaying mark to region %Lu, due to recovery",
+					region);
+				set_current_state(TASK_INTERRUPTIBLE);
+				schedule_timeout(HZ/2);
+				continue;
+			}
 
-		if (error == -EIO) {
-			lc->log_dev_failed = 1;
-			break;
+			if (error == -EIO) {
+				lc->log_dev_failed = 1;
+				break;
+			}
+			DMWARN("unable to get server (%u) to mark region (%Lu)",
+			       lc->server_id, region);
+			DMWARN("Reason :: %d", error);
 		}
-		DMWARN("unable to get server (%u) to mark region (%Lu)",
-		       lc->server_id, region);
-		DMWARN("Reason :: %d", error);
-	}
 
-	if (lc->log_dev_failed) {
-		dm_table_event(lc->ti->table);
-		/*
-		DMERR("Write failed on mirror log device, %s",
-		      lc->log_dev->name);
-		if (!atomic_read(&lc->suspended))
-			wait_for_completion(&lc->failure_completion);
-		*/
+		if (lc->log_dev_failed) {
+			dm_table_event(lc->ti->table);
+			/*
+			  DMERR("Write failed on mirror log device, %s",
+			  lc->log_dev->name);
+			  if (!atomic_read(&lc->suspended))
+			  wait_for_completion(&lc->failure_completion);
+			*/
+		}
 	}
 	return;
 }
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/01/08 19:28:26	1.1.2.19
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/02/02 17:22:55	1.1.2.20
@@ -224,8 +224,16 @@
 
 static int _core_get_resync_work(struct log_c *lc, region_t *region)
 {
-	if (lc->sync_search >= lc->region_count){
-		return 0;
+	if (lc->sync_search >= lc->region_count) {
+		/*
+		 * FIXME: pvmove is not supported yet, but when it is,
+		 * an audit of sync_count changes will need to be made
+		 */
+		if (lc->sync_count < lc->region_count) {
+			lc->sync_search = 0;
+		} else {
+			return 0;
+		}
 	}
 	do {
 		*region = ext2_find_next_zero_bit((unsigned long *) lc->sync_bits,
@@ -557,9 +565,12 @@
 
 	if(!find_ru_by_region(lc, lr->u.lr_region)){
 		log_set_bit(lc, lc->clean_bits, lr->u.lr_region);
+		write_bits(lc);
+		/*
 		if (write_bits(lc))
 			DMERR("Write bits failed on mirror log device, %s",
 			      lc->log_dev->name);
+		*/
 	}
 	return 0;
 }



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-01-08 19:28 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-01-08 19:28 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-01-08 19:28:26

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 

Log message:
	Now that the kernel is correctly handling sync state change, we can
	remove the workaround in cmirror (keeping sync_count vs. decrementing
	it).
	
	Also moved some print statements to reduce console ouput.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.33&r2=1.1.2.34
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.18&r2=1.1.2.19

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/12/07 18:58:32	1.1.2.33
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/01/08 19:28:26	1.1.2.34
@@ -961,21 +961,20 @@
 			continue;
 		}
 
-		DMWARN("unable to get server (%u) to mark region (%Lu)",
-		       lc->server_id, region);
-		DMWARN("Reason :: %d", error);
-
 		if (error == -EIO) {
 			lc->log_dev_failed = 1;
 			break;
 		}
+		DMWARN("unable to get server (%u) to mark region (%Lu)",
+		       lc->server_id, region);
+		DMWARN("Reason :: %d", error);
 	}
 
 	if (lc->log_dev_failed) {
-		DMERR("Write failed on mirror log device, %s",
-		      lc->log_dev->name);
 		dm_table_event(lc->ti->table);
 		/*
+		DMERR("Write failed on mirror log device, %s",
+		      lc->log_dev->name);
 		if (!atomic_read(&lc->suspended))
 			wait_for_completion(&lc->failure_completion);
 		*/
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/12/07 18:58:32	1.1.2.18
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/01/08 19:28:26	1.1.2.19
@@ -516,8 +516,6 @@
 			lc->touched = 0;
 			lc->log_dev_failed = 0;
 		} else {
-			DMERR("Mark region failed (%d) on mirror log device, %s",
-			      r, lc->log_dev->name);
 			lc->log_dev_failed = 1;
 		}
 	} else if (ru->ru_rw == RU_RECOVER) {
@@ -608,7 +606,7 @@
 			lc->sync_count++;
 		}
 	} else if (log_test_bit(lc->sync_bits, lr->u.lr_region)) {
-		/* gone for now: lc->sync_count--; */
+		lc->sync_count--;
 		log_clear_bit(lc, lc->sync_bits, lr->u.lr_region);
 	}
 



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2006-12-07 18:58 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2006-12-07 18:58 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2006-12-07 18:58:32

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 

Log message:
	Get rid of a number of unnecessary messages, which spray to the console
	during errors and cause the mirror reconfiguration to take a long time.
	(Even been seen to cause machines to be fenced if the load is too great.)

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.32&r2=1.1.2.33
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.17&r2=1.1.2.18

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/12/05 17:49:08	1.1.2.32
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/12/07 18:58:32	1.1.2.33
@@ -905,9 +905,8 @@
 	spin_lock(&region_state_lock);
 	list_for_each_entry_safe(rs, tmp_rs, &clear_region_list, rs_list){
 		if(lc == rs->rs_lc && region == rs->rs_region){
-#ifdef DEBUG
-			DMINFO("Mark pre-empting clear of region %Lu", region);
-#endif
+			DMDEBUG("Mark pre-empting clear (%Lu/%s)",
+				region, lc->uuid + (strlen(lc->uuid) - 8));
 			list_del_init(&rs->rs_list);
 			list_add(&rs->rs_list, &marked_region_list);
 			clear_region_count--;
@@ -1023,7 +1022,7 @@
 	** clearing without ever marking..................................... */
 
 	if(!rs_new){
-		DMERR("Unable to allocate region_state for mark.");
+		DMERR("Unable to allocate region_state for clear.");
 		BUG();
 	}
 
@@ -1058,9 +1057,6 @@
 	while(consult_server(lc, region, LRT_COMPLETE_RESYNC_WORK, &success_tmp)){
 		DMWARN("unable to notify server of completed resync work");
 	}
-	if (!success)
-		DMERR("Attempting to revert sync status of region #%llu", region);
-
 	return;
 }
 
@@ -1069,11 +1065,7 @@
 	int i;
 	region_t rtn;
 	struct log_c *lc = (struct log_c *) log->context;
-/* take out optimization
-	if(atomic_read(&lc->in_sync) == 1){
-		return lc->region_count;
-	}
-*/
+
 	/* Try to get sync count up to five times */
 	for (i = 0; i < 5 && consult_server(lc, 0, LRT_GET_SYNC_COUNT, &rtn); i++);
 	if(i >= 5){
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/09/05 17:50:11	1.1.2.17
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/12/07 18:58:32	1.1.2.18
@@ -522,8 +522,8 @@
 		}
 	} else if (ru->ru_rw == RU_RECOVER) {
 		DMINFO("Attempt to mark a region " SECTOR_FORMAT 
-		      ", which is being recovered.",
-		      lr->u.lr_region);
+		      "/%s which is being recovered.",
+		       lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
 		DMINFO("Current recoverer: %u", ru->ru_nodeid);
 		DMINFO("Mark requester   : %u", who);
 
@@ -534,8 +534,8 @@
 	} else {
 		DMWARN("Attempt to mark a already marked region (%u,"
 		       SECTOR_FORMAT
-		       ")",
-		       who, lr->u.lr_region);
+		       "/%s)",
+		       who, lr->u.lr_region, lc->uuid + (strlen(lc->uuid) - 8));
 		mempool_free(new, region_user_pool);
 	}
 
@@ -595,8 +595,6 @@
 
 
 static int server_complete_resync_work(struct log_c *lc, struct log_request *lr, int success){
-	uint32_t info;
-
 	if (lr->u.lr_region > lc->region_count) {
 		return -EINVAL;
 	}
@@ -610,30 +608,33 @@
 			lc->sync_count++;
 		}
 	} else if (log_test_bit(lc->sync_bits, lr->u.lr_region)) {
-		DMERR("complete_resync_work region going out-of-sync: disk failure");
 		/* gone for now: lc->sync_count--; */
 		log_clear_bit(lc, lc->sync_bits, lr->u.lr_region);
 	}
 
-	info = (uint32_t)(lc->region_count - lc->sync_count);
-
-	if((info < 10001 && !(info%1000)) ||
-	   (info < 1000 && !(info%100)) ||
-	   (info < 200 && !(info%25)) ||
-	   (info < 6)){
-		DMDEBUG(SECTOR_FORMAT " out-of-sync regions remaining for %s.",
-		       lc->region_count - lc->sync_count,
-		       lc->uuid + (strlen(lc->uuid) - 8));
-	}
 	return 0;
 }
 
 
 static int server_get_sync_count(struct log_c *lc, struct log_request *lr){
+	if (lc->sync_count > lc->region_count) {
+		DMERR("sync_count (" SECTOR_FORMAT ") > region_count (" SECTOR_FORMAT ") in %s!",
+		      lc->sync_count, lc->region_count, lc->uuid + (strlen(lc->uuid) - 8));
+		disk_resume(lc);
+	}
+
 	lr->u.lr_region_rtn = lc->sync_count;
 	return 0;
 }
 
+static void report_duplicate_log(struct log_c *lc)
+{
+	DMERR("HEY!!! There are two matches for %s",
+	      lc->uuid + (strlen(lc->uuid) - 8));
+	list_for_each_entry(lc, &log_list_head, log_list) {
+		DMERR("  %s", lc->uuid + (strlen(lc->uuid) - 8));
+	}
+}
 
 static struct log_c *get_log_context(char *uuid){
 	struct log_c *lc, *r = NULL;
@@ -641,8 +642,7 @@
 	list_for_each_entry(lc, &log_list_head, log_list){
 		if(!strncmp(lc->uuid, uuid, MAX_NAME_LEN)){
 			if (r)
-				DMERR("HEY!!! There are two matches for %s",
-				      lc->uuid + (strlen(lc->uuid) - 8));
+				report_duplicate_log(lc);
 			else
 				r = lc;
 		}
@@ -932,6 +932,7 @@
 				break;
 			}
 			error = server_mark_region(lc, &lr, nodeid);
+			lr.u.lr_int_rtn = 0;
 			break;
 		case LRT_CLEAR_REGION:
 			if(!(nodeid = 



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2006-09-05 17:50 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2006-09-05 17:50 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2006-09-05 17:50:11

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 

Log message:
	- fix the bugs I've seen so far - mostly related to the recently added
	ability to migrate the log server on suspension - that cause hangs
	during combinations of create/delete/convert of mirrors

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.23&r2=1.1.2.24
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.16&r2=1.1.2.17

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/07/27 23:11:55	1.1.2.23
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/09/05 17:50:11	1.1.2.24
@@ -396,8 +396,8 @@
 	set_fs(get_ds());
 
 	if(type == LRT_MASTER_LEAVING){
-		len = sock_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
-				   /* WAIT for it */0);
+		len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
+				 0, 10);
 	} else {
 		len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
 				 0, 5);
@@ -419,7 +419,7 @@
 		goto fail;
 	}
 
-	if(lr->u.lr_int_rtn == -ENXIO){
+	if (lr->u.lr_int_rtn == -ENXIO) {
 		lc->server_id = 0xDEAD;
 		*retry = 1;
 		goto fail;
@@ -591,7 +591,7 @@
 		       unsigned int argc, char **argv, int disk)
 {
 	int error = 0;
-	struct log_c *lc;
+	struct log_c *lc, *tmp_lc;
 	struct sockaddr_in saddr_in;
 
 	if (!disk) {
@@ -621,6 +621,15 @@
 
 	atomic_set(&lc->in_sync, -1);
 
+	list_for_each_entry(tmp_lc, &log_list_head, log_list){
+		if(!strncmp(tmp_lc->uuid, lc->uuid, MAX_NAME_LEN)){
+			DMERR("Log already exists with uuid, %s",
+			      lc->uuid + (strlen(lc->uuid) - 8));
+			error = -EINVAL;
+			goto fail;
+		}
+	}
+
 	list_add(&lc->log_list, &log_list_head);
 	INIT_LIST_HEAD(&lc->region_users);
 
@@ -730,6 +739,7 @@
 	list_del_init(&lc->log_list);
 	if ((lc->server_id == my_id) && !atomic_read(&lc->suspended))
 		consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
+
 	sock_release(lc->client_sock);
 
 	if (lc->log_dev)
@@ -748,6 +758,7 @@
 
 static int cluster_postsuspend(struct dirty_log *log)
 {
+	int r;
 	struct log_c *lc = (struct log_c *) log->context;
 
 	while (1) {
@@ -765,12 +776,16 @@
 	if(lc->server_id == my_id) {
 		while (1) {
 			consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
+
 			down(&consult_server_lock);
 			run_election(lc, 0xDEAD);
 			up(&consult_server_lock);
-			if (lc->server_id == my_id) {
+
+			if ((my_id && (lc->server_id == my_id))) {
+				atomic_set(&lc->suspended, 0);
 				set_current_state(TASK_INTERRUPTIBLE);
-				schedule_timeout(HZ/4);
+				schedule_timeout(HZ*2);
+				atomic_set(&lc->suspended, 1);
 			} else {
 				break;
 			}
@@ -1005,7 +1020,7 @@
 	if (!success) {
 		DMERR("Attempting to revert sync status of region #%llu", region);
 		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(HZ/50);
+		schedule_timeout(HZ/5);
 	}
 
 	return;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/07/27 23:11:55	1.1.2.16
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/09/05 17:50:11	1.1.2.17
@@ -107,7 +107,9 @@
 	if (!log->log_dev)
 		return 0;
 
-	BUG_ON(atomic_read(&log->suspended));
+	if (atomic_read(&log->suspended))
+		return -EDEADLK;
+
 	r = dm_io_sync_vm(1, &log->header_location, READ,
 			  log->disk_header, &ebits);
 	if (unlikely(r))
@@ -138,7 +140,9 @@
 	if (!log->log_dev)
 		return 0;
 
-	BUG_ON(atomic_read(&log->suspended));
+	if (atomic_read(&log->suspended))
+		return -EDEADLK;
+
 	header_to_disk(&log->header, log->disk_header);
 	return dm_io_sync_vm(1, &log->header_location, WRITE,
 			     log->disk_header, &ebits);
@@ -182,7 +186,9 @@
 	if (!log->log_dev)
 		return 0;
 
-	BUG_ON(atomic_read(&log->suspended));
+	if (atomic_read(&log->suspended))
+		return -EDEADLK;
+
 	r = dm_io_sync_vm(1, &log->bits_location, READ,
 			  log->clean_bits, &ebits);
 
@@ -199,7 +205,9 @@
 	if (!log->log_dev)
 		return 0;
 
-	BUG_ON(atomic_read(&log->suspended));
+	if (atomic_read(&log->suspended))
+		return -EDEADLK;
+
 	return dm_io_sync_vm(1, &log->bits_location, WRITE,
 			     log->clean_bits, &ebits);
 }
@@ -252,9 +260,9 @@
 			continue;
 		} else if(str[i] == 0xFF){
 			if(range_count==1){
-				DMINFO("  %d", region - 1);
+				DMDEBUG("  %d", region - 1);
 			} else if(range_count){
-				DMINFO("  %d - %d", region-range_count, region-1);
+				DMDEBUG("  %d - %d", region-range_count, region-1);
 			}
 			range_count = 0;
 			region+=(bit_count < 8)? bit_count: 8;      
@@ -272,9 +280,9 @@
 				count++;
 			} else {
 				if(range_count==1){
-					DMINFO("  %d", region - 1);
+					DMDEBUG("  %d", region - 1);
 				} else if(range_count){
-					DMINFO("  %d - %d", region-range_count, region-1);
+					DMDEBUG("  %d - %d", region-range_count, region-1);
 				}
 				range_count = 0;
 				region++;
@@ -283,9 +291,9 @@
 	}
 
 	if(range_count==1){
-		DMINFO("  %d", region - 1);
+		DMDEBUG("  %d", region - 1);
 	} else if(range_count){
-		DMINFO("  %d - %d", region-range_count, region);
+		DMDEBUG("  %d - %d", region-range_count, region);
 	}
 	return count;
 }
@@ -312,7 +320,7 @@
 	i = 1;
 	if (!lc->log_dev_failed &&
 	    ((r = read_header(lc)) || (i = 0) || (r = read_bits(lc)))) {
-		if (r == -EINVAL)
+		if (r == -EINVAL || r == -EDEADLK)
 			return r;
 
 		DMWARN("Read %s failed on mirror log device, %s",
@@ -416,9 +424,11 @@
 
 	i = 1;
 	if ((r = write_bits(lc)) || (i = 0) || (r = write_header(lc))) {
-		DMWARN("Write %s failed on mirror log device, %s.",
-		       i ? "bits" : "header", lc->log_dev->name);
-		lc->log_dev_failed = 1;
+		if (r != -EDEADLK) {
+			DMWARN("Write %s failed on mirror log device, %s.",
+			       i ? "bits" : "header", lc->log_dev->name);
+			lc->log_dev_failed = 1;
+		}
 	} else 
 		lc->log_dev_failed = 0;
 
@@ -469,6 +479,11 @@
 
 static int server_in_sync(struct log_c *lc, struct log_request *lr)
 {
+	if (lr->u.lr_region > lc->region_count) {
+		lr->u.lr_int_rtn = 0;
+		return -EINVAL;
+	}
+
 	if(likely(log_test_bit(lc->sync_bits, lr->u.lr_region)))
 		/* in-sync */
 		lr->u.lr_int_rtn = 1;
@@ -581,6 +596,11 @@
 
 static int server_complete_resync_work(struct log_c *lc, struct log_request *lr, int success){
 	uint32_t info;
+
+	if (lr->u.lr_region > lc->region_count) {
+		return -EINVAL;
+	}
+
 	log_clear_bit(lc, lc->recovering_bits, lr->u.lr_region);
 
 	if (success) {
@@ -678,15 +698,16 @@
 
 	/*
 	 * Check if we have access to the log.  We may not
-	 * get have loaded this device.
+	 * yet have loaded this device.
 	 */
-	if(!lc){
+	if (!lc) {
 		lr->u.lr_node_count++;
 		return 0;
 	}
 
 	if(lr->lr_type == LRT_MASTER_LEAVING){
-		lc->server_id = 0xDEAD;
+		if (lr->u.lr_starter == lc->server_id)
+			lc->server_id = 0xDEAD;
 		lr->u.lr_node_count++;
 		return 0;
 	}
@@ -696,7 +717,7 @@
 	 * We shortcut the election here and respond directly
 	 * to the inquirer
 	 */
-	if(lc->server_id == my_id){
+	if((lc->server_id == my_id) && !atomic_read(&lc->suspended)){
 		lr->u.lr_coordinator = my_id;
 		if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
 			return -1;
@@ -850,10 +871,12 @@
 			if(lc && (old != lc->server_id) && (my_id == lc->server_id)){
 				DMDEBUG("I'm the cluster mirror log server for %s",
 				       lc->uuid + (strlen(lc->uuid) - 8));
-				if (!atomic_read(&lc->suspended))
-					disk_resume(lc);
-				else
+				if (atomic_read(&lc->suspended)) {
 					DMDEBUG("Not reading disk log because I'm suspended.");
+					
+				} else if (disk_resume(lc) == -EDEADLK) {
+					DMDEBUG("Unable to read disk log - deadlock potential.");
+				}
 			}
 			goto reply;
 		}
@@ -944,7 +967,7 @@
 /*
 			DMWARN("Error (%d) while processing request (%s)",
 			       error,
-			       (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_C	LEAN":
+			       (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
 			       (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
 			       (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
 			       (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
@@ -972,7 +995,18 @@
 			
 		set_fs(fs);
 		if(error < 0){
-			DMWARN("unable to sendmsg to client (error = %d)", error);
+			DMWARN("unable to sendmsg to client (type = %s, error = %d)",
+			       (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
+			       (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
+			       (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
+			       (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
+			       (lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
+			       (lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
+			       (lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
+			       (lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
+			       (lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
+			       (lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN",
+			       error);
 			return error;
 		}
 	} else if(error == -EAGAIN || error == -ETIMEDOUT){
@@ -1036,10 +1070,11 @@
 			
 			list_for_each_entry(lc, &log_list_head, log_list){
 				if(lc->server_id == my_id){
-					if (!atomic_read(&lc->suspended))
-						disk_resume(lc);
-					else
+					if (atomic_read(&lc->suspended)) {
 						DMDEBUG("Not reading disk log because I'm suspended.");
+					} else if (disk_resume(lc) == -EDEADLK) {
+						DMDEBUG("Unable to read disk log - deadlock potential.");
+					}
 				}
 			}
 			break;



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2006-09-05 17:48 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2006-09-05 17:48 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4U4
Changes by:	jbrassow at sourceware.org	2006-09-05 17:48:02

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-server.c 

Log message:
	- fix the bugs I've seen so far - mostly related to the recently added
	ability to migrate the log server on suspension - that cause hangs
	during combinations of create/delete/convert of mirrors

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4U4&r1=1.1.2.19.2.4&r2=1.1.2.19.2.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4U4&r1=1.1.2.9.2.7&r2=1.1.2.9.2.8

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/07/27 23:10:58	1.1.2.19.2.4
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/09/05 17:48:02	1.1.2.19.2.5
@@ -396,8 +396,8 @@
 	set_fs(get_ds());
 
 	if(type == LRT_MASTER_LEAVING){
-		len = sock_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
-				   /* WAIT for it */0);
+		len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
+				 0, 10);
 	} else {
 		len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
 				 0, 5);
@@ -419,7 +419,7 @@
 		goto fail;
 	}
 
-	if(lr->u.lr_int_rtn == -ENXIO){
+	if (lr->u.lr_int_rtn == -ENXIO) {
 		lc->server_id = 0xDEAD;
 		*retry = 1;
 		goto fail;
@@ -591,7 +591,7 @@
 		       unsigned int argc, char **argv, int disk)
 {
 	int error = 0;
-	struct log_c *lc;
+	struct log_c *lc, *tmp_lc;
 	struct sockaddr_in saddr_in;
 
 	if (!disk) {
@@ -621,6 +621,15 @@
 
 	atomic_set(&lc->in_sync, -1);
 
+	list_for_each_entry(tmp_lc, &log_list_head, log_list){
+		if(!strncmp(tmp_lc->uuid, lc->uuid, MAX_NAME_LEN)){
+			DMERR("Log already exists with uuid, %s",
+			      lc->uuid + (strlen(lc->uuid) - 8));
+			error = -EINVAL;
+			goto fail;
+		}
+	}
+
 	list_add(&lc->log_list, &log_list_head);
 	INIT_LIST_HEAD(&lc->region_users);
 
@@ -730,6 +739,7 @@
 	list_del_init(&lc->log_list);
 	if ((lc->server_id == my_id) && !atomic_read(&lc->suspended))
 		consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
+
 	sock_release(lc->client_sock);
 
 	if (lc->log_dev)
@@ -748,6 +758,7 @@
 
 static int cluster_postsuspend(struct dirty_log *log)
 {
+	int r;
 	struct log_c *lc = (struct log_c *) log->context;
 
 	while (1) {
@@ -765,12 +776,16 @@
 	if(lc->server_id == my_id) {
 		while (1) {
 			consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
+
 			down(&consult_server_lock);
 			run_election(lc, 0xDEAD);
 			up(&consult_server_lock);
-			if (lc->server_id == my_id) {
+
+			if ((my_id && (lc->server_id == my_id))) {
+				atomic_set(&lc->suspended, 0);
 				set_current_state(TASK_INTERRUPTIBLE);
-				schedule_timeout(HZ/4);
+				schedule_timeout(HZ*2);
+				atomic_set(&lc->suspended, 1);
 			} else {
 				break;
 			}
@@ -1005,7 +1020,7 @@
 	if (!success) {
 		DMERR("Attempting to revert sync status of region #%llu", region);
 		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(HZ/50);
+		schedule_timeout(HZ/5);
 	}
 
 	return;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/07/27 23:10:58	1.1.2.9.2.7
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/09/05 17:48:02	1.1.2.9.2.8
@@ -107,7 +107,9 @@
 	if (!log->log_dev)
 		return 0;
 
-	BUG_ON(atomic_read(&log->suspended));
+	if (atomic_read(&log->suspended))
+		return -EDEADLK;
+
 	r = dm_io_sync_vm(1, &log->header_location, READ,
 			  log->disk_header, &ebits);
 	if (unlikely(r))
@@ -138,7 +140,9 @@
 	if (!log->log_dev)
 		return 0;
 
-	BUG_ON(atomic_read(&log->suspended));
+	if (atomic_read(&log->suspended))
+		return -EDEADLK;
+
 	header_to_disk(&log->header, log->disk_header);
 	return dm_io_sync_vm(1, &log->header_location, WRITE,
 			     log->disk_header, &ebits);
@@ -182,7 +186,9 @@
 	if (!log->log_dev)
 		return 0;
 
-	BUG_ON(atomic_read(&log->suspended));
+	if (atomic_read(&log->suspended))
+		return -EDEADLK;
+
 	r = dm_io_sync_vm(1, &log->bits_location, READ,
 			  log->clean_bits, &ebits);
 
@@ -199,7 +205,9 @@
 	if (!log->log_dev)
 		return 0;
 
-	BUG_ON(atomic_read(&log->suspended));
+	if (atomic_read(&log->suspended))
+		return -EDEADLK;
+
 	return dm_io_sync_vm(1, &log->bits_location, WRITE,
 			     log->clean_bits, &ebits);
 }
@@ -252,9 +260,9 @@
 			continue;
 		} else if(str[i] == 0xFF){
 			if(range_count==1){
-				DMINFO("  %d", region - 1);
+				DMDEBUG("  %d", region - 1);
 			} else if(range_count){
-				DMINFO("  %d - %d", region-range_count, region-1);
+				DMDEBUG("  %d - %d", region-range_count, region-1);
 			}
 			range_count = 0;
 			region+=(bit_count < 8)? bit_count: 8;      
@@ -272,9 +280,9 @@
 				count++;
 			} else {
 				if(range_count==1){
-					DMINFO("  %d", region - 1);
+					DMDEBUG("  %d", region - 1);
 				} else if(range_count){
-					DMINFO("  %d - %d", region-range_count, region-1);
+					DMDEBUG("  %d - %d", region-range_count, region-1);
 				}
 				range_count = 0;
 				region++;
@@ -283,9 +291,9 @@
 	}
 
 	if(range_count==1){
-		DMINFO("  %d", region - 1);
+		DMDEBUG("  %d", region - 1);
 	} else if(range_count){
-		DMINFO("  %d - %d", region-range_count, region);
+		DMDEBUG("  %d - %d", region-range_count, region);
 	}
 	return count;
 }
@@ -312,7 +320,7 @@
 	i = 1;
 	if (!lc->log_dev_failed &&
 	    ((r = read_header(lc)) || (i = 0) || (r = read_bits(lc)))) {
-		if (r == -EINVAL)
+		if (r == -EINVAL || r == -EDEADLK)
 			return r;
 
 		DMWARN("Read %s failed on mirror log device, %s",
@@ -416,9 +424,11 @@
 
 	i = 1;
 	if ((r = write_bits(lc)) || (i = 0) || (r = write_header(lc))) {
-		DMWARN("Write %s failed on mirror log device, %s.",
-		       i ? "bits" : "header", lc->log_dev->name);
-		lc->log_dev_failed = 1;
+		if (r != -EDEADLK) {
+			DMWARN("Write %s failed on mirror log device, %s.",
+			       i ? "bits" : "header", lc->log_dev->name);
+			lc->log_dev_failed = 1;
+		}
 	} else 
 		lc->log_dev_failed = 0;
 
@@ -469,6 +479,11 @@
 
 static int server_in_sync(struct log_c *lc, struct log_request *lr)
 {
+	if (lr->u.lr_region > lc->region_count) {
+		lr->u.lr_int_rtn = 0;
+		return -EINVAL;
+	}
+
 	if(likely(log_test_bit(lc->sync_bits, lr->u.lr_region)))
 		/* in-sync */
 		lr->u.lr_int_rtn = 1;
@@ -581,6 +596,11 @@
 
 static int server_complete_resync_work(struct log_c *lc, struct log_request *lr, int success){
 	uint32_t info;
+
+	if (lr->u.lr_region > lc->region_count) {
+		return -EINVAL;
+	}
+
 	log_clear_bit(lc, lc->recovering_bits, lr->u.lr_region);
 
 	if (success) {
@@ -678,15 +698,16 @@
 
 	/*
 	 * Check if we have access to the log.  We may not
-	 * get have loaded this device.
+	 * yet have loaded this device.
 	 */
-	if(!lc){
+	if (!lc) {
 		lr->u.lr_node_count++;
 		return 0;
 	}
 
 	if(lr->lr_type == LRT_MASTER_LEAVING){
-		lc->server_id = 0xDEAD;
+		if (lr->u.lr_starter == lc->server_id)
+			lc->server_id = 0xDEAD;
 		lr->u.lr_node_count++;
 		return 0;
 	}
@@ -696,7 +717,7 @@
 	 * We shortcut the election here and respond directly
 	 * to the inquirer
 	 */
-	if(lc->server_id == my_id){
+	if((lc->server_id == my_id) && !atomic_read(&lc->suspended)){
 		lr->u.lr_coordinator = my_id;
 		if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
 			return -1;
@@ -850,10 +871,12 @@
 			if(lc && (old != lc->server_id) && (my_id == lc->server_id)){
 				DMDEBUG("I'm the cluster mirror log server for %s",
 				       lc->uuid + (strlen(lc->uuid) - 8));
-				if (!atomic_read(&lc->suspended))
-					disk_resume(lc);
-				else
+				if (atomic_read(&lc->suspended)) {
 					DMDEBUG("Not reading disk log because I'm suspended.");
+					
+				} else if (disk_resume(lc) == -EDEADLK) {
+					DMDEBUG("Unable to read disk log - deadlock potential.");
+				}
 			}
 			goto reply;
 		}
@@ -944,7 +967,7 @@
 /*
 			DMWARN("Error (%d) while processing request (%s)",
 			       error,
-			       (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_C	LEAN":
+			       (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
 			       (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
 			       (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
 			       (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
@@ -972,7 +995,18 @@
 			
 		set_fs(fs);
 		if(error < 0){
-			DMWARN("unable to sendmsg to client (error = %d)", error);
+			DMWARN("unable to sendmsg to client (type = %s, error = %d)",
+			       (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
+			       (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
+			       (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
+			       (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
+			       (lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
+			       (lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
+			       (lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
+			       (lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
+			       (lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
+			       (lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN",
+			       error);
 			return error;
 		}
 	} else if(error == -EAGAIN || error == -ETIMEDOUT){
@@ -1036,10 +1070,11 @@
 			
 			list_for_each_entry(lc, &log_list_head, log_list){
 				if(lc->server_id == my_id){
-					if (!atomic_read(&lc->suspended))
-						disk_resume(lc);
-					else
+					if (atomic_read(&lc->suspended)) {
 						DMDEBUG("Not reading disk log because I'm suspended.");
+					} else if (disk_resume(lc) == -EDEADLK) {
+						DMDEBUG("Unable to read disk log - deadlock potential.");
+					}
 				}
 			}
 			break;



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2006-07-27 23:11 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2006-07-27 23:11 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2006-07-27 23:11:55

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c 

Log message:
	- further tightening for 199826
	
	We now refuse to read/write the disk log if we are suspended.  We also
	add BUG_ON(<suspended>) to operations that do I/O to the log device.
	
	The reason for the BUG_ON() is that it is better to drop the machine
	than to have it hang the cluster while it attempts to read/write from
	a suspended device.  That being said, it should now be impossible to
	get to those functions which would perform I/O operations during
	suspension.
	
	I have still seen cases where the mirror will stall.  However, I think
	this is due to LVM (clvmd), because it happens when a mirror is created
	while the log device is suspended - which must not happen.  I've only
	seen this when doing simultaneous create/convert/remove from all nodes
	in the cluster.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.22&r2=1.1.2.23
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.9&r2=1.1.2.10
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.15&r2=1.1.2.16

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/07/22 22:19:34	1.1.2.22
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/07/27 23:11:55	1.1.2.23
@@ -302,7 +302,7 @@
 		lc->server_id = lr.u.lr_coordinator;
 	} else {
 		/* ATTENTION -- what do we do with this ? */
-		DMWARN("Failed to receive election results from server");
+		DMWARN("Failed to receive election results from server: %d", len);
 		error = len;
 	}
 
@@ -363,21 +363,7 @@
 
 	iov.iov_len = sizeof(struct log_request);
 	iov.iov_base = lr;
-/*
-	DMERR("To  :: 0x%x, %s", 
-	       saddr_in.sin_addr.s_addr,
-	       (lr->lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
-	       (lr->lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-	       (lr->lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-	       (lr->lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-	       (lr->lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-	       (lr->lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-	       (lr->lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-	       (lr->lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-	       (lr->lr_type == LRT_ELECTION)? "LRT_ELECTION":
-	       (lr->lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN"
-		);
-*/
+
 	if(lr->lr_type == LRT_MARK_REGION){
 		mark_req2ser++;
 	}
@@ -453,25 +439,28 @@
 			       request_retry_count,
 			       request_count,
 			       dm_div_up(request_retry_count*100, request_count));
+			DMDEBUG("Last request:");
+			DMDEBUG(" - my_id   :: %u", my_id);
+			DMDEBUG(" - server  :: %u", lc->server_id);
+			DMDEBUG(" - log uuid:: %s (%s)",
+			       lc->uuid + (strlen(lc->uuid) - 8),
+			       atomic_read(&lc->suspended) ? "suspended" : "active");
+			DMDEBUG(" - request :: %s",
+			       (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
+			       (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
+			       (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
+			       (type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
+			       (type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
+			       (type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
+			       (type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
+			       (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
+			       (type == LRT_ELECTION)? "LRT_ELECTION":
+			       (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+			DMDEBUG(" - error   :: %d", error);
 		}
 	}
 
 	if(lr) kfree(lr);
-#if 0
-	DMINFO("My (%u) request (%s) to server (%u) failed :: %d",
-	       my_id,
-	       (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
-	       (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-	       (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-	       (type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-	       (type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-	       (type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-	       (type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-	       (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-	       (type == LRT_ELECTION)? "LRT_ELECTION":
-	       (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN",
-	       lc->server_id, error);
-#endif
 	return error;
 }
 
@@ -739,7 +728,7 @@
 		DMINFO("Leaving while clear region requests remain.");
 
 	list_del_init(&lc->log_list);
-	if(lc->server_id == my_id)
+	if ((lc->server_id == my_id) && !atomic_read(&lc->suspended))
 		consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
 	sock_release(lc->client_sock);
 
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/07/22 22:19:34	1.1.2.9
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/07/27 23:11:55	1.1.2.10
@@ -22,6 +22,7 @@
 #define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
 #define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
 #define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
+#define DMDEBUG(f, x...) printk(KERN_DEBUG DM_NAME ": " f "\n" , ## x)
 #define DMEMIT(x...) sz += ((sz >= maxlen) ? \
 	  0 : scnprintf(result + sz, maxlen - sz, x))
 
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/07/22 22:50:38	1.1.2.15
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/07/27 23:11:55	1.1.2.16
@@ -107,6 +107,7 @@
 	if (!log->log_dev)
 		return 0;
 
+	BUG_ON(atomic_read(&log->suspended));
 	r = dm_io_sync_vm(1, &log->header_location, READ,
 			  log->disk_header, &ebits);
 	if (unlikely(r))
@@ -137,6 +138,7 @@
 	if (!log->log_dev)
 		return 0;
 
+	BUG_ON(atomic_read(&log->suspended));
 	header_to_disk(&log->header, log->disk_header);
 	return dm_io_sync_vm(1, &log->header_location, WRITE,
 			     log->disk_header, &ebits);
@@ -180,6 +182,7 @@
 	if (!log->log_dev)
 		return 0;
 
+	BUG_ON(atomic_read(&log->suspended));
 	r = dm_io_sync_vm(1, &log->bits_location, READ,
 			  log->clean_bits, &ebits);
 
@@ -196,6 +199,7 @@
 	if (!log->log_dev)
 		return 0;
 
+	BUG_ON(atomic_read(&log->suspended));
 	return dm_io_sync_vm(1, &log->bits_location, WRITE,
 			     log->clean_bits, &ebits);
 }
@@ -295,7 +299,8 @@
 	struct region_user *tmp_ru, *ru;
 	unsigned char live_nodes[16]; /* Attention -- max of 128 nodes... */
 
-	DMINFO("Disk Resume::");
+	DMDEBUG("Disk Resume::  %s (%s)", lc->uuid + (strlen(lc->uuid) - 8),
+		atomic_read(&lc->suspended) ? "suspended" : "active");
 
 	debug_disk_write = 1;
 	memset(live_nodes, 0, sizeof(live_nodes));
@@ -355,20 +360,20 @@
 		}
 	}
 
-	DMINFO("  Live nodes        :: %d", global_count);
-	DMINFO("  In-Use Regions    :: %d", good_count+bad_count);
-	DMINFO("  Good IUR's        :: %d", good_count);
-	DMINFO("  Bad IUR's         :: %d", bad_count);
+	DMDEBUG("  Live nodes        :: %d", global_count);
+	DMDEBUG("  In-Use Regions    :: %d", good_count+bad_count);
+	DMDEBUG("  Good IUR's        :: %d", good_count);
+	DMDEBUG("  Bad IUR's         :: %d", bad_count);
 
 	lc->sync_count = count_bits32(lc->sync_bits, lc->bitset_uint32_count);
 	lc->sync_search = 0;
 
-	DMINFO("  Sync count        :: %Lu", lc->sync_count);
-	DMINFO("  Disk Region count :: %Lu", lc->header.nr_regions);
-	DMINFO("  Region count      :: %Lu", lc->region_count);
+	DMDEBUG("  Sync count        :: %Lu", lc->sync_count);
+	DMDEBUG("  Disk Region count :: %Lu", lc->header.nr_regions);
+	DMDEBUG("  Region count      :: %Lu", lc->region_count);
 
 	if(lc->header.nr_regions != lc->region_count){
-		DMINFO("  NOTE:  Mapping has changed.");
+		DMDEBUG("  NOTE:  Mapping has changed.");
 	}
 /* Take this out for now.
 	if(list_empty(&lc->region_users) && (lc->sync_count != lc->header.nr_regions)){
@@ -398,13 +403,13 @@
 	}			
 
 */
-	DMINFO("Marked regions::");
+	DMDEBUG("Marked regions::");
 	i = print_zero_bits((unsigned char *)lc->clean_bits, 0, lc->region_count);
-	DMINFO("  Total = %d", i);
+	DMDEBUG("  Total = %d", i);
 
-	DMINFO("Out-of-sync regions::");
+	DMDEBUG("Out-of-sync regions::");
 	i = print_zero_bits((unsigned char *)lc->sync_bits, 0, lc->region_count);
-	DMINFO("  Total = %d", i);
+	DMDEBUG("  Total = %d", i);
 
 	/* set the correct number of regions in the header */
 	lc->header.nr_regions = lc->region_count;
@@ -529,7 +534,7 @@
 
 	ru = find_ru(lc, who, lr->u.lr_region);
 	if(!ru){
-		DMINFO("Request to remove unrecorded region user (%u/%Lu)",
+		DMDEBUG("Request to remove unrecorded region user (%u/%Lu)",
 		       who, lr->u.lr_region);
 		return -EINVAL;
 	} else {
@@ -596,7 +601,7 @@
 	   (info < 1000 && !(info%100)) ||
 	   (info < 200 && !(info%25)) ||
 	   (info < 6)){
-		DMINFO(SECTOR_FORMAT " out-of-sync regions remaining for %s.",
+		DMDEBUG(SECTOR_FORMAT " out-of-sync regions remaining for %s.",
 		       lc->region_count - lc->sync_count,
 		       lc->uuid + (strlen(lc->uuid) - 8));
 	}
@@ -843,9 +848,12 @@
 				return -1;
 			}
 			if(lc && (old != lc->server_id) && (my_id == lc->server_id)){
-				DMINFO("I'm the cluster mirror log server for %s",
+				DMDEBUG("I'm the cluster mirror log server for %s",
 				       lc->uuid + (strlen(lc->uuid) - 8));
-				disk_resume(lc);
+				if (!atomic_read(&lc->suspended))
+					disk_resume(lc);
+				else
+					DMDEBUG("Not reading disk log because I'm suspended.");
 			}
 			goto reply;
 		}
@@ -860,6 +868,30 @@
 			goto reply;
 		}
 
+		if (atomic_read(&lc->suspended)) {
+			nodeid = ipaddr_to_nodeid((struct sockaddr *)msg.msg_name);
+			/*
+			DMDEBUG("Getting request while server (%u) is suspended:", my_id);
+			DMDEBUG(" - Requester :: %u", nodeid);
+			DMDEBUG(" - log uuid  :: %s", lc->uuid + (strlen(lc->uuid) - 8));
+			DMDEBUG(" - req type  :: %s",
+				(lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_C	LEAN":
+				(lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
+				(lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
+				(lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
+				(lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
+				(lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
+				(lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
+				(lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
+				(lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
+				(lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+			*/
+			if (my_id != nodeid) {
+				lr.u.lr_int_rtn = -ENXIO;
+				goto reply;
+			}
+		}			
+
 		switch(lr.lr_type){
 		case LRT_IS_CLEAN:
 			error = server_is_clean(lc, &lr);
@@ -1004,7 +1036,10 @@
 			
 			list_for_each_entry(lc, &log_list_head, log_list){
 				if(lc->server_id == my_id){
-					disk_resume(lc);
+					if (!atomic_read(&lc->suspended))
+						disk_resume(lc);
+					else
+						DMDEBUG("Not reading disk log because I'm suspended.");
 				}
 			}
 			break;



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2006-07-27 23:11 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2006-07-27 23:11 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4U4
Changes by:	jbrassow at sourceware.org	2006-07-27 23:10:58

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c 

Log message:
	- further tightening for 199826
	
	We now refuse to read/write the disk log if we are suspended.  We also
	add BUG_ON(<suspended>) to operations that do I/O to the log device.
	
	The reason for the BUG_ON() is that it is better to drop the machine
	than to have it hang the cluster while it attempts to read/write from
	a suspended device.  That being said, it should now be impossible to
	get to those functions which would perform I/O operations during
	suspension.
	
	I have still seen cases where the mirror will stall.  However, I think
	this is due to LVM (clvmd), because it happens when a mirror is created
	while the log device is suspended - which must not happen.  I've only
	seen this when doing simultaneous create/convert/remove from all nodes
	in the cluster.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4U4&r1=1.1.2.19.2.3&r2=1.1.2.19.2.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL4U4&r1=1.1.2.7.2.2&r2=1.1.2.7.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4U4&r1=1.1.2.9.2.6&r2=1.1.2.9.2.7

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/07/22 22:12:32	1.1.2.19.2.3
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/07/27 23:10:58	1.1.2.19.2.4
@@ -302,7 +302,7 @@
 		lc->server_id = lr.u.lr_coordinator;
 	} else {
 		/* ATTENTION -- what do we do with this ? */
-		DMWARN("Failed to receive election results from server");
+		DMWARN("Failed to receive election results from server: %d", len);
 		error = len;
 	}
 
@@ -363,21 +363,7 @@
 
 	iov.iov_len = sizeof(struct log_request);
 	iov.iov_base = lr;
-/*
-	DMERR("To  :: 0x%x, %s", 
-	       saddr_in.sin_addr.s_addr,
-	       (lr->lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
-	       (lr->lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-	       (lr->lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-	       (lr->lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-	       (lr->lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-	       (lr->lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-	       (lr->lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-	       (lr->lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-	       (lr->lr_type == LRT_ELECTION)? "LRT_ELECTION":
-	       (lr->lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN"
-		);
-*/
+
 	if(lr->lr_type == LRT_MARK_REGION){
 		mark_req2ser++;
 	}
@@ -453,25 +439,28 @@
 			       request_retry_count,
 			       request_count,
 			       dm_div_up(request_retry_count*100, request_count));
+			DMDEBUG("Last request:");
+			DMDEBUG(" - my_id   :: %u", my_id);
+			DMDEBUG(" - server  :: %u", lc->server_id);
+			DMDEBUG(" - log uuid:: %s (%s)",
+			       lc->uuid + (strlen(lc->uuid) - 8),
+			       atomic_read(&lc->suspended) ? "suspended" : "active");
+			DMDEBUG(" - request :: %s",
+			       (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
+			       (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
+			       (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
+			       (type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
+			       (type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
+			       (type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
+			       (type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
+			       (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
+			       (type == LRT_ELECTION)? "LRT_ELECTION":
+			       (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+			DMDEBUG(" - error   :: %d", error);
 		}
 	}
 
 	if(lr) kfree(lr);
-#if 0
-	DMINFO("My (%u) request (%s) to server (%u) failed :: %d",
-	       my_id,
-	       (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
-	       (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
-	       (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
-	       (type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
-	       (type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
-	       (type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
-	       (type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
-	       (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
-	       (type == LRT_ELECTION)? "LRT_ELECTION":
-	       (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN",
-	       lc->server_id, error);
-#endif
 	return error;
 }
 
@@ -739,7 +728,7 @@
 		DMINFO("Leaving while clear region requests remain.");
 
 	list_del_init(&lc->log_list);
-	if(lc->server_id == my_id)
+	if ((lc->server_id == my_id) && !atomic_read(&lc->suspended))
 		consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
 	sock_release(lc->client_sock);
 
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/07/22 22:12:32	1.1.2.7.2.2
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/07/27 23:10:58	1.1.2.7.2.3
@@ -22,6 +22,7 @@
 #define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
 #define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
 #define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
+#define DMDEBUG(f, x...) printk(KERN_DEBUG DM_NAME ": " f "\n" , ## x)
 #define DMEMIT(x...) sz += ((sz >= maxlen) ? \
 	  0 : scnprintf(result + sz, maxlen - sz, x))
 
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/07/22 22:49:49	1.1.2.9.2.6
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/07/27 23:10:58	1.1.2.9.2.7
@@ -107,6 +107,7 @@
 	if (!log->log_dev)
 		return 0;
 
+	BUG_ON(atomic_read(&log->suspended));
 	r = dm_io_sync_vm(1, &log->header_location, READ,
 			  log->disk_header, &ebits);
 	if (unlikely(r))
@@ -137,6 +138,7 @@
 	if (!log->log_dev)
 		return 0;
 
+	BUG_ON(atomic_read(&log->suspended));
 	header_to_disk(&log->header, log->disk_header);
 	return dm_io_sync_vm(1, &log->header_location, WRITE,
 			     log->disk_header, &ebits);
@@ -180,6 +182,7 @@
 	if (!log->log_dev)
 		return 0;
 
+	BUG_ON(atomic_read(&log->suspended));
 	r = dm_io_sync_vm(1, &log->bits_location, READ,
 			  log->clean_bits, &ebits);
 
@@ -196,6 +199,7 @@
 	if (!log->log_dev)
 		return 0;
 
+	BUG_ON(atomic_read(&log->suspended));
 	return dm_io_sync_vm(1, &log->bits_location, WRITE,
 			     log->clean_bits, &ebits);
 }
@@ -295,7 +299,8 @@
 	struct region_user *tmp_ru, *ru;
 	unsigned char live_nodes[16]; /* Attention -- max of 128 nodes... */
 
-	DMINFO("Disk Resume::");
+	DMDEBUG("Disk Resume::  %s (%s)", lc->uuid + (strlen(lc->uuid) - 8),
+		atomic_read(&lc->suspended) ? "suspended" : "active");
 
 	debug_disk_write = 1;
 	memset(live_nodes, 0, sizeof(live_nodes));
@@ -355,20 +360,20 @@
 		}
 	}
 
-	DMINFO("  Live nodes        :: %d", global_count);
-	DMINFO("  In-Use Regions    :: %d", good_count+bad_count);
-	DMINFO("  Good IUR's        :: %d", good_count);
-	DMINFO("  Bad IUR's         :: %d", bad_count);
+	DMDEBUG("  Live nodes        :: %d", global_count);
+	DMDEBUG("  In-Use Regions    :: %d", good_count+bad_count);
+	DMDEBUG("  Good IUR's        :: %d", good_count);
+	DMDEBUG("  Bad IUR's         :: %d", bad_count);
 
 	lc->sync_count = count_bits32(lc->sync_bits, lc->bitset_uint32_count);
 	lc->sync_search = 0;
 
-	DMINFO("  Sync count        :: %Lu", lc->sync_count);
-	DMINFO("  Disk Region count :: %Lu", lc->header.nr_regions);
-	DMINFO("  Region count      :: %Lu", lc->region_count);
+	DMDEBUG("  Sync count        :: %Lu", lc->sync_count);
+	DMDEBUG("  Disk Region count :: %Lu", lc->header.nr_regions);
+	DMDEBUG("  Region count      :: %Lu", lc->region_count);
 
 	if(lc->header.nr_regions != lc->region_count){
-		DMINFO("  NOTE:  Mapping has changed.");
+		DMDEBUG("  NOTE:  Mapping has changed.");
 	}
 /* Take this out for now.
 	if(list_empty(&lc->region_users) && (lc->sync_count != lc->header.nr_regions)){
@@ -398,13 +403,13 @@
 	}			
 
 */
-	DMINFO("Marked regions::");
+	DMDEBUG("Marked regions::");
 	i = print_zero_bits((unsigned char *)lc->clean_bits, 0, lc->region_count);
-	DMINFO("  Total = %d", i);
+	DMDEBUG("  Total = %d", i);
 
-	DMINFO("Out-of-sync regions::");
+	DMDEBUG("Out-of-sync regions::");
 	i = print_zero_bits((unsigned char *)lc->sync_bits, 0, lc->region_count);
-	DMINFO("  Total = %d", i);
+	DMDEBUG("  Total = %d", i);
 
 	/* set the correct number of regions in the header */
 	lc->header.nr_regions = lc->region_count;
@@ -529,7 +534,7 @@
 
 	ru = find_ru(lc, who, lr->u.lr_region);
 	if(!ru){
-		DMINFO("Request to remove unrecorded region user (%u/%Lu)",
+		DMDEBUG("Request to remove unrecorded region user (%u/%Lu)",
 		       who, lr->u.lr_region);
 		return -EINVAL;
 	} else {
@@ -596,7 +601,7 @@
 	   (info < 1000 && !(info%100)) ||
 	   (info < 200 && !(info%25)) ||
 	   (info < 6)){
-		DMINFO(SECTOR_FORMAT " out-of-sync regions remaining for %s.",
+		DMDEBUG(SECTOR_FORMAT " out-of-sync regions remaining for %s.",
 		       lc->region_count - lc->sync_count,
 		       lc->uuid + (strlen(lc->uuid) - 8));
 	}
@@ -843,9 +848,12 @@
 				return -1;
 			}
 			if(lc && (old != lc->server_id) && (my_id == lc->server_id)){
-				DMINFO("I'm the cluster mirror log server for %s",
+				DMDEBUG("I'm the cluster mirror log server for %s",
 				       lc->uuid + (strlen(lc->uuid) - 8));
-				disk_resume(lc);
+				if (!atomic_read(&lc->suspended))
+					disk_resume(lc);
+				else
+					DMDEBUG("Not reading disk log because I'm suspended.");
 			}
 			goto reply;
 		}
@@ -860,6 +868,30 @@
 			goto reply;
 		}
 
+		if (atomic_read(&lc->suspended)) {
+			nodeid = ipaddr_to_nodeid((struct sockaddr *)msg.msg_name);
+			/*
+			DMDEBUG("Getting request while server (%u) is suspended:", my_id);
+			DMDEBUG(" - Requester :: %u", nodeid);
+			DMDEBUG(" - log uuid  :: %s", lc->uuid + (strlen(lc->uuid) - 8));
+			DMDEBUG(" - req type  :: %s",
+				(lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_C	LEAN":
+				(lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
+				(lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
+				(lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
+				(lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
+				(lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
+				(lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
+				(lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
+				(lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
+				(lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
+			*/
+			if (my_id != nodeid) {
+				lr.u.lr_int_rtn = -ENXIO;
+				goto reply;
+			}
+		}			
+
 		switch(lr.lr_type){
 		case LRT_IS_CLEAN:
 			error = server_is_clean(lc, &lr);
@@ -1004,7 +1036,10 @@
 			
 			list_for_each_entry(lc, &log_list_head, log_list){
 				if(lc->server_id == my_id){
-					disk_resume(lc);
+					if (!atomic_read(&lc->suspended))
+						disk_resume(lc);
+					else
+						DMDEBUG("Not reading disk log because I'm suspended.");
 				}
 			}
 			break;



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2006-07-22 22:19 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2006-07-22 22:19 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2006-07-22 22:19:34

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c 

Log message:
	Fix for bug:
	199862 - Suspending cluster mirrors can cause indefinite hang
	And is part of a fix for:
	199185 - 'lvconvert' fails to remove device-mapper devices ...
	198555 - mirror log not getting cleared causes new mirror ...
	And is likely to fix:
	199334 - cmirror removal attempt hangs and caused locking ...
	And will certainly help for:
	199498
	198821
	194137
	194125
	199635
	
	All of the above bugs will need to be reexamined when the packages
	are rebuilt.
	
	This fix allows the log server to migrate to other nodes during
	suspension.  This prevents the situation where the log server may
	have it's devices suspended when it recieves a request.  Trying to
	fulfill a log request while devices are suspended will lead to an
	indefinite hang, because I/O will not complete until the devices
	are unsuspended.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.21&r2=1.1.2.22
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.8&r2=1.1.2.9
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.13&r2=1.1.2.14

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/07/07 17:08:56	1.1.2.21
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/07/22 22:19:34	1.1.2.22
@@ -243,7 +243,7 @@
 
 
 
-static int run_election(struct log_c *lc){
+static int run_election(struct log_c *lc, uint32_t initial_server){
 	int error=0, len;
 	struct sockaddr_in saddr_in;
 	struct msghdr msg;
@@ -255,7 +255,7 @@
 
 	lr.lr_type = LRT_ELECTION;
 	lr.u.lr_starter = my_id;
-	lr.u.lr_coordinator = my_id;
+	lr.u.lr_coordinator = initial_server;
 	memcpy(lr.lr_uuid, lc->uuid, MAX_NAME_LEN);
 
 	memset(&saddr_in, 0, sizeof(struct sockaddr_cl));
@@ -420,7 +420,6 @@
 
 	if(len <= 0){
 		/* ATTENTION -- what do we do with this ? */
-//		DMWARN("Failed to recvmsg from clustered log server");
 		error = len;
 		*retry = 1;
 		goto fail;
@@ -435,16 +434,11 @@
 	}
 
 	if(lr->u.lr_int_rtn == -ENXIO){
-		DMWARN("server tells us it no longer controls the log");
 		lc->server_id = 0xDEAD;
 		*retry = 1;
 		goto fail;
 	}
 
-	if(lr->u.lr_int_rtn < 0){
-		DMWARN("an error occured on the server while processing our request");
-	}
-
 	if(result)
 		*result = lr->u.lr_region_rtn;
 
@@ -463,8 +457,9 @@
 	}
 
 	if(lr) kfree(lr);
-#ifdef DEBUG
-	DMWARN("Request (%s) to server failed :: %d",
+#if 0
+	DMINFO("My (%u) request (%s) to server (%u) failed :: %d",
+	       my_id,
 	       (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
 	       (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
 	       (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
@@ -475,7 +470,7 @@
 	       (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
 	       (type == LRT_ELECTION)? "LRT_ELECTION":
 	       (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN",
-	       error);
+	       lc->server_id, error);
 #endif
 	return error;
 }
@@ -495,9 +490,13 @@
 	do{
 		retry = 0;
 		suspend_on(&suspend_client_queue, atomic_read(&suspend_client));
+		if ((type == LRT_MASTER_LEAVING) && (lc->server_id == 0xDEAD)) {
+			/* Nothing to do */
+			goto out;
+		}
 	election:
 		while(lc->server_id == 0xDEAD){
-			run_election(lc);
+			run_election(lc, my_id);
 			new_server = 1;
 		}
 
@@ -539,7 +538,7 @@
 				spin_unlock(&region_state_lock);
 				goto out;
 			} else {
-				DMWARN("Continuing request:: %s", 
+				DMINFO("Continuing request:: %s",
 				      (type == LRT_IS_CLEAN)? "LRT_IS_C	LEAN":
 				      (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
 				      (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
@@ -737,14 +736,14 @@
 	struct log_c *lc = (struct log_c *) log->context;
 
 	if (!list_empty(&clear_region_list))
-		DMERR("LEAVING WHILE REGION REQUESTS REMAIN.");
+		DMINFO("Leaving while clear region requests remain.");
 
 	list_del_init(&lc->log_list);
 	if(lc->server_id == my_id)
 		consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
 	sock_release(lc->client_sock);
 
-	if (lc->log_dev) 
+	if (lc->log_dev)
 		disk_dtr(log);
 	else
 		core_dtr(log);
@@ -755,15 +754,13 @@
 
 static int cluster_presuspend(struct dirty_log *log)
 {
-	struct log_c *lc = (struct log_c *) log->context;
+	return 0;
+}
 
-	/*
-	atomic_set(&lc->suspended, 1);
+static int cluster_postsuspend(struct dirty_log *log)
+{
+	struct log_c *lc = (struct log_c *) log->context;
 
-	if (lc->log_dev && lc->log_dev_failed)
-		complete(&lc->failure_completion);
-	else
-	*/
 	while (1) {
 		spin_lock(&region_state_lock);
 		if (list_empty(&clear_region_list)) {
@@ -775,20 +772,31 @@
 		/* Just an unnessesary call to clear out regions */
 		consult_server(lc, 0, LRT_IN_SYNC, NULL);
 	}
+	atomic_set(&lc->suspended, 1);
+	if(lc->server_id == my_id) {
+		while (1) {
+			consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
+			down(&consult_server_lock);
+			run_election(lc, 0xDEAD);
+			up(&consult_server_lock);
+			if (lc->server_id == my_id) {
+				set_current_state(TASK_INTERRUPTIBLE);
+				schedule_timeout(HZ/4);
+			} else {
+				break;
+			}
+		}
+	}
 
 	return 0;
 }
 
-static int cluster_postsuspend(struct dirty_log *log){
-	return 0;
-}
-
 static int cluster_resume(struct dirty_log *log){
 	struct log_c *lc = (struct log_c *) log->context;
 
 	lc->sync_search = 0;
 	resume_server_requests();
-	/* atomic_set(&lc->suspended, 0); */
+	atomic_set(&lc->suspended, 0);
 
 	return 0;
 }
@@ -1310,7 +1318,7 @@
 	.get_failure_response = cluster_get_failure_response,
 };
 
-#define CMIRROR_RELEASE_NAME "0.1.0"
+#define CMIRROR_RELEASE_NAME "0.2.0"
 static int __init cluster_dirty_log_init(void)
 {
 	int r = 0;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/06/29 19:48:01	1.1.2.8
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/07/22 22:19:34	1.1.2.9
@@ -13,10 +13,12 @@
 	sector_t sector;
 	sector_t count;
 };
+int dm_io_get(unsigned int num_pages);
+void dm_io_put(unsigned int num_pages);
 int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
                   void *data, unsigned long *error_bits);
 /* from dm.h */
-#define DM_NAME "device-mapper"
+#define DM_NAME "dm-cmirror"
 #define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
 #define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
 #define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
@@ -110,8 +112,8 @@
 	 * Disk log fields
 	 */
 	int log_dev_failed;
-	/*
 	atomic_t suspended;
+	/*
 	struct completion failure_completion;
 	*/
 	struct dm_dev *log_dev;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/07/19 14:39:12	1.1.2.13
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/07/22 22:19:34	1.1.2.14
@@ -197,7 +197,7 @@
 		return 0;
 
 	return dm_io_sync_vm(1, &log->bits_location, WRITE,
-			      log->clean_bits, &ebits);
+			     log->clean_bits, &ebits);
 }
 
 static int count_bits32(uint32_t *addr, unsigned size)
@@ -656,23 +656,41 @@
 		return -1;
 	}
 
-	
-	if((lr->lr_type == LRT_MASTER_LEAVING) && 
-	   (lr->u.lr_starter == my_id) &&
-	   lr->u.lr_node_count){
-		lr->u.lr_coordinator = 0xDEAD;
-		if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
-			return -1;
+	if(lr->lr_type == LRT_MASTER_LEAVING) {
+		/*
+		 * if we started this and (lr->u.lr_node_count != 0),
+		 * then we have told everyone that we are leaving
+		 */
+		if ((lr->u.lr_starter == my_id) && lr->u.lr_node_count){
+			lr->u.lr_coordinator = 0xDEAD;
+			if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
+				return -1;
+			}
+			saddr->sin_port = lr->u.lr_starter_port;
+			return 0;
 		}
-		saddr->sin_port = lr->u.lr_starter_port;
-		return 0;
 	}
-	
+
+	/*
+	 * Check if we have access to the log.  We may not
+	 * get have loaded this device.
+	 */
 	if(!lc){
 		lr->u.lr_node_count++;
 		return 0;
 	}
-	
+
+	if(lr->lr_type == LRT_MASTER_LEAVING){
+		lc->server_id = 0xDEAD;
+		lr->u.lr_node_count++;
+		return 0;
+	}
+
+	/*
+	 * New node joins and needs to know I am the server
+	 * We shortcut the election here and respond directly
+	 * to the inquirer
+	 */
 	if(lc->server_id == my_id){
 		lr->u.lr_coordinator = my_id;
 		if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
@@ -681,16 +699,16 @@
 		saddr->sin_port = lr->u.lr_starter_port;
 		return 0;
 	}
-	
-	
-	if(lr->lr_type == LRT_MASTER_LEAVING){
-		lc->server_id = 0xDEAD;
-		lr->u.lr_node_count++;
-		return 0;
-	}
-	
+
 	if(lr->lr_type == LRT_ELECTION){
 		if((lr->u.lr_starter == my_id) && (lr->u.lr_node_count)){
+			/*
+			 * We started this election, and we've been
+			 * around the loop.  If the node count hasn't
+			 * changed since we started, we can proceed to
+			 * selection.  Otherwise, go again setting
+			 * ourself as the leader to start.
+			 */
 			if(node_count == lr->u.lr_node_count){
 				lr->lr_type = LRT_SELECTION;
 			} else {
@@ -700,9 +718,23 @@
 			return 0;
 		}
 
+		/*
+		 * We are in the election phase, so
+		 * if we have the lowest ID so far,
+		 * we elect ourselves for server.
+		 *
+		 * However, if the mirror is being suspended
+		 * (lc->suspended), then we leave the current
+		 * coordinator in place.
+		 *
+		 * The client must not set lc->suspended until
+		 * it has completed sending all requests.  That
+		 * way, everyone is done sending requests when
+		 * the last server is stuck holding the ball.
+		 */
 		lr->u.lr_node_count++;
 		
-		if(my_id < lr->u.lr_coordinator){
+		if((my_id < lr->u.lr_coordinator) && !atomic_read(&lc->suspended)){
 			lr->u.lr_coordinator = my_id;
 		}
 		return 0;
@@ -712,6 +744,13 @@
 			return 0;
 		}
 		
+		/*
+		 * Need to restart election if someone
+		 * has joined since we started.
+		 *
+		 * Here, we are the started, so set
+		 * node_count = 1
+		 */
 		if(lr->u.lr_node_count == node_count){
 			lr->lr_type = LRT_MASTER_ASSIGN;
 		} else {
@@ -721,12 +760,24 @@
 		}
 		lr->u.lr_node_count = 1;
 	} else if(lr->lr_type == LRT_MASTER_ASSIGN){
+		/*
+		 * If we are the server, assign it
+		 */
 		if(lr->u.lr_coordinator == my_id){
 			lc->server_id = my_id;
 		}
+
+		/*
+		 * Continue around the loop
+		 */
 		if(lr->u.lr_starter != my_id){
 			return 0;
 		}
+
+		/*
+		 * If I was the one who asked for the election,
+		 * the send the results back to the client
+		 */
 		if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
 			return -1;
 		}
@@ -800,18 +851,15 @@
 		}
 
 		if(!lc){
-			DMWARN("Log context can not be found for request");
 			lr.u.lr_int_rtn = -ENXIO;
 			goto reply;
 		}
 
-/*
-  if(lc->server_id != my_id){
-  DMWARN("I am not the server for this request");
-  lr.u.lr_int_rtn = -ENXIO;
-  goto reply;
-  }
-*/
+		if (lc->server_id != my_id) {
+			lr.u.lr_int_rtn = -ENXIO;
+			goto reply;
+		}
+
 		switch(lr.lr_type){
 		case LRT_IS_CLEAN:
 			error = server_is_clean(lc, &lr);
@@ -860,12 +908,23 @@
 		}
 
 		/* ATTENTION -- if error? */
+/*
 		if(error){
-			DMWARN("Error (%d) while processing request (type = %d)",
-			       error, lr.lr_type);
+			DMWARN("Error (%d) while processing request (%s)",
+			       error,
+			       (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_C	LEAN":
+			       (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
+			       (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
+			       (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
+			       (lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
+			       (lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
+			       (lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
+			       (lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
+			       (lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
+			       (lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
 			lr.u.lr_int_rtn = error;
 		}
-
+*/
 	reply:
     
 		/* Why do we need to reset this? */
@@ -940,9 +999,8 @@
 			** leaving node, it won't hurt anything - and**
 			** if there is, they will be recovered.      */
 		case SERVICE_NODE_FAILED:
-			DMINFO("A cluster mirror log member has %s",
-			       (restart_event_type == SERVICE_NODE_FAILED) ?
-			       "failed." : "left.");
+			if (restart_event_type == SERVICE_NODE_FAILED)
+				DMINFO("A cluster mirror log member has failed.");
 			
 			list_for_each_entry(lc, &log_list_head, log_list){
 				if(lc->server_id == my_id){
@@ -968,8 +1026,6 @@
 		schedule();
 	}
 
-	DMINFO("Cluster mirror log server is shutting down.");
-
 	sock_release(sock);
 	complete(&server_completion);
 	return 0;



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2006-07-22 22:19 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2006-07-22 22:19 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	STABLE
Changes by:	jbrassow at sourceware.org	2006-07-22 22:19:04

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c 

Log message:
	Fix for bug:
	199862 - Suspending cluster mirrors can cause indefinite hang
	And is part of a fix for:
	199185 - 'lvconvert' fails to remove device-mapper devices ...
	198555 - mirror log not getting cleared causes new mirror ...
	And is likely to fix:
	199334 - cmirror removal attempt hangs and caused locking ...
	And will certainly help for:
	199498
	198821
	194137
	194125
	199635
	
	All of the above bugs will need to be reexamined when the packages
	are rebuilt.
	
	This fix allows the log server to migrate to other nodes during
	suspension.  This prevents the situation where the log server may
	have it's devices suspended when it recieves a request.  Trying to
	fulfill a log request while devices are suspended will lead to an
	indefinite hang, because I/O will not complete until the devices
	are unsuspended.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1.4.4&r2=1.1.4.5
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1.4.3&r2=1.1.4.4
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1.4.4&r2=1.1.4.5

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/07/07 17:09:54	1.1.4.4
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/07/22 22:19:04	1.1.4.5
@@ -243,7 +243,7 @@
 
 
 
-static int run_election(struct log_c *lc){
+static int run_election(struct log_c *lc, uint32_t initial_server){
 	int error=0, len;
 	struct sockaddr_in saddr_in;
 	struct msghdr msg;
@@ -255,7 +255,7 @@
 
 	lr.lr_type = LRT_ELECTION;
 	lr.u.lr_starter = my_id;
-	lr.u.lr_coordinator = my_id;
+	lr.u.lr_coordinator = initial_server;
 	memcpy(lr.lr_uuid, lc->uuid, MAX_NAME_LEN);
 
 	memset(&saddr_in, 0, sizeof(struct sockaddr_cl));
@@ -420,7 +420,6 @@
 
 	if(len <= 0){
 		/* ATTENTION -- what do we do with this ? */
-//		DMWARN("Failed to recvmsg from clustered log server");
 		error = len;
 		*retry = 1;
 		goto fail;
@@ -435,16 +434,11 @@
 	}
 
 	if(lr->u.lr_int_rtn == -ENXIO){
-		DMWARN("server tells us it no longer controls the log");
 		lc->server_id = 0xDEAD;
 		*retry = 1;
 		goto fail;
 	}
 
-	if(lr->u.lr_int_rtn < 0){
-		DMWARN("an error occured on the server while processing our request");
-	}
-
 	if(result)
 		*result = lr->u.lr_region_rtn;
 
@@ -463,8 +457,9 @@
 	}
 
 	if(lr) kfree(lr);
-#ifdef DEBUG
-	DMWARN("Request (%s) to server failed :: %d",
+#if 0
+	DMINFO("My (%u) request (%s) to server (%u) failed :: %d",
+	       my_id,
 	       (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
 	       (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
 	       (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
@@ -475,7 +470,7 @@
 	       (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
 	       (type == LRT_ELECTION)? "LRT_ELECTION":
 	       (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN",
-	       error);
+	       lc->server_id, error);
 #endif
 	return error;
 }
@@ -495,9 +490,13 @@
 	do{
 		retry = 0;
 		suspend_on(&suspend_client_queue, atomic_read(&suspend_client));
+		if ((type == LRT_MASTER_LEAVING) && (lc->server_id == 0xDEAD)) {
+			/* Nothing to do */
+			goto out;
+		}
 	election:
 		while(lc->server_id == 0xDEAD){
-			run_election(lc);
+			run_election(lc, my_id);
 			new_server = 1;
 		}
 
@@ -539,7 +538,7 @@
 				spin_unlock(&region_state_lock);
 				goto out;
 			} else {
-				DMWARN("Continuing request:: %s", 
+				DMINFO("Continuing request:: %s",
 				      (type == LRT_IS_CLEAN)? "LRT_IS_C	LEAN":
 				      (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
 				      (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
@@ -737,14 +736,14 @@
 	struct log_c *lc = (struct log_c *) log->context;
 
 	if (!list_empty(&clear_region_list))
-		DMERR("LEAVING WHILE REGION REQUESTS REMAIN.");
+		DMINFO("Leaving while clear region requests remain.");
 
 	list_del_init(&lc->log_list);
 	if(lc->server_id == my_id)
 		consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
 	sock_release(lc->client_sock);
 
-	if (lc->log_dev) 
+	if (lc->log_dev)
 		disk_dtr(log);
 	else
 		core_dtr(log);
@@ -755,15 +754,13 @@
 
 static int cluster_presuspend(struct dirty_log *log)
 {
-	struct log_c *lc = (struct log_c *) log->context;
+	return 0;
+}
 
-	/*
-	atomic_set(&lc->suspended, 1);
+static int cluster_postsuspend(struct dirty_log *log)
+{
+	struct log_c *lc = (struct log_c *) log->context;
 
-	if (lc->log_dev && lc->log_dev_failed)
-		complete(&lc->failure_completion);
-	else
-	*/
 	while (1) {
 		spin_lock(&region_state_lock);
 		if (list_empty(&clear_region_list)) {
@@ -775,20 +772,31 @@
 		/* Just an unnessesary call to clear out regions */
 		consult_server(lc, 0, LRT_IN_SYNC, NULL);
 	}
+	atomic_set(&lc->suspended, 1);
+	if(lc->server_id == my_id) {
+		while (1) {
+			consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
+			down(&consult_server_lock);
+			run_election(lc, 0xDEAD);
+			up(&consult_server_lock);
+			if (lc->server_id == my_id) {
+				set_current_state(TASK_INTERRUPTIBLE);
+				schedule_timeout(HZ/4);
+			} else {
+				break;
+			}
+		}
+	}
 
 	return 0;
 }
 
-static int cluster_postsuspend(struct dirty_log *log){
-	return 0;
-}
-
 static int cluster_resume(struct dirty_log *log){
 	struct log_c *lc = (struct log_c *) log->context;
 
 	lc->sync_search = 0;
 	resume_server_requests();
-	/* atomic_set(&lc->suspended, 0); */
+	atomic_set(&lc->suspended, 0);
 
 	return 0;
 }
@@ -1330,7 +1338,7 @@
 	.get_default_mirror = cluster_get_default_mirror,
 };
 
-#define CMIRROR_RELEASE_NAME "0.1.0"
+#define CMIRROR_RELEASE_NAME "0.2.0"
 static int __init cluster_dirty_log_init(void)
 {
 	int r = 0;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/06/29 19:49:32	1.1.4.3
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/07/22 22:19:04	1.1.4.4
@@ -13,10 +13,12 @@
 	sector_t sector;
 	sector_t count;
 };
+int dm_io_get(unsigned int num_pages);
+void dm_io_put(unsigned int num_pages);
 int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
                   void *data, unsigned long *error_bits);
 /* from dm.h */
-#define DM_NAME "device-mapper"
+#define DM_NAME "dm-cmirror"
 #define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
 #define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
 #define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
@@ -111,8 +113,8 @@
 	 * Disk log fields
 	 */
 	int log_dev_failed;
-	/*
 	atomic_t suspended;
+	/*
 	struct completion failure_completion;
 	*/
 	struct dm_dev *log_dev;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/07/19 14:40:15	1.1.4.4
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/07/22 22:19:04	1.1.4.5
@@ -199,7 +199,7 @@
 		return 0;
 
 	return dm_io_sync_vm(1, &log->bits_location, WRITE,
-			      log->clean_bits, &ebits);
+			     log->clean_bits, &ebits);
 }
 
 static int count_bits32(uint32_t *addr, unsigned size)
@@ -676,23 +676,41 @@
 		return -1;
 	}
 
-	
-	if((lr->lr_type == LRT_MASTER_LEAVING) && 
-	   (lr->u.lr_starter == my_id) &&
-	   lr->u.lr_node_count){
-		lr->u.lr_coordinator = 0xDEAD;
-		if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
-			return -1;
+	if(lr->lr_type == LRT_MASTER_LEAVING) {
+		/*
+		 * if we started this and (lr->u.lr_node_count != 0),
+		 * then we have told everyone that we are leaving
+		 */
+		if ((lr->u.lr_starter == my_id) && lr->u.lr_node_count){
+			lr->u.lr_coordinator = 0xDEAD;
+			if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
+				return -1;
+			}
+			saddr->sin_port = lr->u.lr_starter_port;
+			return 0;
 		}
-		saddr->sin_port = lr->u.lr_starter_port;
-		return 0;
 	}
-	
+
+	/*
+	 * Check if we have access to the log.  We may not
+	 * get have loaded this device.
+	 */
 	if(!lc){
 		lr->u.lr_node_count++;
 		return 0;
 	}
-	
+
+	if(lr->lr_type == LRT_MASTER_LEAVING){
+		lc->server_id = 0xDEAD;
+		lr->u.lr_node_count++;
+		return 0;
+	}
+
+	/*
+	 * New node joins and needs to know I am the server
+	 * We shortcut the election here and respond directly
+	 * to the inquirer
+	 */
 	if(lc->server_id == my_id){
 		lr->u.lr_coordinator = my_id;
 		if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
@@ -701,16 +719,16 @@
 		saddr->sin_port = lr->u.lr_starter_port;
 		return 0;
 	}
-	
-	
-	if(lr->lr_type == LRT_MASTER_LEAVING){
-		lc->server_id = 0xDEAD;
-		lr->u.lr_node_count++;
-		return 0;
-	}
-	
+
 	if(lr->lr_type == LRT_ELECTION){
 		if((lr->u.lr_starter == my_id) && (lr->u.lr_node_count)){
+			/*
+			 * We started this election, and we've been
+			 * around the loop.  If the node count hasn't
+			 * changed since we started, we can proceed to
+			 * selection.  Otherwise, go again setting
+			 * ourself as the leader to start.
+			 */
 			if(node_count == lr->u.lr_node_count){
 				lr->lr_type = LRT_SELECTION;
 			} else {
@@ -720,9 +738,23 @@
 			return 0;
 		}
 
+		/*
+		 * We are in the election phase, so
+		 * if we have the lowest ID so far,
+		 * we elect ourselves for server.
+		 *
+		 * However, if the mirror is being suspended
+		 * (lc->suspended), then we leave the current
+		 * coordinator in place.
+		 *
+		 * The client must not set lc->suspended until
+		 * it has completed sending all requests.  That
+		 * way, everyone is done sending requests when
+		 * the last server is stuck holding the ball.
+		 */
 		lr->u.lr_node_count++;
 		
-		if(my_id < lr->u.lr_coordinator){
+		if((my_id < lr->u.lr_coordinator) && !atomic_read(&lc->suspended)){
 			lr->u.lr_coordinator = my_id;
 		}
 		return 0;
@@ -732,6 +764,13 @@
 			return 0;
 		}
 		
+		/*
+		 * Need to restart election if someone
+		 * has joined since we started.
+		 *
+		 * Here, we are the started, so set
+		 * node_count = 1
+		 */
 		if(lr->u.lr_node_count == node_count){
 			lr->lr_type = LRT_MASTER_ASSIGN;
 		} else {
@@ -741,12 +780,24 @@
 		}
 		lr->u.lr_node_count = 1;
 	} else if(lr->lr_type == LRT_MASTER_ASSIGN){
+		/*
+		 * If we are the server, assign it
+		 */
 		if(lr->u.lr_coordinator == my_id){
 			lc->server_id = my_id;
 		}
+
+		/*
+		 * Continue around the loop
+		 */
 		if(lr->u.lr_starter != my_id){
 			return 0;
 		}
+
+		/*
+		 * If I was the one who asked for the election,
+		 * the send the results back to the client
+		 */
 		if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
 			return -1;
 		}
@@ -820,18 +871,15 @@
 		}
 
 		if(!lc){
-			DMWARN("Log context can not be found for request");
 			lr.u.lr_int_rtn = -ENXIO;
 			goto reply;
 		}
 
-/*
-  if(lc->server_id != my_id){
-  DMWARN("I am not the server for this request");
-  lr.u.lr_int_rtn = -ENXIO;
-  goto reply;
-  }
-*/
+		if (lc->server_id != my_id) {
+			lr.u.lr_int_rtn = -ENXIO;
+			goto reply;
+		}
+
 		switch(lr.lr_type){
 		case LRT_IS_CLEAN:
 			error = server_is_clean(lc, &lr);
@@ -886,12 +934,23 @@
 		}
 
 		/* ATTENTION -- if error? */
+/*
 		if(error){
-			DMWARN("Error (%d) while processing request (type = %d)",
-			       error, lr.lr_type);
+			DMWARN("Error (%d) while processing request (%s)",
+			       error,
+			       (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_C	LEAN":
+			       (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
+			       (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
+			       (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
+			       (lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
+			       (lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
+			       (lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
+			       (lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
+			       (lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
+			       (lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
 			lr.u.lr_int_rtn = error;
 		}
-
+*/
 	reply:
     
 		/* Why do we need to reset this? */
@@ -966,9 +1025,8 @@
 			** leaving node, it won't hurt anything - and**
 			** if there is, they will be recovered.      */
 		case SERVICE_NODE_FAILED:
-			DMINFO("A cluster mirror log member has %s",
-			       (restart_event_type == SERVICE_NODE_FAILED) ?
-			       "failed." : "left.");
+			if (restart_event_type == SERVICE_NODE_FAILED)
+				DMINFO("A cluster mirror log member has failed.");
 			
 			list_for_each_entry(lc, &log_list_head, log_list){
 				if(lc->server_id == my_id){
@@ -994,8 +1052,6 @@
 		schedule();
 	}
 
-	DMINFO("Cluster mirror log server is shutting down.");
-
 	sock_release(sock);
 	complete(&server_completion);
 	return 0;



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2006-07-22 22:12 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2006-07-22 22:12 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4U4
Changes by:	jbrassow at sourceware.org	2006-07-22 22:12:33

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c 

Log message:
	Fix for bug:
	199862 - Suspending cluster mirrors can cause indefinite hang
	And is part of a fix for:
	199185 ??? 'lvconvert' fails to remove device-mapper devices ...
	198555 ??? mirror log not getting cleared causes new mirror ...
	And is likely to fix:
	199334 ??? cmirror removal attempt hangs and caused locking ...
	And will certainly help for:
	199498
	198821
	194137
	194125
	199635
	
	All of the above bugs will need to be reexamined when the packages
	are rebuilt.
	
	This fix allows the log server to migrate to other nodes during
	suspension.  This prevents the situation where the log server may
	have it's devices suspended when it recieves a request.  Trying to
	fulfill a log request while devices are suspended will lead to an
	indefinite hang, because I/O will not complete until the devices
	are unsuspended.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4U4&r1=1.1.2.19.2.2&r2=1.1.2.19.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL4U4&r1=1.1.2.7.2.1&r2=1.1.2.7.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4U4&r1=1.1.2.9.2.4&r2=1.1.2.9.2.5

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/07/07 17:12:22	1.1.2.19.2.2
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/07/22 22:12:32	1.1.2.19.2.3
@@ -243,7 +243,7 @@
 
 
 
-static int run_election(struct log_c *lc){
+static int run_election(struct log_c *lc, uint32_t initial_server){
 	int error=0, len;
 	struct sockaddr_in saddr_in;
 	struct msghdr msg;
@@ -255,7 +255,7 @@
 
 	lr.lr_type = LRT_ELECTION;
 	lr.u.lr_starter = my_id;
-	lr.u.lr_coordinator = my_id;
+	lr.u.lr_coordinator = initial_server;
 	memcpy(lr.lr_uuid, lc->uuid, MAX_NAME_LEN);
 
 	memset(&saddr_in, 0, sizeof(struct sockaddr_cl));
@@ -420,7 +420,6 @@
 
 	if(len <= 0){
 		/* ATTENTION -- what do we do with this ? */
-//		DMWARN("Failed to recvmsg from clustered log server");
 		error = len;
 		*retry = 1;
 		goto fail;
@@ -435,16 +434,11 @@
 	}
 
 	if(lr->u.lr_int_rtn == -ENXIO){
-		DMWARN("server tells us it no longer controls the log");
 		lc->server_id = 0xDEAD;
 		*retry = 1;
 		goto fail;
 	}
 
-	if(lr->u.lr_int_rtn < 0){
-		DMWARN("an error occured on the server while processing our request");
-	}
-
 	if(result)
 		*result = lr->u.lr_region_rtn;
 
@@ -463,8 +457,9 @@
 	}
 
 	if(lr) kfree(lr);
-#ifdef DEBUG
-	DMWARN("Request (%s) to server failed :: %d",
+#if 0
+	DMINFO("My (%u) request (%s) to server (%u) failed :: %d",
+	       my_id,
 	       (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
 	       (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
 	       (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
@@ -475,7 +470,7 @@
 	       (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
 	       (type == LRT_ELECTION)? "LRT_ELECTION":
 	       (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN",
-	       error);
+	       lc->server_id, error);
 #endif
 	return error;
 }
@@ -495,9 +490,13 @@
 	do{
 		retry = 0;
 		suspend_on(&suspend_client_queue, atomic_read(&suspend_client));
+		if ((type == LRT_MASTER_LEAVING) && (lc->server_id == 0xDEAD)) {
+			/* Nothing to do */
+			goto out;
+		}
 	election:
 		while(lc->server_id == 0xDEAD){
-			run_election(lc);
+			run_election(lc, my_id);
 			new_server = 1;
 		}
 
@@ -539,7 +538,7 @@
 				spin_unlock(&region_state_lock);
 				goto out;
 			} else {
-				DMWARN("Continuing request:: %s", 
+				DMINFO("Continuing request:: %s",
 				      (type == LRT_IS_CLEAN)? "LRT_IS_C	LEAN":
 				      (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
 				      (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
@@ -737,14 +736,14 @@
 	struct log_c *lc = (struct log_c *) log->context;
 
 	if (!list_empty(&clear_region_list))
-		DMERR("LEAVING WHILE REGION REQUESTS REMAIN.");
+		DMINFO("Leaving while clear region requests remain.");
 
 	list_del_init(&lc->log_list);
 	if(lc->server_id == my_id)
 		consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
 	sock_release(lc->client_sock);
 
-	if (lc->log_dev) 
+	if (lc->log_dev)
 		disk_dtr(log);
 	else
 		core_dtr(log);
@@ -755,15 +754,13 @@
 
 static int cluster_presuspend(struct dirty_log *log)
 {
-	struct log_c *lc = (struct log_c *) log->context;
+	return 0;
+}
 
-	/*
-	atomic_set(&lc->suspended, 1);
+static int cluster_postsuspend(struct dirty_log *log)
+{
+	struct log_c *lc = (struct log_c *) log->context;
 
-	if (lc->log_dev && lc->log_dev_failed)
-		complete(&lc->failure_completion);
-	else
-	*/
 	while (1) {
 		spin_lock(&region_state_lock);
 		if (list_empty(&clear_region_list)) {
@@ -775,20 +772,31 @@
 		/* Just an unnessesary call to clear out regions */
 		consult_server(lc, 0, LRT_IN_SYNC, NULL);
 	}
+	atomic_set(&lc->suspended, 1);
+	if(lc->server_id == my_id) {
+		while (1) {
+			consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
+			down(&consult_server_lock);
+			run_election(lc, 0xDEAD);
+			up(&consult_server_lock);
+			if (lc->server_id == my_id) {
+				set_current_state(TASK_INTERRUPTIBLE);
+				schedule_timeout(HZ/4);
+			} else {
+				break;
+			}
+		}
+	}
 
 	return 0;
 }
 
-static int cluster_postsuspend(struct dirty_log *log){
-	return 0;
-}
-
 static int cluster_resume(struct dirty_log *log){
 	struct log_c *lc = (struct log_c *) log->context;
 
 	lc->sync_search = 0;
 	resume_server_requests();
-	/* atomic_set(&lc->suspended, 0); */
+	atomic_set(&lc->suspended, 0);
 
 	return 0;
 }
@@ -1310,7 +1318,7 @@
 	.get_failure_response = cluster_get_failure_response,
 };
 
-#define CMIRROR_RELEASE_NAME "0.1.0"
+#define CMIRROR_RELEASE_NAME "0.2.0"
 static int __init cluster_dirty_log_init(void)
 {
 	int r = 0;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/06/29 19:46:37	1.1.2.7.2.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/07/22 22:12:32	1.1.2.7.2.2
@@ -13,10 +13,12 @@
 	sector_t sector;
 	sector_t count;
 };
+int dm_io_get(unsigned int num_pages);
+void dm_io_put(unsigned int num_pages);
 int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
                   void *data, unsigned long *error_bits);
 /* from dm.h */
-#define DM_NAME "device-mapper"
+#define DM_NAME "dm-cmirror"
 #define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
 #define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
 #define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
@@ -110,8 +112,8 @@
 	 * Disk log fields
 	 */
 	int log_dev_failed;
-	/*
 	atomic_t suspended;
+	/*
 	struct completion failure_completion;
 	*/
 	struct dm_dev *log_dev;
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/07/19 14:38:20	1.1.2.9.2.4
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/07/22 22:12:32	1.1.2.9.2.5
@@ -197,7 +197,7 @@
 		return 0;
 
 	return dm_io_sync_vm(1, &log->bits_location, WRITE,
-			      log->clean_bits, &ebits);
+			     log->clean_bits, &ebits);
 }
 
 static int count_bits32(uint32_t *addr, unsigned size)
@@ -656,23 +656,41 @@
 		return -1;
 	}
 
-	
-	if((lr->lr_type == LRT_MASTER_LEAVING) && 
-	   (lr->u.lr_starter == my_id) &&
-	   lr->u.lr_node_count){
-		lr->u.lr_coordinator = 0xDEAD;
-		if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
-			return -1;
+	if(lr->lr_type == LRT_MASTER_LEAVING) {
+		/*
+		 * if we started this and (lr->u.lr_node_count != 0),
+		 * then we have told everyone that we are leaving
+		 */
+		if ((lr->u.lr_starter == my_id) && lr->u.lr_node_count){
+			lr->u.lr_coordinator = 0xDEAD;
+			if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
+				return -1;
+			}
+			saddr->sin_port = lr->u.lr_starter_port;
+			return 0;
 		}
-		saddr->sin_port = lr->u.lr_starter_port;
-		return 0;
 	}
-	
+
+	/*
+	 * Check if we have access to the log.  We may not
+	 * get have loaded this device.
+	 */
 	if(!lc){
 		lr->u.lr_node_count++;
 		return 0;
 	}
-	
+
+	if(lr->lr_type == LRT_MASTER_LEAVING){
+		lc->server_id = 0xDEAD;
+		lr->u.lr_node_count++;
+		return 0;
+	}
+
+	/*
+	 * New node joins and needs to know I am the server
+	 * We shortcut the election here and respond directly
+	 * to the inquirer
+	 */
 	if(lc->server_id == my_id){
 		lr->u.lr_coordinator = my_id;
 		if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
@@ -681,16 +699,16 @@
 		saddr->sin_port = lr->u.lr_starter_port;
 		return 0;
 	}
-	
-	
-	if(lr->lr_type == LRT_MASTER_LEAVING){
-		lc->server_id = 0xDEAD;
-		lr->u.lr_node_count++;
-		return 0;
-	}
-	
+
 	if(lr->lr_type == LRT_ELECTION){
 		if((lr->u.lr_starter == my_id) && (lr->u.lr_node_count)){
+			/*
+			 * We started this election, and we've been
+			 * around the loop.  If the node count hasn't
+			 * changed since we started, we can proceed to
+			 * selection.  Otherwise, go again setting
+			 * ourself as the leader to start.
+			 */
 			if(node_count == lr->u.lr_node_count){
 				lr->lr_type = LRT_SELECTION;
 			} else {
@@ -700,9 +718,23 @@
 			return 0;
 		}
 
+		/*
+		 * We are in the election phase, so
+		 * if we have the lowest ID so far,
+		 * we elect ourselves for server.
+		 *
+		 * However, if the mirror is being suspended
+		 * (lc->suspended), then we leave the current
+		 * coordinator in place.
+		 *
+		 * The client must not set lc->suspended until
+		 * it has completed sending all requests.  That
+		 * way, everyone is done sending requests when
+		 * the last server is stuck holding the ball.
+		 */
 		lr->u.lr_node_count++;
 		
-		if(my_id < lr->u.lr_coordinator){
+		if((my_id < lr->u.lr_coordinator) && !atomic_read(&lc->suspended)){
 			lr->u.lr_coordinator = my_id;
 		}
 		return 0;
@@ -712,6 +744,13 @@
 			return 0;
 		}
 		
+		/*
+		 * Need to restart election if someone
+		 * has joined since we started.
+		 *
+		 * Here, we are the started, so set
+		 * node_count = 1
+		 */
 		if(lr->u.lr_node_count == node_count){
 			lr->lr_type = LRT_MASTER_ASSIGN;
 		} else {
@@ -721,12 +760,24 @@
 		}
 		lr->u.lr_node_count = 1;
 	} else if(lr->lr_type == LRT_MASTER_ASSIGN){
+		/*
+		 * If we are the server, assign it
+		 */
 		if(lr->u.lr_coordinator == my_id){
 			lc->server_id = my_id;
 		}
+
+		/*
+		 * Continue around the loop
+		 */
 		if(lr->u.lr_starter != my_id){
 			return 0;
 		}
+
+		/*
+		 * If I was the one who asked for the election,
+		 * the send the results back to the client
+		 */
 		if(!(saddr->sin_addr.s_addr = nodeid_to_ipaddr(lr->u.lr_starter))){
 			return -1;
 		}
@@ -800,18 +851,15 @@
 		}
 
 		if(!lc){
-			DMWARN("Log context can not be found for request");
 			lr.u.lr_int_rtn = -ENXIO;
 			goto reply;
 		}
 
-/*
-  if(lc->server_id != my_id){
-  DMWARN("I am not the server for this request");
-  lr.u.lr_int_rtn = -ENXIO;
-  goto reply;
-  }
-*/
+		if (lc->server_id != my_id) {
+			lr.u.lr_int_rtn = -ENXIO;
+			goto reply;
+		}
+
 		switch(lr.lr_type){
 		case LRT_IS_CLEAN:
 			error = server_is_clean(lc, &lr);
@@ -860,12 +908,23 @@
 		}
 
 		/* ATTENTION -- if error? */
+/*
 		if(error){
-			DMWARN("Error (%d) while processing request (type = %d)",
-			       error, lr.lr_type);
+			DMWARN("Error (%d) while processing request (%s)",
+			       error,
+			       (lr.lr_type == LRT_IS_CLEAN)? "LRT_IS_C	LEAN":
+			       (lr.lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
+			       (lr.lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
+			       (lr.lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
+			       (lr.lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
+			       (lr.lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
+			       (lr.lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
+			       (lr.lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
+			       (lr.lr_type == LRT_ELECTION)? "LRT_ELECTION":
+			       (lr.lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN");
 			lr.u.lr_int_rtn = error;
 		}
-
+*/
 	reply:
     
 		/* Why do we need to reset this? */
@@ -940,9 +999,8 @@
 			** leaving node, it won't hurt anything - and**
 			** if there is, they will be recovered.      */
 		case SERVICE_NODE_FAILED:
-			DMINFO("A cluster mirror log member has %s",
-			       (restart_event_type == SERVICE_NODE_FAILED) ?
-			       "failed." : "left.");
+			if (restart_event_type == SERVICE_NODE_FAILED)
+				DMINFO("A cluster mirror log member has failed.");
 			
 			list_for_each_entry(lc, &log_list_head, log_list){
 				if(lc->server_id == my_id){
@@ -968,8 +1026,6 @@
 		schedule();
 	}
 
-	DMINFO("Cluster mirror log server is shutting down.");
-
 	sock_release(sock);
 	complete(&server_completion);
 	return 0;



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2006-06-29 19:49 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2006-06-29 19:49 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	STABLE
Changes by:	jbrassow at sourceware.org	2006-06-29 19:49:32

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c 

Log message:
	- fix for bug 197263
	
	The cluster mirror 'flush' logging function was still behaving the
	way it used to before the changes to the kernel that allowed it to
	receive the status of a log flush from 'flush'.  This could result
	in a indefinite suspension of the mirror on which a log device had
	failed.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1.4.2&r2=1.1.4.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1.4.2&r2=1.1.4.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1.4.2&r2=1.1.4.3

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/06/27 20:19:53	1.1.4.2
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/06/29 19:49:32	1.1.4.3
@@ -202,7 +202,7 @@
 	lc = (struct log_c *) log->context;
 	lc->log_dev = dev;
 	lc->log_dev_failed = 0;
-	init_completion(&lc->failure_completion);
+	/* init_completion(&lc->failure_completion); */
 
 	/* setup the disk header fields */
 	lc->header_location.bdev = lc->log_dev->bdev;
@@ -757,22 +757,23 @@
 {
 	struct log_c *lc = (struct log_c *) log->context;
 
+	/*
 	atomic_set(&lc->suspended, 1);
-	
+
 	if (lc->log_dev && lc->log_dev_failed)
 		complete(&lc->failure_completion);
-	else {
-		while (1) {
-			spin_lock(&region_state_lock);
-			if (list_empty(&clear_region_list)) {
-				spin_unlock(&region_state_lock);
-				break;
-			}
+	else
+	*/
+	while (1) {
+		spin_lock(&region_state_lock);
+		if (list_empty(&clear_region_list)) {
 			spin_unlock(&region_state_lock);
-
-			/* Just an unnessesary call to clear out regions */
-			consult_server(lc, 0, LRT_IN_SYNC, NULL);
+			break;
 		}
+		spin_unlock(&region_state_lock);
+
+		/* Just an unnessesary call to clear out regions */
+		consult_server(lc, 0, LRT_IN_SYNC, NULL);
 	}
 
 	return 0;
@@ -787,7 +788,7 @@
 
 	lc->sync_search = 0;
 	resume_server_requests();
-	atomic_set(&lc->suspended, 0);
+	/* atomic_set(&lc->suspended, 0); */
 
 	return 0;
 }
@@ -833,8 +834,10 @@
 
 static int cluster_flush(struct dirty_log *log)
 {
+	struct log_c *lc = (struct log_c *) log->context;
+
 	/* FIXME:  flush all clear_region requests to server */
-	return 0;
+	return (lc->log_dev_failed) ? -EIO : 0;
 }
 
 static void cluster_mark_region(struct dirty_log *log, region_t region)
@@ -895,14 +898,20 @@
 		DMWARN("unable to get server (%u) to mark region (%Lu)",
 		       lc->server_id, region);
 		DMWARN("Reason :: %d", error);
+		if (error == -EIO) {
+			lc->log_dev_failed = 1;
+			break;
+		}
 	}
 
 	if (lc->log_dev_failed) {
 		DMERR("Write failed on mirror log device, %s",
 		      lc->log_dev->name);
 		dm_table_event(lc->ti->table);
+		/*
 		if (!atomic_read(&lc->suspended))
 			wait_for_completion(&lc->failure_completion);
+		*/
 	}
 	return;
 }
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/06/27 20:19:53	1.1.4.2
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/06/29 19:49:32	1.1.4.3
@@ -111,8 +111,10 @@
 	 * Disk log fields
 	 */
 	int log_dev_failed;
+	/*
 	atomic_t suspended;
 	struct completion failure_completion;
+	*/
 	struct dm_dev *log_dev;
 	struct log_header header;
 
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/06/27 20:19:53	1.1.4.2
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/06/29 19:49:32	1.1.4.3
@@ -508,7 +508,7 @@
 		mempool_free(new, region_user_pool);
 	}
 
-	return 0;
+	return (lc->log_dev_failed) ? -EIO : 0;
 }
 
 



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2006-06-29 19:48 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2006-06-29 19:48 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2006-06-29 19:48:01

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c 

Log message:
	- fix for bug 197263
	
	The cluster mirror 'flush' logging function was still behaving the
	way it used to before the changes to the kernel that allowed it to
	receive the status of a log flush from 'flush'.  This could result
	in a indefinite suspension of the mirror on which a log device had
	failed.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.19&r2=1.1.2.20
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.7&r2=1.1.2.8
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.11&r2=1.1.2.12

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/06/15 19:48:00	1.1.2.19
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/06/29 19:48:01	1.1.2.20
@@ -202,7 +202,7 @@
 	lc = (struct log_c *) log->context;
 	lc->log_dev = dev;
 	lc->log_dev_failed = 0;
-	init_completion(&lc->failure_completion);
+	/* init_completion(&lc->failure_completion); */
 
 	/* setup the disk header fields */
 	lc->header_location.bdev = lc->log_dev->bdev;
@@ -757,22 +757,23 @@
 {
 	struct log_c *lc = (struct log_c *) log->context;
 
+	/*
 	atomic_set(&lc->suspended, 1);
-	
+
 	if (lc->log_dev && lc->log_dev_failed)
 		complete(&lc->failure_completion);
-	else {
-		while (1) {
-			spin_lock(&region_state_lock);
-			if (list_empty(&clear_region_list)) {
-				spin_unlock(&region_state_lock);
-				break;
-			}
+	else
+	*/
+	while (1) {
+		spin_lock(&region_state_lock);
+		if (list_empty(&clear_region_list)) {
 			spin_unlock(&region_state_lock);
-
-			/* Just an unnessesary call to clear out regions */
-			consult_server(lc, 0, LRT_IN_SYNC, NULL);
+			break;
 		}
+		spin_unlock(&region_state_lock);
+
+		/* Just an unnessesary call to clear out regions */
+		consult_server(lc, 0, LRT_IN_SYNC, NULL);
 	}
 
 	return 0;
@@ -787,7 +788,7 @@
 
 	lc->sync_search = 0;
 	resume_server_requests();
-	atomic_set(&lc->suspended, 0);
+	/* atomic_set(&lc->suspended, 0); */
 
 	return 0;
 }
@@ -847,8 +848,10 @@
 
 static int cluster_flush(struct dirty_log *log)
 {
+	struct log_c *lc = (struct log_c *) log->context;
+
 	/* FIXME:  flush all clear_region requests to server */
-	return 0;
+	return (lc->log_dev_failed) ? -EIO : 0;
 }
 
 static void cluster_mark_region(struct dirty_log *log, region_t region)
@@ -909,14 +912,20 @@
 		DMWARN("unable to get server (%u) to mark region (%Lu)",
 		       lc->server_id, region);
 		DMWARN("Reason :: %d", error);
+		if (error == -EIO) {
+			lc->log_dev_failed = 1;
+			break;
+		}
 	}
 
 	if (lc->log_dev_failed) {
 		DMERR("Write failed on mirror log device, %s",
 		      lc->log_dev->name);
 		dm_table_event(lc->ti->table);
+		/*
 		if (!atomic_read(&lc->suspended))
 			wait_for_completion(&lc->failure_completion);
+		*/
 	}
 	return;
 }
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/02/16 18:34:05	1.1.2.7
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/06/29 19:48:01	1.1.2.8
@@ -110,8 +110,10 @@
 	 * Disk log fields
 	 */
 	int log_dev_failed;
+	/*
 	atomic_t suspended;
 	struct completion failure_completion;
+	*/
 	struct dm_dev *log_dev;
 	struct log_header header;
 
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/06/27 20:24:59	1.1.2.11
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/06/29 19:48:01	1.1.2.12
@@ -519,7 +519,7 @@
 		mempool_free(new, region_user_pool);
 	}
 
-	return 0;
+	return (lc->log_dev_failed) ? -EIO : 0;
 }
 
 



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2006-06-29 19:46 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2006-06-29 19:46 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4U4
Changes by:	jbrassow at sourceware.org	2006-06-29 19:46:37

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c 

Log message:
	- fix for bug 197263
	
	The cluster mirror 'flush' logging function was still behaving the
	way it used to before the changes to the kernel that allowed it to
	receive the status of a log flush from 'flush'.  This could result
	in a indefinite suspension of the mirror on which a log device had
	failed.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4U4&r1=1.1.2.19&r2=1.1.2.19.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=RHEL4U4&r1=1.1.2.7&r2=1.1.2.7.2.1
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4U4&r1=1.1.2.9.2.2&r2=1.1.2.9.2.3

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/06/15 19:48:00	1.1.2.19
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/06/29 19:46:37	1.1.2.19.2.1
@@ -202,7 +202,7 @@
 	lc = (struct log_c *) log->context;
 	lc->log_dev = dev;
 	lc->log_dev_failed = 0;
-	init_completion(&lc->failure_completion);
+	/* init_completion(&lc->failure_completion); */
 
 	/* setup the disk header fields */
 	lc->header_location.bdev = lc->log_dev->bdev;
@@ -757,22 +757,23 @@
 {
 	struct log_c *lc = (struct log_c *) log->context;
 
+	/*
 	atomic_set(&lc->suspended, 1);
-	
+
 	if (lc->log_dev && lc->log_dev_failed)
 		complete(&lc->failure_completion);
-	else {
-		while (1) {
-			spin_lock(&region_state_lock);
-			if (list_empty(&clear_region_list)) {
-				spin_unlock(&region_state_lock);
-				break;
-			}
+	else
+	*/
+	while (1) {
+		spin_lock(&region_state_lock);
+		if (list_empty(&clear_region_list)) {
 			spin_unlock(&region_state_lock);
-
-			/* Just an unnessesary call to clear out regions */
-			consult_server(lc, 0, LRT_IN_SYNC, NULL);
+			break;
 		}
+		spin_unlock(&region_state_lock);
+
+		/* Just an unnessesary call to clear out regions */
+		consult_server(lc, 0, LRT_IN_SYNC, NULL);
 	}
 
 	return 0;
@@ -787,7 +788,7 @@
 
 	lc->sync_search = 0;
 	resume_server_requests();
-	atomic_set(&lc->suspended, 0);
+	/* atomic_set(&lc->suspended, 0); */
 
 	return 0;
 }
@@ -847,8 +848,10 @@
 
 static int cluster_flush(struct dirty_log *log)
 {
+	struct log_c *lc = (struct log_c *) log->context;
+
 	/* FIXME:  flush all clear_region requests to server */
-	return 0;
+	return (lc->log_dev_failed) ? -EIO : 0;
 }
 
 static void cluster_mark_region(struct dirty_log *log, region_t region)
@@ -909,14 +912,20 @@
 		DMWARN("unable to get server (%u) to mark region (%Lu)",
 		       lc->server_id, region);
 		DMWARN("Reason :: %d", error);
+		if (error == -EIO) {
+			lc->log_dev_failed = 1;
+			break;
+		}
 	}
 
 	if (lc->log_dev_failed) {
 		DMERR("Write failed on mirror log device, %s",
 		      lc->log_dev->name);
 		dm_table_event(lc->ti->table);
+		/*
 		if (!atomic_read(&lc->suspended))
 			wait_for_completion(&lc->failure_completion);
+		*/
 	}
 	return;
 }
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/02/16 18:34:05	1.1.2.7
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/06/29 19:46:37	1.1.2.7.2.1
@@ -110,8 +110,10 @@
 	 * Disk log fields
 	 */
 	int log_dev_failed;
+	/*
 	atomic_t suspended;
 	struct completion failure_completion;
+	*/
 	struct dm_dev *log_dev;
 	struct log_header header;
 
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/06/27 20:26:02	1.1.2.9.2.2
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/06/29 19:46:37	1.1.2.9.2.3
@@ -519,7 +519,7 @@
 		mempool_free(new, region_user_pool);
 	}
 
-	return 0;
+	return (lc->log_dev_failed) ? -EIO : 0;
 }
 
 



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2006-06-27 20:19 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2006-06-27 20:19 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	STABLE
Changes by:	jbrassow at sourceware.org	2006-06-27 20:19:53

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-common.h 
	                    dm-cmirror-server.c dm-cmirror-xfr.h 
	                    dm-log.h 

Log message:
	- bring logging functions inline with upstream API

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1.4.1&r2=1.1.4.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-common.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1.4.1&r2=1.1.4.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1.4.1&r2=1.1.4.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-xfr.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1.4.1&r2=1.1.4.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-log.h.diff?cvsroot=cluster&only_with_tag=STABLE&r1=1.1.4.1&r2=1.1.4.2

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/06/27 18:36:09	1.1.4.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/06/27 20:19:53	1.1.4.2
@@ -58,7 +58,7 @@
 static int mark_req2ser=0;
 static int insync_req2ser=0;
 
-static void *region_state_alloc(int gfp_mask, void *pool_data){
+static void *region_state_alloc(gfp_t gfp_mask, void *pool_data){
 	return kmalloc(sizeof(struct region_state), gfp_mask);
 }
 
@@ -810,20 +810,6 @@
 	return rtn;
 }
 
-static int cluster_is_remote_recovering(struct dirty_log *log, region_t region)
-{
-	int rtn;
-	struct log_c *lc = (struct log_c *) log->context;
-
-/* take out optimization
-	if(atomic_read(&lc->in_sync) == 1){
-		return 0;
-	}
-*/
-	rtn = consult_server(lc, region, LRT_IS_REMOTE_RECOVERING, NULL);
-	return rtn;
-}
-
 static int cluster_in_sync(struct dirty_log *log, region_t region, int block)
 {
 	int rtn;
@@ -1128,6 +1114,36 @@
 	return lc->failure_response;
 }
 
+static int cluster_is_remote_recovering(struct dirty_log *log, region_t region)
+{
+	int rtn;
+	struct log_c *lc = (struct log_c *) log->context;
+
+/* take out optimization
+	if(atomic_read(&lc->in_sync) == 1){
+		return 0;
+	}
+*/
+	rtn = consult_server(lc, region, LRT_IS_REMOTE_RECOVERING, NULL);
+	return rtn;
+}
+
+static int cluster_set_default_mirror(struct dirty_log *log, int mirror)
+{
+	int rtn;
+	struct log_c *lc = (struct log_c *) log->context;
+	rtn = consult_server(lc, (region_t)mirror, LRT_SET_DEFAULT_MIRROR, NULL);
+	return rtn;
+}
+
+static int cluster_get_default_mirror(struct dirty_log *log)
+{
+	int rtn;
+	struct log_c *lc = (struct log_c *) log->context;
+	rtn = consult_server(lc, 0, LRT_GET_DEFAULT_MIRROR, NULL);
+	return rtn;
+}
+
 static int clog_stop(void *data){
 	struct log_c *lc;
 
@@ -1255,16 +1271,18 @@
 	.resume = cluster_resume,
 	.get_region_size = cluster_get_region_size,
 	.is_clean = cluster_is_clean,
-	.is_remote_recovering = cluster_is_remote_recovering,
 	.in_sync = cluster_in_sync,
 	.flush = cluster_flush,
 	.mark_region = cluster_mark_region,
 	.clear_region = cluster_clear_region,
 	.get_resync_work = cluster_get_resync_work,
-	.complete_resync_work = cluster_complete_resync_work,
+	.set_region_sync = cluster_complete_resync_work,
 	.get_sync_count = cluster_get_sync_count,
 	.status = cluster_status,
 	.get_failure_response = cluster_get_failure_response,
+	.is_remote_recovering = cluster_is_remote_recovering,
+	.set_default_mirror = cluster_set_default_mirror,
+	.get_default_mirror = cluster_get_default_mirror,
 };
 
 static struct dirty_log_type _clustered_disk_type = {
@@ -1277,16 +1295,18 @@
 	.resume = cluster_resume,
 	.get_region_size = cluster_get_region_size,
 	.is_clean = cluster_is_clean,
-	.is_remote_recovering = cluster_is_remote_recovering,
 	.in_sync = cluster_in_sync,
 	.flush = cluster_flush,
 	.mark_region = cluster_mark_region,
 	.clear_region = cluster_clear_region,
 	.get_resync_work = cluster_get_resync_work,
-	.complete_resync_work = cluster_complete_resync_work,
+	.set_region_sync = cluster_complete_resync_work,
 	.get_sync_count = cluster_get_sync_count,
 	.status = cluster_status,
 	.get_failure_response = cluster_get_failure_response,
+	.is_remote_recovering = cluster_is_remote_recovering,
+	.set_default_mirror = cluster_set_default_mirror,
+	.get_default_mirror = cluster_get_default_mirror,
 };
 
 #define CMIRROR_RELEASE_NAME "0.1.0"
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/06/27 18:36:09	1.1.4.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-common.h	2006/06/27 20:19:53	1.1.4.2
@@ -82,6 +82,7 @@
 	uint32_t version;
 	sector_t nr_regions;
 	char uuid[MAX_NAME_LEN];
+	int default_mirror;
 };
 
 struct log_c {
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/06/27 18:36:09	1.1.4.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2006/06/27 20:19:53	1.1.4.2
@@ -49,7 +49,7 @@
 static int debug_disk_write = 0;
 extern struct list_head log_list_head;
 
-static void *region_user_alloc(int gfp_mask, void *pool_data){
+static void *region_user_alloc(gfp_t gfp_mask, void *pool_data){
 	return kmalloc(sizeof(struct region_user), gfp_mask);
 }
 
@@ -89,6 +89,7 @@
 	disk->version = cpu_to_le32(core->version);
 	disk->nr_regions = cpu_to_le64(core->nr_regions);
 	memcpy(disk->uuid, core->uuid, MAX_NAME_LEN);
+	disk->default_mirror = cpu_to_le32(core->default_mirror);
 }
 
 static void header_from_disk(struct log_header *core, struct log_header *disk)
@@ -97,6 +98,7 @@
 	core->version = le32_to_cpu(disk->version);
 	core->nr_regions = le64_to_cpu(disk->nr_regions);
 	memcpy(core->uuid, disk->uuid, MAX_NAME_LEN);
+	core->default_mirror = cpu_to_le32(disk->default_mirror);
 }
 
 int read_header(struct log_c *log)
@@ -449,19 +451,6 @@
 	return 0;
 }
 
-static int server_is_remote_recovering(struct log_c *lc, struct log_request *lr)
-{
-	struct region_user *ru;
-
-	if ((ru = find_ru_by_region(lc, lr->u.lr_region)) && 
-	    (ru->ru_rw == RU_RECOVER))
-		lr->u.lr_int_rtn = 1;
-	else
-		lr->u.lr_int_rtn = 0;
-
-	return 0;
-}
-
 static int server_in_sync(struct log_c *lc, struct log_request *lr)
 {
 	if(likely(log_test_bit(lc->sync_bits, lr->u.lr_region)))
@@ -586,7 +575,7 @@
 		}
 	} else if (log_test_bit(lc->sync_bits, lr->u.lr_region)) {
 		DMERR("complete_resync_work region going out-of-sync: disk failure");
-		/* gone for now: lc->sync_count--; */
+		lc->sync_count--;
 		log_clear_bit(lc, lc->sync_bits, lr->u.lr_region);
 	}
 
@@ -610,6 +599,37 @@
 }
 
 
+static int server_is_remote_recovering(struct log_c *lc, struct log_request *lr)
+{
+	struct region_user *ru;
+
+	if ((ru = find_ru_by_region(lc, lr->u.lr_region)) && 
+	    (ru->ru_rw == RU_RECOVER))
+		lr->u.lr_int_rtn = 1;
+	else
+		lr->u.lr_int_rtn = 0;
+
+	return 0;
+}
+
+static int server_set_default_mirror(struct log_c *lc, struct log_request *lr)
+{
+	lc->disk_header->default_mirror = (int)lr->u.lr_region;
+	if (write_header(lc))
+		DMERR("Failed to commit default mirror to disk log");
+
+	lr->u.lr_int_rtn = 0;
+
+	return 0;
+}
+
+static int server_get_default_mirror(struct log_c *lc, struct log_request *lr)
+{
+	lr->u.lr_int_rtn = lc->disk_header->default_mirror;
+
+	return 0;
+}
+
 static struct log_c *get_log_context(char *uuid){
 	struct log_c *lc, *r = NULL;
 
@@ -816,9 +836,6 @@
 		case LRT_IS_CLEAN:
 			error = server_is_clean(lc, &lr);
 			break;
-		case LRT_IS_REMOTE_RECOVERING:
-			error = server_is_remote_recovering(lc, &lr);
-			break;
 		case LRT_IN_SYNC:
 			error = server_in_sync(lc, &lr);
 			break;
@@ -853,6 +870,15 @@
 		case LRT_GET_SYNC_COUNT:
 			error = server_get_sync_count(lc, &lr);
 			break;
+		case LRT_IS_REMOTE_RECOVERING:
+			error = server_is_remote_recovering(lc, &lr);
+			break;
+		case LRT_SET_DEFAULT_MIRROR:
+			error = server_set_default_mirror(lc, &lr);
+			break;
+		case LRT_GET_DEFAULT_MIRROR:
+			error = server_get_default_mirror(lc, &lr);
+			break;
 		default:
 			DMWARN("unknown request type received");
 			return 0;  /* do not send a reply */
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2006/06/27 18:36:09	1.1.4.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-xfr.h	2006/06/27 20:19:53	1.1.4.2
@@ -23,6 +23,9 @@
 #define LRT_MASTER_ASSIGN		11
 #define LRT_MASTER_LEAVING		12
 
+#define LRT_SET_DEFAULT_MIRROR          13
+#define LRT_GET_DEFAULT_MIRROR          14
+
 #define CLUSTER_LOG_PORT 51005
 
 struct log_request {
--- cluster/cmirror-kernel/src/Attic/dm-log.h	2006/06/27 18:36:09	1.1.4.1
+++ cluster/cmirror-kernel/src/Attic/dm-log.h	2006/06/27 20:19:53	1.1.4.2
@@ -56,16 +56,6 @@
 	int (*is_clean)(struct dirty_log *log, region_t region);
 
 	/*
-	 * Returns: 0, 1
-	 *
-	 * This is necessary for cluster mirroring. It provides
-	 * a way to detect recovery on another node, so we
-	 * aren't writing concurrently.  This function is likely
-	 * to block (when a cluster log is used).
-	 */
-	int (*is_remote_recovering)(struct dirty_log *log, region_t region);
-
-	/*
 	 *  Returns: 0, 1, -EWOULDBLOCK, < 0
 	 *
 	 * A predicate function to check the area given by
@@ -108,12 +98,12 @@
 	int (*get_resync_work)(struct dirty_log *log, region_t *region);
 
 	/*
-	 * This notifies the log that the resync of an area has
-	 * been completed.  The log should then mark this region
-	 * as CLEAN.
+	 * This notifies the log that the resync status of a region
+	 * has changed.  It also clears the region from the recovering
+	 * list (if present).
 	 */
-	void (*complete_resync_work)(struct dirty_log *log,
-				     region_t region, int success);
+	void (*set_region_sync)(struct dirty_log *log,
+				region_t region, int in_sync);
 
         /*
 	 * Returns the number of regions that are in sync.
@@ -131,6 +121,31 @@
 	 * of a device failure.
 	 */
 	int (*get_failure_response)(struct dirty_log *log);
+
+	/*
+	 * Returns: 0, 1
+	 *
+	 * This is necessary for cluster mirroring. It provides
+	 * a way to detect recovery on another node, so we
+	 * aren't writing concurrently.  This function is likely
+	 * to block (when a cluster log is used).
+	 */
+	int (*is_remote_recovering)(struct dirty_log *log, region_t region);
+
+	/*
+	 * Returns: 0 on success, <0 on failure
+	 *
+	 * This function is necessary for cluster mirroring.
+	 * If a node detects the primary device has failed,
+	 * the others must have a way of knowing what its
+	 * successor is.
+	 */
+	int (*set_default_mirror)(struct dirty_log *log, int mirror);
+
+	/*
+	 * Returns: >=0 on success, <0 on failure
+	 */
+	int (*get_default_mirror)(struct dirty_log *log);
 };
 
 int dm_register_dirty_log_type(struct dirty_log_type *type);



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2006-06-15 19:48 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2006-06-15 19:48 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2006-06-15 19:48:00

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-cman.c 

Log message:
	- pull some more unnecessary prints (and test cvs commit)

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.18&r2=1.1.2.19
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-cman.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.4&r2=1.1.2.5

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2005/07/27 16:09:31	1.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2006/06/15 19:34:41	1.1.2.18
@@ -0,0 +1,1359 @@
+/*
+ * Copyright (C) 2005 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/list.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/socket.h>
+#include <linux/signal.h>
+#include <linux/mempool.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/device-mapper.h>
+#include <linux/delay.h>
+#include <cluster/service.h>
+#include <cluster/cnxman.h>
+#include <cluster/cnxman-socket.h>
+
+#include "dm-log.h"
+#include "dm-cmirror-xfr.h"
+#include "dm-cmirror-common.h"
+#include "dm-cmirror-server.h"
+#include "dm-cmirror-cman.h"
+
+LIST_HEAD(log_list_head);
+
+struct region_state {
+	struct log_c *rs_lc;
+	region_t rs_region;
+	struct list_head rs_list;
+};
+
+static mempool_t *region_state_pool = NULL;
+static spinlock_t region_state_lock;
+static int clear_region_count=0;
+static struct list_head clear_region_list;
+static struct list_head marked_region_list;
+
+static int shutting_down=0;
+static atomic_t suspend_client;
+static wait_queue_head_t suspend_client_queue;
+
+static DECLARE_MUTEX(consult_server_lock);
+
+/* These vars are just for stats, and will be removed */
+static uint32_t request_count=0;
+static uint32_t request_retry_count=0;
+static int clear_req=0;
+static int mark_req=0;
+static int insync_req=0;
+static int clear_req2ser=0;
+static int mark_req2ser=0;
+static int insync_req2ser=0;
+
+static void *region_state_alloc(int gfp_mask, void *pool_data){
+	return kmalloc(sizeof(struct region_state), gfp_mask);
+}
+
+static void region_state_free(void *element, void *pool_data){
+	kfree(element);
+}
+
+#define BYTE_SHIFT 3
+/*
+ *   <region_size> <uuid> [[no]sync] [block_on_error]
+ */
+static int core_ctr(struct dirty_log *log, struct dm_target *ti,
+		    unsigned int argc, char **argv)
+{
+	enum sync sync = DEFAULTSYNC;
+	int failure_response = FR_NONBLOCK;
+
+	struct log_c *lc;
+	sector_t region_size;
+	unsigned int region_count;
+	size_t bitset_size;
+	int uuid = 0;
+	int i;
+
+	/* Already checked argument count */
+
+	for (i = 1; i < argc; i++) {
+		if (!strcmp(argv[i], "sync"))
+			sync = FORCESYNC;
+		else if (!strcmp(argv[i], "nosync"))
+			sync = NOSYNC;
+		else if (!strcmp(argv[i], "block_on_error"))
+			failure_response = FR_BLOCK;
+		else if (!uuid)
+			uuid = i;
+		else {
+			DMWARN("unrecognised argument to clustered mirror log: %s",
+			       argv[i]);
+			return -EINVAL;
+		}
+	}
+
+	if (sscanf(argv[0], SECTOR_FORMAT, &region_size) != 1) {
+		DMWARN("invalid region size string");
+		return -EINVAL;
+	}
+
+	region_count = dm_sector_div_up(ti->len, region_size);
+
+	lc = kmalloc(sizeof(*lc), GFP_KERNEL);
+	if (!lc) {
+		DMWARN("Couldn't allocate core log");
+		return -ENOMEM;
+	}
+	memset(lc, 0, sizeof(*lc));
+
+	lc->ti = ti;
+	lc->region_size = region_size;
+	lc->region_count = region_count;
+	lc->sync = sync;
+	lc->failure_response = failure_response;
+	strncpy(lc->uuid, argv[uuid], MAX_NAME_LEN);
+
+	/*
+	 * Work out how many words we need to hold the bitset.
+	 */
+	bitset_size = dm_round_up(region_count,
+				  sizeof(*lc->clean_bits) << BYTE_SHIFT);
+
+	bitset_size >>= BYTE_SHIFT;
+
+	lc->bitset_uint32_count = bitset_size / sizeof(*lc->clean_bits);
+	lc->clean_bits = vmalloc(bitset_size);
+	if (!lc->clean_bits) {
+		DMWARN("couldn't allocate clean bitset");
+		kfree(lc);
+		return -ENOMEM;
+	}
+	memset(lc->clean_bits, -1, bitset_size);
+
+	lc->sync_bits = vmalloc(bitset_size);
+	if (!lc->sync_bits) {
+		DMWARN("couldn't allocate sync bitset");
+		vfree(lc->clean_bits);
+		kfree(lc);
+		return -ENOMEM;
+	}
+	memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
+	lc->sync_count = (sync == NOSYNC) ? region_count : 0;
+
+	lc->recovering_bits = vmalloc(bitset_size);
+	if (!lc->recovering_bits) {
+		DMWARN("couldn't allocate sync bitset");
+		vfree(lc->sync_bits);
+		vfree(lc->clean_bits);
+		kfree(lc);
+		return -ENOMEM;
+	}
+	memset(lc->recovering_bits, 0, bitset_size);
+	lc->sync_search = 0;
+	log->context = lc;
+	return 0;
+}
+
+static void core_dtr(struct dirty_log *log)
+{
+	struct log_c *lc = (struct log_c *) log->context;
+	vfree(lc->clean_bits);
+	vfree(lc->sync_bits);
+	vfree(lc->recovering_bits);
+	kfree(lc);
+}
+
+/*----------------------------------------------------------------
+ * disk log constructor/destructor
+ *
+ *   <device> <region_size> <uuid> [[no]sync] [block_on_error]
+ *--------------------------------------------------------------*/
+static int disk_ctr(struct dirty_log *log, struct dm_target *ti,
+		    unsigned int argc, char **argv)
+{
+	int r;
+	size_t size;
+	struct log_c *lc;
+	struct dm_dev *dev;
+
+	/* already checked argument count */
+	r = dm_get_device(ti, argv[0], 0, 0 /* FIXME */,
+			  FMODE_READ | FMODE_WRITE, &dev);
+	if (r){
+		DMWARN("Unable to get device %s", argv[0]);
+		return r;
+	}
+
+	r = core_ctr(log, ti, argc - 1, argv + 1);
+	if (r) {
+		dm_put_device(ti, dev);
+		return r;
+	}
+
+	lc = (struct log_c *) log->context;
+	lc->log_dev = dev;
+	lc->log_dev_failed = 0;
+	init_completion(&lc->failure_completion);
+
+	/* setup the disk header fields */
+	lc->header_location.bdev = lc->log_dev->bdev;
+	lc->header_location.sector = 0;
+	lc->header_location.count = 1;
+
+	/*
+	 * We can't read less than this amount, even though we'll
+	 * not be using most of this space.
+	 */
+	lc->disk_header = vmalloc(1 << SECTOR_SHIFT);
+	if (!lc->disk_header)
+		goto bad;
+
+	/* setup the disk bitset fields */
+	lc->bits_location.bdev = lc->log_dev->bdev;
+	lc->bits_location.sector = LOG_OFFSET;
+
+	size = dm_round_up(lc->bitset_uint32_count * sizeof(uint32_t),
+			   1 << SECTOR_SHIFT);
+	lc->bits_location.count = size >> SECTOR_SHIFT;
+
+	return 0;
+
+ bad:
+	dm_put_device(ti, lc->log_dev);
+	core_dtr(log);
+	return -ENOMEM;
+}
+
+static void disk_dtr(struct dirty_log *log)
+{
+	struct log_c *lc = (struct log_c *) log->context;
+	dm_put_device(lc->ti, lc->log_dev);
+	vfree(lc->disk_header);
+	core_dtr(log);
+}
+
+
+
+static int run_election(struct log_c *lc){
+	int error=0, len;
+	struct sockaddr_in saddr_in;
+	struct msghdr msg;
+	struct iovec iov;
+	mm_segment_t fs;
+	struct log_request lr;  /* ATTENTION -- could be too much on the stack */
+  
+	memset(&lr, 0, sizeof(lr));
+
+	lr.lr_type = LRT_ELECTION;
+	lr.u.lr_starter = my_id;
+	lr.u.lr_coordinator = my_id;
+	memcpy(lr.lr_uuid, lc->uuid, MAX_NAME_LEN);
+
+	memset(&saddr_in, 0, sizeof(struct sockaddr_cl));
+
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_iovlen = 1;
+	msg.msg_iov = &iov;
+	msg.msg_flags = 0;
+  
+	saddr_in.sin_family = AF_INET;
+	saddr_in.sin_port = CLUSTER_LOG_PORT;
+	if(!(saddr_in.sin_addr.s_addr = nodeid_to_ipaddr(my_id))){
+		DMERR("Unable to convert nodeid_to_ipaddr in run_election");
+	}
+	msg.msg_name = &saddr_in;
+	msg.msg_namelen = sizeof(saddr_in);
+
+	iov.iov_len = sizeof(struct log_request);
+	iov.iov_base = &lr;
+
+	fs = get_fs();
+	set_fs(get_ds());
+
+	len = sock_sendmsg(lc->client_sock, &msg, sizeof(struct log_request));
+
+	if(len < 0){
+		DMERR("unable to send election notice to server (error = %d)", len);
+		error = len;
+		set_fs(fs);
+		goto fail;
+	}
+
+  
+	/* why do we need to reset this? */
+	iov.iov_len = sizeof(struct log_request);
+	iov.iov_base = &lr;
+
+	len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
+			 0, 20);
+	set_fs(fs);
+  
+	if(len > 0){
+		lc->server_id = lr.u.lr_coordinator;
+	} else {
+		/* ATTENTION -- what do we do with this ? */
+		DMWARN("Failed to receive election results from server");
+		error = len;
+	}
+
+ fail:
+	return error;
+}
+
+static int _consult_server(struct log_c *lc, region_t region,
+			  int type, region_t *result, int *retry){
+	int len;
+	int error=0;
+	struct sockaddr_in saddr_in;
+	struct msghdr msg;
+	struct iovec iov;
+	mm_segment_t fs;
+	struct log_request *lr;
+
+	request_count++;
+
+	lr = kmalloc(sizeof(struct log_request), GFP_KERNEL);
+	if(!lr){
+		error = -ENOMEM;
+		*retry = 1;
+		goto fail;
+	}
+
+	memset(lr, 0, sizeof(struct log_request));
+	
+	lr->lr_type = type;
+	if(type == LRT_MASTER_LEAVING){
+		lr->u.lr_starter = my_id;
+	} else {
+		lr->u.lr_region = region;
+	}
+
+	if (type == LRT_COMPLETE_RESYNC_WORK)
+		lr->u.lr_int_rtn = (*result) ? 1 : 0;
+
+	memcpy(lr->lr_uuid, lc->uuid, MAX_NAME_LEN);
+
+	memset(&saddr_in, 0, sizeof(struct sockaddr_in));
+
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_iovlen = 1;
+	msg.msg_iov = &iov;
+	msg.msg_flags = 0;
+  
+	saddr_in.sin_family = AF_INET;
+	saddr_in.sin_port = CLUSTER_LOG_PORT;
+	if(!(saddr_in.sin_addr.s_addr = nodeid_to_ipaddr(lc->server_id))){
+		DMERR("Unable to convert nodeid_to_ipaddr(0x%x) in _consult_server",
+			lc->server_id);
+		error = -ENXIO;
+		*retry = 1;
+		goto fail;
+	}
+	msg.msg_name = &saddr_in;
+	msg.msg_namelen = sizeof(saddr_in);
+
+	iov.iov_len = sizeof(struct log_request);
+	iov.iov_base = lr;
+/*
+	DMERR("To  :: 0x%x, %s", 
+	       saddr_in.sin_addr.s_addr,
+	       (lr->lr_type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
+	       (lr->lr_type == LRT_IN_SYNC)? "LRT_IN_SYNC":
+	       (lr->lr_type == LRT_MARK_REGION)? "LRT_MARK_REGION":
+	       (lr->lr_type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
+	       (lr->lr_type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
+	       (lr->lr_type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
+	       (lr->lr_type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
+	       (lr->lr_type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
+	       (lr->lr_type == LRT_ELECTION)? "LRT_ELECTION":
+	       (lr->lr_type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN"
+		);
+*/
+	if(lr->lr_type == LRT_MARK_REGION){
+		mark_req2ser++;
+	}
+
+	if(lr->lr_type == LRT_CLEAR_REGION){
+		clear_req2ser++;
+	}
+	
+	if(lr->lr_type == LRT_IN_SYNC){
+		insync_req2ser++;
+	}
+	
+	fs = get_fs();
+	set_fs(get_ds());
+  
+	len = sock_sendmsg(lc->client_sock, &msg, sizeof(struct log_request));
+
+	set_fs(fs);
+
+	if(len < sizeof(struct log_request)){
+		DMWARN("unable to send log request to server");
+		error = -EBADE;
+		goto fail;
+	}
+
+	iov.iov_len = sizeof(struct log_request);
+	iov.iov_base = lr;
+
+	fs = get_fs();
+	set_fs(get_ds());
+
+	if(type == LRT_MASTER_LEAVING){
+		len = sock_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
+				   /* WAIT for it */0);
+	} else {
+		len = my_recvmsg(lc->client_sock, &msg, sizeof(struct log_request),
+				 0, 5);
+	}
+	set_fs(fs);
+
+	if(len <= 0){
+		/* ATTENTION -- what do we do with this ? */
+//		DMWARN("Failed to recvmsg from clustered log server");
+		error = len;
+		*retry = 1;
+		goto fail;
+	}
+    
+	if(lr->u.lr_int_rtn == -EAGAIN){
+		DMWARN("Server (%u), request type %d, -EAGAIN."
+		       "  Mirror suspended?",
+		       lc->server_id, lr->lr_type);
+		*retry = 1;
+		goto fail;
+	}
+
+	if(lr->u.lr_int_rtn == -ENXIO){
+		DMWARN("server tells us it no longer controls the log");
+		lc->server_id = 0xDEAD;
+		*retry = 1;
+		goto fail;
+	}
+
+	if(lr->u.lr_int_rtn < 0){
+		DMWARN("an error occured on the server while processing our request");
+	}
+
+	if(result)
+		*result = lr->u.lr_region_rtn;
+
+	error = lr->u.lr_int_rtn;
+	kfree(lr);
+	return error;
+ fail:
+	if(*retry){
+		request_retry_count++;
+		if(!(request_retry_count & 0x1F)){
+			DMINFO("Clustered mirror retried requests :: %u of %u (%u%%)",
+			       request_retry_count,
+			       request_count,
+			       dm_div_up(request_retry_count*100, request_count));
+		}
+	}
+
+	if(lr) kfree(lr);
+#ifdef DEBUG
+	DMWARN("Request (%s) to server failed :: %d",
+	       (type == LRT_IS_CLEAN)? "LRT_IS_CLEAN":
+	       (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
+	       (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
+	       (type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
+	       (type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
+	       (type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
+	       (type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
+	       (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
+	       (type == LRT_ELECTION)? "LRT_ELECTION":
+	       (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN",
+	       error);
+#endif
+	return error;
+}
+
+static int consult_server(struct log_c *lc, region_t region,
+			  int type, region_t *result){
+	int rtn=0;
+	int retry=0;
+	int new_server=0;
+	struct region_state *rs=NULL;
+
+	/* ATTENTION -- need to change this, the server could fail at anypoint **
+	** we do not want to send requests to the wrong place, or fail to run  **
+	** an election when needed */
+	down(&consult_server_lock);
+
+	do{
+		retry = 0;
+		suspend_on(&suspend_client_queue, atomic_read(&suspend_client));
+	election:
+		while(lc->server_id == 0xDEAD){
+			run_election(lc);
+			new_server = 1;
+		}
+
+		spin_lock(&region_state_lock);
+		if(new_server && 
+		   (!list_empty(&clear_region_list) ||
+		    !list_empty(&marked_region_list))){
+			int i=0;
+			struct region_state *tmp_rs;
+
+			DMINFO("Clean-up required due to new server");
+			DMINFO(" - Wiping clear region list");
+			list_for_each_entry_safe(rs, tmp_rs,
+						 &clear_region_list, rs_list){
+				i++;
+				list_del_init(&rs->rs_list);
+				mempool_free(rs, region_state_pool);
+			}
+			clear_region_count=0;
+			DMINFO(" - %d clear region requests wiped", i);
+
+			DMINFO(" - Resending all mark region requests");
+			list_for_each_entry(rs, &marked_region_list, rs_list){
+				do {
+					retry = 0;
+					DMINFO("   - " SECTOR_FORMAT, rs->rs_region);
+					rtn = _consult_server(rs->rs_lc, rs->rs_region,
+							      LRT_MARK_REGION, NULL, &retry);
+					if (lc->server_id == 0xDEAD) {
+						spin_unlock(&region_state_lock);
+						goto election;
+					}
+				} while(retry);
+			}
+			DMINFO("Clean-up complete");
+			if(type == LRT_MARK_REGION){
+				/* we just handled all marks */
+				DMWARN("Mark request ignored.\n");
+				spin_unlock(&region_state_lock);
+				goto out;
+			} else {
+				DMWARN("Continuing request:: %s", 
+				      (type == LRT_IS_CLEAN)? "LRT_IS_C	LEAN":
+				      (type == LRT_IN_SYNC)? "LRT_IN_SYNC":
+				      (type == LRT_MARK_REGION)? "LRT_MARK_REGION":
+				      (type == LRT_GET_RESYNC_WORK)? "LRT_GET_RESYNC_WORK":
+				      (type == LRT_GET_SYNC_COUNT)? "LRT_GET_SYNC_COUNT":
+				      (type == LRT_CLEAR_REGION)? "LRT_CLEAR_REGION":
+				      (type == LRT_COMPLETE_RESYNC_WORK)? "LRT_COMPLETE_RESYNC_WORK":
+				      (type == LRT_MASTER_LEAVING)? "LRT_MASTER_LEAVING":
+				      (type == LRT_ELECTION)? "LRT_ELECTION":
+				      (type == LRT_SELECTION)? "LRT_SELECTION": "UNKNOWN"
+					);
+			}
+		}
+
+		rs = NULL;
+
+		if(!list_empty(&clear_region_list)){
+			rs = list_entry(clear_region_list.next,
+					struct region_state, rs_list);
+			list_del_init(&rs->rs_list);
+			clear_region_count--;
+		}
+
+		spin_unlock(&region_state_lock);
+		
+		/* ATTENTION -- it may be possible to remove a clear region **
+		** request from the list.  Then, have a mark region happen  **
+		** while we are here.  If the clear region request fails, it**
+		** would be re-added - perhaps prematurely clearing the bit */
+		
+		if(rs){
+			_consult_server(rs->rs_lc, rs->rs_region,
+					LRT_CLEAR_REGION, NULL, &retry);
+
+			if(retry){
+				spin_lock(&region_state_lock);
+				list_add(&rs->rs_list, &clear_region_list);
+				clear_region_count++;
+				spin_unlock(&region_state_lock);
+
+			} else {
+				mempool_free(rs, region_state_pool);
+			}
+		}
+		retry = 0;
+		
+		rtn = _consult_server(lc, region, type, result, &retry);
+		schedule();
+	} while(retry);
+out:
+	up(&consult_server_lock);
+
+	return rtn;
+}
+
+
+static int cluster_connect(void);
+static int cluster_disconnect(void);
+
+static int cluster_ctr(struct dirty_log *log, struct dm_target *ti,
+		       unsigned int argc, char **argv, int disk)
+{
+	int error = 0;
+	struct log_c *lc;
+	struct sockaddr_in saddr_in;
+
+	if (!disk) {
+		if ((error = core_ctr(log, ti, argc, argv))) {
+			DMWARN("Clustered mirror:: core_ctr failed");
+			return error;
+		}
+	} else {
+		/* NOTE -- we take advantage of the fact that disk_ctr does **
+		** not actually read the disk.  I suppose, however, that if **
+		** it does in the future, we will simply reread it when a   **
+		** server is started here.................................. */
+
+		if((error = disk_ctr(log, ti, argc, argv))) {
+			DMWARN("Clustered mirror:: disk_ctr failed");
+			return error;
+		}
+	}
+
+	lc = log->context;
+
+	if (lc->failure_response != FR_BLOCK) {
+		DMERR("Clustered mirror requires \"block_on_error\" parameter");
+		error = -EINVAL;
+		goto fail;
+	}
+
+	atomic_set(&lc->in_sync, -1);
+
+	list_add(&lc->log_list, &log_list_head);
+	INIT_LIST_HEAD(&lc->region_users);
+
+	lc->server_id = 0xDEAD;
+
+	if ((error = cluster_connect())) {
+		DMWARN("Unable to connect to cluster infrastructure.");
+		goto fail;
+	}
+
+	error = sock_create(AF_INET, SOCK_DGRAM,
+			    0,
+			    &lc->client_sock);
+
+	if(error){
+		DMWARN("unable to create clustered log client socket");
+		goto fail;
+	}
+
+	saddr_in.sin_family = AF_INET;
+	saddr_in.sin_port = CLUSTER_LOG_PORT+1;
+	if(!(saddr_in.sin_addr.s_addr = nodeid_to_ipaddr(my_id))){
+		DMERR("Unable to convert nodeid_to_ipaddr in cluster_ctr");
+	}
+	error = lc->client_sock->ops->bind(lc->client_sock,
+					   (struct sockaddr *)&saddr_in,
+					   sizeof(struct sockaddr_in));
+	while(error == -EADDRINUSE){
+		saddr_in.sin_port++;
+		error = lc->client_sock->ops->bind(lc->client_sock,
+						   (struct sockaddr *)&saddr_in,
+						   sizeof(struct sockaddr_in));
+	}
+
+	if(error){
+		DMWARN("unable to bind clustered log client socket");
+		sock_release(lc->client_sock);
+		goto fail;
+	}
+
+	return 0;
+
+ fail:
+	if (lc->log_dev)
+		disk_dtr(log);
+	else
+		core_dtr(log);
+
+	return error;
+}
+
+/*------------------------------------------------------------------
+ * clustered_core log constructor
+ * (preceding args::  <start> <len> mirror clustered_core <log_args>
+ *
+ * Right now, 3 <= argc <= 4.  "block_on_error" is required.
+ *
+ * argv contains:
+ *   <region_size> <uuid> [[no]sync] [block_on_error]
+ *--------------------------------------------------------------*/
+static int cluster_core_ctr(struct dirty_log *log, struct dm_target *ti,
+		       unsigned int argc, char **argv) {
+	int i;
+	if ((argc < 3) || (argc > 4)) {
+		DMERR("Too %s arguments to clustered_core mirror log type.",
+		      (argc < 3) ? "few" : "many");
+		DMERR("  %d arguments supplied:", argc);
+		for (i = 0; i < argc; i++)
+			DMERR("    %s", argv[i]);
+		return -EINVAL;
+	}
+
+	return cluster_ctr(log, ti, argc, argv, 0);
+}
+
+/*------------------------------------------------------------------
+ * clustered_disk log constructor
+ * (preceding args::  <start> <len> mirror clustered_disk <log_args>
+ *
+ * Right now, 4 <= argc <= 5.  "block_on_error" is required.
+ *
+ * argv contains:
+ *   <disk> <region_size> <uuid> [[no]sync] [block_on_error]
+ *--------------------------------------------------------------*/
+static int cluster_disk_ctr(struct dirty_log *log, struct dm_target *ti,
+		       unsigned int argc, char **argv) {
+	int i;
+	if ((argc < 4) || (argc > 5)) {
+		DMERR("Too %s arguments to clustered_disk mirror log type.",
+		      (argc < 4) ? "few" : "many");
+		DMERR("  %d arguments supplied:", argc);
+		for (i = 0; i < argc; i++)
+			DMERR("    %s", argv[i]);
+		return -EINVAL;
+	}
+
+	return cluster_ctr(log, ti, argc, argv, 1);
+}
+
+static void cluster_dtr(struct dirty_log *log)
+{
+	struct log_c *lc = (struct log_c *) log->context;
+
+	if (!list_empty(&clear_region_list))
+		DMERR("LEAVING WHILE REGION REQUESTS REMAIN.");
+
+	list_del_init(&lc->log_list);
+	if(lc->server_id == my_id)
+		consult_server(lc, 0, LRT_MASTER_LEAVING, NULL);
+	sock_release(lc->client_sock);
+
+	if (lc->log_dev) 
+		disk_dtr(log);
+	else
+		core_dtr(log);
+
+	if (cluster_disconnect())
+		DMERR("Unable to disconnect from cluster infrastructure.\n");
+}
+
+static int cluster_presuspend(struct dirty_log *log)
+{
+	struct log_c *lc = (struct log_c *) log->context;
+
+	atomic_set(&lc->suspended, 1);
+	
+	if (lc->log_dev && lc->log_dev_failed)
+		complete(&lc->failure_completion);
+	else {
+		while (1) {
+			spin_lock(&region_state_lock);
+			if (list_empty(&clear_region_list)) {
+				spin_unlock(&region_state_lock);
+				break;
+			}
+			spin_unlock(&region_state_lock);
+
+			/* Just an unnessesary call to clear out regions */
+			consult_server(lc, 0, LRT_IN_SYNC, NULL);
+		}
+	}
+
+	return 0;
+}
+
+static int cluster_postsuspend(struct dirty_log *log){
+	return 0;
+}
+
+static int cluster_resume(struct dirty_log *log){
+	struct log_c *lc = (struct log_c *) log->context;
+
+	lc->sync_search = 0;
+	resume_server_requests();
+	atomic_set(&lc->suspended, 0);
+
+	return 0;
+}
+
+static uint32_t cluster_get_region_size(struct dirty_log *log)
+{
+	struct log_c *lc = (struct log_c *) log->context;
+
+	return lc->region_size;
+}
+
+
+static int cluster_is_clean(struct dirty_log *log, region_t region)
+{
+	int rtn;
+	struct log_c *lc = (struct log_c *) log->context;
+
+	rtn = consult_server(lc, region, LRT_IS_CLEAN, NULL);
+
+	return rtn;
+}
+
+static int cluster_is_remote_recovering(struct dirty_log *log, region_t region)
+{
+	int rtn;
+	struct log_c *lc = (struct log_c *) log->context;
+
+/* take out optimization
+	if(atomic_read(&lc->in_sync) == 1){
+		return 0;
+	}
+*/
+	rtn = consult_server(lc, region, LRT_IS_REMOTE_RECOVERING, NULL);
+	return rtn;
+}
+
+static int cluster_in_sync(struct dirty_log *log, region_t region, int block)
+{
+	int rtn;
+	struct log_c *lc = (struct log_c *) log->context;
+  
+	/* check known_regions, return if found */
+	insync_req++;
+/* take out optimization
+	if(atomic_read(&lc->in_sync) == 1){
+		return 1;
+	}
+*/
+	if(!block){
+		return -EWOULDBLOCK;
+	}
+
+	rtn = consult_server(lc, region, LRT_IN_SYNC, NULL);
+
+	return rtn;
+}
+
+static int cluster_flush(struct dirty_log *log)
+{
+	/* FIXME:  flush all clear_region requests to server */
+	return 0;
+}
+
+static void cluster_mark_region(struct dirty_log *log, region_t region)
+{
+	int error = 0;
+	struct region_state *rs, *tmp_rs, *rs_new;
+	struct log_c *lc = (struct log_c *) log->context;
+
+	mark_req++;
+
+	rs_new = mempool_alloc(region_state_pool, GFP_KERNEL);
+
+	spin_lock(&region_state_lock);
+	list_for_each_entry_safe(rs, tmp_rs, &clear_region_list, rs_list){
+		if(lc == rs->rs_lc && region == rs->rs_region){
+#ifdef DEBUG
+			DMINFO("Mark pre-empting clear of region %Lu", region);
+#endif
+			list_del_init(&rs->rs_list);
+			list_add(&rs->rs_list, &marked_region_list);
+			clear_region_count--;
+			spin_unlock(&region_state_lock);
+			if (rs_new)
+				mempool_free(rs_new, region_state_pool);
+
+			return;
+		}
+	}
+	/* ATTENTION -- this check should not be necessary.   **
+	** Why are regions being marked again before a clear? */
+	list_for_each_entry(rs, &marked_region_list, rs_list){
+		if(lc == rs->rs_lc && region == rs->rs_region){
+#ifdef DEBUG
+			DMINFO("Double mark on region ("
+			       SECTOR_FORMAT ")", region);
+#endif
+			spin_unlock(&region_state_lock);
+			if (rs_new)
+				mempool_free(rs_new, region_state_pool);
+
+			return;
+		}
+	}
+
+	if(!rs_new){
+		DMERR("Unable to allocate region_state for mark.");
+		BUG();
+	}
+
+	rs_new->rs_lc = lc;
+	rs_new->rs_region = region;
+	INIT_LIST_HEAD(&rs_new->rs_list);
+	list_add(&rs_new->rs_list, &marked_region_list);
+
+	spin_unlock(&region_state_lock);
+
+	while((error = consult_server(lc, region, LRT_MARK_REGION, NULL))){
+		DMWARN("unable to get server (%u) to mark region (%Lu)",
+		       lc->server_id, region);
+		DMWARN("Reason :: %d", error);
+	}
+
+	if (lc->log_dev_failed) {
+		DMERR("Write failed on mirror log device, %s",
+		      lc->log_dev->name);
+		dm_table_event(lc->ti->table);
+		if (!atomic_read(&lc->suspended))
+			wait_for_completion(&lc->failure_completion);
+	}
+	return;
+}
+
+static void cluster_clear_region(struct dirty_log *log, region_t region)
+{
+	struct log_c *lc = (struct log_c *) log->context;
+	struct region_state *rs, *tmp_rs, *rs_new;
+	clear_req++;
+
+	rs_new = mempool_alloc(region_state_pool, GFP_ATOMIC);
+
+	spin_lock(&region_state_lock);
+
+	list_for_each_entry_safe(rs, tmp_rs, &clear_region_list, rs_list){
+		if(lc == rs->rs_lc && region == rs->rs_region){
+			DMINFO("%d) Double clear on region ("
+			      SECTOR_FORMAT ")", __LINE__, region);
+			spin_unlock(&region_state_lock);
+			if (rs_new)
+				mempool_free(rs_new, region_state_pool);
+			return;
+		}
+	}
+
+	list_for_each_entry_safe(rs, tmp_rs, &marked_region_list, rs_list){
+		if(lc == rs->rs_lc && region == rs->rs_region){
+			list_del_init(&rs->rs_list);
+			list_add(&rs->rs_list, &clear_region_list);
+			clear_region_count++;
+			if(!(clear_region_count & 0x7F)){
+				DMINFO("clear_region_count :: %d", clear_region_count);
+			}
+			spin_unlock(&region_state_lock);
+			if (rs_new)
+				mempool_free(rs_new, region_state_pool);
+			return;
+		}
+	}
+
+	/* We can get here because we my be doing resync_work, and therefore, **
+	** clearing without ever marking..................................... */
+
+	if(!rs_new){
+		DMERR("Unable to allocate region_state for mark.");
+		BUG();
+	}
+
+	rs_new->rs_lc = lc;
+	rs_new->rs_region = region;
+	INIT_LIST_HEAD(&rs_new->rs_list);
+	list_add(&rs_new->rs_list, &clear_region_list);
+	clear_region_count++;
+	if(!(clear_region_count & 0x7F)){
+		DMINFO("clear_region_count :: %d", clear_region_count);
+	}
+
+	spin_unlock(&region_state_lock);
+	return;
+}
+
+static int cluster_get_resync_work(struct dirty_log *log, region_t *region)
+{
+	int rtn;
+	struct log_c *lc = (struct log_c *) log->context;
+
+	rtn = consult_server(lc, 0, LRT_GET_RESYNC_WORK, region);
+
+	return rtn;
+}
+
+static void cluster_complete_resync_work(struct dirty_log *log,
+					 region_t region, int success)
+{
+	region_t success_tmp = success;
+	struct log_c *lc = (struct log_c *) log->context;
+	while(consult_server(lc, region, LRT_COMPLETE_RESYNC_WORK, &success_tmp)){
+		DMWARN("unable to notify server of completed resync work");
+	}
+	if (!success) {
+		DMERR("Attempting to revert sync status of region #%llu", region);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(HZ/50);
+	}
+
+	return;
+}
+
+static region_t cluster_get_sync_count(struct dirty_log *log)
+{
+	region_t rtn;
+	struct log_c *lc = (struct log_c *) log->context;
+/* take out optimization
+	if(atomic_read(&lc->in_sync) == 1){
+		return lc->region_count;
+	}
+*/
+	if(consult_server(lc, 0, LRT_GET_SYNC_COUNT, &rtn)){
+		return 0;
+	}
+
+	if(rtn > lc->region_count){
+		DMERR("sync_count ("
+		      SECTOR_FORMAT
+		      ") > region_count ("
+		      SECTOR_FORMAT
+		      ") - (%s)!",
+		      rtn, lc->region_count, lc->uuid + (strlen(lc->uuid) - 8));
+	}
+
+	if(rtn >= lc->region_count){
+		atomic_set(&lc->in_sync, 1);
+	} else if(unlikely(atomic_read(&lc->in_sync) < 0)){
+		atomic_set(&lc->in_sync, 0);
+	}
+
+	return rtn;
+}
+
+static int cluster_status(struct dirty_log *log, status_type_t status,
+			  char *result, unsigned int maxlen)
+{
+	int sz = 0;
+	int arg_count=3;
+	struct log_c *lc = (struct log_c *) log->context;
+
+	switch(status){
+	case STATUSTYPE_INFO:
+/*
+		spin_lock(&region_state_lock);
+		i = clear_region_count;
+		list_for_each_entry(rs, &marked_region_list, rs_list){
+			j++;
+		}
+		spin_unlock(&region_state_lock);
+
+		DMINFO("CLIENT OUTPUT::");
+		DMINFO("  My ID            : %u", my_id);
+		DMINFO("  Server ID        : %u", lc->server_id);
+
+		DMINFO("  In-sync          : %s", (atomic_read(&lc->in_sync)>0)?
+		       "YES" : "NO");
+		DMINFO("  Regions marked   : %d", j);
+		DMINFO("  Regions clearing : %d", i);
+
+		DMINFO("  Mark requests    : %d", mark_req);
+		if(mark_req)
+			DMINFO("  Mark req to serv : %d (%d%%)", mark_req2ser,
+			       (mark_req2ser*100)/mark_req);
+
+		DMINFO("  Clear requests   : %d", clear_req);
+		if(clear_req)
+			DMINFO("  Clear req to serv: %d (%d%%)", clear_req2ser,
+			       (clear_req2ser*100)/clear_req);
+
+		DMINFO("  Sync  requests   : %d", insync_req);
+		if(insync_req)
+			DMINFO("  Sync req to serv : %d (%d%%)", insync_req2ser,
+			       (insync_req2ser*100)/insync_req);
+
+		if(lc->server_id == my_id){
+			print_server_status(lc);
+		}
+*/
+		if(lc->sync != DEFAULTSYNC)
+			arg_count++;
+
+		if (lc->log_dev)
+			DMEMIT("3 %s %s %c",
+			       log->type->name,                  /* NAME */
+			       lc->log_dev->name,                /* THE LOG DEVICE */
+			       (lc->log_dev_failed)? 'D' : 'A'); /* LOG DEVICE LIVENESS */
+		else
+			DMEMIT("1 %s", log->type->name);
+
+                break;
+
+        case STATUSTYPE_TABLE:
+		if(lc->sync != DEFAULTSYNC)
+			arg_count++;
+
+		if (lc->log_dev) {
+			arg_count++;
+
+			DMEMIT("%s %u %s " SECTOR_FORMAT " %s ",
+			       log->type->name,                 /* NAME */
+			       arg_count,                       /* # OF ARGS */
+			       lc->log_dev->name,               /* THE LOG DEVICE */
+			       lc->region_size,                 /* REGION SIZE */
+			       lc->uuid);                       /* UUID */
+		} else {
+			DMEMIT("%s %u " SECTOR_FORMAT " %s ",
+			       log->type->name,                 /* NAME */
+			       arg_count,                       /* # OF ARGS */
+			       lc->region_size,                 /* REGION SIZE */
+			       lc->uuid);                       /* UUID */
+		}
+		if (lc->sync != DEFAULTSYNC)
+			DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "");
+		DMEMIT("block_on_error ");
+        }
+
+	return sz;
+}
+
+static int cluster_get_failure_response(struct dirty_log *log)
+{
+	struct log_c *lc = (struct log_c *) log->context;
+	return lc->failure_response;
+}
+
+static int clog_stop(void *data){
+	struct log_c *lc;
+
+	atomic_set(&suspend_client, 1);
+
+	list_for_each_entry(lc, &log_list_head, log_list) {
+		atomic_set(&lc->in_sync, 0);
+	}
+	
+	if (likely(!shutting_down))
+		suspend_server();
+
+	return 0;
+}
+
+static int clog_start(void *data, uint32_t *nodeids, int count, int event_id, int type){
+	int i;
+	uint32_t server;
+	struct log_c *lc;
+	struct kcl_cluster_node node;
+
+	if(global_nodeids){
+		kfree(global_nodeids);
+	}
+	global_nodeids = nodeids;
+	global_count = count;
+
+	kcl_get_node_by_nodeid(0, &node);
+	my_id = node.node_id;
+
+	restart_event_id = event_id;
+	restart_event_type = type;
+
+	switch(type){
+	case SERVICE_NODE_LEAVE:
+	case SERVICE_NODE_FAILED:
+		list_for_each_entry(lc, &log_list_head, log_list){
+			for(i=0, server = 0xDEAD; i < count; i++){
+				if(lc->server_id == nodeids[i]){
+					server = nodeids[i];
+				}
+			}
+			/* ATTENTION -- need locking around this ? */
+			lc->server_id = server;
+		}
+		break;
+	case SERVICE_NODE_JOIN:
+		break;
+	default:
+		DMERR("Invalid service event type received");
+		BUG();
+		break;
+	}
+	resume_server();
+	return 0;
+}
+
+static void clog_finish(void *data, int event_id){
+	atomic_set(&suspend_client, 0);
+	wake_up_all(&suspend_client_queue);
+}
+
+static struct kcl_service_ops clog_ops = {
+	.stop = clog_stop,
+	.start = clog_start,
+	.finish = clog_finish,
+};
+
+static int mirror_set_count = 0; /* used to prevent multiple cluster [dis]connects */
+
+static int cluster_connect(void)
+{
+	int r;
+
+	if (mirror_set_count++)
+		return 0;
+
+	r = kcl_register_service("clustered_log", 13, SERVICE_LEVEL_GDLM, &clog_ops,
+				 1, NULL, &local_id);
+	if (r) {
+		DMWARN("Couldn't register clustered_log service");
+		return r;
+	}
+
+	r = start_server();
+	if(r){
+		DMWARN("Unable to start clustered log server daemon");
+		kcl_unregister_service(local_id);
+		return r;
+	}
+
+	r = kcl_join_service(local_id);
+
+	if(r){
+		DMWARN("couldn't join service group");
+		stop_server();
+		kcl_unregister_service(local_id);
+	}
+
+	return r;
+}
+
+static int cluster_disconnect(void)
+{
+	if (--mirror_set_count)
+		return 0;
+
+	/* By setting 'shutting_down', the server will not be suspended **
+	** when a stop is received */
+	shutting_down = 1;
+	kcl_leave_service(local_id);
+	stop_server();
+	kcl_unregister_service(local_id);
+
+	return 0;
+}
+
+static struct dirty_log_type _clustered_core_type = {
+	.name = "clustered_core",
+	.module = THIS_MODULE,
+	.ctr = cluster_core_ctr,
+	.dtr = cluster_dtr,
+	.presuspend = cluster_presuspend,
+	.postsuspend = cluster_postsuspend,
+	.resume = cluster_resume,
+	.get_region_size = cluster_get_region_size,
+	.is_clean = cluster_is_clean,
+	.is_remote_recovering = cluster_is_remote_recovering,
+	.in_sync = cluster_in_sync,
+	.flush = cluster_flush,
+	.mark_region = cluster_mark_region,
+	.clear_region = cluster_clear_region,
+	.get_resync_work = cluster_get_resync_work,
+	.complete_resync_work = cluster_complete_resync_work,
+	.get_sync_count = cluster_get_sync_count,
+	.status = cluster_status,
+	.get_failure_response = cluster_get_failure_response,
+};
+
+static struct dirty_log_type _clustered_disk_type = {
+	.name = "clustered_disk",
+	.module = THIS_MODULE,
+	.ctr = cluster_disk_ctr,
+	.dtr = cluster_dtr,
+	.presuspend = cluster_presuspend,
+	.postsuspend = cluster_postsuspend,
+	.resume = cluster_resume,
+	.get_region_size = cluster_get_region_size,
+	.is_clean = cluster_is_clean,
+	.is_remote_recovering = cluster_is_remote_recovering,
+	.in_sync = cluster_in_sync,
+	.flush = cluster_flush,
+	.mark_region = cluster_mark_region,
+	.clear_region = cluster_clear_region,
+	.get_resync_work = cluster_get_resync_work,
+	.complete_resync_work = cluster_complete_resync_work,
+	.get_sync_count = cluster_get_sync_count,
+	.status = cluster_status,
+	.get_failure_response = cluster_get_failure_response,
+};
+
+#define CMIRROR_RELEASE_NAME "0.1.0"
+static int __init cluster_dirty_log_init(void)
+{
+	int r = 0;
+
+        DMINFO("dm-cmirror %s (built %s %s) installed",
+               CMIRROR_RELEASE_NAME, __DATE__, __TIME__);
+
+	INIT_LIST_HEAD(&clear_region_list);
+	INIT_LIST_HEAD(&marked_region_list);
+
+	spin_lock_init(&region_state_lock);
+	region_state_pool = mempool_create(20, region_state_alloc,
+					   region_state_free, NULL);
+	if(!region_state_pool){
+		DMWARN("couldn't create region state pool");
+		return -ENOMEM;
+	}
+
+	init_waitqueue_head(&suspend_client_queue);
+
+	r = dm_register_dirty_log_type(&_clustered_core_type);
+	if (r) {
+		DMWARN("couldn't register clustered_core dirty log type");
+		return r;
+	}
+
+	r = dm_register_dirty_log_type(&_clustered_disk_type);
+	if (r) {
+		DMWARN("couldn't register clustered_disk dirty log type");
+		dm_unregister_dirty_log_type(&_clustered_core_type);
+		return r;
+	}
+
+	return r;
+
+}
+
+static void __exit cluster_dirty_log_exit(void)
+{
+	if(!list_empty(&log_list_head)){
+		DMERR("attempt to remove module, but dirty logs are still in place!");
+		DMERR("this is a fatal error");
+		BUG();
+	}
+	dm_unregister_dirty_log_type(&_clustered_core_type);
+	dm_unregister_dirty_log_type(&_clustered_disk_type);
+}
+
+module_init(cluster_dirty_log_init);
+module_exit(cluster_dirty_log_exit);
+
+MODULE_DESCRIPTION(DM_NAME " cluster capable mirror logs (clustered mirroring)");
+MODULE_AUTHOR("Jonathan Brassow");
+MODULE_LICENSE("GPL");
+
+/*
+ * Overrides for Emacs so that we follow Linus's tabbing style.
+ * Emacs will notice this stuff at the end of the file and automatically
+ * adjust the settings for this buffer only.  This must remain at the end
+ * of the file.
+ * ---------------------------------------------------------------------------
+ * Local variables:
+ * c-file-style: "linux"
+ * End:
+ */
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-cman.c	2005/07/27 16:09:31	1.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-cman.c	2006/06/15 19:34:41	1.1.2.4
@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 2005 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/list.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/socket.h>
+#include <linux/signal.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/device-mapper.h>
+#include <cluster/service.h>
+#include <cluster/cnxman.h>
+#include <cluster/cnxman-socket.h>
+
+#include "dm-log.h"
+#include "dm-cmirror-common.h"
+
+uint32_t local_id;
+uint32_t my_id=0;
+int global_count=0;
+uint32_t *global_nodeids=NULL;
+
+int restart_event_type=0;
+int restart_event_id=0;
+
+uint32_t nodeid_to_ipaddr(uint32_t nodeid){
+	struct cluster_node_addr *cna;
+	struct sockaddr_in *saddr;
+	struct list_head *list = kcl_get_node_addresses(nodeid);
+	uint32_t buff[8];
+	int i, memb_count;
+
+	if(!list){
+		DMERR("No address list for nodeid %u", nodeid);
+		DMERR(" - Cluster is %squorate", kcl_is_quorate() ? "" : "not ");
+		memb_count = kcl_get_member_ids(buff, 8);
+		DMERR(" - There are %d members.", memb_count);
+		for (i = 0; i < memb_count; i++)
+			DMERR(" - %u", buff[i]);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(HZ/4);
+		return 0;
+	}
+		
+
+	list_for_each_entry(cna, list, list){
+		saddr = (struct sockaddr_in *)(&cna->addr);
+		return (uint32_t)(saddr->sin_addr.s_addr);
+	}
+	return 0;
+}
+
+uint32_t ipaddr_to_nodeid(struct sockaddr *addr){
+	struct list_head *addr_list;
+	struct kcl_cluster_node node;
+	struct cluster_node_addr *tmp;
+
+	if(!(addr_list = kcl_get_node_addresses(my_id))){
+		DMWARN("No address list available for %u\n", my_id);
+		goto fail;
+	}
+
+	if(addr->sa_family == AF_INET){
+		struct sockaddr_in a4;
+		struct sockaddr_in *tmp_addr;
+		list_for_each_entry(tmp, addr_list, list){
+			tmp_addr = (struct sockaddr_in *)tmp->addr;
+			if(tmp_addr->sin_family == AF_INET){
+				memcpy(&a4, tmp_addr, sizeof(a4));
+				memcpy(&a4.sin_addr,
+				       &((struct sockaddr_in *)addr)->sin_addr,
+				       sizeof(a4.sin_addr));
+				if(!kcl_get_node_by_addr((char *)&a4,
+							 sizeof(a4),
+							 &node)){
+					return node.node_id;
+				}
+			}
+		}
+	} else if(addr->sa_family == AF_INET6){
+		struct sockaddr_in6 a6;
+		struct sockaddr_in6 *tmp_addr;
+		list_for_each_entry(tmp, addr_list, list){
+			tmp_addr = (struct sockaddr_in6 *)tmp->addr;
+			if(tmp_addr->sin6_family == AF_INET6){
+				memcpy(&a6, tmp_addr, sizeof(a6));
+				memcpy(&a6.sin6_addr,
+				       &((struct sockaddr_in6 *)addr)->sin6_addr,
+				       sizeof(a6.sin6_addr));
+				if(!kcl_get_node_by_addr((char *)&a6,
+							 sizeof(a6),
+							 &node)){
+					return node.node_id;
+				}
+			}
+		}
+	}
+
+ fail:
+	DMWARN("Failed to convert IP address to nodeid.");
+	return 0;
+}



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2006-06-15 19:34 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2006-06-15 19:34 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2006-06-15 19:34:41

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-cman.c 

Log message:
	- last typo and a message clean-up

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.17&r2=1.1.2.18
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-cman.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.3&r2=1.1.2.4



^ permalink raw reply	[flat|nested] 40+ messages in thread

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2006-06-13 16:26 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2006-06-13 16:26 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2006-06-13 16:26:15

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-cman.c 
	                    dm-cmirror-server.c 

Log message:
	- downgrade some messages to INFO
	- serialize client requests to prevent getting wrong response
	- properly clear regions before shutting down a mirror
	- do not allocate memory while holding a spin lock
	- do not set sync bit multiple times if network failures occur

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.14&r2=1.1.2.15
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-cman.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.2&r2=1.1.2.3
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.7&r2=1.1.2.8



^ permalink raw reply	[flat|nested] 40+ messages in thread

end of thread, other threads:[~2007-10-03 19:02 UTC | newest]

Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-20 19:35 [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c jbrassow
  -- strict thread matches above, loose matches on Subject: below --
2007-10-03 19:02 jbrassow
2007-09-27 20:31 jbrassow
2007-09-26  3:15 jbrassow
2007-09-21 20:07 jbrassow
2007-09-13 15:24 jbrassow
2007-07-11 16:18 jbrassow
2007-04-26 16:55 jbrassow
2007-04-26 16:54 jbrassow
2007-04-24 20:10 jbrassow
2007-04-24 20:08 jbrassow
2007-04-10  7:13 jbrassow
2007-04-10  7:12 jbrassow
2007-04-05 21:33 jbrassow
2007-04-05 21:32 jbrassow
2007-04-03 18:23 jbrassow
2007-04-03 18:21 jbrassow
2007-03-22 22:34 jbrassow
2007-03-22 22:22 jbrassow
2007-03-14  4:28 jbrassow
2007-02-26 17:38 jbrassow
2007-02-19 16:29 jbrassow
2007-02-14 17:44 jbrassow
2007-02-02 17:22 jbrassow
2007-01-08 19:28 jbrassow
2006-12-07 18:58 jbrassow
2006-09-05 17:50 jbrassow
2006-09-05 17:48 jbrassow
2006-07-27 23:11 jbrassow
2006-07-27 23:11 jbrassow
2006-07-22 22:19 jbrassow
2006-07-22 22:19 jbrassow
2006-07-22 22:12 jbrassow
2006-06-29 19:49 jbrassow
2006-06-29 19:48 jbrassow
2006-06-29 19:46 jbrassow
2006-06-27 20:19 jbrassow
2006-06-15 19:48 jbrassow
2006-06-15 19:34 jbrassow
2006-06-13 16:26 jbrassow

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.