[Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...

* [Cluster-devel] cluster/cmirror-kernel/src dm-cmirror-client.c ...
@ 2007-02-20 19:35 jbrassow
  0 siblings, 0 replies; 40+ messages in thread
From: jbrassow @ 2007-02-20 19:35 UTC (permalink / raw)
  To: cluster-devel.redhat.com

CVSROOT:	/cvs/cluster
Module name:	cluster
Branch: 	RHEL4
Changes by:	jbrassow at sourceware.org	2007-02-20 19:35:10

Modified files:
	cmirror-kernel/src: dm-cmirror-client.c dm-cmirror-cman.c 
	                    dm-cmirror-cman.h dm-cmirror-server.c 

Log message:
	Bug 217895: lost election results from cmirror server cause mirror ...
	
	There was a race happening as a result of simultaneous cman issued
	'starts'.  The client receives the start requests, but the server
	processes them.  So, it was possible for the server to reset the
	event id/type while the client was trying to set them.  This would
	cause the next kcl_start_done command issued by the server to fail.
	
	The bug can be interpretted many different ways depending on which
	machine in the cluster you are looking at when it happens.
	
	The fix was to have the client wait to set the event id/type until
	it knows the server has completed the previous request.
	
	This fix may resolve other bugs as well, but I will test them
	individually.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-client.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.37&r2=1.1.2.38
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-cman.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.5&r2=1.1.2.6
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-cman.h.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.1&r2=1.1.2.2
http://sourceware.org/cgi-bin/cvsweb.cgi/cluster/cmirror-kernel/src/dm-cmirror-server.c.diff?cvsroot=cluster&only_with_tag=RHEL4&r1=1.1.2.22&r2=1.1.2.23

--- cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/02/19 16:29:42	1.1.2.37
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-client.c	2007/02/20 19:35:10	1.1.2.38
@@ -45,6 +45,7 @@
 static int shutting_down=0;
 static atomic_t suspend_client;
 static wait_queue_head_t suspend_client_queue;
+static wait_queue_head_t event_queue;
 
 static DECLARE_MUTEX(consult_server_lock);
 
@@ -1228,8 +1229,11 @@
 	kcl_get_node_by_nodeid(0, &node);
 	my_id = node.node_id;
 
+	/* Wait for any outstanding starts to complete */
+	suspend_on(&event_queue, atomic_read(&restart_event_type));
+
 	restart_event_id = event_id;
-	restart_event_type = type;
+	atomic_set(&restart_event_type, type);
 
 	switch(type){
 	case SERVICE_NODE_LEAVE:
@@ -1391,6 +1395,7 @@
 	}
 
 	init_waitqueue_head(&suspend_client_queue);
+	init_waitqueue_head(&event_queue);
 
 	r = dm_register_dirty_log_type(&_clustered_core_type);
 	if (r) {
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-cman.c	2006/06/15 19:48:00	1.1.2.5
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-cman.c	2007/02/20 19:35:10	1.1.2.6
@@ -27,7 +27,7 @@
 int global_count=0;
 uint32_t *global_nodeids=NULL;
 
-int restart_event_type=0;
+atomic_t restart_event_type = ATOMIC_INIT(0);
 int restart_event_id=0;
 
 uint32_t nodeid_to_ipaddr(uint32_t nodeid){
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-cman.h	2005/07/27 16:09:31	1.1.2.1
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-cman.h	2007/02/20 19:35:10	1.1.2.2
@@ -12,7 +12,7 @@
 extern int global_count;
 extern uint32_t *global_nodeids;
 
-extern int restart_event_type;
+extern atomic_t restart_event_type;
 extern int restart_event_id;
 
 uint32_t nodeid_to_ipaddr(uint32_t nodeid);
--- cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/02/19 16:29:42	1.1.2.22
+++ cluster/cmirror-kernel/src/Attic/dm-cmirror-server.c	2007/02/20 19:35:10	1.1.2.23
@@ -1067,7 +1067,7 @@
 		}
 
 		suspend_on(&_suspend_queue, atomic_read(&_suspend));
-		switch(restart_event_type){
+		switch(atomic_read(&restart_event_type)){
 		case SERVICE_NODE_LEAVE:
 			/* ATTENTION -- may wish to check if regions **
 			** are still in use by this node.  For now,  **
@@ -1076,7 +1076,7 @@
 			** leaving node, it won't hurt anything - and**
 			** if there is, they will be recovered.      */
 		case SERVICE_NODE_FAILED:
-			if (restart_event_type == SERVICE_NODE_FAILED)
+			if (atomic_read(&restart_event_type) == SERVICE_NODE_FAILED)
 				DMINFO("A cluster mirror log member has failed.");
 			
 			list_for_each_entry(lc, &log_list_head, log_list){
@@ -1095,10 +1095,13 @@
 		}
 		
 		
-		if(restart_event_type){
+		if(atomic_read(&restart_event_type)){
 			/* finish the start phase */
 			kcl_start_done(local_id, restart_event_id);
-			restart_event_id = restart_event_type = 0;
+			restart_event_id = 0;
+
+			/* Trigger any waiting starts to proceed */
+			atomic_set(&restart_event_type, 0);
 		} else if (atomic_read(&_do_requests)) {
 			/* ATTENTION -- what to do with error ? */
 			if(process_log_request(sock))



^ permalink raw reply	[flat|nested] 40+ messages in thread