All of lore.kernel.org
 help / color / mirror / Atom feed
* [Cluster-devel] [PATCH] qdiskd: Make multipath issues go away
@ 2012-07-02 13:55 Fabio M. Di Nitto
  2012-07-13 20:16 ` Lon Hohberger
  0 siblings, 1 reply; 6+ messages in thread
From: Fabio M. Di Nitto @ 2012-07-02 13:55 UTC (permalink / raw)
  To: cluster-devel.redhat.com

From: Lon Hohberger <lhh@redhat.com>

Qdiskd hsitorically has required significant tuning to work around
delays which occur during multipath failover, overloaded I/O, and LUN
trespasses in both device-mapper-multipath and EMC PowerPath
environments.

This patch goes a very long way towards eliminating false evictions
when these conditions occur by making qdiskd whine to the other
cluster members when it detects hung system calls.  When a cluster
member whines, it indicates the source of the problem (which system
call is hung), and the act of receiving a whine from a host indicates
that qdiskd is operational, but that I/O is hung.  Hung I/O is different
from losing storage entirely (where you get I/O errors).

Possible problems:

- Receive queue getting very full, causing messages to become blocked on
a node where I/O is hung.  1) that would take a very long time, and 2)
node should get evicted at that point anyway.

Resolves: rhbz#782900

this version of the patch is a backport of:
e2937eb33f224f86904fead08499a6178868ca6a
34d2872fb7e60be1594158acaaeb8acd74f78d22

There is a minor change vs original patch based on how qdiskd
in RHEL5 handles cman connection. We add an extra call to cman_alive
in main qdisk_loop to make sure data are not stalled on the
cman port, and data_callback to qdiskd_whine executed.

Signed-off-by: Lon Hohberger <lhh@redhat.com>
Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>
---
 cman/daemon/cnxman-socket.h |    1 +
 cman/qdisk/Makefile         |    2 +-
 cman/qdisk/disk.h           |    6 ++++
 cman/qdisk/iostate.c        |   17 +++++++++++--
 cman/qdisk/iostate.h        |    4 ++-
 cman/qdisk/main.c           |   54 +++++++++++++++++++++++++++++++++++++++----
 6 files changed, 74 insertions(+), 10 deletions(-)

diff --git a/cman/daemon/cnxman-socket.h b/cman/daemon/cnxman-socket.h
index 351c97c..1d01b44 100644
--- a/cman/daemon/cnxman-socket.h
+++ b/cman/daemon/cnxman-socket.h
@@ -79,6 +79,7 @@
 #define CLUSTER_PORT_SERVICES    2
 #define CLUSTER_PORT_SYSMAN      10	/* Remote execution daemon */
 #define CLUSTER_PORT_CLVMD       11	/* Cluster LVM daemon */
+#define	CLUSTER_PORT_QDISKD	 178	/* Quorum disk daemon */
 
 /* Port numbers above this will be blocked when the cluster is inquorate or in
  * transition */
diff --git a/cman/qdisk/Makefile b/cman/qdisk/Makefile
index f58806b..9bfc486 100644
--- a/cman/qdisk/Makefile
+++ b/cman/qdisk/Makefile
@@ -32,7 +32,7 @@ qdiskd: disk.o crc32.o disk_util.o main.o score.o bitmap.o clulog.o \
 	gcc -o $@ $^ -lpthread -L../lib -L${ccslibdir} -lccs -lrt
 
 mkqdisk: disk.o crc32.o disk_util.o iostate.o \
-	 proc.o mkqdisk.o scandisk.o clulog.o gettid.o
+	 proc.o mkqdisk.o scandisk.o clulog.o gettid.o ../lib/libcman.a
 	gcc -o $@ $^ -lrt
 
 %.o: %.c
diff --git a/cman/qdisk/disk.h b/cman/qdisk/disk.h
index b784220..d491de1 100644
--- a/cman/qdisk/disk.h
+++ b/cman/qdisk/disk.h
@@ -290,6 +290,12 @@ typedef struct {
 	status_block_t ni_status;
 } node_info_t;
 
+typedef struct {
+	qd_ctx *ctx;
+	node_info_t *ni;
+	size_t ni_len;
+} qd_priv_t;
+
 int qd_write_status(qd_ctx *ctx, int nid, disk_node_state_t state,
 		    disk_msg_t *msg, memb_mask_t mask, memb_mask_t master);
 int qd_read_print_status(target_info_t *disk, int nid);
diff --git a/cman/qdisk/iostate.c b/cman/qdisk/iostate.c
index 65b4d50..eb74ad2 100644
--- a/cman/qdisk/iostate.c
+++ b/cman/qdisk/iostate.c
@@ -1,10 +1,14 @@
 #include <pthread.h>
+#include <libcman.h>
 #include <iostate.h>
 #include <unistd.h>
 #include <time.h>
 #include <sys/time.h>
 #include <clulog.h>
+#include <stdint.h>
+#include "platform.h"
 #include "iostate.h"
+#include "../daemon/cnxman-socket.h"
 
 static iostate_t main_state = 0;
 static int main_incarnation = 0;
@@ -26,7 +30,7 @@ static struct state_table io_state_table[] = {
 {	STATE_LSEEK,	"seek"	},
 {	-1,		NULL	} };
 
-static const char *
+const char *
 state_to_string(iostate_t state)
 {
 	static const char *ret = "unknown";
@@ -65,6 +69,8 @@ io_nanny_thread(void *arg)
 	iostate_t last_main_state = 0, current_main_state = 0;
 	int last_main_incarnation = 0, current_main_incarnation = 0;
 	int logged_incarnation = 0;
+	cman_handle_t ch = (cman_handle_t)arg;
+	int32_t whine_state;
 
 	/* Start with wherever we're at now */
 	pthread_mutex_lock(&state_mutex);
@@ -96,6 +102,11 @@ io_nanny_thread(void *arg)
 			continue;
 		}
 
+		/* Whine on CMAN api */
+		whine_state = (int32_t)current_main_state;
+		swab32(whine_state);
+		cman_send_data(ch, &whine_state, sizeof(int32_t), 0, CLUSTER_PORT_QDISKD, 0);
+
 		/* Don't log things twice */
 		if (logged_incarnation == current_main_incarnation)
 			continue;
@@ -114,7 +125,7 @@ io_nanny_thread(void *arg)
 
 
 int
-io_nanny_start(int timeout)
+io_nanny_start(cman_handle_t ch, int timeout)
 {
 	int ret;
 
@@ -124,7 +135,7 @@ io_nanny_start(int timeout)
 	qdisk_timeout = timeout;
 	thread_active = 1;
 
-	ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, NULL);
+	ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, ch);
 	pthread_mutex_unlock(&state_mutex);
 
 	return ret;
diff --git a/cman/qdisk/iostate.h b/cman/qdisk/iostate.h
index 7dd7bf6..a65b1d4 100644
--- a/cman/qdisk/iostate.h
+++ b/cman/qdisk/iostate.h
@@ -11,7 +11,9 @@ typedef enum {
 
 void io_state(iostate_t state);
 
-int io_nanny_start(int timeout);
+int io_nanny_start(cman_handle_t ch, int timeout);
 int io_nanny_stop(void);
 
+const char * state_to_string(iostate_t state);
+
 #endif
diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
index 0d7bb3d..90d00ab 100644
--- a/cman/qdisk/main.c
+++ b/cman/qdisk/main.c
@@ -48,6 +48,7 @@
      (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION < 2))
 #include <cluster/cnxman-socket.h>
 #endif
+#include "../daemon/cnxman-socket.h"
 
 int daemon_init(char *);
 int check_process_running(char *, pid_t *);
@@ -892,6 +893,11 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
 	
 	_running = 1;
 	while (_running) {
+		/* perform a forceful cman dispatch */
+		if (cman_alive(ctx->qc_ch) < 0) {
+			clulog(LOG_ERR, "cman: %s\n", strerror(errno));
+		}
+
 		/* XXX this was getuptime() in clumanager */
 		get_time(&oldtime, (ctx->qc_flags&RF_UPTIME));
 
@@ -1514,6 +1520,31 @@ check_stop_cman(qd_ctx *ctx)
 	}
 }
 
+static void
+qdisk_whine(cman_handle_t h, void *privdata, char *buf, int len,
+	    uint8_t port, int nodeid)
+{
+	int32_t dstate;
+	qd_priv_t *qp = (qd_priv_t *)privdata;
+	node_info_t *ni = qp->ni;
+
+	if (len != sizeof(dstate)) {
+		return;
+	}
+
+	dstate = *((int32_t*)buf);
+
+	if (nodeid == (qp->ctx->qc_my_id))
+		return;
+
+	swab32(dstate);
+
+	if (dstate) {
+		clulog(LOG_CRIT, "qdiskd on node %d reports hung %s()\n",
+		       state_to_string(dstate));
+		ni[nodeid-1].ni_misses = 0;
+	}
+}
 
 int
 main(int argc, char **argv)
@@ -1528,6 +1559,7 @@ main(int argc, char **argv)
 	char device[128];
 	pid_t pid;
 	quorum_header_t qh;
+	qd_priv_t qp;
 
 	if (check_process_running(argv[0], &pid) && pid !=getpid()) {
 		printf("QDisk services already running\n");
@@ -1559,10 +1591,16 @@ main(int argc, char **argv)
 		}
 	}
 
+	/* For cman notifications we need two sockets - one for events,
+	   one for config change callbacks */
+	qp.ctx = &ctx;
+	qp.ni = &ni[0];
+	qp.ni_len = MAX_NODES_DISK;
+
 #if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2)
-	ch = cman_admin_init(NULL);
+	ch = cman_admin_init(&qp);
 #else
-	ch = cman_init(NULL);
+	ch = cman_init(&qp);
 #endif
 	if (!ch) {
 		if (!foreground && !forked) {
@@ -1577,13 +1615,19 @@ main(int argc, char **argv)
 		do {
 			sleep(5);
 #if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2)
-			ch = cman_admin_init(NULL);
+			ch = cman_admin_init(&qp);
 #else
-			ch = cman_init(NULL);
+			ch = cman_init(&qp);
 #endif
 		} while (!ch);
 	}
 
+	if (cman_start_recv_data(ch, qdisk_whine, CLUSTER_PORT_QDISKD) != 0) {
+		clulog_and_print(LOG_CRIT, "Could not register with CMAN: %s\n",
+				 strerror(errno));
+		goto out;
+	}
+
 	memset(&me, 0, sizeof(me));
 	while (cman_get_node(ch, CMAN_NODEID_US, &me) < 0) {
 		if (!foreground && !forked) {
@@ -1696,7 +1740,7 @@ main(int argc, char **argv)
 		}
 	}
 
-	io_nanny_start(ctx.qc_tko * ctx.qc_interval);
+	io_nanny_start(ch, ctx.qc_tko * ctx.qc_interval);
 
 	if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0)
 		cman_unregister_quorum_device(ctx.qc_ch);
-- 
1.7.7.6



^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [Cluster-devel] [PATCH] qdiskd: Make multipath issues go away
  2012-07-02 13:55 [Cluster-devel] [PATCH] qdiskd: Make multipath issues go away Fabio M. Di Nitto
@ 2012-07-13 20:16 ` Lon Hohberger
  0 siblings, 0 replies; 6+ messages in thread
From: Lon Hohberger @ 2012-07-13 20:16 UTC (permalink / raw)
  To: cluster-devel.redhat.com

On 07/02/2012 09:55 AM, Fabio M. Di Nitto wrote:
> From: Lon Hohberger <lhh@redhat.com>
> 
> Qdiskd hsitorically has required significant tuning to work around
> delays which occur during multipath failover, overloaded I/O, and LUN
> trespasses in both device-mapper-multipath and EMC PowerPath
> environments.
> 
> This patch goes a very long way towards eliminating false evictions
> when these conditions occur by making qdiskd whine to the other
> cluster members when it detects hung system calls.  When a cluster
> member whines, it indicates the source of the problem (which system
> call is hung), and the act of receiving a whine from a host indicates
> that qdiskd is operational, but that I/O is hung.  Hung I/O is different
> from losing storage entirely (where you get I/O errors).
> 
> Possible problems:
> 
> - Receive queue getting very full, causing messages to become blocked on
> a node where I/O is hung.  1) that would take a very long time, and 2)
> node should get evicted at that point anyway.
> 
> Resolves: rhbz#782900
> 
> this version of the patch is a backport of:
> e2937eb33f224f86904fead08499a6178868ca6a
> 34d2872fb7e60be1594158acaaeb8acd74f78d22
> 
> There is a minor change vs original patch based on how qdiskd
> in RHEL5 handles cman connection. We add an extra call to cman_alive
> in main qdisk_loop to make sure data are not stalled on the
> cman port, and data_callback to qdiskd_whine executed.
> 
> Signed-off-by: Lon Hohberger <lhh@redhat.com>
> Signed-off-by: Fabio M. Di Nitto <fdinitto@redhat.com>

Re-ack :)



^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Cluster-devel] [PATCH] qdiskd: Make multipath issues go away
  2012-02-17  5:38 ` Fabio M. Di Nitto
@ 2012-02-17 21:40   ` Lon Hohberger
  0 siblings, 0 replies; 6+ messages in thread
From: Lon Hohberger @ 2012-02-17 21:40 UTC (permalink / raw)
  To: cluster-devel.redhat.com

On 02/17/2012 12:38 AM, Fabio M. Di Nitto wrote:
> The patch looks good and ACK, with one minor nitpick.
>
> that 178 port value in cman_recv/send_data is rather cryptic.
>
> I would prefer to see it defined as others in cman/cnxman-socket.h
> for documentation purposes (so we know it's QDISK).

I pushed a patch to add this to STABLE32, merged it with this one, and 
resent.

-- Lon



^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Cluster-devel] [PATCH] qdiskd: Make multipath issues go away
  2012-02-15 23:53 Lon Hohberger
@ 2012-02-17  5:38 ` Fabio M. Di Nitto
  2012-02-17 21:40   ` Lon Hohberger
  0 siblings, 1 reply; 6+ messages in thread
From: Fabio M. Di Nitto @ 2012-02-17  5:38 UTC (permalink / raw)
  To: cluster-devel.redhat.com

The patch looks good and ACK, with one minor nitpick.

that 178 port value in cman_recv/send_data is rather cryptic.

I would prefer to see it defined as others in cman/cnxman-socket.h
for documentation purposes (so we know it's QDISK).

Fabio

On 02/16/2012 12:53 AM, Lon Hohberger wrote:
> Qdiskd hsitorically has required significant tuning to work around
> delays which occur during multipath failover, overloaded I/O, and LUN
> trespasses in both device-mapper-multipath and EMC PowerPath
> environments.
> 
> This patch goes a very long way towards eliminating false evictions
> when these conditions occur by making qdiskd whine to the other
> cluster members when it detects hung system calls.  When a cluster
> member whines, it indicates the source of the problem (which system
> call is hung), and the act of receiving a whine from a host indicates
> that qdiskd is operational, but that I/O is hung.  Hung I/O is different
> from losing storage entirely (where you get I/O errors).
> 
> Possible problems:
> 
> - Receive queue getting very full, causing messages to become blocked on
> a node where I/O is hung.  1) that would take a very long time, and 2)
> node should get evicted at that point anyway.
> 
> Resolves: rhbz#678372
> 
> Signed-off-by: Lon Hohberger <lhh@redhat.com>
> ---
>  cman/qdisk/Makefile  |    2 +-
>  cman/qdisk/disk.h    |    6 ++++++
>  cman/qdisk/iostate.c |   16 +++++++++++++---
>  cman/qdisk/iostate.h |    4 +++-
>  cman/qdisk/main.c    |   45 ++++++++++++++++++++++++++++++++++++++++++---
>  5 files changed, 65 insertions(+), 8 deletions(-)
> 
> diff --git a/cman/qdisk/Makefile b/cman/qdisk/Makefile
> index 68e20cd..e3bb5f7 100644
> --- a/cman/qdisk/Makefile
> +++ b/cman/qdisk/Makefile
> @@ -40,7 +40,7 @@ ${TARGET1}: ${SHAREDOBJS} ${OBJS1}
>  	$(CC) -o $@ $^ $(EXTRA_LDFLAGS) $(LDFLAGS)
>  
>  ${TARGET2}: ${SHAREDOBJS} ${OBJS2}
> -	$(CC) -o $@ $^ $(LDFLAGS)
> +	$(CC) -o $@ $^ $(EXTRA_LDFLAGS) $(LDFLAGS)
>  
>  depends:
>  	$(MAKE) -C ../lib all
> diff --git a/cman/qdisk/disk.h b/cman/qdisk/disk.h
> index 93d15fe..fd80fa6 100644
> --- a/cman/qdisk/disk.h
> +++ b/cman/qdisk/disk.h
> @@ -273,6 +273,12 @@ typedef struct {
>  	status_block_t ni_status;
>  } node_info_t;
>  
> +typedef struct {
> +	qd_ctx *ctx;
> +	node_info_t *ni;
> +	size_t ni_len;
> +} qd_priv_t;
> +
>  int qd_write_status(qd_ctx *ctx, int nid, disk_node_state_t state,
>  		    disk_msg_t *msg, memb_mask_t mask, memb_mask_t master);
>  int qd_init(qd_ctx *ctx, cman_handle_t ch_admin,
> diff --git a/cman/qdisk/iostate.c b/cman/qdisk/iostate.c
> index 0199da4..06c6831 100644
> --- a/cman/qdisk/iostate.c
> +++ b/cman/qdisk/iostate.c
> @@ -1,9 +1,12 @@
>  #include <pthread.h>
> +#include <libcman.h>
>  #include <iostate.h>
>  #include <unistd.h>
>  #include <time.h>
>  #include <sys/time.h>
>  #include <liblogthread.h>
> +#include <stdint.h>
> +#include "platform.h"
>  #include "iostate.h"
>  
>  static iostate_t main_state = 0;
> @@ -26,7 +29,7 @@ static struct state_table io_state_table[] = {
>  {	STATE_LSEEK,	"seek"	},
>  {	-1,		NULL	} };
>  
> -static const char *
> +const char *
>  state_to_string(iostate_t state)
>  {
>  	static const char *ret = "unknown";
> @@ -65,6 +68,8 @@ io_nanny_thread(void *arg)
>  	iostate_t last_main_state = 0, current_main_state = 0;
>  	int last_main_incarnation = 0, current_main_incarnation = 0;
>  	int logged_incarnation = 0;
> +	cman_handle_t ch = (cman_handle_t)arg;
> +	int32_t whine_state;
>  
>  	/* Start with wherever we're at now */
>  	pthread_mutex_lock(&state_mutex);
> @@ -96,6 +101,11 @@ io_nanny_thread(void *arg)
>  			continue;
>  		}
>  
> +		/* Whine on CMAN api */
> +		whine_state = (int32_t)current_main_state;
> +		swab32(whine_state);
> +		cman_send_data(ch, &whine_state, sizeof(int32_t), 0, 178, 0);
> +
>  		/* Don't log things twice */
>  		if (logged_incarnation == current_main_incarnation)
>  			continue;
> @@ -114,7 +124,7 @@ io_nanny_thread(void *arg)
>  
>  
>  int
> -io_nanny_start(int timeout)
> +io_nanny_start(cman_handle_t ch, int timeout)
>  {
>  	int ret;
>  
> @@ -124,7 +134,7 @@ io_nanny_start(int timeout)
>  	qdisk_timeout = timeout;
>  	thread_active = 1;
>  
> -	ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, NULL);
> +	ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, ch);
>  	pthread_mutex_unlock(&state_mutex);
>  
>  	return ret;
> diff --git a/cman/qdisk/iostate.h b/cman/qdisk/iostate.h
> index 7dd7bf6..a65b1d4 100644
> --- a/cman/qdisk/iostate.h
> +++ b/cman/qdisk/iostate.h
> @@ -11,7 +11,9 @@ typedef enum {
>  
>  void io_state(iostate_t state);
>  
> -int io_nanny_start(int timeout);
> +int io_nanny_start(cman_handle_t ch, int timeout);
>  int io_nanny_stop(void);
>  
> +const char * state_to_string(iostate_t state);
> +
>  #endif
> diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
> index a90e82c..d613d84 100644
> --- a/cman/qdisk/main.c
> +++ b/cman/qdisk/main.c
> @@ -915,7 +915,8 @@ cman_wait(cman_handle_t ch, struct timeval *_tv)
>  static void
>  process_cman_event(cman_handle_t handle, void *private, int reason, int arg)
>  {
> -	qd_ctx *ctx = (qd_ctx *)private;
> +	qd_priv_t *qp = (qd_priv_t *)private;
> +	qd_ctx *ctx = qp->ctx;
>  
>  	switch(reason) {
>  	case CMAN_REASON_PORTOPENED:
> @@ -1926,6 +1927,33 @@ check_stop_cman(qd_ctx *ctx)
>  do { static int _logged=0; if (!_logged) { _logged=1; logt_print(level, fmt, ##args); } } while(0)
>  
>  
> +static void
> +qdisk_whine(cman_handle_t h, void *privdata, char *buf, int len,
> +	    uint8_t port, int nodeid)
> +{
> +	int32_t dstate;
> +	qd_priv_t *qp = (qd_priv_t *)privdata;
> +	node_info_t *ni = qp->ni;
> +
> +	if (len != sizeof(dstate)) {
> +		return;
> +	}
> +
> +	dstate = *((int32_t*)buf);
> +
> +	if (nodeid == (qp->ctx->qc_my_id))
> +		return;
> +
> +	swab32(dstate);
> +
> +	if (dstate) {
> +		logt_print(LOG_NOTICE, "qdiskd on node %d reports hung %s()\n", nodeid,
> +			   state_to_string(dstate));
> +		ni[nodeid-1].ni_misses = 0;
> +	}
> +}
> +
> +
>  int
>  main(int argc, char **argv)
>  {
> @@ -1939,6 +1967,7 @@ main(int argc, char **argv)
>  	char device[128];
>  	pid_t pid;
>  	quorum_header_t qh;
> +	qd_priv_t qp;
>  
>  	if (check_process_running(argv[0], &pid) && pid !=getpid()) {
>  		printf("QDisk services already running\n");
> @@ -2001,13 +2030,23 @@ main(int argc, char **argv)
>  
>  	/* For cman notifications we need two sockets - one for events,
>  	   one for config change callbacks */
> -	ch_user = cman_init(&ctx);
> +	qp.ctx = &ctx;
> +	qp.ni = &ni[0];
> +	qp.ni_len = MAX_NODES_DISK;
> +
> +	ch_user = cman_init(&qp);
>          if (cman_start_notification(ch_user, process_cman_event) != 0) {
>  		logt_print(LOG_CRIT, "Could not register with CMAN: %s\n",
>  			   strerror(errno));
>  		goto out;
>  	}
>  
> +	if (cman_start_recv_data(ch_user, qdisk_whine, 178) != 0) {
> +		logt_print(LOG_CRIT, "Could not register with CMAN: %s\n",
> +			   strerror(errno));
> +		goto out;
> +	}
> +
>  	memset(&me, 0, sizeof(me));
>  	if (cman_get_node(ch_admin, CMAN_NODEID_US, &me) < 0) {
>  		logt_print(LOG_CRIT, "Could not determine local node ID: %s\n",
> @@ -2108,7 +2147,7 @@ main(int argc, char **argv)
>  		}
>  	}
>  
> -	io_nanny_start(ctx.qc_tko * ctx.qc_interval);
> +	io_nanny_start(ch_user, ctx.qc_tko * ctx.qc_interval);
>  
>  	if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0) {
>  		/* Only clean up if we're exiting w/o error) */



^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Cluster-devel] [PATCH] qdiskd: Make multipath issues go away
@ 2012-02-15 23:53 Lon Hohberger
  2012-02-17  5:38 ` Fabio M. Di Nitto
  0 siblings, 1 reply; 6+ messages in thread
From: Lon Hohberger @ 2012-02-15 23:53 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Qdiskd hsitorically has required significant tuning to work around
delays which occur during multipath failover, overloaded I/O, and LUN
trespasses in both device-mapper-multipath and EMC PowerPath
environments.

This patch goes a very long way towards eliminating false evictions
when these conditions occur by making qdiskd whine to the other
cluster members when it detects hung system calls.  When a cluster
member whines, it indicates the source of the problem (which system
call is hung), and the act of receiving a whine from a host indicates
that qdiskd is operational, but that I/O is hung.  Hung I/O is different
from losing storage entirely (where you get I/O errors).

Possible problems:

- Receive queue getting very full, causing messages to become blocked on
a node where I/O is hung.  1) that would take a very long time, and 2)
node should get evicted at that point anyway.

Resolves: rhbz#678372

Signed-off-by: Lon Hohberger <lhh@redhat.com>
---
 cman/qdisk/Makefile  |    2 +-
 cman/qdisk/disk.h    |    6 ++++++
 cman/qdisk/iostate.c |   16 +++++++++++++---
 cman/qdisk/iostate.h |    4 +++-
 cman/qdisk/main.c    |   45 ++++++++++++++++++++++++++++++++++++++++++---
 5 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/cman/qdisk/Makefile b/cman/qdisk/Makefile
index 68e20cd..e3bb5f7 100644
--- a/cman/qdisk/Makefile
+++ b/cman/qdisk/Makefile
@@ -40,7 +40,7 @@ ${TARGET1}: ${SHAREDOBJS} ${OBJS1}
 	$(CC) -o $@ $^ $(EXTRA_LDFLAGS) $(LDFLAGS)
 
 ${TARGET2}: ${SHAREDOBJS} ${OBJS2}
-	$(CC) -o $@ $^ $(LDFLAGS)
+	$(CC) -o $@ $^ $(EXTRA_LDFLAGS) $(LDFLAGS)
 
 depends:
 	$(MAKE) -C ../lib all
diff --git a/cman/qdisk/disk.h b/cman/qdisk/disk.h
index 93d15fe..fd80fa6 100644
--- a/cman/qdisk/disk.h
+++ b/cman/qdisk/disk.h
@@ -273,6 +273,12 @@ typedef struct {
 	status_block_t ni_status;
 } node_info_t;
 
+typedef struct {
+	qd_ctx *ctx;
+	node_info_t *ni;
+	size_t ni_len;
+} qd_priv_t;
+
 int qd_write_status(qd_ctx *ctx, int nid, disk_node_state_t state,
 		    disk_msg_t *msg, memb_mask_t mask, memb_mask_t master);
 int qd_init(qd_ctx *ctx, cman_handle_t ch_admin,
diff --git a/cman/qdisk/iostate.c b/cman/qdisk/iostate.c
index 0199da4..06c6831 100644
--- a/cman/qdisk/iostate.c
+++ b/cman/qdisk/iostate.c
@@ -1,9 +1,12 @@
 #include <pthread.h>
+#include <libcman.h>
 #include <iostate.h>
 #include <unistd.h>
 #include <time.h>
 #include <sys/time.h>
 #include <liblogthread.h>
+#include <stdint.h>
+#include "platform.h"
 #include "iostate.h"
 
 static iostate_t main_state = 0;
@@ -26,7 +29,7 @@ static struct state_table io_state_table[] = {
 {	STATE_LSEEK,	"seek"	},
 {	-1,		NULL	} };
 
-static const char *
+const char *
 state_to_string(iostate_t state)
 {
 	static const char *ret = "unknown";
@@ -65,6 +68,8 @@ io_nanny_thread(void *arg)
 	iostate_t last_main_state = 0, current_main_state = 0;
 	int last_main_incarnation = 0, current_main_incarnation = 0;
 	int logged_incarnation = 0;
+	cman_handle_t ch = (cman_handle_t)arg;
+	int32_t whine_state;
 
 	/* Start with wherever we're at now */
 	pthread_mutex_lock(&state_mutex);
@@ -96,6 +101,11 @@ io_nanny_thread(void *arg)
 			continue;
 		}
 
+		/* Whine on CMAN api */
+		whine_state = (int32_t)current_main_state;
+		swab32(whine_state);
+		cman_send_data(ch, &whine_state, sizeof(int32_t), 0, 178, 0);
+
 		/* Don't log things twice */
 		if (logged_incarnation == current_main_incarnation)
 			continue;
@@ -114,7 +124,7 @@ io_nanny_thread(void *arg)
 
 
 int
-io_nanny_start(int timeout)
+io_nanny_start(cman_handle_t ch, int timeout)
 {
 	int ret;
 
@@ -124,7 +134,7 @@ io_nanny_start(int timeout)
 	qdisk_timeout = timeout;
 	thread_active = 1;
 
-	ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, NULL);
+	ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, ch);
 	pthread_mutex_unlock(&state_mutex);
 
 	return ret;
diff --git a/cman/qdisk/iostate.h b/cman/qdisk/iostate.h
index 7dd7bf6..a65b1d4 100644
--- a/cman/qdisk/iostate.h
+++ b/cman/qdisk/iostate.h
@@ -11,7 +11,9 @@ typedef enum {
 
 void io_state(iostate_t state);
 
-int io_nanny_start(int timeout);
+int io_nanny_start(cman_handle_t ch, int timeout);
 int io_nanny_stop(void);
 
+const char * state_to_string(iostate_t state);
+
 #endif
diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
index a90e82c..d613d84 100644
--- a/cman/qdisk/main.c
+++ b/cman/qdisk/main.c
@@ -915,7 +915,8 @@ cman_wait(cman_handle_t ch, struct timeval *_tv)
 static void
 process_cman_event(cman_handle_t handle, void *private, int reason, int arg)
 {
-	qd_ctx *ctx = (qd_ctx *)private;
+	qd_priv_t *qp = (qd_priv_t *)private;
+	qd_ctx *ctx = qp->ctx;
 
 	switch(reason) {
 	case CMAN_REASON_PORTOPENED:
@@ -1926,6 +1927,33 @@ check_stop_cman(qd_ctx *ctx)
 do { static int _logged=0; if (!_logged) { _logged=1; logt_print(level, fmt, ##args); } } while(0)
 
 
+static void
+qdisk_whine(cman_handle_t h, void *privdata, char *buf, int len,
+	    uint8_t port, int nodeid)
+{
+	int32_t dstate;
+	qd_priv_t *qp = (qd_priv_t *)privdata;
+	node_info_t *ni = qp->ni;
+
+	if (len != sizeof(dstate)) {
+		return;
+	}
+
+	dstate = *((int32_t*)buf);
+
+	if (nodeid == (qp->ctx->qc_my_id))
+		return;
+
+	swab32(dstate);
+
+	if (dstate) {
+		logt_print(LOG_NOTICE, "qdiskd on node %d reports hung %s()\n", nodeid,
+			   state_to_string(dstate));
+		ni[nodeid-1].ni_misses = 0;
+	}
+}
+
+
 int
 main(int argc, char **argv)
 {
@@ -1939,6 +1967,7 @@ main(int argc, char **argv)
 	char device[128];
 	pid_t pid;
 	quorum_header_t qh;
+	qd_priv_t qp;
 
 	if (check_process_running(argv[0], &pid) && pid !=getpid()) {
 		printf("QDisk services already running\n");
@@ -2001,13 +2030,23 @@ main(int argc, char **argv)
 
 	/* For cman notifications we need two sockets - one for events,
 	   one for config change callbacks */
-	ch_user = cman_init(&ctx);
+	qp.ctx = &ctx;
+	qp.ni = &ni[0];
+	qp.ni_len = MAX_NODES_DISK;
+
+	ch_user = cman_init(&qp);
         if (cman_start_notification(ch_user, process_cman_event) != 0) {
 		logt_print(LOG_CRIT, "Could not register with CMAN: %s\n",
 			   strerror(errno));
 		goto out;
 	}
 
+	if (cman_start_recv_data(ch_user, qdisk_whine, 178) != 0) {
+		logt_print(LOG_CRIT, "Could not register with CMAN: %s\n",
+			   strerror(errno));
+		goto out;
+	}
+
 	memset(&me, 0, sizeof(me));
 	if (cman_get_node(ch_admin, CMAN_NODEID_US, &me) < 0) {
 		logt_print(LOG_CRIT, "Could not determine local node ID: %s\n",
@@ -2108,7 +2147,7 @@ main(int argc, char **argv)
 		}
 	}
 
-	io_nanny_start(ctx.qc_tko * ctx.qc_interval);
+	io_nanny_start(ch_user, ctx.qc_tko * ctx.qc_interval);
 
 	if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0) {
 		/* Only clean up if we're exiting w/o error) */
-- 
1.7.7.6



^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [Cluster-devel] [PATCH] qdiskd: Make multipath issues go away
@ 2011-05-09 13:28 Lon Hohberger
  0 siblings, 0 replies; 6+ messages in thread
From: Lon Hohberger @ 2011-05-09 13:28 UTC (permalink / raw)
  To: cluster-devel.redhat.com

Qdiskd hsitorically has required significant tuning to work around
delays which occur during multipath failover, overloaded I/O, and LUN
trespasses in both device-mapper-multipath and EMC PowerPath
environments.

This patch goes a very long way towards eliminating false evictions
when these conditions occur by making qdiskd whine to the other
cluster members when it detects hung system calls.  When a cluster
member whines, it indicates the source of the problem (which system
call is hung), and the act of receiving a whine from a host indicates
that qdiskd is operational, but that I/O is hung.  Hung I/O is different
from losing storage entirely (where you get I/O errors).

Possible problems:

- Receive queue getting very full, causing messages to become blocked on
a node where I/O is hung.  1) that would take a very long time, and 2)
node should get evicted at that point anyway.

Signed-off-by: Lon Hohberger <lhh@redhat.com>
---
 cman/qdisk/Makefile  |    2 +-
 cman/qdisk/disk.h    |    6 ++++++
 cman/qdisk/iostate.c |   16 +++++++++++++---
 cman/qdisk/iostate.h |    4 +++-
 cman/qdisk/main.c    |   45 ++++++++++++++++++++++++++++++++++++++++++---
 5 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/cman/qdisk/Makefile b/cman/qdisk/Makefile
index 68e20cd..e3bb5f7 100644
--- a/cman/qdisk/Makefile
+++ b/cman/qdisk/Makefile
@@ -40,7 +40,7 @@ ${TARGET1}: ${SHAREDOBJS} ${OBJS1}
 	$(CC) -o $@ $^ $(EXTRA_LDFLAGS) $(LDFLAGS)
 
 ${TARGET2}: ${SHAREDOBJS} ${OBJS2}
-	$(CC) -o $@ $^ $(LDFLAGS)
+	$(CC) -o $@ $^ $(EXTRA_LDFLAGS) $(LDFLAGS)
 
 depends:
 	$(MAKE) -C ../lib all
diff --git a/cman/qdisk/disk.h b/cman/qdisk/disk.h
index 93d15fe..fd80fa6 100644
--- a/cman/qdisk/disk.h
+++ b/cman/qdisk/disk.h
@@ -273,6 +273,12 @@ typedef struct {
 	status_block_t ni_status;
 } node_info_t;
 
+typedef struct {
+	qd_ctx *ctx;
+	node_info_t *ni;
+	size_t ni_len;
+} qd_priv_t;
+
 int qd_write_status(qd_ctx *ctx, int nid, disk_node_state_t state,
 		    disk_msg_t *msg, memb_mask_t mask, memb_mask_t master);
 int qd_init(qd_ctx *ctx, cman_handle_t ch_admin,
diff --git a/cman/qdisk/iostate.c b/cman/qdisk/iostate.c
index 0199da4..06c6831 100644
--- a/cman/qdisk/iostate.c
+++ b/cman/qdisk/iostate.c
@@ -1,9 +1,12 @@
 #include <pthread.h>
+#include <libcman.h>
 #include <iostate.h>
 #include <unistd.h>
 #include <time.h>
 #include <sys/time.h>
 #include <liblogthread.h>
+#include <stdint.h>
+#include "platform.h"
 #include "iostate.h"
 
 static iostate_t main_state = 0;
@@ -26,7 +29,7 @@ static struct state_table io_state_table[] = {
 {	STATE_LSEEK,	"seek"	},
 {	-1,		NULL	} };
 
-static const char *
+const char *
 state_to_string(iostate_t state)
 {
 	static const char *ret = "unknown";
@@ -65,6 +68,8 @@ io_nanny_thread(void *arg)
 	iostate_t last_main_state = 0, current_main_state = 0;
 	int last_main_incarnation = 0, current_main_incarnation = 0;
 	int logged_incarnation = 0;
+	cman_handle_t ch = (cman_handle_t)arg;
+	int32_t whine_state;
 
 	/* Start with wherever we're at now */
 	pthread_mutex_lock(&state_mutex);
@@ -96,6 +101,11 @@ io_nanny_thread(void *arg)
 			continue;
 		}
 
+		/* Whine on CMAN api */
+		whine_state = (int32_t)current_main_state;
+		swab32(whine_state);
+		cman_send_data(ch, &whine_state, sizeof(int32_t), 0, 178, 0);
+
 		/* Don't log things twice */
 		if (logged_incarnation == current_main_incarnation)
 			continue;
@@ -114,7 +124,7 @@ io_nanny_thread(void *arg)
 
 
 int
-io_nanny_start(int timeout)
+io_nanny_start(cman_handle_t ch, int timeout)
 {
 	int ret;
 
@@ -124,7 +134,7 @@ io_nanny_start(int timeout)
 	qdisk_timeout = timeout;
 	thread_active = 1;
 
-	ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, NULL);
+	ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, ch);
 	pthread_mutex_unlock(&state_mutex);
 
 	return ret;
diff --git a/cman/qdisk/iostate.h b/cman/qdisk/iostate.h
index 7dd7bf6..a65b1d4 100644
--- a/cman/qdisk/iostate.h
+++ b/cman/qdisk/iostate.h
@@ -11,7 +11,9 @@ typedef enum {
 
 void io_state(iostate_t state);
 
-int io_nanny_start(int timeout);
+int io_nanny_start(cman_handle_t ch, int timeout);
 int io_nanny_stop(void);
 
+const char * state_to_string(iostate_t state);
+
 #endif
diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
index 9262af2..c1598fa 100644
--- a/cman/qdisk/main.c
+++ b/cman/qdisk/main.c
@@ -904,7 +904,8 @@ cman_wait(cman_handle_t ch, struct timeval *_tv)
 static void
 process_cman_event(cman_handle_t handle, void *private, int reason, int arg)
 {
-	qd_ctx *ctx = (qd_ctx *)private;
+	qd_priv_t *qp = (qd_priv_t *)private;
+	qd_ctx *ctx = qp->ctx;
 
 	switch(reason) {
 	case CMAN_REASON_PORTOPENED:
@@ -1915,6 +1916,33 @@ check_stop_cman(qd_ctx *ctx)
 do { static int _logged=0; if (!_logged) { _logged=1; logt_print(level, fmt, ##args); } } while(0)
 
 
+static void
+qdisk_whine(cman_handle_t h, void *privdata, char *buf, int len,
+	    uint8_t port, int nodeid)
+{
+	int32_t dstate;
+	qd_priv_t *qp = (qd_priv_t *)privdata;
+	node_info_t *ni = qp->ni;
+
+	if (len != sizeof(dstate)) {
+		return;
+	}
+
+	dstate = *((int32_t*)buf);
+
+	if (nodeid == (qp->ctx->qc_my_id))
+		return;
+
+	swab32(dstate);
+
+	if (dstate) {
+		logt_print(LOG_NOTICE, "qdiskd on node %d reports hung %s()\n", nodeid,
+			   state_to_string(dstate));
+		ni[nodeid-1].ni_misses = 0;
+	}
+}
+
+
 int
 main(int argc, char **argv)
 {
@@ -1928,6 +1956,7 @@ main(int argc, char **argv)
 	char device[128];
 	pid_t pid;
 	quorum_header_t qh;
+	qd_priv_t qp;
 
 	if (check_process_running(argv[0], &pid) && pid !=getpid()) {
 		printf("QDisk services already running\n");
@@ -1990,13 +2019,23 @@ main(int argc, char **argv)
 
 	/* For cman notifications we need two sockets - one for events,
 	   one for config change callbacks */
-	ch_user = cman_init(&ctx);
+	qp.ctx = &ctx;
+	qp.ni = &ni[0];
+	qp.ni_len = MAX_NODES_DISK;
+
+	ch_user = cman_init(&qp);
         if (cman_start_notification(ch_user, process_cman_event) != 0) {
 		logt_print(LOG_CRIT, "Could not register with CMAN: %s\n",
 			   strerror(errno));
 		goto out;
 	}
 
+	if (cman_start_recv_data(ch_user, qdisk_whine, 178) != 0) {
+		logt_print(LOG_CRIT, "Could not register with CMAN: %s\n",
+			   strerror(errno));
+		goto out;
+	}
+
 	memset(&me, 0, sizeof(me));
 	if (cman_get_node(ch_admin, CMAN_NODEID_US, &me) < 0) {
 		logt_print(LOG_CRIT, "Could not determine local node ID: %s\n",
@@ -2084,7 +2123,7 @@ main(int argc, char **argv)
 	/* This registers the quorum device */
 	register_device(&ctx);
 
-	io_nanny_start(ctx.qc_tko * ctx.qc_interval);
+	io_nanny_start(ch_user, ctx.qc_tko * ctx.qc_interval);
 
 	if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0) {
 		/* Only clean up if we're exiting w/o error) */
-- 
1.7.3.4



^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2012-07-13 20:16 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-07-02 13:55 [Cluster-devel] [PATCH] qdiskd: Make multipath issues go away Fabio M. Di Nitto
2012-07-13 20:16 ` Lon Hohberger
  -- strict thread matches above, loose matches on Subject: below --
2012-02-15 23:53 Lon Hohberger
2012-02-17  5:38 ` Fabio M. Di Nitto
2012-02-17 21:40   ` Lon Hohberger
2011-05-09 13:28 Lon Hohberger

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.