All of lore.kernel.org
 help / color / mirror / Atom feed
* [Ocfs2-devel] o2net patches
@ 2010-01-29  4:51 Srinivas Eeda
  2010-01-29  4:51 ` [Ocfs2-devel] [PATCH 1/3] o2net: rollback reconnect on network timeout Srinivas Eeda
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Srinivas Eeda @ 2010-01-29  4:51 UTC (permalink / raw)
  To: ocfs2-devel

Following 3 patches fixes:
1. rollback's reconnect fix
2. delay enotconn for sends, receives till a node reconnects/dies after a
   lost connection.
3. Correct's keepalive protocol

Thanks,
--Srini

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH 1/3] o2net: rollback reconnect on network timeout.
  2010-01-29  4:51 [Ocfs2-devel] o2net patches Srinivas Eeda
@ 2010-01-29  4:51 ` Srinivas Eeda
  2010-01-29  4:51 ` [Ocfs2-devel] [PATCH 2/3] o2net: delay enotconn for sends receives till quorum decision Srinivas Eeda
  2010-01-29  4:51 ` [Ocfs2-devel] [PATCH 3/3] o2net: correct keepalive message protocol Srinivas Eeda
  2 siblings, 0 replies; 16+ messages in thread
From: Srinivas Eeda @ 2010-01-29  4:51 UTC (permalink / raw)
  To: ocfs2-devel

This patch rollbacks earlier fix that tries to re-establish network connection
when network timeout happens. Reconnect was re-cycling sockets which results
in lost messages resulting in hangs.

Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
---
 fs/ocfs2/cluster/tcp.c          |   50 +++++++++++----------------------------
 fs/ocfs2/cluster/tcp_internal.h |    2 -
 2 files changed, 14 insertions(+), 38 deletions(-)

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 334f231..d81acff 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -463,6 +463,8 @@ static void o2net_set_nn_state(struct o2net_node *nn,
 	mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
 	mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
 
+	/* we won't reconnect after our valid conn goes away for
+	 * this hb iteration.. here so it shows up in the logs */
 	if (was_valid && !valid && err == 0)
 		err = -ENOTCONN;
 
@@ -491,6 +493,11 @@ static void o2net_set_nn_state(struct o2net_node *nn,
 	}
 
 	if (!was_valid && valid) {
+		/* this is a bit of a hack.  we only try reconnecting
+		 * when heartbeating starts until we get a connection.
+		 * if that connection then dies we don't try reconnecting.
+		 * the only way to start connecting again is to down
+		 * heartbeat and bring it back up. */
 		o2quo_conn_up(o2net_num_from_nn(nn));
 		cancel_delayed_work(&nn->nn_connect_expired);
 		printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
@@ -514,18 +521,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
 			delay = 0;
 		mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
 		queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
-
-		/*
-		 * Delay the expired work after idle timeout.
-		 *
-		 * We might have lots of failed connection attempts that run
-		 * through here but we only cancel the connect_expired work when
-		 * a connection attempt succeeds.  So only the first enqueue of
-		 * the connect_expired work will do anything.  The rest will see
-		 * that it's already queued and do nothing.
-		 */
-		delay += msecs_to_jiffies(o2net_idle_timeout());
-		queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay);
 	}
 
 	/* keep track of the nn's sc ref for the caller */
@@ -1273,7 +1268,6 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
 	 * shut down already */
 	if (nn->nn_sc == sc) {
 		o2net_sc_reset_idle_timer(sc);
-		atomic_set(&nn->nn_timeout, 0);
 		o2net_set_nn_state(nn, sc, 1, 0);
 	}
 	spin_unlock(&nn->nn_lock);
@@ -1471,7 +1465,6 @@ static void o2net_sc_send_keep_req(struct work_struct *work)
 static void o2net_idle_timer(unsigned long data)
 {
 	struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
-	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
 	struct timeval now;
 
 	do_gettimeofday(&now);
@@ -1494,12 +1487,6 @@ static void o2net_idle_timer(unsigned long data)
 	     sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
 	     sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
 
-	/*
-	 * Initialize the nn_timeout so that the next connection attempt
-	 * will continue in o2net_start_connect.
-	 */
-	atomic_set(&nn->nn_timeout, 1);
-
 	o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
 }
 
@@ -1534,7 +1521,6 @@ static void o2net_start_connect(struct work_struct *work)
 	struct socket *sock = NULL;
 	struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
 	int ret = 0, stop;
-	unsigned int timeout;
 
 	/* if we're greater we initiate tx, otherwise we accept */
 	if (o2nm_this_node() <= o2net_num_from_nn(nn))
@@ -1554,17 +1540,8 @@ static void o2net_start_connect(struct work_struct *work)
 	}
 
 	spin_lock(&nn->nn_lock);
-	/*
-	 * see if we already have one pending or have given up.
-	 * For nn_timeout, it is set when we close the connection
-	 * because of the idle time out. So it means that we have
-	 * at least connected to that node successfully once,
-	 * now try to connect to it again.
-	 */
-	timeout = atomic_read(&nn->nn_timeout);
-	stop = (nn->nn_sc ||
-		(nn->nn_persistent_error &&
-		(nn->nn_persistent_error != -ENOTCONN || timeout == 0)));
+	/* see if we already have one pending or have given up */
+	stop = (nn->nn_sc || nn->nn_persistent_error);
 	spin_unlock(&nn->nn_lock);
 	if (stop)
 		goto out;
@@ -1676,7 +1653,6 @@ void o2net_disconnect_node(struct o2nm_node *node)
 
 	/* don't reconnect until it's heartbeating again */
 	spin_lock(&nn->nn_lock);
-	atomic_set(&nn->nn_timeout, 0);
 	o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
 	spin_unlock(&nn->nn_lock);
 
@@ -1711,12 +1687,16 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
 		(msecs_to_jiffies(o2net_reconnect_delay()) + 1);
 
 	if (node_num != o2nm_this_node()) {
+		/* heartbeat doesn't work unless a local node number is
+		 * configured and doing so brings up the o2net_wq, so we can
+		 * use it.. */
+		queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
+				   msecs_to_jiffies(o2net_idle_timeout()));
 		/* believe it or not, accept and node hearbeating testing
 		 * can succeed for this node before we got here.. so
 		 * only use set_nn_state to clear the persistent error
 		 * if that hasn't already happened */
 		spin_lock(&nn->nn_lock);
-		atomic_set(&nn->nn_timeout, 0);
 		if (nn->nn_persistent_error)
 			o2net_set_nn_state(nn, NULL, 0, 0);
 		spin_unlock(&nn->nn_lock);
@@ -1839,7 +1819,6 @@ static int o2net_accept_one(struct socket *sock)
 	new_sock = NULL;
 
 	spin_lock(&nn->nn_lock);
-	atomic_set(&nn->nn_timeout, 0);
 	o2net_set_nn_state(nn, sc, 0, 0);
 	spin_unlock(&nn->nn_lock);
 
@@ -2037,7 +2016,6 @@ int o2net_init(void)
 	for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
 		struct o2net_node *nn = o2net_nn_from_num(i);
 
-		atomic_set(&nn->nn_timeout, 0);
 		spin_lock_init(&nn->nn_lock);
 		INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect);
 		INIT_DELAYED_WORK(&nn->nn_connect_expired,
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 8d58cfe..df36e1b 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -95,8 +95,6 @@ struct o2net_node {
 	unsigned			nn_sc_valid:1;
 	/* if this is set tx just returns it */
 	int				nn_persistent_error;
-	/* It is only set to 1 after the idle time out. */
-	atomic_t			nn_timeout;
 
 	/* threads waiting for an sc to arrive wait on the wq for generation
 	 * to increase.  it is increased when a connecting socket succeeds
-- 
1.5.6.5

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH 2/3] o2net: delay enotconn for sends receives till quorum decision
  2010-01-29  4:51 [Ocfs2-devel] o2net patches Srinivas Eeda
  2010-01-29  4:51 ` [Ocfs2-devel] [PATCH 1/3] o2net: rollback reconnect on network timeout Srinivas Eeda
@ 2010-01-29  4:51 ` Srinivas Eeda
  2010-01-29  4:51 ` [Ocfs2-devel] [PATCH 3/3] o2net: correct keepalive message protocol Srinivas Eeda
  2 siblings, 0 replies; 16+ messages in thread
From: Srinivas Eeda @ 2010-01-29  4:51 UTC (permalink / raw)
  To: ocfs2-devel

When a ocfs2 network heartbeat times out between two nodes, o2net layer breaks
the socket connection, and returns -ENOTCONN to processes that are trying
send/receive messages to/from other node. It also queues a quorum decision to
be made after the disk timeout to resolve split brain.

The fix queues the quorum decision after network heartbeat timeout but avoids
socket disconnects. The fix delays socket disconnects till O2HB_NODE_DOWN_CB
event which is triggered on the surviving node after the node evictions happen.
Surviving node signals -ENOTCONN to processes waiting to send/receives messages
to/from evicted node. If network connection comes back before the eviction,
quorum decision is cancelled and messaging resumes.

Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
---
 fs/ocfs2/cluster/tcp.c          |   69 +++++++++++++++++++++++++++++++--------
 fs/ocfs2/cluster/tcp_internal.h |    3 ++
 2 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d81acff..0bbd47b 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -141,6 +141,7 @@ static void o2net_sc_send_keep_req(struct work_struct *work);
 static void o2net_idle_timer(unsigned long data);
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
+static void o2net_queue_quorum(struct o2net_node *nn);
 
 #ifdef CONFIG_DEBUG_FS
 static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
@@ -447,7 +448,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
 			       unsigned valid, int err)
 {
 	int was_valid = nn->nn_sc_valid;
-	int was_err = nn->nn_persistent_error;
 	struct o2net_sock_container *old_sc = nn->nn_sc;
 
 	assert_spin_locked(&nn->nn_lock);
@@ -480,12 +480,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
 	if (nn->nn_persistent_error || nn->nn_sc_valid)
 		wake_up(&nn->nn_sc_wq);
 
-	if (!was_err && nn->nn_persistent_error) {
-		o2quo_conn_err(o2net_num_from_nn(nn));
-		queue_delayed_work(o2net_wq, &nn->nn_still_up,
-				   msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
-	}
-
 	if (was_valid && !valid) {
 		printk(KERN_INFO "o2net: no longer connected to "
 		       SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
@@ -498,7 +492,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
 		 * if that connection then dies we don't try reconnecting.
 		 * the only way to start connecting again is to down
 		 * heartbeat and bring it back up. */
-		o2quo_conn_up(o2net_num_from_nn(nn));
 		cancel_delayed_work(&nn->nn_connect_expired);
 		printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
 		       o2nm_this_node() > sc->sc_node->nd_num ?
@@ -557,6 +550,7 @@ static void o2net_state_change(struct sock *sk)
 {
 	void (*state_change)(struct sock *sk);
 	struct o2net_sock_container *sc;
+	struct o2net_node *nn;
 
 	read_lock(&sk->sk_callback_lock);
 	sc = sk->sk_user_data;
@@ -578,7 +572,11 @@ static void o2net_state_change(struct sock *sk)
 			o2net_sc_queue_work(sc, &sc->sc_connect_work);
 			break;
 		default:
-			o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+			if (sc->sc_handshake_ok) {
+				nn = o2net_nn_from_num(sc->sc_node->nd_num);
+				queue_work(o2net_wq, &nn->nn_connection_err);
+			} else
+				o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
 			break;
 	}
 out:
@@ -682,6 +680,26 @@ static void o2net_shutdown_sc(struct work_struct *work)
 	sc_put(sc);
 }
 
+static void o2net_queue_quorum(struct o2net_node *nn)
+{
+	if (!atomic_read(&nn->nn_quorum_queued)) {
+		o2quo_conn_err(o2net_num_from_nn(nn));
+		queue_delayed_work(o2net_wq, &nn->nn_still_up,
+				   msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
+		atomic_set(&nn->nn_quorum_queued, 1);
+	}
+}
+
+static void o2net_connection_err(struct work_struct *work)
+{
+	struct o2net_node *nn =
+		container_of(work, struct o2net_node, nn_connection_err);
+
+	spin_lock(&nn->nn_lock);
+	o2net_queue_quorum(nn);
+	spin_unlock(&nn->nn_lock);
+}
+
 /* ------------------------------------------------------------ */
 
 static int o2net_handler_cmp(struct o2net_msg_handler *nmh, u32 msg_type,
@@ -1465,6 +1483,7 @@ static void o2net_sc_send_keep_req(struct work_struct *work)
 static void o2net_idle_timer(unsigned long data)
 {
 	struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
 	struct timeval now;
 
 	do_gettimeofday(&now);
@@ -1487,7 +1506,7 @@ static void o2net_idle_timer(unsigned long data)
 	     sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
 	     sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
 
-	o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
+	queue_work(o2net_wq, &nn->nn_connection_err);
 }
 
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
@@ -1502,9 +1521,24 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
 
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
 {
-	/* Only push out an existing timer */
-	if (timer_pending(&sc->sc_idle_timeout))
-		o2net_sc_reset_idle_timer(sc);
+	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
+
+	/* avoid spin_lock if not needed */
+	if (atomic_read(&nn->nn_quorum_queued)) {
+		spin_lock(&nn->nn_lock);
+		if (atomic_read(&nn->nn_quorum_queued)) {
+			o2quo_conn_up(sc->sc_node->nd_num);
+			cancel_delayed_work(&nn->nn_still_up);
+			atomic_set(&nn->nn_quorum_queued, 0);
+			printk(KERN_INFO "o2net: reconnected to "
+			       SC_NODEF_FMT "\n", SC_NODEF_ARGS(sc));
+		}
+		spin_unlock(&nn->nn_lock);
+	}
+
+	if (!timer_pending(&sc->sc_idle_timeout))
+		return;
+	o2net_sc_reset_idle_timer(sc);
 }
 
 /* this work func is kicked whenever a path sets the nn state which doesn't
@@ -1540,7 +1574,7 @@ static void o2net_start_connect(struct work_struct *work)
 	}
 
 	spin_lock(&nn->nn_lock);
-	/* see if we already have one pending or have given up */
+	/* don't queue on broken or pending connection. */
 	stop = (nn->nn_sc || nn->nn_persistent_error);
 	spin_unlock(&nn->nn_lock);
 	if (stop)
@@ -1653,6 +1687,9 @@ void o2net_disconnect_node(struct o2nm_node *node)
 
 	/* don't reconnect until it's heartbeating again */
 	spin_lock(&nn->nn_lock);
+	if (nn->nn_sc)
+		o2net_sc_queue_work(nn->nn_sc, &nn->nn_sc->sc_shutdown_work);
+
 	o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
 	spin_unlock(&nn->nn_lock);
 
@@ -1661,7 +1698,9 @@ void o2net_disconnect_node(struct o2nm_node *node)
 		cancel_delayed_work(&nn->nn_connect_work);
 		cancel_delayed_work(&nn->nn_still_up);
 		flush_workqueue(o2net_wq);
+		atomic_set(&nn->nn_quorum_queued, 0);
 	}
+	o2quo_conn_err(o2net_num_from_nn(nn));
 }
 
 static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
@@ -2021,8 +2060,10 @@ int o2net_init(void)
 		INIT_DELAYED_WORK(&nn->nn_connect_expired,
 				  o2net_connect_expired);
 		INIT_DELAYED_WORK(&nn->nn_still_up, o2net_still_up);
+		INIT_WORK(&nn->nn_connection_err, o2net_connection_err);
 		/* until we see hb from a node we'll return einval */
 		nn->nn_persistent_error = -ENOTCONN;
+		atomic_set(&nn->nn_quorum_queued, 0);
 		init_waitqueue_head(&nn->nn_sc_wq);
 		idr_init(&nn->nn_status_idr);
 		INIT_LIST_HEAD(&nn->nn_status_list);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index df36e1b..f013c35 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -95,6 +95,8 @@ struct o2net_node {
 	unsigned			nn_sc_valid:1;
 	/* if this is set tx just returns it */
 	int				nn_persistent_error;
+	/* It is toggled between quorum fired and cancelled */
+	atomic_t			nn_quorum_queued;
 
 	/* threads waiting for an sc to arrive wait on the wq for generation
 	 * to increase.  it is increased when a connecting socket succeeds
@@ -123,6 +125,7 @@ struct o2net_node {
 	 * that it is still heartbeating and that we should do some
 	 * quorum work */
 	struct delayed_work		nn_still_up;
+	struct work_struct		nn_connection_err;
 };
 
 struct o2net_sock_container {
-- 
1.5.6.5

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH 3/3] o2net: correct keepalive message protocol
  2010-01-29  4:51 [Ocfs2-devel] o2net patches Srinivas Eeda
  2010-01-29  4:51 ` [Ocfs2-devel] [PATCH 1/3] o2net: rollback reconnect on network timeout Srinivas Eeda
  2010-01-29  4:51 ` [Ocfs2-devel] [PATCH 2/3] o2net: delay enotconn for sends receives till quorum decision Srinivas Eeda
@ 2010-01-29  4:51 ` Srinivas Eeda
  2010-02-17  5:56   ` Joel Becker
  2 siblings, 1 reply; 16+ messages in thread
From: Srinivas Eeda @ 2010-01-29  4:51 UTC (permalink / raw)
  To: ocfs2-devel

Currently keepalive packet is sent to another node if a message is not heard
from the other node for O2NET_KEEPALIVE_DELAY_MS seconds. The message is not
resent again till the other node sends a message.

The functionality described above works as we rely on TCP protocol which
guarantees message delivery. However the intention of this feature was to send
a keepalive message every timeout seconds. This patch corrects the
functionality. Now since each node sends a message for every keepalive time
interval, there is no need for a response for the keepalive message.

Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
---
 fs/ocfs2/cluster/tcp.c |   16 ++++++++++------
 1 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 0bbd47b..8a999ec 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1160,8 +1160,9 @@ static int o2net_process_message(struct o2net_sock_container *sc,
 					   be32_to_cpu(hdr->status));
 			goto out;
 		case O2NET_MSG_KEEP_REQ_MAGIC:
-			o2net_sendpage(sc, o2net_keep_resp,
-				       sizeof(*o2net_keep_resp));
+			/* Each node now sends keepalive message every
+			 * keepalive time interval. Hence no need for response
+			 */
 			goto out;
 		case O2NET_MSG_KEEP_RESP_MAGIC:
 			goto out;
@@ -1285,6 +1286,8 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
 	/* set valid and queue the idle timers only if it hasn't been
 	 * shut down already */
 	if (nn->nn_sc == sc) {
+		o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
+				msecs_to_jiffies(o2net_keepalive_delay()));
 		o2net_sc_reset_idle_timer(sc);
 		o2net_set_nn_state(nn, sc, 1, 0);
 	}
@@ -1473,7 +1476,11 @@ static void o2net_sc_send_keep_req(struct work_struct *work)
 		container_of(work, struct o2net_sock_container,
 			     sc_keepalive_work.work);
 
-	o2net_sendpage(sc, o2net_keep_req, sizeof(*o2net_keep_req));
+	if (timer_pending(&sc->sc_idle_timeout)) {
+		o2net_sendpage(sc, o2net_keep_req, sizeof(*o2net_keep_req));
+		o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
+				msecs_to_jiffies(o2net_keepalive_delay()));
+	}
 	sc_put(sc);
 }
 
@@ -1511,9 +1518,6 @@ static void o2net_idle_timer(unsigned long data)
 
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
 {
-	o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
-	o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
-		      msecs_to_jiffies(o2net_keepalive_delay()));
 	do_gettimeofday(&sc->sc_tv_timer);
 	mod_timer(&sc->sc_idle_timeout,
 	       jiffies + msecs_to_jiffies(o2net_idle_timeout()));
-- 
1.5.6.5

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH 3/3] o2net: correct keepalive message protocol
  2010-01-29  4:51 ` [Ocfs2-devel] [PATCH 3/3] o2net: correct keepalive message protocol Srinivas Eeda
@ 2010-02-17  5:56   ` Joel Becker
  2010-02-17 18:24     ` Srinivas Eeda
  0 siblings, 1 reply; 16+ messages in thread
From: Joel Becker @ 2010-02-17  5:56 UTC (permalink / raw)
  To: ocfs2-devel

On Thu, Jan 28, 2010 at 08:51:11PM -0800, Srinivas Eeda wrote:
>  		case O2NET_MSG_KEEP_REQ_MAGIC:
> -			o2net_sendpage(sc, o2net_keep_resp,
> -				       sizeof(*o2net_keep_resp));
> +			/* Each node now sends keepalive message every
> +			 * keepalive time interval. Hence no need for response
> +			 */
>  			goto out;

	You still have to send the response.  Think about a mixed
environment where some nodes have this fix and some do not.  The older
software is still waiting on the response.
	The newer version can just ignore any responses it gets from
other nodes.  But it has to send responses out just in case the other
node is older.
	The only other alternative is to bump the o2net protocol
version, and that means the cluster has to be shut down to upgrade.  Not
a good choice.

Joel

-- 

Life's Little Instruction Book #464

	"Don't miss the magic of the moment by focusing on what's
	 to come."

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH 3/3] o2net: correct keepalive message protocol
  2010-02-17  5:56   ` Joel Becker
@ 2010-02-17 18:24     ` Srinivas Eeda
  2010-02-17 22:40       ` Sunil Mushran
  2010-02-18  1:49       ` Joel Becker
  0 siblings, 2 replies; 16+ messages in thread
From: Srinivas Eeda @ 2010-02-17 18:24 UTC (permalink / raw)
  To: ocfs2-devel

Each node that has this patch would send a O2NET_MSG_KEEP_REQ_MAGIC 
every 2 seconds(default). So, nodes without this patch would always 
receive a heartbeat message every 2 seconds.

Nodes without this patch will send(respond) with 
O2NET_MSG_KEEP_RESP_MAGIC for every keep alive packet they received. So 
nodes with this patch will always receive a response message.

So, in a mixed setup, both nodes will always hear the heartbeat from 
each other :).

thanks,
--Srini



Joel Becker wrote:
> On Thu, Jan 28, 2010 at 08:51:11PM -0800, Srinivas Eeda wrote:
>   
>>  		case O2NET_MSG_KEEP_REQ_MAGIC:
>> -			o2net_sendpage(sc, o2net_keep_resp,
>> -				       sizeof(*o2net_keep_resp));
>> +			/* Each node now sends keepalive message every
>> +			 * keepalive time interval. Hence no need for response
>> +			 */
>>  			goto out;
>>     
>
> 	You still have to send the response.  Think about a mixed
> environment where some nodes have this fix and some do not.  The older
> software is still waiting on the response.
> 	The newer version can just ignore any responses it gets from
> other nodes.  But it has to send responses out just in case the other
> node is older.
> 	The only other alternative is to bump the o2net protocol
> version, and that means the cluster has to be shut down to upgrade.  Not
> a good choice.
>
> Joel
>
>   

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH 3/3] o2net: correct keepalive message protocol
  2010-02-17 18:24     ` Srinivas Eeda
@ 2010-02-17 22:40       ` Sunil Mushran
  2010-02-17 22:52         ` Srinivas Eeda
  2010-02-18  1:49       ` Joel Becker
  1 sibling, 1 reply; 16+ messages in thread
From: Sunil Mushran @ 2010-02-17 22:40 UTC (permalink / raw)
  To: ocfs2-devel

What's the harm in leaving it in?

Srinivas Eeda wrote:
> Each node that has this patch would send a O2NET_MSG_KEEP_REQ_MAGIC 
> every 2 seconds(default). So, nodes without this patch would always 
> receive a heartbeat message every 2 seconds.
>
> Nodes without this patch will send(respond) with 
> O2NET_MSG_KEEP_RESP_MAGIC for every keep alive packet they received. So 
> nodes with this patch will always receive a response message.
>
> So, in a mixed setup, both nodes will always hear the heartbeat from 
> each other :).
>
> thanks,
> --Srini
>
>
>
> Joel Becker wrote:
>   
>> On Thu, Jan 28, 2010 at 08:51:11PM -0800, Srinivas Eeda wrote:
>>   
>>     
>>>  		case O2NET_MSG_KEEP_REQ_MAGIC:
>>> -			o2net_sendpage(sc, o2net_keep_resp,
>>> -				       sizeof(*o2net_keep_resp));
>>> +			/* Each node now sends keepalive message every
>>> +			 * keepalive time interval. Hence no need for response
>>> +			 */
>>>  			goto out;
>>>     
>>>       
>> 	You still have to send the response.  Think about a mixed
>> environment where some nodes have this fix and some do not.  The older
>> software is still waiting on the response.
>> 	The newer version can just ignore any responses it gets from
>> other nodes.  But it has to send responses out just in case the other
>> node is older.
>> 	The only other alternative is to bump the o2net protocol
>> version, and that means the cluster has to be shut down to upgrade.  Not
>> a good choice.
>>
>> Joel
>>
>>   
>>     
>
>
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel at oss.oracle.com
> http://oss.oracle.com/mailman/listinfo/ocfs2-devel
>   

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH 3/3] o2net: correct keepalive message protocol
  2010-02-17 22:52         ` Srinivas Eeda
@ 2010-02-17 22:47           ` Sunil Mushran
  2010-02-17 23:45             ` Srinivas Eeda
  0 siblings, 1 reply; 16+ messages in thread
From: Sunil Mushran @ 2010-02-17 22:47 UTC (permalink / raw)
  To: ocfs2-devel

How will it double? The node will send a keepalive only if it has
not heard from the other node for 2 secs.

Srinivas Eeda wrote:
> No harm, just doubles heartbeat messages which is not required at all.
>
> Sunil Mushran wrote:
>> What's the harm in leaving it in?
>>
>> Srinivas Eeda wrote:
>>> Each node that has this patch would send a O2NET_MSG_KEEP_REQ_MAGIC 
>>> every 2 seconds(default). So, nodes without this patch would always 
>>> receive a heartbeat message every 2 seconds.
>>>
>>> Nodes without this patch will send(respond) with 
>>> O2NET_MSG_KEEP_RESP_MAGIC for every keep alive packet they received. 
>>> So nodes with this patch will always receive a response message.
>>>
>>> So, in a mixed setup, both nodes will always hear the heartbeat from 
>>> each other :).
>>>
>>> thanks,
>>> --Srini
>>>
>>>
>>>
>>> Joel Becker wrote:
>>>  
>>>> On Thu, Jan 28, 2010 at 08:51:11PM -0800, Srinivas Eeda wrote:
>>>>     
>>>>>          case O2NET_MSG_KEEP_REQ_MAGIC:
>>>>> -            o2net_sendpage(sc, o2net_keep_resp,
>>>>> -                       sizeof(*o2net_keep_resp));
>>>>> +            /* Each node now sends keepalive message every
>>>>> +             * keepalive time interval. Hence no need for response
>>>>> +             */
>>>>>              goto out;
>>>>>           
>>>>     You still have to send the response.  Think about a mixed
>>>> environment where some nodes have this fix and some do not.  The older
>>>> software is still waiting on the response.
>>>>     The newer version can just ignore any responses it gets from
>>>> other nodes.  But it has to send responses out just in case the other
>>>> node is older.
>>>>     The only other alternative is to bump the o2net protocol
>>>> version, and that means the cluster has to be shut down to 
>>>> upgrade.  Not
>>>> a good choice.
>>>>
>>>> Joel
>>>>
>>>>       
>>>
>>>
>>> _______________________________________________
>>> Ocfs2-devel mailing list
>>> Ocfs2-devel at oss.oracle.com
>>> http://oss.oracle.com/mailman/listinfo/ocfs2-devel
>>>   
>>
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH 3/3] o2net: correct keepalive message protocol
  2010-02-17 22:40       ` Sunil Mushran
@ 2010-02-17 22:52         ` Srinivas Eeda
  2010-02-17 22:47           ` Sunil Mushran
  0 siblings, 1 reply; 16+ messages in thread
From: Srinivas Eeda @ 2010-02-17 22:52 UTC (permalink / raw)
  To: ocfs2-devel

No harm, just doubles heartbeat messages which is not required at all.

Sunil Mushran wrote:
> What's the harm in leaving it in?
>
> Srinivas Eeda wrote:
>> Each node that has this patch would send a O2NET_MSG_KEEP_REQ_MAGIC 
>> every 2 seconds(default). So, nodes without this patch would always 
>> receive a heartbeat message every 2 seconds.
>>
>> Nodes without this patch will send(respond) with 
>> O2NET_MSG_KEEP_RESP_MAGIC for every keep alive packet they received. 
>> So nodes with this patch will always receive a response message.
>>
>> So, in a mixed setup, both nodes will always hear the heartbeat from 
>> each other :).
>>
>> thanks,
>> --Srini
>>
>>
>>
>> Joel Becker wrote:
>>  
>>> On Thu, Jan 28, 2010 at 08:51:11PM -0800, Srinivas Eeda wrote:
>>>      
>>>>          case O2NET_MSG_KEEP_REQ_MAGIC:
>>>> -            o2net_sendpage(sc, o2net_keep_resp,
>>>> -                       sizeof(*o2net_keep_resp));
>>>> +            /* Each node now sends keepalive message every
>>>> +             * keepalive time interval. Hence no need for response
>>>> +             */
>>>>              goto out;
>>>>           
>>>     You still have to send the response.  Think about a mixed
>>> environment where some nodes have this fix and some do not.  The older
>>> software is still waiting on the response.
>>>     The newer version can just ignore any responses it gets from
>>> other nodes.  But it has to send responses out just in case the other
>>> node is older.
>>>     The only other alternative is to bump the o2net protocol
>>> version, and that means the cluster has to be shut down to upgrade.  
>>> Not
>>> a good choice.
>>>
>>> Joel
>>>
>>>       
>>
>>
>> _______________________________________________
>> Ocfs2-devel mailing list
>> Ocfs2-devel at oss.oracle.com
>> http://oss.oracle.com/mailman/listinfo/ocfs2-devel
>>   
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH 3/3] o2net: correct keepalive message protocol
  2010-02-17 22:47           ` Sunil Mushran
@ 2010-02-17 23:45             ` Srinivas Eeda
  2010-02-17 23:50               ` Sunil Mushran
  0 siblings, 1 reply; 16+ messages in thread
From: Srinivas Eeda @ 2010-02-17 23:45 UTC (permalink / raw)
  To: ocfs2-devel

In old code a node cancels and re queues keep alive message when it 
hears from the other node. If it didn't hear in 2 seconds, queued 
message gets fired which sends a keep alive message. And a re queue 
happens only after it hears from the other node.

With the new change, a node sends keep alive every 2 seconds.

Sunil Mushran wrote:
> How will it double? The node will send a keepalive only if it has
> not heard from the other node for 2 secs.
>
> Srinivas Eeda wrote:
>> No harm, just doubles heartbeat messages which is not required at all.
>>
>> Sunil Mushran wrote:
>>> What's the harm in leaving it in?
>>>
>>> Srinivas Eeda wrote:
>>>> Each node that has this patch would send a O2NET_MSG_KEEP_REQ_MAGIC 
>>>> every 2 seconds(default). So, nodes without this patch would always 
>>>> receive a heartbeat message every 2 seconds.
>>>>
>>>> Nodes without this patch will send(respond) with 
>>>> O2NET_MSG_KEEP_RESP_MAGIC for every keep alive packet they 
>>>> received. So nodes with this patch will always receive a response 
>>>> message.
>>>>
>>>> So, in a mixed setup, both nodes will always hear the heartbeat 
>>>> from each other :).
>>>>
>>>> thanks,
>>>> --Srini
>>>>
>>>>
>>>>
>>>> Joel Becker wrote:
>>>>  
>>>>> On Thu, Jan 28, 2010 at 08:51:11PM -0800, Srinivas Eeda wrote:
>>>>>    
>>>>>>          case O2NET_MSG_KEEP_REQ_MAGIC:
>>>>>> -            o2net_sendpage(sc, o2net_keep_resp,
>>>>>> -                       sizeof(*o2net_keep_resp));
>>>>>> +            /* Each node now sends keepalive message every
>>>>>> +             * keepalive time interval. Hence no need for response
>>>>>> +             */
>>>>>>              goto out;
>>>>>>           
>>>>>     You still have to send the response.  Think about a mixed
>>>>> environment where some nodes have this fix and some do not.  The 
>>>>> older
>>>>> software is still waiting on the response.
>>>>>     The newer version can just ignore any responses it gets from
>>>>> other nodes.  But it has to send responses out just in case the other
>>>>> node is older.
>>>>>     The only other alternative is to bump the o2net protocol
>>>>> version, and that means the cluster has to be shut down to 
>>>>> upgrade.  Not
>>>>> a good choice.
>>>>>
>>>>> Joel
>>>>>
>>>>>       
>>>>
>>>>
>>>> _______________________________________________
>>>> Ocfs2-devel mailing list
>>>> Ocfs2-devel at oss.oracle.com
>>>> http://oss.oracle.com/mailman/listinfo/ocfs2-devel
>>>>   
>>>
>>
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH 3/3] o2net: correct keepalive message protocol
  2010-02-17 23:45             ` Srinivas Eeda
@ 2010-02-17 23:50               ` Sunil Mushran
  2010-02-18  0:59                 ` srinivas eeda
  0 siblings, 1 reply; 16+ messages in thread
From: Sunil Mushran @ 2010-02-17 23:50 UTC (permalink / raw)
  To: ocfs2-devel

My understanding was that we'll also requeue after sending a keepalive.
As in, not wait for the response to requeue. But we'll still be smart about
it in the sense that not send a hb even if the nodes are communicating
otherwise.

Srinivas Eeda wrote:
> In old code a node cancels and re queues keep alive message when it 
> hears from the other node. If it didn't hear in 2 seconds, queued 
> message gets fired which sends a keep alive message. And a re queue 
> happens only after it hears from the other node.
>
> With the new change, a node sends keep alive every 2 seconds.
>
> Sunil Mushran wrote:
>> How will it double? The node will send a keepalive only if it has
>> not heard from the other node for 2 secs.
>>
>> Srinivas Eeda wrote:
>>> No harm, just doubles heartbeat messages which is not required at all.
>>>
>>> Sunil Mushran wrote:
>>>> What's the harm in leaving it in?
>>>>
>>>> Srinivas Eeda wrote:
>>>>> Each node that has this patch would send a 
>>>>> O2NET_MSG_KEEP_REQ_MAGIC every 2 seconds(default). So, nodes 
>>>>> without this patch would always receive a heartbeat message every 
>>>>> 2 seconds.
>>>>>
>>>>> Nodes without this patch will send(respond) with 
>>>>> O2NET_MSG_KEEP_RESP_MAGIC for every keep alive packet they 
>>>>> received. So nodes with this patch will always receive a response 
>>>>> message.
>>>>>
>>>>> So, in a mixed setup, both nodes will always hear the heartbeat 
>>>>> from each other :).
>>>>>
>>>>> thanks,
>>>>> --Srini
>>>>>
>>>>>
>>>>>
>>>>> Joel Becker wrote:
>>>>>  
>>>>>> On Thu, Jan 28, 2010 at 08:51:11PM -0800, Srinivas Eeda wrote:
>>>>>>   
>>>>>>>          case O2NET_MSG_KEEP_REQ_MAGIC:
>>>>>>> -            o2net_sendpage(sc, o2net_keep_resp,
>>>>>>> -                       sizeof(*o2net_keep_resp));
>>>>>>> +            /* Each node now sends keepalive message every
>>>>>>> +             * keepalive time interval. Hence no need for response
>>>>>>> +             */
>>>>>>>              goto out;
>>>>>>>           
>>>>>>     You still have to send the response.  Think about a mixed
>>>>>> environment where some nodes have this fix and some do not.  The 
>>>>>> older
>>>>>> software is still waiting on the response.
>>>>>>     The newer version can just ignore any responses it gets from
>>>>>> other nodes.  But it has to send responses out just in case the 
>>>>>> other
>>>>>> node is older.
>>>>>>     The only other alternative is to bump the o2net protocol
>>>>>> version, and that means the cluster has to be shut down to 
>>>>>> upgrade.  Not
>>>>>> a good choice.
>>>>>>
>>>>>> Joel
>>>>>>
>>>>>>       
>>>>>
>>>>>
>>>>> _______________________________________________
>>>>> Ocfs2-devel mailing list
>>>>> Ocfs2-devel at oss.oracle.com
>>>>> http://oss.oracle.com/mailman/listinfo/ocfs2-devel
>>>>>   
>>>>
>>>
>>
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH 3/3] o2net: correct keepalive message protocol
  2010-02-17 23:50               ` Sunil Mushran
@ 2010-02-18  0:59                 ` srinivas eeda
  2010-02-18  1:08                   ` Sunil Mushran
  0 siblings, 1 reply; 16+ messages in thread
From: srinivas eeda @ 2010-02-18  0:59 UTC (permalink / raw)
  To: ocfs2-devel

Ok, I'll modify the patch. Are messages queued on o2net_wq and execution 
of o2net_process_message is always done in the context of o2net thread 
and are synchronized?

On 2/17/2010 3:50 PM, Sunil Mushran wrote:
> My understanding was that we'll also requeue after sending a keepalive.
> As in, not wait for the response to requeue. But we'll still be smart 
> about
> it in the sense that not send a hb even if the nodes are communicating
> otherwise.
>
> Srinivas Eeda wrote:
>> In old code a node cancels and re queues keep alive message when it 
>> hears from the other node. If it didn't hear in 2 seconds, queued 
>> message gets fired which sends a keep alive message. And a re queue 
>> happens only after it hears from the other node.
>>
>> With the new change, a node sends keep alive every 2 seconds.
>>
>> Sunil Mushran wrote:
>>> How will it double? The node will send a keepalive only if it has
>>> not heard from the other node for 2 secs.
>>>
>>> Srinivas Eeda wrote:
>>>> No harm, just doubles heartbeat messages which is not required at all.
>>>>
>>>> Sunil Mushran wrote:
>>>>> What's the harm in leaving it in?
>>>>>
>>>>> Srinivas Eeda wrote:
>>>>>> Each node that has this patch would send a 
>>>>>> O2NET_MSG_KEEP_REQ_MAGIC every 2 seconds(default). So, nodes 
>>>>>> without this patch would always receive a heartbeat message every 
>>>>>> 2 seconds.
>>>>>>
>>>>>> Nodes without this patch will send(respond) with 
>>>>>> O2NET_MSG_KEEP_RESP_MAGIC for every keep alive packet they 
>>>>>> received. So nodes with this patch will always receive a response 
>>>>>> message.
>>>>>>
>>>>>> So, in a mixed setup, both nodes will always hear the heartbeat 
>>>>>> from each other :).
>>>>>>
>>>>>> thanks,
>>>>>> --Srini
>>>>>>
>>>>>>
>>>>>>
>>>>>> Joel Becker wrote:
>>>>>>  
>>>>>>> On Thu, Jan 28, 2010 at 08:51:11PM -0800, Srinivas Eeda wrote:
>>>>>>>  
>>>>>>>>          case O2NET_MSG_KEEP_REQ_MAGIC:
>>>>>>>> -            o2net_sendpage(sc, o2net_keep_resp,
>>>>>>>> -                       sizeof(*o2net_keep_resp));
>>>>>>>> +            /* Each node now sends keepalive message every
>>>>>>>> +             * keepalive time interval. Hence no need for 
>>>>>>>> response
>>>>>>>> +             */
>>>>>>>>              goto out;
>>>>>>>>           
>>>>>>>     You still have to send the response.  Think about a mixed
>>>>>>> environment where some nodes have this fix and some do not.  The 
>>>>>>> older
>>>>>>> software is still waiting on the response.
>>>>>>>     The newer version can just ignore any responses it gets from
>>>>>>> other nodes.  But it has to send responses out just in case the 
>>>>>>> other
>>>>>>> node is older.
>>>>>>>     The only other alternative is to bump the o2net protocol
>>>>>>> version, and that means the cluster has to be shut down to 
>>>>>>> upgrade.  Not
>>>>>>> a good choice.
>>>>>>>
>>>>>>> Joel
>>>>>>>
>>>>>>>       
>>>>>>
>>>>>>
>>>>>> _______________________________________________
>>>>>> Ocfs2-devel mailing list
>>>>>> Ocfs2-devel at oss.oracle.com
>>>>>> http://oss.oracle.com/mailman/listinfo/ocfs2-devel
>>>>>>   
>>>>>
>>>>
>>>
>>
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH 3/3] o2net: correct keepalive message protocol
  2010-02-18  0:59                 ` srinivas eeda
@ 2010-02-18  1:08                   ` Sunil Mushran
  2010-02-18  1:46                     ` srinivas eeda
  0 siblings, 1 reply; 16+ messages in thread
From: Sunil Mushran @ 2010-02-18  1:08 UTC (permalink / raw)
  To: ocfs2-devel

srinivas eeda wrote:
> Ok, I'll modify the patch. Are messages queued on o2net_wq and 
> execution of o2net_process_message is always done in the context of 
> o2net thread and are synchronized?
>
o2net is the context. Synchronized?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH 3/3] o2net: correct keepalive message protocol
  2010-02-18  1:08                   ` Sunil Mushran
@ 2010-02-18  1:46                     ` srinivas eeda
  0 siblings, 0 replies; 16+ messages in thread
From: srinivas eeda @ 2010-02-18  1:46 UTC (permalink / raw)
  To: ocfs2-devel

whenever a message is received o2net_process_message will call 
o2net_sc_postpone_idle to cancel/requeue keepalive message. What will 
happen if the 2 seconds have passed right before the cancel was issued. 
Will the work gets queued which will be executed later by o2net thread 
or will o2net thread gets interrupted and will execute this work? or can 
both codes be executed in parallel?

On 2/17/2010 5:08 PM, Sunil Mushran wrote:
> srinivas eeda wrote:
>> Ok, I'll modify the patch. Are messages queued on o2net_wq and 
>> execution of o2net_process_message is always done in the context of 
>> o2net thread and are synchronized?
>>
> o2net is the context. Synchronized?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH 3/3] o2net: correct keepalive message protocol
  2010-02-17 18:24     ` Srinivas Eeda
  2010-02-17 22:40       ` Sunil Mushran
@ 2010-02-18  1:49       ` Joel Becker
  2010-02-18  1:55         ` srinivas eeda
  1 sibling, 1 reply; 16+ messages in thread
From: Joel Becker @ 2010-02-18  1:49 UTC (permalink / raw)
  To: ocfs2-devel

On Wed, Feb 17, 2010 at 10:24:30AM -0800, Srinivas Eeda wrote:
> Each node that has this patch would send a O2NET_MSG_KEEP_REQ_MAGIC
> every 2 seconds(default). So, nodes without this patch would always
> receive a heartbeat message every 2 seconds.

	So, old nodes do not care if they never receive RESP_MAGIC as
long as they got some other message?

Joel

-- 

 There are morethings in heaven and earth, Horatio,
 Than are dreamt of in your philosophy.

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH 3/3] o2net: correct keepalive message protocol
  2010-02-18  1:49       ` Joel Becker
@ 2010-02-18  1:55         ` srinivas eeda
  0 siblings, 0 replies; 16+ messages in thread
From: srinivas eeda @ 2010-02-18  1:55 UTC (permalink / raw)
  To: ocfs2-devel

Yea, they don't expect/wait for a response for keep alive message.

On 2/17/2010 5:49 PM, Joel Becker wrote:
> On Wed, Feb 17, 2010 at 10:24:30AM -0800, Srinivas Eeda wrote:
>   
>> Each node that has this patch would send a O2NET_MSG_KEEP_REQ_MAGIC
>> every 2 seconds(default). So, nodes without this patch would always
>> receive a heartbeat message every 2 seconds.
>>     
>
> 	So, old nodes do not care if they never receive RESP_MAGIC as
> long as they got some other message?
>
> Joel
>
>   
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://oss.oracle.com/pipermail/ocfs2-devel/attachments/20100217/09dedd52/attachment.html 

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2010-02-18  1:55 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-01-29  4:51 [Ocfs2-devel] o2net patches Srinivas Eeda
2010-01-29  4:51 ` [Ocfs2-devel] [PATCH 1/3] o2net: rollback reconnect on network timeout Srinivas Eeda
2010-01-29  4:51 ` [Ocfs2-devel] [PATCH 2/3] o2net: delay enotconn for sends receives till quorum decision Srinivas Eeda
2010-01-29  4:51 ` [Ocfs2-devel] [PATCH 3/3] o2net: correct keepalive message protocol Srinivas Eeda
2010-02-17  5:56   ` Joel Becker
2010-02-17 18:24     ` Srinivas Eeda
2010-02-17 22:40       ` Sunil Mushran
2010-02-17 22:52         ` Srinivas Eeda
2010-02-17 22:47           ` Sunil Mushran
2010-02-17 23:45             ` Srinivas Eeda
2010-02-17 23:50               ` Sunil Mushran
2010-02-18  0:59                 ` srinivas eeda
2010-02-18  1:08                   ` Sunil Mushran
2010-02-18  1:46                     ` srinivas eeda
2010-02-18  1:49       ` Joel Becker
2010-02-18  1:55         ` srinivas eeda

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.