All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] libceph: use keepalive2 to verify the mon session is alive
@ 2015-09-01 14:21 Yan, Zheng
  2015-09-01 15:52 ` Ilya Dryomov
  0 siblings, 1 reply; 7+ messages in thread
From: Yan, Zheng @ 2015-09-01 14:21 UTC (permalink / raw)
  To: ceph-devel; +Cc: idryomov, Yan, Zheng

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 include/linux/ceph/libceph.h   |  2 ++
 include/linux/ceph/messenger.h |  4 +++
 include/linux/ceph/msgr.h      |  4 ++-
 net/ceph/ceph_common.c         | 18 ++++++++++++-
 net/ceph/messenger.c           | 60 ++++++++++++++++++++++++++++++++++++++----
 net/ceph/mon_client.c          | 38 ++++++++++++++++++++------
 6 files changed, 111 insertions(+), 15 deletions(-)

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 9ebee53..9a0b471 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -46,6 +46,7 @@ struct ceph_options {
 	unsigned long mount_timeout;		/* jiffies */
 	unsigned long osd_idle_ttl;		/* jiffies */
 	unsigned long osd_keepalive_timeout;	/* jiffies */
+	unsigned long mon_keepalive_timeout;	/* jiffies */
 
 	/*
 	 * any type that can't be simply compared or doesn't need need
@@ -66,6 +67,7 @@ struct ceph_options {
 #define CEPH_MOUNT_TIMEOUT_DEFAULT	msecs_to_jiffies(60 * 1000)
 #define CEPH_OSD_KEEPALIVE_DEFAULT	msecs_to_jiffies(5 * 1000)
 #define CEPH_OSD_IDLE_TTL_DEFAULT	msecs_to_jiffies(60 * 1000)
+#define CEPH_MON_KEEPALIVE_DEFAULT	msecs_to_jiffies(30 * 1000)
 
 #define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
 #define CEPH_MSG_MAX_MIDDLE_LEN	(16*1024*1024)
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 3775327..83063b6 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -248,6 +248,8 @@ struct ceph_connection {
 	int in_base_pos;     /* bytes read */
 	__le64 in_temp_ack;  /* for reading an ack */
 
+	struct timespec last_keepalive_ack;
+
 	struct delayed_work work;	    /* send|recv work */
 	unsigned long       delay;          /* current delay interval */
 };
@@ -285,6 +287,8 @@ extern void ceph_msg_revoke(struct ceph_msg *msg);
 extern void ceph_msg_revoke_incoming(struct ceph_msg *msg);
 
 extern void ceph_con_keepalive(struct ceph_connection *con);
+extern int ceph_con_keepalive_expired(struct ceph_connection *con,
+				      unsigned long interval);
 
 extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
 				size_t length, size_t alignment);
diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
index 1c18872..0fe2656 100644
--- a/include/linux/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
@@ -84,10 +84,12 @@ struct ceph_entity_inst {
 #define CEPH_MSGR_TAG_MSG           7  /* message */
 #define CEPH_MSGR_TAG_ACK           8  /* message ack */
 #define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
-#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
+#define CEPH_MSGR_TAG_BADPROTOVER   10 /* bad protocol version */
 #define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
 #define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
 #define CEPH_MSGR_TAG_SEQ           13 /* 64-bit int follows with seen seq number */
+#define CEPH_MSGR_TAG_KEEPALIVE2    14 /* keepalive2 byte + ceph_timespec */
+#define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15 /* keepalive2 reply */
 
 
 /*
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index f30329f..5143f6e 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -226,6 +226,7 @@ static int parse_fsid(const char *str, struct ceph_fsid *fsid)
  * ceph options
  */
 enum {
+	Opt_monkeepalivetimeout,
 	Opt_osdtimeout,
 	Opt_osdkeepalivetimeout,
 	Opt_mount_timeout,
@@ -250,6 +251,7 @@ enum {
 };
 
 static match_table_t opt_tokens = {
+	{Opt_monkeepalivetimeout, "monkeepalive=%d"},
 	{Opt_osdtimeout, "osdtimeout=%d"},
 	{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
 	{Opt_mount_timeout, "mount_timeout=%d"},
@@ -354,9 +356,10 @@ ceph_parse_options(char *options, const char *dev_name,
 
 	/* start with defaults */
 	opt->flags = CEPH_OPT_DEFAULT;
-	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
 	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
 	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
+	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+	opt->mon_keepalive_timeout = CEPH_MON_KEEPALIVE_DEFAULT;
 
 	/* get mon ip(s) */
 	/* ip1[:port1][,ip2[:port2]...] */
@@ -460,6 +463,16 @@ ceph_parse_options(char *options, const char *dev_name,
 			}
 			opt->osd_idle_ttl = msecs_to_jiffies(intval * 1000);
 			break;
+		case Opt_monkeepalivetimeout:
+			/* 0 isn't well defined right now, reject it */
+			if (intval < 1 || intval > INT_MAX / 1000) {
+				pr_err("monkeepalive out of range\n");
+				err = -EINVAL;
+				goto out;
+			}
+			opt->mon_keepalive_timeout =
+					msecs_to_jiffies(intval * 1000);
+			break;
 		case Opt_mount_timeout:
 			/* 0 is "wait forever" (i.e. infinite timeout) */
 			if (intval < 0 || intval > INT_MAX / 1000) {
@@ -542,6 +555,9 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
 	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
 		seq_printf(m, "osdkeepalivetimeout=%d,",
 		    jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
+	if (opt->mon_keepalive_timeout != CEPH_MON_KEEPALIVE_DEFAULT)
+		seq_printf(m, "monkeepalivetimeout=%d,",
+		    jiffies_to_msecs(opt->mon_keepalive_timeout) / 1000);
 
 	/* drop redundant comma */
 	if (m->count != pos)
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 101ab62..6dfdd87 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -163,6 +163,7 @@ static struct kmem_cache	*ceph_msg_data_cache;
 static char tag_msg = CEPH_MSGR_TAG_MSG;
 static char tag_ack = CEPH_MSGR_TAG_ACK;
 static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
 
 #ifdef CONFIG_LOCKDEP
 static struct lock_class_key socket_class;
@@ -1351,7 +1352,16 @@ static void prepare_write_keepalive(struct ceph_connection *con)
 {
 	dout("prepare_write_keepalive %p\n", con);
 	con_out_kvec_reset(con);
-	con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
+
+	if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
+		struct timespec ts = CURRENT_TIME;
+		struct ceph_timespec ceph_ts;
+		ceph_encode_timespec(&ceph_ts, &ts);
+		con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
+		con_out_kvec_add(con, sizeof(ceph_ts), &ceph_ts);
+	} else {
+		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
+	}
 	con_flag_set(con, CON_FLAG_WRITE_PENDING);
 }
 
@@ -1625,6 +1635,12 @@ static void prepare_read_tag(struct ceph_connection *con)
 	con->in_tag = CEPH_MSGR_TAG_READY;
 }
 
+static void prepare_read_keepalive_ack(struct ceph_connection *con)
+{
+	dout("prepare_read_keepalive_ack %p\n", con);
+	con->in_base_pos = 0;
+}
+
 /*
  * Prepare to read a message.
  */
@@ -2457,6 +2473,17 @@ static void process_message(struct ceph_connection *con)
 	mutex_lock(&con->mutex);
 }
 
+static int read_keepalive_ack(struct ceph_connection *con)
+{
+	struct ceph_timespec ceph_ts;
+	size_t size = sizeof(ceph_ts);
+	int ret = read_partial(con, size, size, &ceph_ts);
+	if (ret <= 0)
+		return ret;
+	ceph_decode_timespec(&con->last_keepalive_ack, &ceph_ts);
+	prepare_read_tag(con);
+	return 1;
+}
 
 /*
  * Write something to the socket.  Called in a worker thread when the
@@ -2526,6 +2553,10 @@ more_kvec:
 
 do_next:
 	if (con->state == CON_STATE_OPEN) {
+		if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
+			prepare_write_keepalive(con);
+			goto more;
+		}
 		/* is anything else pending? */
 		if (!list_empty(&con->out_queue)) {
 			prepare_write_message(con);
@@ -2535,10 +2566,6 @@ do_next:
 			prepare_write_ack(con);
 			goto more;
 		}
-		if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
-			prepare_write_keepalive(con);
-			goto more;
-		}
 	}
 
 	/* Nothing to do! */
@@ -2641,6 +2668,9 @@ more:
 		case CEPH_MSGR_TAG_ACK:
 			prepare_read_ack(con);
 			break;
+		case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
+			prepare_read_keepalive_ack(con);
+			break;
 		case CEPH_MSGR_TAG_CLOSE:
 			con_close_socket(con);
 			con->state = CON_STATE_CLOSED;
@@ -2684,6 +2714,12 @@ more:
 		process_ack(con);
 		goto more;
 	}
+	if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+		ret = read_keepalive_ack(con);
+		if (ret <= 0)
+			goto out;
+		goto more;
+	}
 
 out:
 	dout("try_read done on %p ret %d\n", con, ret);
@@ -3101,6 +3137,20 @@ void ceph_con_keepalive(struct ceph_connection *con)
 }
 EXPORT_SYMBOL(ceph_con_keepalive);
 
+int ceph_con_keepalive_expired(struct ceph_connection *con,
+			       unsigned long interval)
+{
+	if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
+		struct timespec now = CURRENT_TIME;
+		struct timespec ts;
+		jiffies_to_timespec(interval, &ts);
+		ts = timespec_add(con->last_keepalive_ack, ts);
+		return timespec_compare(&now, &ts) >= 0;
+	}
+	return false;
+}
+EXPORT_SYMBOL(ceph_con_keepalive_expired);
+
 static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type)
 {
 	struct ceph_msg_data *data;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 9d6ff12..3bcd332 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -149,6 +149,10 @@ static int __open_session(struct ceph_mon_client *monc)
 			      CEPH_ENTITY_TYPE_MON, monc->cur_mon,
 			      &monc->monmap->mon_inst[monc->cur_mon].addr);
 
+		/* send an initial keepalive to ensure our timestamp is
+		 * valid by the time we are in an OPENED state */
+		ceph_con_keepalive(&monc->con);
+
 		/* initiatiate authentication handshake */
 		ret = ceph_auth_build_hello(monc->auth,
 					    monc->m_auth->front.iov_base,
@@ -170,14 +174,19 @@ static bool __sub_expired(struct ceph_mon_client *monc)
  */
 static void __schedule_delayed(struct ceph_mon_client *monc)
 {
-	unsigned int delay;
+	struct ceph_options *opt = monc->client->options;
+	unsigned long delay;
 
-	if (monc->cur_mon < 0 || __sub_expired(monc))
+	if (monc->cur_mon < 0 || __sub_expired(monc)) {
 		delay = 10 * HZ;
-	else
+	} else {
 		delay = 20 * HZ;
-	dout("__schedule_delayed after %u\n", delay);
-	schedule_delayed_work(&monc->delayed_work, delay);
+		if (opt->mon_keepalive_timeout > 0)
+			delay = min(delay, opt->mon_keepalive_timeout >> 2);
+	}
+	dout("__schedule_delayed after %lu\n", delay);
+	schedule_delayed_work(&monc->delayed_work,
+			      round_jiffies_relative(delay));
 }
 
 /*
@@ -743,11 +752,24 @@ static void delayed_work(struct work_struct *work)
 		__close_session(monc);
 		__open_session(monc);  /* continue hunting */
 	} else {
-		ceph_con_keepalive(&monc->con);
+		struct ceph_options *opt = monc->client->options;
+		int is_auth = ceph_auth_is_authenticated(monc->auth);
+		if (is_auth && opt->mon_keepalive_timeout > 0 &&
+		    ceph_con_keepalive_expired(&monc->con,
+					opt->mon_keepalive_timeout)) {
+			dout("monc keepalive timeout\n");
+			is_auth = 0;
+			__close_session(monc);
+			monc->hunting = true;
+			__open_session(monc);
+		}
 
-		__validate_auth(monc);
+		if (!monc->hunting) {
+			ceph_con_keepalive(&monc->con);
+			__validate_auth(monc);
+		}
 
-		if (ceph_auth_is_authenticated(monc->auth))
+		if (is_auth)
 			__send_subscribe(monc);
 	}
 	__schedule_delayed(monc);
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] libceph: use keepalive2 to verify the mon session is alive
  2015-09-01 14:21 [PATCH] libceph: use keepalive2 to verify the mon session is alive Yan, Zheng
@ 2015-09-01 15:52 ` Ilya Dryomov
  0 siblings, 0 replies; 7+ messages in thread
From: Ilya Dryomov @ 2015-09-01 15:52 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Ceph Development

On Tue, Sep 1, 2015 at 5:21 PM, Yan, Zheng <zyan@redhat.com> wrote:
> Signed-off-by: Yan, Zheng <zyan@redhat.com>
> ---
>  include/linux/ceph/libceph.h   |  2 ++
>  include/linux/ceph/messenger.h |  4 +++
>  include/linux/ceph/msgr.h      |  4 ++-
>  net/ceph/ceph_common.c         | 18 ++++++++++++-
>  net/ceph/messenger.c           | 60 ++++++++++++++++++++++++++++++++++++++----
>  net/ceph/mon_client.c          | 38 ++++++++++++++++++++------
>  6 files changed, 111 insertions(+), 15 deletions(-)

Hi Zheng,

Just want to make sure you saw my "Re: " message - I had a few
suggestions.  This one looks like the one I commented on, straight from
the testing branch.

Thanks,

                Ilya

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] libceph: use keepalive2 to verify the mon session is alive
  2015-09-02  9:25     ` Yan, Zheng
  2015-09-02 15:23       ` Ilya Dryomov
@ 2015-09-02 15:59       ` Ilya Dryomov
  1 sibling, 0 replies; 7+ messages in thread
From: Ilya Dryomov @ 2015-09-02 15:59 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Ceph Development

On Wed, Sep 2, 2015 at 12:25 PM, Yan, Zheng <zyan@redhat.com> wrote:
>
>> On Sep 2, 2015, at 17:12, Ilya Dryomov <idryomov@gmail.com> wrote:
>>
>> On Wed, Sep 2, 2015 at 5:22 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>> timespec_to_jiffies() does not work this way. it convert time delta in form of timespec to time delta in form of jiffies.
>>
>> Ah sorry, con->last_keepalive_ack is a realtime timespec from
>> userspace.
>>
>>>
>>> I will updated the patch according  to the rest comments .
>>
>> I still want ceph_con_keepalive_expired() to handle interval == 0
>> internally, so that opt->monc_ping_timeout can be passed without any
>> checks in the caller.
>>
>> I also noticed you added delay = max_t(unsigned long, delay, HZ); to
>> __schedule_delayed().  Is it really necessary?
>>
>
> Updated. please review.

One other thing you might want to do is to / 3 instead of >> 2 in
delayed_work().  That way the tick interval in the non-hunting case is
going to be 10 seconds, which matches the default for ping interval
(mon_client_ping_interval) in userspace.

Thanks,

                Ilya

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] libceph: use keepalive2 to verify the mon session is alive
  2015-09-02  9:25     ` Yan, Zheng
@ 2015-09-02 15:23       ` Ilya Dryomov
  2015-09-02 15:59       ` Ilya Dryomov
  1 sibling, 0 replies; 7+ messages in thread
From: Ilya Dryomov @ 2015-09-02 15:23 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Ceph Development

On Wed, Sep 2, 2015 at 12:25 PM, Yan, Zheng <zyan@redhat.com> wrote:
>
>> On Sep 2, 2015, at 17:12, Ilya Dryomov <idryomov@gmail.com> wrote:
>>
>> On Wed, Sep 2, 2015 at 5:22 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>> timespec_to_jiffies() does not work this way. it convert time delta in form of timespec to time delta in form of jiffies.
>>
>> Ah sorry, con->last_keepalive_ack is a realtime timespec from
>> userspace.
>>
>>>
>>> I will updated the patch according  to the rest comments .
>>
>> I still want ceph_con_keepalive_expired() to handle interval == 0
>> internally, so that opt->monc_ping_timeout can be passed without any
>> checks in the caller.
>>
>> I also noticed you added delay = max_t(unsigned long, delay, HZ); to
>> __schedule_delayed().  Is it really necessary?
>>
>
> Updated. please review.

I was concerned about user facing option names and timeouts, since
that's what I had to deal with recently - those parts look OK to me
now.

Thanks,

                Ilya

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] libceph: use keepalive2 to verify the mon session is alive
  2015-09-02  9:12   ` Ilya Dryomov
@ 2015-09-02  9:25     ` Yan, Zheng
  2015-09-02 15:23       ` Ilya Dryomov
  2015-09-02 15:59       ` Ilya Dryomov
  0 siblings, 2 replies; 7+ messages in thread
From: Yan, Zheng @ 2015-09-02  9:25 UTC (permalink / raw)
  To: Ilya Dryomov; +Cc: Ceph Development


> On Sep 2, 2015, at 17:12, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Wed, Sep 2, 2015 at 5:22 AM, Yan, Zheng <zyan@redhat.com> wrote:
>> timespec_to_jiffies() does not work this way. it convert time delta in form of timespec to time delta in form of jiffies.
> 
> Ah sorry, con->last_keepalive_ack is a realtime timespec from
> userspace.
> 
>> 
>> I will updated the patch according  to the rest comments .
> 
> I still want ceph_con_keepalive_expired() to handle interval == 0
> internally, so that opt->monc_ping_timeout can be passed without any
> checks in the caller.
> 
> I also noticed you added delay = max_t(unsigned long, delay, HZ); to
> __schedule_delayed().  Is it really necessary?
> 

Updated. please review.

Regards
Yan, Zheng

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] libceph: use keepalive2 to verify the mon session is alive
       [not found] ` <AFB5599E-A06C-480C-8A9C-C42C9FC9EFE1@redhat.com>
@ 2015-09-02  9:12   ` Ilya Dryomov
  2015-09-02  9:25     ` Yan, Zheng
  0 siblings, 1 reply; 7+ messages in thread
From: Ilya Dryomov @ 2015-09-02  9:12 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Ceph Development

On Wed, Sep 2, 2015 at 5:22 AM, Yan, Zheng <zyan@redhat.com> wrote:
> timespec_to_jiffies() does not work this way. it convert time delta in form of timespec to time delta in form of jiffies.

Ah sorry, con->last_keepalive_ack is a realtime timespec from
userspace.

>
> I will updated the patch according  to the rest comments .

I still want ceph_con_keepalive_expired() to handle interval == 0
internally, so that opt->monc_ping_timeout can be passed without any
checks in the caller.

I also noticed you added delay = max_t(unsigned long, delay, HZ); to
__schedule_delayed().  Is it really necessary?

Thanks,

                Ilya

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] libceph: use keepalive2 to verify the mon session is alive
@ 2015-09-01 14:16 Ilya Dryomov
       [not found] ` <AFB5599E-A06C-480C-8A9C-C42C9FC9EFE1@redhat.com>
  0 siblings, 1 reply; 7+ messages in thread
From: Ilya Dryomov @ 2015-09-01 14:16 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Ceph Development

Hi Zheng,

I saw your keepalive2 patch - a couple of suggestions below.

> From 33b68dde7f27927a7cb1a7691e3c5b6f847ffd14 Mon Sep 17 00:00:00 2001
> From: "Yan, Zheng" <zyan@redhat.com>
> Date: Tue, 1 Sep 2015 17:19:38 +0800
> Subject: [PATCH] libceph: use keepalive2 to verify the mon session is alive
>
> Signed-off-by: Yan, Zheng <zyan@redhat.com>
> ---
>  include/linux/ceph/libceph.h   |  2 ++
>  include/linux/ceph/messenger.h |  4 +++
>  include/linux/ceph/msgr.h      |  4 ++-
>  net/ceph/ceph_common.c         | 18 ++++++++++++-
>  net/ceph/messenger.c           | 60 ++++++++++++++++++++++++++++++++++++++----
>  net/ceph/mon_client.c          | 38 ++++++++++++++++++++------
>  6 files changed, 111 insertions(+), 15 deletions(-)
>
> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
> index 9ebee53..9a0b471 100644
> --- a/include/linux/ceph/libceph.h
> +++ b/include/linux/ceph/libceph.h
> @@ -46,6 +46,7 @@ struct ceph_options {
>       unsigned long mount_timeout;            /* jiffies */
>       unsigned long osd_idle_ttl;             /* jiffies */
>       unsigned long osd_keepalive_timeout;    /* jiffies */
> +     unsigned long mon_keepalive_timeout;    /* jiffies */
>
>       /*
>        * any type that can't be simply compared or doesn't need need
> @@ -66,6 +67,7 @@ struct ceph_options {
>  #define CEPH_MOUNT_TIMEOUT_DEFAULT   msecs_to_jiffies(60 * 1000)
>  #define CEPH_OSD_KEEPALIVE_DEFAULT   msecs_to_jiffies(5 * 1000)
>  #define CEPH_OSD_IDLE_TTL_DEFAULT    msecs_to_jiffies(60 * 1000)
> +#define CEPH_MON_KEEPALIVE_DEFAULT   msecs_to_jiffies(30 * 1000)
>
>  #define CEPH_MSG_MAX_FRONT_LEN       (16*1024*1024)
>  #define CEPH_MSG_MAX_MIDDLE_LEN      (16*1024*1024)
> diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
> index 3775327..83063b6 100644
> --- a/include/linux/ceph/messenger.h
> +++ b/include/linux/ceph/messenger.h
> @@ -248,6 +248,8 @@ struct ceph_connection {
>       int in_base_pos;     /* bytes read */
>       __le64 in_temp_ack;  /* for reading an ack */
>
> +     struct timespec last_keepalive_ack;
> +
>       struct delayed_work work;           /* send|recv work */
>       unsigned long       delay;          /* current delay interval */
>  };
> @@ -285,6 +287,8 @@ extern void ceph_msg_revoke(struct ceph_msg *msg);
>  extern void ceph_msg_revoke_incoming(struct ceph_msg *msg);
>
>  extern void ceph_con_keepalive(struct ceph_connection *con);
> +extern int ceph_con_keepalive_expired(struct ceph_connection *con,
> +                                   unsigned long interval);
>
>  extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
>                               size_t length, size_t alignment);
> diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
> index 1c18872..0fe2656 100644
> --- a/include/linux/ceph/msgr.h
> +++ b/include/linux/ceph/msgr.h
> @@ -84,10 +84,12 @@ struct ceph_entity_inst {
>  #define CEPH_MSGR_TAG_MSG           7  /* message */
>  #define CEPH_MSGR_TAG_ACK           8  /* message ack */
>  #define CEPH_MSGR_TAG_KEEPALIVE     9  /* just a keepalive byte! */
> -#define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
> +#define CEPH_MSGR_TAG_BADPROTOVER   10 /* bad protocol version */
>  #define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
>  #define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
>  #define CEPH_MSGR_TAG_SEQ           13 /* 64-bit int follows with seen seq number */
> +#define CEPH_MSGR_TAG_KEEPALIVE2    14 /* keepalive2 byte + ceph_timespec */
> +#define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15 /* keepalive2 reply */
>
>
>  /*
> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
> index f30329f..5143f6e 100644
> --- a/net/ceph/ceph_common.c
> +++ b/net/ceph/ceph_common.c
> @@ -226,6 +226,7 @@ static int parse_fsid(const char *str, struct ceph_fsid *fsid)
>   * ceph options
>   */
>  enum {
> +     Opt_monkeepalivetimeout,
>       Opt_osdtimeout,
>       Opt_osdkeepalivetimeout,
>       Opt_mount_timeout,
> @@ -250,6 +251,7 @@ enum {
>  };
>
>  static match_table_t opt_tokens = {
> +     {Opt_monkeepalivetimeout, "monkeepalive=%d"},
>       {Opt_osdtimeout, "osdtimeout=%d"},
>       {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
>       {Opt_mount_timeout, "mount_timeout=%d"},
> @@ -354,9 +356,10 @@ ceph_parse_options(char *options, const char *dev_name,
>
>       /* start with defaults */
>       opt->flags = CEPH_OPT_DEFAULT;
> -     opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
>       opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
>       opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
> +     opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
> +     opt->mon_keepalive_timeout = CEPH_MON_KEEPALIVE_DEFAULT;

I suggest naming the option "monc_ping_timeout" and the corresponding
variable opt->monc_ping_timeout to be closer to userspace, or not
exporting it as a map/mount option at all (leave it in ceph_options but
not expose it to users).  osdkeepalive which you modeled this option
upon is a misnomer and refers to both tick interval and timeout, so
I don't think it's a good example to use.

>
>       /* get mon ip(s) */
>       /* ip1[:port1][,ip2[:port2]...] */
> @@ -460,6 +463,16 @@ ceph_parse_options(char *options, const char *dev_name,
>                       }
>                       opt->osd_idle_ttl = msecs_to_jiffies(intval * 1000);
>                       break;
> +             case Opt_monkeepalivetimeout:
> +                     /* 0 isn't well defined right now, reject it */

0 is actually well defined and handled, so if you decide to expose it
as monc_ping_timeout, this could be /* 0 is "don't ping" */ and

> +                     if (intval < 1 || intval > INT_MAX / 1000) {

intval < 0.

> +                             pr_err("monkeepalive out of range\n");
> +                             err = -EINVAL;
> +                             goto out;
> +                     }
> +                     opt->mon_keepalive_timeout =
> +                                     msecs_to_jiffies(intval * 1000);
> +                     break;
>               case Opt_mount_timeout:
>                       /* 0 is "wait forever" (i.e. infinite timeout) */
>                       if (intval < 0 || intval > INT_MAX / 1000) {
> @@ -542,6 +555,9 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
>       if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
>               seq_printf(m, "osdkeepalivetimeout=%d,",
>                   jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
> +     if (opt->mon_keepalive_timeout != CEPH_MON_KEEPALIVE_DEFAULT)
> +             seq_printf(m, "monkeepalivetimeout=%d,",
> +                 jiffies_to_msecs(opt->mon_keepalive_timeout) / 1000);
>
>       /* drop redundant comma */
>       if (m->count != pos)
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index 101ab62..6dfdd87 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -163,6 +163,7 @@ static struct kmem_cache  *ceph_msg_data_cache;
>  static char tag_msg = CEPH_MSGR_TAG_MSG;
>  static char tag_ack = CEPH_MSGR_TAG_ACK;
>  static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
> +static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
>
>  #ifdef CONFIG_LOCKDEP
>  static struct lock_class_key socket_class;
> @@ -1351,7 +1352,16 @@ static void prepare_write_keepalive(struct ceph_connection *con)
>  {
>       dout("prepare_write_keepalive %p\n", con);
>       con_out_kvec_reset(con);
> -     con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
> +
> +     if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
> +             struct timespec ts = CURRENT_TIME;
> +             struct ceph_timespec ceph_ts;
> +             ceph_encode_timespec(&ceph_ts, &ts);
> +             con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
> +             con_out_kvec_add(con, sizeof(ceph_ts), &ceph_ts);
> +     } else {
> +             con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
> +     }
>       con_flag_set(con, CON_FLAG_WRITE_PENDING);
>  }
>
> @@ -1625,6 +1635,12 @@ static void prepare_read_tag(struct ceph_connection *con)
>       con->in_tag = CEPH_MSGR_TAG_READY;
>  }
>
> +static void prepare_read_keepalive_ack(struct ceph_connection *con)
> +{
> +     dout("prepare_read_keepalive_ack %p\n", con);
> +     con->in_base_pos = 0;
> +}
> +
>  /*
>   * Prepare to read a message.
>   */
> @@ -2457,6 +2473,17 @@ static void process_message(struct ceph_connection *con)
>       mutex_lock(&con->mutex);
>  }
>
> +static int read_keepalive_ack(struct ceph_connection *con)
> +{
> +     struct ceph_timespec ceph_ts;
> +     size_t size = sizeof(ceph_ts);
> +     int ret = read_partial(con, size, size, &ceph_ts);
> +     if (ret <= 0)
> +             return ret;
> +     ceph_decode_timespec(&con->last_keepalive_ack, &ceph_ts);
> +     prepare_read_tag(con);
> +     return 1;
> +}
>
>  /*
>   * Write something to the socket.  Called in a worker thread when the
> @@ -2526,6 +2553,10 @@ static int try_write(struct ceph_connection *con)
>
>  do_next:
>       if (con->state == CON_STATE_OPEN) {
> +             if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
> +                     prepare_write_keepalive(con);
> +                     goto more;
> +             }
>               /* is anything else pending? */
>               if (!list_empty(&con->out_queue)) {
>                       prepare_write_message(con);
> @@ -2535,10 +2566,6 @@ static int try_write(struct ceph_connection *con)
>                       prepare_write_ack(con);
>                       goto more;
>               }
> -             if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
> -                     prepare_write_keepalive(con);
> -                     goto more;
> -             }
>       }
>
>       /* Nothing to do! */
> @@ -2641,6 +2668,9 @@ static int try_read(struct ceph_connection *con)
>               case CEPH_MSGR_TAG_ACK:
>                       prepare_read_ack(con);
>                       break;
> +             case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
> +                     prepare_read_keepalive_ack(con);
> +                     break;
>               case CEPH_MSGR_TAG_CLOSE:
>                       con_close_socket(con);
>                       con->state = CON_STATE_CLOSED;
> @@ -2684,6 +2714,12 @@ static int try_read(struct ceph_connection *con)
>               process_ack(con);
>               goto more;
>       }
> +     if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
> +             ret = read_keepalive_ack(con);
> +             if (ret <= 0)
> +                     goto out;
> +             goto more;
> +     }
>
>  out:
>       dout("try_read done on %p ret %d\n", con, ret);
> @@ -3101,6 +3137,20 @@ void ceph_con_keepalive(struct ceph_connection *con)
>  }
>  EXPORT_SYMBOL(ceph_con_keepalive);
>
> +int ceph_con_keepalive_expired(struct ceph_connection *con,

Use bool here.  Also, since you encapsulated this math instead of
open-coding it in delayed_work(), I think it's this function's job to
handle inverval == 0.

> +                            unsigned long interval)
> +{
> +     if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
> +             struct timespec now = CURRENT_TIME;
> +             struct timespec ts;
> +             jiffies_to_timespec(interval, &ts);
> +             ts = timespec_add(con->last_keepalive_ack, ts);
> +             return timespec_compare(&now, &ts) >= 0;
> +     }
> +     return false;
> +}

If you count in jiffies, you can do without those timespec variables
and easily handle interval == 0 at the same time:

bool ceph_con_keepalive_expired(struct ceph_connection *con,
                                unsigned long timeout)
{
        unsigned long last_ack = timespec_to_jiffies(con->last_keepalive_ack);

        return (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) &&
               timeout && time_is_before_eq_jiffies(last_ack + timeout);
}

> +EXPORT_SYMBOL(ceph_con_keepalive_expired);

I don't think EXPORT_SYMBOL is needed here.

Thanks,

                Ilya

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2015-09-02 15:59 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-09-01 14:21 [PATCH] libceph: use keepalive2 to verify the mon session is alive Yan, Zheng
2015-09-01 15:52 ` Ilya Dryomov
  -- strict thread matches above, loose matches on Subject: below --
2015-09-01 14:16 Ilya Dryomov
     [not found] ` <AFB5599E-A06C-480C-8A9C-C42C9FC9EFE1@redhat.com>
2015-09-02  9:12   ` Ilya Dryomov
2015-09-02  9:25     ` Yan, Zheng
2015-09-02 15:23       ` Ilya Dryomov
2015-09-02 15:59       ` Ilya Dryomov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.