All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 0/4] ceph: periodically send perf metrics to ceph
@ 2020-06-22 13:24 xiubli
  2020-06-22 13:24 ` [PATCH v3 1/4] ceph: add check_session_state helper and make it global xiubli
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: xiubli @ 2020-06-22 13:24 UTC (permalink / raw)
  To: jlayton, idryomov; +Cc: zyan, pdonnell, ceph-devel, Xiubo Li

From: Xiubo Li <xiubli@redhat.com>

This series is based the previous patches of the metrics in kceph[1]
and mds daemons record and forward client side metrics to manager[2][3].

This will send the caps/read/write/metadata metrics to any available
MDS only once per second as default, which will be the same as the
userland client, or every metric_send_interval seconds, which is a
module parameter, the valid values for metric_send_interval will be
0~5 seconds, 0 means disabled.

And will also send the metric flags to MDS, currently it supports the
cap, read latency, write latency and metadata latency.

Also have pushed this series to github [4].

[1] https://patchwork.kernel.org/project/ceph-devel/list/?series=238907 [Merged]
[2] https://github.com/ceph/ceph/pull/26004 [Merged]
[3] https://github.com/ceph/ceph/pull/35608 [Merged]
[4] https://github.com/lxbsz/ceph-client/commits/perf_metric2

Changed in V3:
- fold "check the METRIC_COLLECT feature before sending metrics" into previous one
- use `enable_send_metrics` on/off switch instead

Changed in V2:
- split the patches into small ones as possible.
- check the METRIC_COLLECT feature before sending metrics
- switch to WARN_ON and bubble up errnos to the callers


Xiubo Li (4):
  ceph: add check_session_state helper and make it global
  ceph: periodically send perf metrics to ceph
  ceph: switch to WARN_ON and bubble up errnos to the callers
  ceph: send client provided metric flags in client metadata

 fs/ceph/mds_client.c         | 152 ++++++++++++++++++++++++++++++++++---------
 fs/ceph/mds_client.h         |   8 ++-
 fs/ceph/metric.c             | 142 ++++++++++++++++++++++++++++++++++++++++
 fs/ceph/metric.h             |  91 ++++++++++++++++++++++++++
 fs/ceph/super.c              |  42 ++++++++++++
 fs/ceph/super.h              |   2 +
 include/linux/ceph/ceph_fs.h |   1 +
 7 files changed, 407 insertions(+), 31 deletions(-)

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v3 1/4] ceph: add check_session_state helper and make it global
  2020-06-22 13:24 [PATCH v3 0/4] ceph: periodically send perf metrics to ceph xiubli
@ 2020-06-22 13:24 ` xiubli
  2020-06-22 13:24 ` [PATCH v3 2/4] ceph: periodically send perf metrics to ceph xiubli
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 11+ messages in thread
From: xiubli @ 2020-06-22 13:24 UTC (permalink / raw)
  To: jlayton, idryomov; +Cc: zyan, pdonnell, ceph-devel, Xiubo Li

From: Xiubo Li <xiubli@redhat.com>

This will be used by followed sending metrics patches.

URL: https://tracker.ceph.com/issues/43215
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/mds_client.c | 43 ++++++++++++++++++++++++++-----------------
 fs/ceph/mds_client.h |  4 ++++
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a504971..608fb5c 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4263,6 +4263,30 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc)
 	ceph_force_reconnect(fsc->sb);
 }
 
+bool check_session_state(struct ceph_mds_client *mdsc,
+			 struct ceph_mds_session *s)
+{
+	if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
+		dout("resending session close request for mds%d\n",
+				s->s_mds);
+		request_close_session(mdsc, s);
+		return false;
+	}
+	if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
+		if (s->s_state == CEPH_MDS_SESSION_OPEN) {
+			s->s_state = CEPH_MDS_SESSION_HUNG;
+			pr_info("mds%d hung\n", s->s_mds);
+		}
+	}
+	if (s->s_state == CEPH_MDS_SESSION_NEW ||
+	    s->s_state == CEPH_MDS_SESSION_RESTARTING ||
+	    s->s_state == CEPH_MDS_SESSION_REJECTED)
+		/* this mds is failed or recovering, just wait */
+		return false;
+
+	return true;
+}
+
 /*
  * delayed work -- periodically trim expired leases, renew caps with mds
  */
@@ -4294,23 +4318,8 @@ static void delayed_work(struct work_struct *work)
 		struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
 		if (!s)
 			continue;
-		if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
-			dout("resending session close request for mds%d\n",
-			     s->s_mds);
-			request_close_session(mdsc, s);
-			ceph_put_mds_session(s);
-			continue;
-		}
-		if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
-			if (s->s_state == CEPH_MDS_SESSION_OPEN) {
-				s->s_state = CEPH_MDS_SESSION_HUNG;
-				pr_info("mds%d hung\n", s->s_mds);
-			}
-		}
-		if (s->s_state == CEPH_MDS_SESSION_NEW ||
-		    s->s_state == CEPH_MDS_SESSION_RESTARTING ||
-		    s->s_state == CEPH_MDS_SESSION_REJECTED) {
-			/* this mds is failed or recovering, just wait */
+
+		if (!check_session_state(mdsc, s)) {
 			ceph_put_mds_session(s);
 			continue;
 		}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 5e0c407..bcb3892 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -18,6 +18,7 @@
 #include <linux/ceph/auth.h>
 
 #include "metric.h"
+#include "super.h"
 
 /* The first 8 bits are reserved for old ceph releases */
 enum ceph_feature_type {
@@ -476,6 +477,9 @@ struct ceph_mds_client {
 
 extern const char *ceph_mds_op_name(int op);
 
+extern bool check_session_state(struct ceph_mds_client *mdsc,
+				struct ceph_mds_session *s);
+
 extern struct ceph_mds_session *
 __ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v3 2/4] ceph: periodically send perf metrics to ceph
  2020-06-22 13:24 [PATCH v3 0/4] ceph: periodically send perf metrics to ceph xiubli
  2020-06-22 13:24 ` [PATCH v3 1/4] ceph: add check_session_state helper and make it global xiubli
@ 2020-06-22 13:24 ` xiubli
  2020-06-23 17:24   ` Jeff Layton
  2020-06-22 13:24 ` [PATCH v3 3/4] ceph: switch to WARN_ON and bubble up errnos to the callers xiubli
  2020-06-22 13:25 ` [PATCH v3 4/4] ceph: send client provided metric flags in client metadata xiubli
  3 siblings, 1 reply; 11+ messages in thread
From: xiubli @ 2020-06-22 13:24 UTC (permalink / raw)
  To: jlayton, idryomov; +Cc: zyan, pdonnell, ceph-devel, Xiubo Li

From: Xiubo Li <xiubli@redhat.com>

This will send the caps/read/write/metadata metrics to any available
MDS only once per second as default, which will be the same as the
userland client, or every metric_send_interval seconds, which is a
module parameter.

Skip the MDS sessions if they don't support the metric collection,
or the MDSs will close the socket connections directly when it get
an unknown type message.

URL: https://tracker.ceph.com/issues/43215
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/mds_client.c         |   3 +
 fs/ceph/mds_client.h         |   4 +-
 fs/ceph/metric.c             | 142 +++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/metric.h             |  78 ++++++++++++++++++++++++
 fs/ceph/super.c              |  42 +++++++++++++
 fs/ceph/super.h              |   2 +
 include/linux/ceph/ceph_fs.h |   1 +
 7 files changed, 271 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 608fb5c..f996363 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4625,6 +4625,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 
 	cancel_work_sync(&mdsc->cap_reclaim_work);
 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+	cancel_delayed_work_sync(&mdsc->metric.delayed_work); /* cancel timer */
 
 	dout("stopped\n");
 }
@@ -4667,6 +4668,7 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
 {
 	dout("stop\n");
 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
+	cancel_delayed_work_sync(&mdsc->metric.delayed_work); /* cancel timer */
 	if (mdsc->mdsmap)
 		ceph_mdsmap_destroy(mdsc->mdsmap);
 	kfree(mdsc->sessions);
@@ -4824,6 +4826,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
 
 	mutex_unlock(&mdsc->mutex);
 	schedule_delayed(mdsc);
+	metric_schedule_delayed(&mdsc->metric);
 	return;
 
 bad_unlock:
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index bcb3892..3c65ac1 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -28,8 +28,9 @@ enum ceph_feature_type {
 	CEPHFS_FEATURE_LAZY_CAP_WANTED,
 	CEPHFS_FEATURE_MULTI_RECONNECT,
 	CEPHFS_FEATURE_DELEG_INO,
+	CEPHFS_FEATURE_METRIC_COLLECT,
 
-	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO,
+	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
 };
 
 /*
@@ -43,6 +44,7 @@ enum ceph_feature_type {
 	CEPHFS_FEATURE_LAZY_CAP_WANTED,		\
 	CEPHFS_FEATURE_MULTI_RECONNECT,		\
 	CEPHFS_FEATURE_DELEG_INO,		\
+	CEPHFS_FEATURE_METRIC_COLLECT,		\
 						\
 	CEPHFS_FEATURE_MAX,			\
 }
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
index 9217f35..4267b46 100644
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -1,10 +1,150 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/ceph/ceph_debug.h>
 
 #include <linux/types.h>
 #include <linux/percpu_counter.h>
 #include <linux/math64.h>
 
 #include "metric.h"
+#include "mds_client.h"
+
+static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
+				   struct ceph_mds_session *s,
+				   u64 nr_caps)
+{
+	struct ceph_metric_head *head;
+	struct ceph_metric_cap *cap;
+	struct ceph_metric_read_latency *read;
+	struct ceph_metric_write_latency *write;
+	struct ceph_metric_metadata_latency *meta;
+	struct ceph_client_metric *m = &mdsc->metric;
+	struct ceph_msg *msg;
+	struct timespec64 ts;
+	s64 sum, total;
+	s32 items = 0;
+	s32 len;
+
+	len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write)
+	      + sizeof(*meta);
+
+	msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
+	if (!msg) {
+		pr_err("send metrics to mds%d, failed to allocate message\n",
+		       s->s_mds);
+		return false;
+	}
+
+	head = msg->front.iov_base;
+
+	/* encode the cap metric */
+	cap = (struct ceph_metric_cap *)(head + 1);
+	cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
+	cap->ver = 1;
+	cap->compat = 1;
+	cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
+	cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit));
+	cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis));
+	cap->total = cpu_to_le64(nr_caps);
+	items++;
+
+	/* encode the read latency metric */
+	read = (struct ceph_metric_read_latency *)(cap + 1);
+	read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
+	read->ver = 1;
+	read->compat = 1;
+	read->data_len = cpu_to_le32(sizeof(*read) - 10);
+	total = m->total_reads;
+	sum = m->read_latency_sum;
+	jiffies_to_timespec64(sum, &ts);
+	read->sec = cpu_to_le32(ts.tv_sec);
+	read->nsec = cpu_to_le32(ts.tv_nsec);
+	items++;
+
+	/* encode the write latency metric */
+	write = (struct ceph_metric_write_latency *)(read + 1);
+	write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
+	write->ver = 1;
+	write->compat = 1;
+	write->data_len = cpu_to_le32(sizeof(*write) - 10);
+	total = m->total_writes;
+	sum = m->write_latency_sum;
+	jiffies_to_timespec64(sum, &ts);
+	write->sec = cpu_to_le32(ts.tv_sec);
+	write->nsec = cpu_to_le32(ts.tv_nsec);
+	items++;
+
+	/* encode the metadata latency metric */
+	meta = (struct ceph_metric_metadata_latency *)(write + 1);
+	meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
+	meta->ver = 1;
+	meta->compat = 1;
+	meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
+	total = m->total_metadatas;
+	sum = m->metadata_latency_sum;
+	jiffies_to_timespec64(sum, &ts);
+	meta->sec = cpu_to_le32(ts.tv_sec);
+	meta->nsec = cpu_to_le32(ts.tv_nsec);
+	items++;
+
+	put_unaligned_le32(items, &head->num);
+	msg->front.iov_len = cpu_to_le32(len);
+	msg->hdr.version = cpu_to_le16(1);
+	msg->hdr.compat_version = cpu_to_le16(1);
+	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+	dout("client%llu send metrics to mds%d\n",
+	     ceph_client_gid(mdsc->fsc->client), s->s_mds);
+	ceph_con_send(&s->s_con, msg);
+
+	return true;
+}
+
+static void metric_delayed_work(struct work_struct *work)
+{
+	struct ceph_client_metric *m =
+		container_of(work, struct ceph_client_metric, delayed_work.work);
+	struct ceph_mds_client *mdsc =
+		container_of(m, struct ceph_mds_client, metric);
+	struct ceph_mds_session *s;
+	u64 nr_caps = 0;
+	bool ret;
+	int i;
+
+	mutex_lock(&mdsc->mutex);
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		s = __ceph_lookup_mds_session(mdsc, i);
+		if (!s)
+			continue;
+		nr_caps += s->s_nr_caps;
+		ceph_put_mds_session(s);
+	}
+
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		s = __ceph_lookup_mds_session(mdsc, i);
+		if (!s)
+			continue;
+		if (!check_session_state(mdsc, s)) {
+			ceph_put_mds_session(s);
+			continue;
+		}
+
+		/*
+		 * Skip it if MDS doesn't support the metric collection,
+		 * or the MDS will close the session's socket connection
+		 * directly when it get this message.
+		 */
+		if (!test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features))
+			continue;
+
+		/* Only send the metric once in any available session */
+		ret = ceph_mdsc_send_metrics(mdsc, s, nr_caps);
+		ceph_put_mds_session(s);
+		if (ret)
+			break;
+	}
+	mutex_unlock(&mdsc->mutex);
+
+	metric_schedule_delayed(&mdsc->metric);
+}
 
 int ceph_metric_init(struct ceph_client_metric *m)
 {
@@ -51,6 +191,8 @@ int ceph_metric_init(struct ceph_client_metric *m)
 	m->total_metadatas = 0;
 	m->metadata_latency_sum = 0;
 
+	INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work);
+
 	return 0;
 
 err_i_caps_mis:
diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
index ccd8128..5a1f8b9 100644
--- a/fs/ceph/metric.h
+++ b/fs/ceph/metric.h
@@ -6,6 +6,71 @@
 #include <linux/percpu_counter.h>
 #include <linux/ktime.h>
 
+extern bool enable_send_metrics;
+
+enum ceph_metric_type {
+	CLIENT_METRIC_TYPE_CAP_INFO,
+	CLIENT_METRIC_TYPE_READ_LATENCY,
+	CLIENT_METRIC_TYPE_WRITE_LATENCY,
+	CLIENT_METRIC_TYPE_METADATA_LATENCY,
+	CLIENT_METRIC_TYPE_DENTRY_LEASE,
+
+	CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE,
+};
+
+/* metric caps header */
+struct ceph_metric_cap {
+	__le32 type;     /* ceph metric type */
+
+	__u8  ver;
+	__u8  compat;
+
+	__le32 data_len; /* length of sizeof(hit + mis + total) */
+	__le64 hit;
+	__le64 mis;
+	__le64 total;
+} __packed;
+
+/* metric read latency header */
+struct ceph_metric_read_latency {
+	__le32 type;     /* ceph metric type */
+
+	__u8  ver;
+	__u8  compat;
+
+	__le32 data_len; /* length of sizeof(sec + nsec) */
+	__le32 sec;
+	__le32 nsec;
+} __packed;
+
+/* metric write latency header */
+struct ceph_metric_write_latency {
+	__le32 type;     /* ceph metric type */
+
+	__u8  ver;
+	__u8  compat;
+
+	__le32 data_len; /* length of sizeof(sec + nsec) */
+	__le32 sec;
+	__le32 nsec;
+} __packed;
+
+/* metric metadata latency header */
+struct ceph_metric_metadata_latency {
+	__le32 type;     /* ceph metric type */
+
+	__u8  ver;
+	__u8  compat;
+
+	__le32 data_len; /* length of sizeof(sec + nsec) */
+	__le32 sec;
+	__le32 nsec;
+} __packed;
+
+struct ceph_metric_head {
+	__le32 num;	/* the number of metrics that will be sent */
+} __packed;
+
 /* This is the global metrics */
 struct ceph_client_metric {
 	atomic64_t            total_dentries;
@@ -35,8 +100,21 @@ struct ceph_client_metric {
 	ktime_t metadata_latency_sq_sum;
 	ktime_t metadata_latency_min;
 	ktime_t metadata_latency_max;
+
+	struct delayed_work delayed_work;  /* delayed work */
 };
 
+static inline void metric_schedule_delayed(struct ceph_client_metric *m)
+{
+	/* per second as default */
+	unsigned int hz = round_jiffies_relative(HZ * enable_send_metrics);
+
+	if (!enable_send_metrics)
+		return;
+
+	schedule_delayed_work(&m->delayed_work, hz);
+}
+
 extern int ceph_metric_init(struct ceph_client_metric *m);
 extern void ceph_metric_destroy(struct ceph_client_metric *m);
 
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index c9784eb1..49f20ea 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -27,6 +27,9 @@
 #include <linux/ceph/auth.h>
 #include <linux/ceph/debugfs.h>
 
+static DEFINE_MUTEX(ceph_fsc_lock);
+static LIST_HEAD(ceph_fsc_list);
+
 /*
  * Ceph superblock operations
  *
@@ -691,6 +694,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 	if (!fsc->wb_pagevec_pool)
 		goto fail_cap_wq;
 
+	mutex_lock(&ceph_fsc_lock);
+	list_add_tail(&fsc->list, &ceph_fsc_list);
+	mutex_unlock(&ceph_fsc_lock);
+
 	return fsc;
 
 fail_cap_wq:
@@ -717,6 +724,10 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
 {
 	dout("destroy_fs_client %p\n", fsc);
 
+	mutex_lock(&ceph_fsc_lock);
+	list_del(&fsc->list);
+	mutex_unlock(&ceph_fsc_lock);
+
 	ceph_mdsc_destroy(fsc);
 	destroy_workqueue(fsc->inode_wq);
 	destroy_workqueue(fsc->cap_wq);
@@ -1282,6 +1293,37 @@ static void __exit exit_ceph(void)
 	destroy_caches();
 }
 
+static int param_set_metrics(const char *val, const struct kernel_param *kp)
+{
+	struct ceph_fs_client *fsc;
+	int ret;
+
+	ret = param_set_bool(val, kp);
+	if (ret) {
+		pr_err("Failed to parse sending metrics switch value '%s'\n",
+		       val);
+		return ret;
+	} else if (enable_send_metrics) {
+		// wake up all the mds clients
+		mutex_lock(&ceph_fsc_lock);
+		list_for_each_entry(fsc, &ceph_fsc_list, list) {
+			metric_schedule_delayed(&fsc->mdsc->metric);
+		}
+		mutex_unlock(&ceph_fsc_lock);
+	}
+
+	return 0;
+}
+
+static const struct kernel_param_ops param_ops_metrics = {
+	.set = param_set_metrics,
+	.get = param_get_bool,
+};
+
+bool enable_send_metrics = true;
+module_param_cb(enable_send_metrics, &param_ops_metrics, &enable_send_metrics, 0644);
+MODULE_PARM_DESC(enable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)");
+
 module_init(init_ceph);
 module_exit(exit_ceph);
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 5a6cdd3..05edc9a 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -101,6 +101,8 @@ struct ceph_mount_options {
 struct ceph_fs_client {
 	struct super_block *sb;
 
+	struct list_head list;
+
 	struct ceph_mount_options *mount_options;
 	struct ceph_client *client;
 
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index ebf5ba6..455e9b9 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -130,6 +130,7 @@ struct ceph_dir_layout {
 #define CEPH_MSG_CLIENT_REQUEST         24
 #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
 #define CEPH_MSG_CLIENT_REPLY           26
+#define CEPH_MSG_CLIENT_METRICS         29
 #define CEPH_MSG_CLIENT_CAPS            0x310
 #define CEPH_MSG_CLIENT_LEASE           0x311
 #define CEPH_MSG_CLIENT_SNAP            0x312
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v3 3/4] ceph: switch to WARN_ON and bubble up errnos to the callers
  2020-06-22 13:24 [PATCH v3 0/4] ceph: periodically send perf metrics to ceph xiubli
  2020-06-22 13:24 ` [PATCH v3 1/4] ceph: add check_session_state helper and make it global xiubli
  2020-06-22 13:24 ` [PATCH v3 2/4] ceph: periodically send perf metrics to ceph xiubli
@ 2020-06-22 13:24 ` xiubli
  2020-06-23 18:02   ` Jeff Layton
  2020-06-22 13:25 ` [PATCH v3 4/4] ceph: send client provided metric flags in client metadata xiubli
  3 siblings, 1 reply; 11+ messages in thread
From: xiubli @ 2020-06-22 13:24 UTC (permalink / raw)
  To: jlayton, idryomov; +Cc: zyan, pdonnell, ceph-devel, Xiubo Li

From: Xiubo Li <xiubli@redhat.com>

Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/mds_client.c | 46 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index f996363..f29cb11 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1168,7 +1168,7 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
 
 static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;
 #define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8)
-static void encode_supported_features(void **p, void *end)
+static int encode_supported_features(void **p, void *end)
 {
 	static const size_t count = ARRAY_SIZE(feature_bits);
 
@@ -1176,16 +1176,22 @@ static void encode_supported_features(void **p, void *end)
 		size_t i;
 		size_t size = FEATURE_BYTES(count);
 
-		BUG_ON(*p + 4 + size > end);
+		if (WARN_ON(*p + 4 + size > end))
+			return -ERANGE;
+
 		ceph_encode_32(p, size);
 		memset(*p, 0, size);
 		for (i = 0; i < count; i++)
 			((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8);
 		*p += size;
 	} else {
-		BUG_ON(*p + 4 > end);
+		if (WARN_ON(*p + 4 > end))
+			return -ERANGE;
+
 		ceph_encode_32(p, 0);
 	}
+
+	return 0;
 }
 
 /*
@@ -1203,6 +1209,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
 	struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
 	size_t size, count;
 	void *p, *end;
+	int ret;
 
 	const char* metadata[][2] = {
 		{"hostname", mdsc->nodename},
@@ -1232,7 +1239,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
 			   GFP_NOFS, false);
 	if (!msg) {
 		pr_err("create_session_msg ENOMEM creating msg\n");
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	}
 	p = msg->front.iov_base;
 	end = p + msg->front.iov_len;
@@ -1269,7 +1276,13 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
 		p += val_len;
 	}
 
-	encode_supported_features(&p, end);
+	ret = encode_supported_features(&p, end);
+	if (ret) {
+		pr_err("encode_supported_features failed!\n");
+		ceph_msg_put(msg);
+		return ERR_PTR(ret);
+	}
+
 	msg->front.iov_len = p - msg->front.iov_base;
 	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 
@@ -1297,8 +1310,8 @@ static int __open_session(struct ceph_mds_client *mdsc,
 
 	/* send connect message */
 	msg = create_session_open_msg(mdsc, session->s_seq);
-	if (!msg)
-		return -ENOMEM;
+	if (IS_ERR(msg))
+		return PTR_ERR(msg);
 	ceph_con_send(&session->s_con, msg);
 	return 0;
 }
@@ -1312,6 +1325,7 @@ static int __open_session(struct ceph_mds_client *mdsc,
 __open_export_target_session(struct ceph_mds_client *mdsc, int target)
 {
 	struct ceph_mds_session *session;
+	int ret;
 
 	session = __ceph_lookup_mds_session(mdsc, target);
 	if (!session) {
@@ -1320,8 +1334,11 @@ static int __open_session(struct ceph_mds_client *mdsc,
 			return session;
 	}
 	if (session->s_state == CEPH_MDS_SESSION_NEW ||
-	    session->s_state == CEPH_MDS_SESSION_CLOSING)
-		__open_session(mdsc, session);
+	    session->s_state == CEPH_MDS_SESSION_CLOSING) {
+		ret = __open_session(mdsc, session);
+		if (ret)
+			return ERR_PTR(ret);
+	}
 
 	return session;
 }
@@ -2520,7 +2537,12 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 		ceph_encode_copy(&p, &ts, sizeof(ts));
 	}
 
-	BUG_ON(p > end);
+	if (WARN_ON(p > end)) {
+		ceph_msg_put(msg);
+		msg = ERR_PTR(-ERANGE);
+		goto out_free2;
+	}
+
 	msg->front.iov_len = p - msg->front.iov_base;
 	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 
@@ -2756,7 +2778,9 @@ static void __do_request(struct ceph_mds_client *mdsc,
 		}
 		if (session->s_state == CEPH_MDS_SESSION_NEW ||
 		    session->s_state == CEPH_MDS_SESSION_CLOSING) {
-			__open_session(mdsc, session);
+			err = __open_session(mdsc, session);
+			if (err)
+				goto out_session;
 			/* retry the same mds later */
 			if (random)
 				req->r_resend_mds = mds;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v3 4/4] ceph: send client provided metric flags in client metadata
  2020-06-22 13:24 [PATCH v3 0/4] ceph: periodically send perf metrics to ceph xiubli
                   ` (2 preceding siblings ...)
  2020-06-22 13:24 ` [PATCH v3 3/4] ceph: switch to WARN_ON and bubble up errnos to the callers xiubli
@ 2020-06-22 13:25 ` xiubli
  3 siblings, 0 replies; 11+ messages in thread
From: xiubli @ 2020-06-22 13:25 UTC (permalink / raw)
  To: jlayton, idryomov; +Cc: zyan, pdonnell, ceph-devel, Xiubo Li

From: Xiubo Li <xiubli@redhat.com>

Will send the metric flags to MDS, currently it supports the cap,
read latency, write latency and metadata latency.

URL: https://tracker.ceph.com/issues/43435
Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 fs/ceph/mds_client.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/ceph/metric.h     | 13 ++++++++++++
 2 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index f29cb11..a55dda3 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1194,6 +1194,48 @@ static int encode_supported_features(void **p, void *end)
 	return 0;
 }
 
+static const unsigned char metric_bits[] = CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED;
+#define METRIC_BYTES(cnt) (DIV_ROUND_UP((size_t)metric_bits[cnt - 1] + 1, 64) * 8)
+static int encode_metric_spec(void **p, void *end)
+{
+	static const size_t count = ARRAY_SIZE(metric_bits);
+
+	/* header */
+	if (WARN_ON(*p + 2 > end))
+		return -ERANGE;
+
+	ceph_encode_8(p, 1); /* version */
+	ceph_encode_8(p, 1); /* compat */
+
+	if (count > 0) {
+		size_t i;
+		size_t size = METRIC_BYTES(count);
+
+		if (WARN_ON(*p + 4 + 4 + size > end))
+			return -ERANGE;
+
+		/* metric spec info length */
+		ceph_encode_32(p, 4 + size);
+
+		/* metric spec */
+		ceph_encode_32(p, size);
+		memset(*p, 0, size);
+		for (i = 0; i < count; i++)
+			((unsigned char *)(*p))[i / 8] |= BIT(metric_bits[i] % 8);
+		*p += size;
+	} else {
+		if (WARN_ON(*p + 4 + 4 > end))
+			return -ERANGE;
+
+		/* metric spec info length */
+		ceph_encode_32(p, 4);
+		/* metric spec */
+		ceph_encode_32(p, 0);
+	}
+
+	return 0;
+}
+
 /*
  * session message, specialization for CEPH_SESSION_REQUEST_OPEN
  * to include additional client metadata fields.
@@ -1234,6 +1276,13 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
 		size = FEATURE_BYTES(count);
 	extra_bytes += 4 + size;
 
+	/* metric spec */
+	size = 0;
+	count = ARRAY_SIZE(metric_bits);
+	if (count > 0)
+		size = METRIC_BYTES(count);
+	extra_bytes += 2 + 4 + 4 + size;
+
 	/* Allocate the message */
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
 			   GFP_NOFS, false);
@@ -1252,9 +1301,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
 	 * Serialize client metadata into waiting buffer space, using
 	 * the format that userspace expects for map<string, string>
 	 *
-	 * ClientSession messages with metadata are v3
+	 * ClientSession messages with metadata are v4
 	 */
-	msg->hdr.version = cpu_to_le16(3);
+	msg->hdr.version = cpu_to_le16(4);
 	msg->hdr.compat_version = cpu_to_le16(1);
 
 	/* The write pointer, following the session_head structure */
@@ -1283,6 +1332,13 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
 		return ERR_PTR(ret);
 	}
 
+	ret = encode_metric_spec(&p, end);
+	if (ret) {
+		pr_err("encode_metric_spec failed!\n");
+		ceph_msg_put(msg);
+		return ERR_PTR(ret);
+	}
+
 	msg->front.iov_len = p - msg->front.iov_base;
 	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 
diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
index 5a1f8b9..87172ca 100644
--- a/fs/ceph/metric.h
+++ b/fs/ceph/metric.h
@@ -18,6 +18,19 @@ enum ceph_metric_type {
 	CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE,
 };
 
+/*
+ * This will always have the highest metric bit value
+ * as the last element of the array.
+ */
+#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED {	\
+	CLIENT_METRIC_TYPE_CAP_INFO,		\
+	CLIENT_METRIC_TYPE_READ_LATENCY,	\
+	CLIENT_METRIC_TYPE_WRITE_LATENCY,	\
+	CLIENT_METRIC_TYPE_METADATA_LATENCY,	\
+						\
+	CLIENT_METRIC_TYPE_MAX,			\
+}
+
 /* metric caps header */
 struct ceph_metric_cap {
 	__le32 type;     /* ceph metric type */
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 2/4] ceph: periodically send perf metrics to ceph
  2020-06-22 13:24 ` [PATCH v3 2/4] ceph: periodically send perf metrics to ceph xiubli
@ 2020-06-23 17:24   ` Jeff Layton
  2020-06-24  8:32     ` Xiubo Li
  0 siblings, 1 reply; 11+ messages in thread
From: Jeff Layton @ 2020-06-23 17:24 UTC (permalink / raw)
  To: xiubli, idryomov; +Cc: zyan, pdonnell, ceph-devel

On Mon, 2020-06-22 at 09:24 -0400, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> This will send the caps/read/write/metadata metrics to any available
> MDS only once per second as default, which will be the same as the
> userland client, or every metric_send_interval seconds, which is a
> module parameter.
> 
> Skip the MDS sessions if they don't support the metric collection,
> or the MDSs will close the socket connections directly when it get
> an unknown type message.
> 
> URL: https://tracker.ceph.com/issues/43215
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  fs/ceph/mds_client.c         |   3 +
>  fs/ceph/mds_client.h         |   4 +-
>  fs/ceph/metric.c             | 142 +++++++++++++++++++++++++++++++++++++++++++
>  fs/ceph/metric.h             |  78 ++++++++++++++++++++++++
>  fs/ceph/super.c              |  42 +++++++++++++
>  fs/ceph/super.h              |   2 +
>  include/linux/ceph/ceph_fs.h |   1 +
>  7 files changed, 271 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 608fb5c..f996363 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -4625,6 +4625,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
>  
>  	cancel_work_sync(&mdsc->cap_reclaim_work);
>  	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
> +	cancel_delayed_work_sync(&mdsc->metric.delayed_work); /* cancel timer */
>  
>  	dout("stopped\n");
>  }
> @@ -4667,6 +4668,7 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
>  {
>  	dout("stop\n");
>  	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
> +	cancel_delayed_work_sync(&mdsc->metric.delayed_work); /* cancel timer */
>  	if (mdsc->mdsmap)
>  		ceph_mdsmap_destroy(mdsc->mdsmap);
>  	kfree(mdsc->sessions);
> @@ -4824,6 +4826,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
>  
>  	mutex_unlock(&mdsc->mutex);
>  	schedule_delayed(mdsc);
> +	metric_schedule_delayed(&mdsc->metric);
>  	return;
>  
>  bad_unlock:
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index bcb3892..3c65ac1 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -28,8 +28,9 @@ enum ceph_feature_type {
>  	CEPHFS_FEATURE_LAZY_CAP_WANTED,
>  	CEPHFS_FEATURE_MULTI_RECONNECT,
>  	CEPHFS_FEATURE_DELEG_INO,
> +	CEPHFS_FEATURE_METRIC_COLLECT,
>  
> -	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO,
> +	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
>  };
>  
>  /*
> @@ -43,6 +44,7 @@ enum ceph_feature_type {
>  	CEPHFS_FEATURE_LAZY_CAP_WANTED,		\
>  	CEPHFS_FEATURE_MULTI_RECONNECT,		\
>  	CEPHFS_FEATURE_DELEG_INO,		\
> +	CEPHFS_FEATURE_METRIC_COLLECT,		\
>  						\
>  	CEPHFS_FEATURE_MAX,			\
>  }
> diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
> index 9217f35..4267b46 100644
> --- a/fs/ceph/metric.c
> +++ b/fs/ceph/metric.c
> @@ -1,10 +1,150 @@
>  /* SPDX-License-Identifier: GPL-2.0 */
> +#include <linux/ceph/ceph_debug.h>
>  
>  #include <linux/types.h>
>  #include <linux/percpu_counter.h>
>  #include <linux/math64.h>
>  
>  #include "metric.h"
> +#include "mds_client.h"
> +
> +static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
> +				   struct ceph_mds_session *s,
> +				   u64 nr_caps)
> +{
> +	struct ceph_metric_head *head;
> +	struct ceph_metric_cap *cap;
> +	struct ceph_metric_read_latency *read;
> +	struct ceph_metric_write_latency *write;
> +	struct ceph_metric_metadata_latency *meta;
> +	struct ceph_client_metric *m = &mdsc->metric;
> +	struct ceph_msg *msg;
> +	struct timespec64 ts;
> +	s64 sum, total;
> +	s32 items = 0;
> +	s32 len;
> +
> +	len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write)
> +	      + sizeof(*meta);
> +
> +	msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
> +	if (!msg) {
> +		pr_err("send metrics to mds%d, failed to allocate message\n",
> +		       s->s_mds);
> +		return false;
> +	}
> +
> +	head = msg->front.iov_base;
> +
> +	/* encode the cap metric */
> +	cap = (struct ceph_metric_cap *)(head + 1);
> +	cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
> +	cap->ver = 1;
> +	cap->compat = 1;
> +	cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
> +	cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit));
> +	cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis));
> +	cap->total = cpu_to_le64(nr_caps);
> +	items++;
> +
> +	/* encode the read latency metric */
> +	read = (struct ceph_metric_read_latency *)(cap + 1);
> +	read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
> +	read->ver = 1;
> +	read->compat = 1;
> +	read->data_len = cpu_to_le32(sizeof(*read) - 10);
> +	total = m->total_reads;
> +	sum = m->read_latency_sum;
> +	jiffies_to_timespec64(sum, &ts);
> +	read->sec = cpu_to_le32(ts.tv_sec);
> +	read->nsec = cpu_to_le32(ts.tv_nsec);
> +	items++;
> +
> +	/* encode the write latency metric */
> +	write = (struct ceph_metric_write_latency *)(read + 1);
> +	write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
> +	write->ver = 1;
> +	write->compat = 1;
> +	write->data_len = cpu_to_le32(sizeof(*write) - 10);
> +	total = m->total_writes;
> +	sum = m->write_latency_sum;
> +	jiffies_to_timespec64(sum, &ts);
> +	write->sec = cpu_to_le32(ts.tv_sec);
> +	write->nsec = cpu_to_le32(ts.tv_nsec);
> +	items++;
> +
> +	/* encode the metadata latency metric */
> +	meta = (struct ceph_metric_metadata_latency *)(write + 1);
> +	meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
> +	meta->ver = 1;
> +	meta->compat = 1;
> +	meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
> +	total = m->total_metadatas;
> +	sum = m->metadata_latency_sum;
> +	jiffies_to_timespec64(sum, &ts);
> +	meta->sec = cpu_to_le32(ts.tv_sec);
> +	meta->nsec = cpu_to_le32(ts.tv_nsec);
> +	items++;
> +
> +	put_unaligned_le32(items, &head->num);
> +	msg->front.iov_len = cpu_to_le32(len);
> +	msg->hdr.version = cpu_to_le16(1);
> +	msg->hdr.compat_version = cpu_to_le16(1);
> +	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
> +	dout("client%llu send metrics to mds%d\n",
> +	     ceph_client_gid(mdsc->fsc->client), s->s_mds);
> +	ceph_con_send(&s->s_con, msg);
> +
> +	return true;
> +}
> +
> +static void metric_delayed_work(struct work_struct *work)
> +{
> +	struct ceph_client_metric *m =
> +		container_of(work, struct ceph_client_metric, delayed_work.work);
> +	struct ceph_mds_client *mdsc =
> +		container_of(m, struct ceph_mds_client, metric);
> +	struct ceph_mds_session *s;
> +	u64 nr_caps = 0;
> +	bool ret;
> +	int i;
> +
> +	mutex_lock(&mdsc->mutex);
> +	for (i = 0; i < mdsc->max_sessions; i++) {
> +		s = __ceph_lookup_mds_session(mdsc, i);
> +		if (!s)
> +			continue;
> +		nr_caps += s->s_nr_caps;
> +		ceph_put_mds_session(s);
> +	}
> +
> +	for (i = 0; i < mdsc->max_sessions; i++) {
> +		s = __ceph_lookup_mds_session(mdsc, i);
> +		if (!s)
> +			continue;
> +		if (!check_session_state(mdsc, s)) {
> +			ceph_put_mds_session(s);
> +			continue;
> +		}
> +
> +		/*
> +		 * Skip it if MDS doesn't support the metric collection,
> +		 * or the MDS will close the session's socket connection
> +		 * directly when it get this message.
> +		 */
> +		if (!test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features))
> +			continue;
> +
> +		/* Only send the metric once in any available session */
> +		ret = ceph_mdsc_send_metrics(mdsc, s, nr_caps);
> +		ceph_put_mds_session(s);
> +		if (ret)
> +			break;
> +	}
> +	mutex_unlock(&mdsc->mutex);
> +
> +	metric_schedule_delayed(&mdsc->metric);
> +}
>  

You're going to be queueing this job up to run every second, even when
none of your MDS's support metrics.

I think it would be better that we make this job conditional on having
at least one session with an MDS that supports receiving metrics. Maybe
have each MDS session hold a reference to the scheduled job and when the
refcount goes to 0, we cancel it...

A simpler approach here might be to just give each session its own
struct work, and only queue the work if the session supports metrics.
That way you could just cancel the work as part of each session's
teardown. I think that would also mean you wouldn't need the mdsc->mutex 
here either, which would be a bonus.

Thoughts?


>  int ceph_metric_init(struct ceph_client_metric *m)
>  {
> @@ -51,6 +191,8 @@ int ceph_metric_init(struct ceph_client_metric *m)
>  	m->total_metadatas = 0;
>  	m->metadata_latency_sum = 0;
>  
> +	INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work);
> +
>  	return 0;
>  
>  err_i_caps_mis:
> diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
> index ccd8128..5a1f8b9 100644
> --- a/fs/ceph/metric.h
> +++ b/fs/ceph/metric.h
> @@ -6,6 +6,71 @@
>  #include <linux/percpu_counter.h>
>  #include <linux/ktime.h>
>  
> +extern bool enable_send_metrics;
> +
> +enum ceph_metric_type {
> +	CLIENT_METRIC_TYPE_CAP_INFO,
> +	CLIENT_METRIC_TYPE_READ_LATENCY,
> +	CLIENT_METRIC_TYPE_WRITE_LATENCY,
> +	CLIENT_METRIC_TYPE_METADATA_LATENCY,
> +	CLIENT_METRIC_TYPE_DENTRY_LEASE,
> +
> +	CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE,
> +};
> +
> +/* metric caps header */
> +struct ceph_metric_cap {
> +	__le32 type;     /* ceph metric type */
> +
> +	__u8  ver;
> +	__u8  compat;
> +
> +	__le32 data_len; /* length of sizeof(hit + mis + total) */
> +	__le64 hit;
> +	__le64 mis;
> +	__le64 total;
> +} __packed;
> +
> +/* metric read latency header */
> +struct ceph_metric_read_latency {
> +	__le32 type;     /* ceph metric type */
> +
> +	__u8  ver;
> +	__u8  compat;
> +
> +	__le32 data_len; /* length of sizeof(sec + nsec) */
> +	__le32 sec;
> +	__le32 nsec;
> +} __packed;
> +
> +/* metric write latency header */
> +struct ceph_metric_write_latency {
> +	__le32 type;     /* ceph metric type */
> +
> +	__u8  ver;
> +	__u8  compat;
> +
> +	__le32 data_len; /* length of sizeof(sec + nsec) */
> +	__le32 sec;
> +	__le32 nsec;
> +} __packed;
> +
> +/* metric metadata latency header */
> +struct ceph_metric_metadata_latency {
> +	__le32 type;     /* ceph metric type */
> +
> +	__u8  ver;
> +	__u8  compat;
> +
> +	__le32 data_len; /* length of sizeof(sec + nsec) */
> +	__le32 sec;
> +	__le32 nsec;
> +} __packed;
> +
> +struct ceph_metric_head {
> +	__le32 num;	/* the number of metrics that will be sent */
> +} __packed;
> +
>  /* This is the global metrics */
>  struct ceph_client_metric {
>  	atomic64_t            total_dentries;
> @@ -35,8 +100,21 @@ struct ceph_client_metric {
>  	ktime_t metadata_latency_sq_sum;
>  	ktime_t metadata_latency_min;
>  	ktime_t metadata_latency_max;
> +
> +	struct delayed_work delayed_work;  /* delayed work */
>  };
>  
> +static inline void metric_schedule_delayed(struct ceph_client_metric *m)
> +{
> +	/* per second as default */
> +	unsigned int hz = round_jiffies_relative(HZ * enable_send_metrics);
> +
> +	if (!enable_send_metrics)
> +		return;
> +
> +	schedule_delayed_work(&m->delayed_work, hz);
> +}
> +
>  extern int ceph_metric_init(struct ceph_client_metric *m);
>  extern void ceph_metric_destroy(struct ceph_client_metric *m);
>  
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index c9784eb1..49f20ea 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -27,6 +27,9 @@
>  #include <linux/ceph/auth.h>
>  #include <linux/ceph/debugfs.h>
>  
> +static DEFINE_MUTEX(ceph_fsc_lock);
> +static LIST_HEAD(ceph_fsc_list);
> +
>  /*
>   * Ceph superblock operations
>   *
> @@ -691,6 +694,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
>  	if (!fsc->wb_pagevec_pool)
>  		goto fail_cap_wq;
>  
> +	mutex_lock(&ceph_fsc_lock);
> +	list_add_tail(&fsc->list, &ceph_fsc_list);
> +	mutex_unlock(&ceph_fsc_lock);
> +
>  	return fsc;
>  
>  fail_cap_wq:
> @@ -717,6 +724,10 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
>  {
>  	dout("destroy_fs_client %p\n", fsc);
>  
> +	mutex_lock(&ceph_fsc_lock);
> +	list_del(&fsc->list);
> +	mutex_unlock(&ceph_fsc_lock);
> +
>  	ceph_mdsc_destroy(fsc);
>  	destroy_workqueue(fsc->inode_wq);
>  	destroy_workqueue(fsc->cap_wq);
> @@ -1282,6 +1293,37 @@ static void __exit exit_ceph(void)
>  	destroy_caches();
>  }
>  
> +static int param_set_metrics(const char *val, const struct kernel_param *kp)
> +{
> +	struct ceph_fs_client *fsc;
> +	int ret;
> +
> +	ret = param_set_bool(val, kp);
> +	if (ret) {
> +		pr_err("Failed to parse sending metrics switch value '%s'\n",
> +		       val);
> +		return ret;
> +	} else if (enable_send_metrics) {
> +		// wake up all the mds clients
> +		mutex_lock(&ceph_fsc_lock);
> +		list_for_each_entry(fsc, &ceph_fsc_list, list) {
> +			metric_schedule_delayed(&fsc->mdsc->metric);
> +		}
> +		mutex_unlock(&ceph_fsc_lock);
> +	}
> +
> +	return 0;
> +}
> +
> +static const struct kernel_param_ops param_ops_metrics = {
> +	.set = param_set_metrics,
> +	.get = param_get_bool,
> +};
> +
> +bool enable_send_metrics = true;
> +module_param_cb(enable_send_metrics, &param_ops_metrics, &enable_send_metrics, 0644);
> +MODULE_PARM_DESC(enable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)");
> +
>  module_init(init_ceph);
>  module_exit(exit_ceph);
>  
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 5a6cdd3..05edc9a 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -101,6 +101,8 @@ struct ceph_mount_options {
>  struct ceph_fs_client {
>  	struct super_block *sb;
>  
> +	struct list_head list;
> +
>  	struct ceph_mount_options *mount_options;
>  	struct ceph_client *client;
>  
> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> index ebf5ba6..455e9b9 100644
> --- a/include/linux/ceph/ceph_fs.h
> +++ b/include/linux/ceph/ceph_fs.h
> @@ -130,6 +130,7 @@ struct ceph_dir_layout {
>  #define CEPH_MSG_CLIENT_REQUEST         24
>  #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
>  #define CEPH_MSG_CLIENT_REPLY           26
> +#define CEPH_MSG_CLIENT_METRICS         29
>  #define CEPH_MSG_CLIENT_CAPS            0x310
>  #define CEPH_MSG_CLIENT_LEASE           0x311
>  #define CEPH_MSG_CLIENT_SNAP            0x312

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 3/4] ceph: switch to WARN_ON and bubble up errnos to the callers
  2020-06-22 13:24 ` [PATCH v3 3/4] ceph: switch to WARN_ON and bubble up errnos to the callers xiubli
@ 2020-06-23 18:02   ` Jeff Layton
  2020-06-24  0:34     ` Xiubo Li
  0 siblings, 1 reply; 11+ messages in thread
From: Jeff Layton @ 2020-06-23 18:02 UTC (permalink / raw)
  To: xiubli, idryomov; +Cc: zyan, pdonnell, ceph-devel

On Mon, 2020-06-22 at 09:24 -0400, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  fs/ceph/mds_client.c | 46 +++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 35 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index f996363..f29cb11 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -1168,7 +1168,7 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
>  
>  static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;
>  #define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8)
> -static void encode_supported_features(void **p, void *end)
> +static int encode_supported_features(void **p, void *end)
>  {
>  	static const size_t count = ARRAY_SIZE(feature_bits);
>  
> @@ -1176,16 +1176,22 @@ static void encode_supported_features(void **p, void *end)
>  		size_t i;
>  		size_t size = FEATURE_BYTES(count);
>  
> -		BUG_ON(*p + 4 + size > end);
> +		if (WARN_ON(*p + 4 + size > end))
> +			return -ERANGE;
> +

Nice cleanup.

Let's use WARN_ON_ONCE instead?

It's better not to spam the logs if this is happening all over the
place. Also, I'm not sure that ERANGE is the right error here, but I
can't think of anything better. At least it should be distinctive.

>  		ceph_encode_32(p, size);
>  		memset(*p, 0, size);
>  		for (i = 0; i < count; i++)
>  			((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8);
>  		*p += size;
>  	} else {
> -		BUG_ON(*p + 4 > end);
> +		if (WARN_ON(*p + 4 > end))
> +			return -ERANGE;
> +
>  		ceph_encode_32(p, 0);
>  	}
> +
> +	return 0;
>  }
>  
>  /*
> @@ -1203,6 +1209,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
>  	struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
>  	size_t size, count;
>  	void *p, *end;
> +	int ret;
>  
>  	const char* metadata[][2] = {
>  		{"hostname", mdsc->nodename},
> @@ -1232,7 +1239,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
>  			   GFP_NOFS, false);
>  	if (!msg) {
>  		pr_err("create_session_msg ENOMEM creating msg\n");
> -		return NULL;
> +		return ERR_PTR(-ENOMEM);
>  	}
>  	p = msg->front.iov_base;
>  	end = p + msg->front.iov_len;
> @@ -1269,7 +1276,13 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
>  		p += val_len;
>  	}
>  
> -	encode_supported_features(&p, end);
> +	ret = encode_supported_features(&p, end);
> +	if (ret) {
> +		pr_err("encode_supported_features failed!\n");
> +		ceph_msg_put(msg);
> +		return ERR_PTR(ret);
> +	}
> +
>  	msg->front.iov_len = p - msg->front.iov_base;
>  	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
>  
> @@ -1297,8 +1310,8 @@ static int __open_session(struct ceph_mds_client *mdsc,
>  
>  	/* send connect message */
>  	msg = create_session_open_msg(mdsc, session->s_seq);
> -	if (!msg)
> -		return -ENOMEM;
> +	if (IS_ERR(msg))
> +		return PTR_ERR(msg);
>  	ceph_con_send(&session->s_con, msg);
>  	return 0;
>  }
> @@ -1312,6 +1325,7 @@ static int __open_session(struct ceph_mds_client *mdsc,
>  __open_export_target_session(struct ceph_mds_client *mdsc, int target)
>  {
>  	struct ceph_mds_session *session;
> +	int ret;
>  
>  	session = __ceph_lookup_mds_session(mdsc, target);
>  	if (!session) {
> @@ -1320,8 +1334,11 @@ static int __open_session(struct ceph_mds_client *mdsc,
>  			return session;
>  	}
>  	if (session->s_state == CEPH_MDS_SESSION_NEW ||
> -	    session->s_state == CEPH_MDS_SESSION_CLOSING)
> -		__open_session(mdsc, session);
> +	    session->s_state == CEPH_MDS_SESSION_CLOSING) {
> +		ret = __open_session(mdsc, session);
> +		if (ret)
> +			return ERR_PTR(ret);
> +	}
>  
>  	return session;
>  }
> @@ -2520,7 +2537,12 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
>  		ceph_encode_copy(&p, &ts, sizeof(ts));
>  	}
>  
> -	BUG_ON(p > end);
> +	if (WARN_ON(p > end)) {
> +		ceph_msg_put(msg);
> +		msg = ERR_PTR(-ERANGE);
> +		goto out_free2;
> +	}
> +
>  	msg->front.iov_len = p - msg->front.iov_base;
>  	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
>  
> @@ -2756,7 +2778,9 @@ static void __do_request(struct ceph_mds_client *mdsc,
>  		}
>  		if (session->s_state == CEPH_MDS_SESSION_NEW ||
>  		    session->s_state == CEPH_MDS_SESSION_CLOSING) {
> -			__open_session(mdsc, session);
> +			err = __open_session(mdsc, session);
> +			if (err)
> +				goto out_session;
>  			/* retry the same mds later */
>  			if (random)
>  				req->r_resend_mds = mds;

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 3/4] ceph: switch to WARN_ON and bubble up errnos to the callers
  2020-06-23 18:02   ` Jeff Layton
@ 2020-06-24  0:34     ` Xiubo Li
  0 siblings, 0 replies; 11+ messages in thread
From: Xiubo Li @ 2020-06-24  0:34 UTC (permalink / raw)
  To: Jeff Layton, idryomov; +Cc: zyan, pdonnell, ceph-devel

On 2020/6/24 2:02, Jeff Layton wrote:
> On Mon, 2020-06-22 at 09:24 -0400, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   fs/ceph/mds_client.c | 46 +++++++++++++++++++++++++++++++++++-----------
>>   1 file changed, 35 insertions(+), 11 deletions(-)
>>
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index f996363..f29cb11 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -1168,7 +1168,7 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
>>   
>>   static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;
>>   #define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8)
>> -static void encode_supported_features(void **p, void *end)
>> +static int encode_supported_features(void **p, void *end)
>>   {
>>   	static const size_t count = ARRAY_SIZE(feature_bits);
>>   
>> @@ -1176,16 +1176,22 @@ static void encode_supported_features(void **p, void *end)
>>   		size_t i;
>>   		size_t size = FEATURE_BYTES(count);
>>   
>> -		BUG_ON(*p + 4 + size > end);
>> +		if (WARN_ON(*p + 4 + size > end))
>> +			return -ERANGE;
>> +
> Nice cleanup.
>
> Let's use WARN_ON_ONCE instead?
>
> It's better not to spam the logs if this is happening all over the
> place. Also, I'm not sure that ERANGE is the right error here, but I
> can't think of anything better. At least it should be distinctive.

Yeah, it  makes sense and I will fix it.

Thanks.


>
>>   		ceph_encode_32(p, size);
>>   		memset(*p, 0, size);
>>   		for (i = 0; i < count; i++)
>>   			((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8);
>>   		*p += size;
>>   	} else {
>> -		BUG_ON(*p + 4 > end);
>> +		if (WARN_ON(*p + 4 > end))
>> +			return -ERANGE;
>> +
>>   		ceph_encode_32(p, 0);
>>   	}
>> +
>> +	return 0;
>>   }
>>   
>>   /*
>> @@ -1203,6 +1209,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
>>   	struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
>>   	size_t size, count;
>>   	void *p, *end;
>> +	int ret;
>>   
>>   	const char* metadata[][2] = {
>>   		{"hostname", mdsc->nodename},
>> @@ -1232,7 +1239,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
>>   			   GFP_NOFS, false);
>>   	if (!msg) {
>>   		pr_err("create_session_msg ENOMEM creating msg\n");
>> -		return NULL;
>> +		return ERR_PTR(-ENOMEM);
>>   	}
>>   	p = msg->front.iov_base;
>>   	end = p + msg->front.iov_len;
>> @@ -1269,7 +1276,13 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
>>   		p += val_len;
>>   	}
>>   
>> -	encode_supported_features(&p, end);
>> +	ret = encode_supported_features(&p, end);
>> +	if (ret) {
>> +		pr_err("encode_supported_features failed!\n");
>> +		ceph_msg_put(msg);
>> +		return ERR_PTR(ret);
>> +	}
>> +
>>   	msg->front.iov_len = p - msg->front.iov_base;
>>   	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
>>   
>> @@ -1297,8 +1310,8 @@ static int __open_session(struct ceph_mds_client *mdsc,
>>   
>>   	/* send connect message */
>>   	msg = create_session_open_msg(mdsc, session->s_seq);
>> -	if (!msg)
>> -		return -ENOMEM;
>> +	if (IS_ERR(msg))
>> +		return PTR_ERR(msg);
>>   	ceph_con_send(&session->s_con, msg);
>>   	return 0;
>>   }
>> @@ -1312,6 +1325,7 @@ static int __open_session(struct ceph_mds_client *mdsc,
>>   __open_export_target_session(struct ceph_mds_client *mdsc, int target)
>>   {
>>   	struct ceph_mds_session *session;
>> +	int ret;
>>   
>>   	session = __ceph_lookup_mds_session(mdsc, target);
>>   	if (!session) {
>> @@ -1320,8 +1334,11 @@ static int __open_session(struct ceph_mds_client *mdsc,
>>   			return session;
>>   	}
>>   	if (session->s_state == CEPH_MDS_SESSION_NEW ||
>> -	    session->s_state == CEPH_MDS_SESSION_CLOSING)
>> -		__open_session(mdsc, session);
>> +	    session->s_state == CEPH_MDS_SESSION_CLOSING) {
>> +		ret = __open_session(mdsc, session);
>> +		if (ret)
>> +			return ERR_PTR(ret);
>> +	}
>>   
>>   	return session;
>>   }
>> @@ -2520,7 +2537,12 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
>>   		ceph_encode_copy(&p, &ts, sizeof(ts));
>>   	}
>>   
>> -	BUG_ON(p > end);
>> +	if (WARN_ON(p > end)) {
>> +		ceph_msg_put(msg);
>> +		msg = ERR_PTR(-ERANGE);
>> +		goto out_free2;
>> +	}
>> +
>>   	msg->front.iov_len = p - msg->front.iov_base;
>>   	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
>>   
>> @@ -2756,7 +2778,9 @@ static void __do_request(struct ceph_mds_client *mdsc,
>>   		}
>>   		if (session->s_state == CEPH_MDS_SESSION_NEW ||
>>   		    session->s_state == CEPH_MDS_SESSION_CLOSING) {
>> -			__open_session(mdsc, session);
>> +			err = __open_session(mdsc, session);
>> +			if (err)
>> +				goto out_session;
>>   			/* retry the same mds later */
>>   			if (random)
>>   				req->r_resend_mds = mds;

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 2/4] ceph: periodically send perf metrics to ceph
  2020-06-23 17:24   ` Jeff Layton
@ 2020-06-24  8:32     ` Xiubo Li
  2020-06-24 10:11       ` Jeff Layton
  0 siblings, 1 reply; 11+ messages in thread
From: Xiubo Li @ 2020-06-24  8:32 UTC (permalink / raw)
  To: Jeff Layton, idryomov; +Cc: zyan, pdonnell, ceph-devel

On 2020/6/24 1:24, Jeff Layton wrote:
> On Mon, 2020-06-22 at 09:24 -0400, xiubli@redhat.com wrote:
>> From: Xiubo Li <xiubli@redhat.com>
>>
>> This will send the caps/read/write/metadata metrics to any available
>> MDS only once per second as default, which will be the same as the
>> userland client, or every metric_send_interval seconds, which is a
>> module parameter.
>>
>> Skip the MDS sessions if they don't support the metric collection,
>> or the MDSs will close the socket connections directly when it get
>> an unknown type message.
>>
>> URL: https://tracker.ceph.com/issues/43215
>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>> ---
>>   fs/ceph/mds_client.c         |   3 +
>>   fs/ceph/mds_client.h         |   4 +-
>>   fs/ceph/metric.c             | 142 +++++++++++++++++++++++++++++++++++++++++++
>>   fs/ceph/metric.h             |  78 ++++++++++++++++++++++++
>>   fs/ceph/super.c              |  42 +++++++++++++
>>   fs/ceph/super.h              |   2 +
>>   include/linux/ceph/ceph_fs.h |   1 +
>>   7 files changed, 271 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index 608fb5c..f996363 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -4625,6 +4625,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
>>   
>>   	cancel_work_sync(&mdsc->cap_reclaim_work);
>>   	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
>> +	cancel_delayed_work_sync(&mdsc->metric.delayed_work); /* cancel timer */
>>   
>>   	dout("stopped\n");
>>   }
>> @@ -4667,6 +4668,7 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
>>   {
>>   	dout("stop\n");
>>   	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
>> +	cancel_delayed_work_sync(&mdsc->metric.delayed_work); /* cancel timer */
>>   	if (mdsc->mdsmap)
>>   		ceph_mdsmap_destroy(mdsc->mdsmap);
>>   	kfree(mdsc->sessions);
>> @@ -4824,6 +4826,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
>>   
>>   	mutex_unlock(&mdsc->mutex);
>>   	schedule_delayed(mdsc);
>> +	metric_schedule_delayed(&mdsc->metric);
>>   	return;
>>   
>>   bad_unlock:
>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>> index bcb3892..3c65ac1 100644
>> --- a/fs/ceph/mds_client.h
>> +++ b/fs/ceph/mds_client.h
>> @@ -28,8 +28,9 @@ enum ceph_feature_type {
>>   	CEPHFS_FEATURE_LAZY_CAP_WANTED,
>>   	CEPHFS_FEATURE_MULTI_RECONNECT,
>>   	CEPHFS_FEATURE_DELEG_INO,
>> +	CEPHFS_FEATURE_METRIC_COLLECT,
>>   
>> -	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO,
>> +	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
>>   };
>>   
>>   /*
>> @@ -43,6 +44,7 @@ enum ceph_feature_type {
>>   	CEPHFS_FEATURE_LAZY_CAP_WANTED,		\
>>   	CEPHFS_FEATURE_MULTI_RECONNECT,		\
>>   	CEPHFS_FEATURE_DELEG_INO,		\
>> +	CEPHFS_FEATURE_METRIC_COLLECT,		\
>>   						\
>>   	CEPHFS_FEATURE_MAX,			\
>>   }
>> diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
>> index 9217f35..4267b46 100644
>> --- a/fs/ceph/metric.c
>> +++ b/fs/ceph/metric.c
>> @@ -1,10 +1,150 @@
>>   /* SPDX-License-Identifier: GPL-2.0 */
>> +#include <linux/ceph/ceph_debug.h>
>>   
>>   #include <linux/types.h>
>>   #include <linux/percpu_counter.h>
>>   #include <linux/math64.h>
>>   
>>   #include "metric.h"
>> +#include "mds_client.h"
>> +
>> +static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
>> +				   struct ceph_mds_session *s,
>> +				   u64 nr_caps)
>> +{
>> +	struct ceph_metric_head *head;
>> +	struct ceph_metric_cap *cap;
>> +	struct ceph_metric_read_latency *read;
>> +	struct ceph_metric_write_latency *write;
>> +	struct ceph_metric_metadata_latency *meta;
>> +	struct ceph_client_metric *m = &mdsc->metric;
>> +	struct ceph_msg *msg;
>> +	struct timespec64 ts;
>> +	s64 sum, total;
>> +	s32 items = 0;
>> +	s32 len;
>> +
>> +	len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write)
>> +	      + sizeof(*meta);
>> +
>> +	msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
>> +	if (!msg) {
>> +		pr_err("send metrics to mds%d, failed to allocate message\n",
>> +		       s->s_mds);
>> +		return false;
>> +	}
>> +
>> +	head = msg->front.iov_base;
>> +
>> +	/* encode the cap metric */
>> +	cap = (struct ceph_metric_cap *)(head + 1);
>> +	cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
>> +	cap->ver = 1;
>> +	cap->compat = 1;
>> +	cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
>> +	cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit));
>> +	cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis));
>> +	cap->total = cpu_to_le64(nr_caps);
>> +	items++;
>> +
>> +	/* encode the read latency metric */
>> +	read = (struct ceph_metric_read_latency *)(cap + 1);
>> +	read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
>> +	read->ver = 1;
>> +	read->compat = 1;
>> +	read->data_len = cpu_to_le32(sizeof(*read) - 10);
>> +	total = m->total_reads;
>> +	sum = m->read_latency_sum;
>> +	jiffies_to_timespec64(sum, &ts);
>> +	read->sec = cpu_to_le32(ts.tv_sec);
>> +	read->nsec = cpu_to_le32(ts.tv_nsec);
>> +	items++;
>> +
>> +	/* encode the write latency metric */
>> +	write = (struct ceph_metric_write_latency *)(read + 1);
>> +	write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
>> +	write->ver = 1;
>> +	write->compat = 1;
>> +	write->data_len = cpu_to_le32(sizeof(*write) - 10);
>> +	total = m->total_writes;
>> +	sum = m->write_latency_sum;
>> +	jiffies_to_timespec64(sum, &ts);
>> +	write->sec = cpu_to_le32(ts.tv_sec);
>> +	write->nsec = cpu_to_le32(ts.tv_nsec);
>> +	items++;
>> +
>> +	/* encode the metadata latency metric */
>> +	meta = (struct ceph_metric_metadata_latency *)(write + 1);
>> +	meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
>> +	meta->ver = 1;
>> +	meta->compat = 1;
>> +	meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
>> +	total = m->total_metadatas;
>> +	sum = m->metadata_latency_sum;
>> +	jiffies_to_timespec64(sum, &ts);
>> +	meta->sec = cpu_to_le32(ts.tv_sec);
>> +	meta->nsec = cpu_to_le32(ts.tv_nsec);
>> +	items++;
>> +
>> +	put_unaligned_le32(items, &head->num);
>> +	msg->front.iov_len = cpu_to_le32(len);
>> +	msg->hdr.version = cpu_to_le16(1);
>> +	msg->hdr.compat_version = cpu_to_le16(1);
>> +	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
>> +	dout("client%llu send metrics to mds%d\n",
>> +	     ceph_client_gid(mdsc->fsc->client), s->s_mds);
>> +	ceph_con_send(&s->s_con, msg);
>> +
>> +	return true;
>> +}
>> +
>> +static void metric_delayed_work(struct work_struct *work)
>> +{
>> +	struct ceph_client_metric *m =
>> +		container_of(work, struct ceph_client_metric, delayed_work.work);
>> +	struct ceph_mds_client *mdsc =
>> +		container_of(m, struct ceph_mds_client, metric);
>> +	struct ceph_mds_session *s;
>> +	u64 nr_caps = 0;
>> +	bool ret;
>> +	int i;
>> +
>> +	mutex_lock(&mdsc->mutex);
>> +	for (i = 0; i < mdsc->max_sessions; i++) {
>> +		s = __ceph_lookup_mds_session(mdsc, i);
>> +		if (!s)
>> +			continue;
>> +		nr_caps += s->s_nr_caps;
>> +		ceph_put_mds_session(s);
>> +	}
>> +
>> +	for (i = 0; i < mdsc->max_sessions; i++) {
>> +		s = __ceph_lookup_mds_session(mdsc, i);
>> +		if (!s)
>> +			continue;
>> +		if (!check_session_state(mdsc, s)) {
>> +			ceph_put_mds_session(s);
>> +			continue;
>> +		}
>> +
>> +		/*
>> +		 * Skip it if MDS doesn't support the metric collection,
>> +		 * or the MDS will close the session's socket connection
>> +		 * directly when it get this message.
>> +		 */
>> +		if (!test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features))
>> +			continue;
>> +
>> +		/* Only send the metric once in any available session */
>> +		ret = ceph_mdsc_send_metrics(mdsc, s, nr_caps);
>> +		ceph_put_mds_session(s);
>> +		if (ret)
>> +			break;
>> +	}
>> +	mutex_unlock(&mdsc->mutex);
>> +
>> +	metric_schedule_delayed(&mdsc->metric);
>> +}
>>   
> You're going to be queueing this job up to run every second, even when
> none of your MDS's support metrics.
>
> I think it would be better that we make this job conditional on having
> at least one session with an MDS that supports receiving metrics. Maybe
> have each MDS session hold a reference to the scheduled job and when the
> refcount goes to 0, we cancel it...
>
> A simpler approach here might be to just give each session its own
> struct work, and only queue the work if the session supports metrics.
> That way you could just cancel the work as part of each session's
> teardown. I think that would also mean you wouldn't need the mdsc->mutex
> here either, which would be a bonus.

Yeah, we need to enhance the code here.

Since we only need to send the metrics to any of the available MDSs and 
the MDS with rank 0 is responsible to collect  them.  But we still need 
to traverse all the mdsc->sessions to collect the total cap number and 
it is hard to get rid of the mdsc->mutex.

As you mentioned above we could just add one ref counter to record the 
total number of MDSs supporting the metric collection, when opening a 
session & ref counter 0 --> 1 then wake up the work and when closing the 
session & ref counter 1 --> 0 then cancel it.

Thanks.


>
> Thoughts?
>
>
>>   int ceph_metric_init(struct ceph_client_metric *m)
>>   {
>> @@ -51,6 +191,8 @@ int ceph_metric_init(struct ceph_client_metric *m)
>>   	m->total_metadatas = 0;
>>   	m->metadata_latency_sum = 0;
>>   
>> +	INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work);
>> +
>>   	return 0;
>>   
>>   err_i_caps_mis:
>> diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
>> index ccd8128..5a1f8b9 100644
>> --- a/fs/ceph/metric.h
>> +++ b/fs/ceph/metric.h
>> @@ -6,6 +6,71 @@
>>   #include <linux/percpu_counter.h>
>>   #include <linux/ktime.h>
>>   
>> +extern bool enable_send_metrics;
>> +
>> +enum ceph_metric_type {
>> +	CLIENT_METRIC_TYPE_CAP_INFO,
>> +	CLIENT_METRIC_TYPE_READ_LATENCY,
>> +	CLIENT_METRIC_TYPE_WRITE_LATENCY,
>> +	CLIENT_METRIC_TYPE_METADATA_LATENCY,
>> +	CLIENT_METRIC_TYPE_DENTRY_LEASE,
>> +
>> +	CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE,
>> +};
>> +
>> +/* metric caps header */
>> +struct ceph_metric_cap {
>> +	__le32 type;     /* ceph metric type */
>> +
>> +	__u8  ver;
>> +	__u8  compat;
>> +
>> +	__le32 data_len; /* length of sizeof(hit + mis + total) */
>> +	__le64 hit;
>> +	__le64 mis;
>> +	__le64 total;
>> +} __packed;
>> +
>> +/* metric read latency header */
>> +struct ceph_metric_read_latency {
>> +	__le32 type;     /* ceph metric type */
>> +
>> +	__u8  ver;
>> +	__u8  compat;
>> +
>> +	__le32 data_len; /* length of sizeof(sec + nsec) */
>> +	__le32 sec;
>> +	__le32 nsec;
>> +} __packed;
>> +
>> +/* metric write latency header */
>> +struct ceph_metric_write_latency {
>> +	__le32 type;     /* ceph metric type */
>> +
>> +	__u8  ver;
>> +	__u8  compat;
>> +
>> +	__le32 data_len; /* length of sizeof(sec + nsec) */
>> +	__le32 sec;
>> +	__le32 nsec;
>> +} __packed;
>> +
>> +/* metric metadata latency header */
>> +struct ceph_metric_metadata_latency {
>> +	__le32 type;     /* ceph metric type */
>> +
>> +	__u8  ver;
>> +	__u8  compat;
>> +
>> +	__le32 data_len; /* length of sizeof(sec + nsec) */
>> +	__le32 sec;
>> +	__le32 nsec;
>> +} __packed;
>> +
>> +struct ceph_metric_head {
>> +	__le32 num;	/* the number of metrics that will be sent */
>> +} __packed;
>> +
>>   /* This is the global metrics */
>>   struct ceph_client_metric {
>>   	atomic64_t            total_dentries;
>> @@ -35,8 +100,21 @@ struct ceph_client_metric {
>>   	ktime_t metadata_latency_sq_sum;
>>   	ktime_t metadata_latency_min;
>>   	ktime_t metadata_latency_max;
>> +
>> +	struct delayed_work delayed_work;  /* delayed work */
>>   };
>>   
>> +static inline void metric_schedule_delayed(struct ceph_client_metric *m)
>> +{
>> +	/* per second as default */
>> +	unsigned int hz = round_jiffies_relative(HZ * enable_send_metrics);
>> +
>> +	if (!enable_send_metrics)
>> +		return;
>> +
>> +	schedule_delayed_work(&m->delayed_work, hz);
>> +}
>> +
>>   extern int ceph_metric_init(struct ceph_client_metric *m);
>>   extern void ceph_metric_destroy(struct ceph_client_metric *m);
>>   
>> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
>> index c9784eb1..49f20ea 100644
>> --- a/fs/ceph/super.c
>> +++ b/fs/ceph/super.c
>> @@ -27,6 +27,9 @@
>>   #include <linux/ceph/auth.h>
>>   #include <linux/ceph/debugfs.h>
>>   
>> +static DEFINE_MUTEX(ceph_fsc_lock);
>> +static LIST_HEAD(ceph_fsc_list);
>> +
>>   /*
>>    * Ceph superblock operations
>>    *
>> @@ -691,6 +694,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
>>   	if (!fsc->wb_pagevec_pool)
>>   		goto fail_cap_wq;
>>   
>> +	mutex_lock(&ceph_fsc_lock);
>> +	list_add_tail(&fsc->list, &ceph_fsc_list);
>> +	mutex_unlock(&ceph_fsc_lock);
>> +
>>   	return fsc;
>>   
>>   fail_cap_wq:
>> @@ -717,6 +724,10 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
>>   {
>>   	dout("destroy_fs_client %p\n", fsc);
>>   
>> +	mutex_lock(&ceph_fsc_lock);
>> +	list_del(&fsc->list);
>> +	mutex_unlock(&ceph_fsc_lock);
>> +
>>   	ceph_mdsc_destroy(fsc);
>>   	destroy_workqueue(fsc->inode_wq);
>>   	destroy_workqueue(fsc->cap_wq);
>> @@ -1282,6 +1293,37 @@ static void __exit exit_ceph(void)
>>   	destroy_caches();
>>   }
>>   
>> +static int param_set_metrics(const char *val, const struct kernel_param *kp)
>> +{
>> +	struct ceph_fs_client *fsc;
>> +	int ret;
>> +
>> +	ret = param_set_bool(val, kp);
>> +	if (ret) {
>> +		pr_err("Failed to parse sending metrics switch value '%s'\n",
>> +		       val);
>> +		return ret;
>> +	} else if (enable_send_metrics) {
>> +		// wake up all the mds clients
>> +		mutex_lock(&ceph_fsc_lock);
>> +		list_for_each_entry(fsc, &ceph_fsc_list, list) {
>> +			metric_schedule_delayed(&fsc->mdsc->metric);
>> +		}
>> +		mutex_unlock(&ceph_fsc_lock);
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static const struct kernel_param_ops param_ops_metrics = {
>> +	.set = param_set_metrics,
>> +	.get = param_get_bool,
>> +};
>> +
>> +bool enable_send_metrics = true;
>> +module_param_cb(enable_send_metrics, &param_ops_metrics, &enable_send_metrics, 0644);
>> +MODULE_PARM_DESC(enable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)");
>> +
>>   module_init(init_ceph);
>>   module_exit(exit_ceph);
>>   
>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>> index 5a6cdd3..05edc9a 100644
>> --- a/fs/ceph/super.h
>> +++ b/fs/ceph/super.h
>> @@ -101,6 +101,8 @@ struct ceph_mount_options {
>>   struct ceph_fs_client {
>>   	struct super_block *sb;
>>   
>> +	struct list_head list;
>> +
>>   	struct ceph_mount_options *mount_options;
>>   	struct ceph_client *client;
>>   
>> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
>> index ebf5ba6..455e9b9 100644
>> --- a/include/linux/ceph/ceph_fs.h
>> +++ b/include/linux/ceph/ceph_fs.h
>> @@ -130,6 +130,7 @@ struct ceph_dir_layout {
>>   #define CEPH_MSG_CLIENT_REQUEST         24
>>   #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
>>   #define CEPH_MSG_CLIENT_REPLY           26
>> +#define CEPH_MSG_CLIENT_METRICS         29
>>   #define CEPH_MSG_CLIENT_CAPS            0x310
>>   #define CEPH_MSG_CLIENT_LEASE           0x311
>>   #define CEPH_MSG_CLIENT_SNAP            0x312

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 2/4] ceph: periodically send perf metrics to ceph
  2020-06-24  8:32     ` Xiubo Li
@ 2020-06-24 10:11       ` Jeff Layton
  2020-06-24 12:31         ` Xiubo Li
  0 siblings, 1 reply; 11+ messages in thread
From: Jeff Layton @ 2020-06-24 10:11 UTC (permalink / raw)
  To: Xiubo Li, idryomov; +Cc: zyan, pdonnell, ceph-devel

On Wed, 2020-06-24 at 16:32 +0800, Xiubo Li wrote:
> On 2020/6/24 1:24, Jeff Layton wrote:
> > On Mon, 2020-06-22 at 09:24 -0400, xiubli@redhat.com wrote:
> > > From: Xiubo Li <xiubli@redhat.com>
> > > 
> > > This will send the caps/read/write/metadata metrics to any available
> > > MDS only once per second as default, which will be the same as the
> > > userland client, or every metric_send_interval seconds, which is a
> > > module parameter.
> > > 
> > > Skip the MDS sessions if they don't support the metric collection,
> > > or the MDSs will close the socket connections directly when it get
> > > an unknown type message.
> > > 
> > > URL: https://tracker.ceph.com/issues/43215
> > > Signed-off-by: Xiubo Li <xiubli@redhat.com>
> > > ---
> > >   fs/ceph/mds_client.c         |   3 +
> > >   fs/ceph/mds_client.h         |   4 +-
> > >   fs/ceph/metric.c             | 142 +++++++++++++++++++++++++++++++++++++++++++
> > >   fs/ceph/metric.h             |  78 ++++++++++++++++++++++++
> > >   fs/ceph/super.c              |  42 +++++++++++++
> > >   fs/ceph/super.h              |   2 +
> > >   include/linux/ceph/ceph_fs.h |   1 +
> > >   7 files changed, 271 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > index 608fb5c..f996363 100644
> > > --- a/fs/ceph/mds_client.c
> > > +++ b/fs/ceph/mds_client.c
> > > @@ -4625,6 +4625,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
> > >   
> > >   	cancel_work_sync(&mdsc->cap_reclaim_work);
> > >   	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
> > > +	cancel_delayed_work_sync(&mdsc->metric.delayed_work); /* cancel timer */
> > >   
> > >   	dout("stopped\n");
> > >   }
> > > @@ -4667,6 +4668,7 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
> > >   {
> > >   	dout("stop\n");
> > >   	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
> > > +	cancel_delayed_work_sync(&mdsc->metric.delayed_work); /* cancel timer */
> > >   	if (mdsc->mdsmap)
> > >   		ceph_mdsmap_destroy(mdsc->mdsmap);
> > >   	kfree(mdsc->sessions);
> > > @@ -4824,6 +4826,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
> > >   
> > >   	mutex_unlock(&mdsc->mutex);
> > >   	schedule_delayed(mdsc);
> > > +	metric_schedule_delayed(&mdsc->metric);
> > >   	return;
> > >   
> > >   bad_unlock:
> > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > index bcb3892..3c65ac1 100644
> > > --- a/fs/ceph/mds_client.h
> > > +++ b/fs/ceph/mds_client.h
> > > @@ -28,8 +28,9 @@ enum ceph_feature_type {
> > >   	CEPHFS_FEATURE_LAZY_CAP_WANTED,
> > >   	CEPHFS_FEATURE_MULTI_RECONNECT,
> > >   	CEPHFS_FEATURE_DELEG_INO,
> > > +	CEPHFS_FEATURE_METRIC_COLLECT,
> > >   
> > > -	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO,
> > > +	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
> > >   };
> > >   
> > >   /*
> > > @@ -43,6 +44,7 @@ enum ceph_feature_type {
> > >   	CEPHFS_FEATURE_LAZY_CAP_WANTED,		\
> > >   	CEPHFS_FEATURE_MULTI_RECONNECT,		\
> > >   	CEPHFS_FEATURE_DELEG_INO,		\
> > > +	CEPHFS_FEATURE_METRIC_COLLECT,		\
> > >   						\
> > >   	CEPHFS_FEATURE_MAX,			\
> > >   }
> > > diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
> > > index 9217f35..4267b46 100644
> > > --- a/fs/ceph/metric.c
> > > +++ b/fs/ceph/metric.c
> > > @@ -1,10 +1,150 @@
> > >   /* SPDX-License-Identifier: GPL-2.0 */
> > > +#include <linux/ceph/ceph_debug.h>
> > >   
> > >   #include <linux/types.h>
> > >   #include <linux/percpu_counter.h>
> > >   #include <linux/math64.h>
> > >   
> > >   #include "metric.h"
> > > +#include "mds_client.h"
> > > +
> > > +static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
> > > +				   struct ceph_mds_session *s,
> > > +				   u64 nr_caps)
> > > +{
> > > +	struct ceph_metric_head *head;
> > > +	struct ceph_metric_cap *cap;
> > > +	struct ceph_metric_read_latency *read;
> > > +	struct ceph_metric_write_latency *write;
> > > +	struct ceph_metric_metadata_latency *meta;
> > > +	struct ceph_client_metric *m = &mdsc->metric;
> > > +	struct ceph_msg *msg;
> > > +	struct timespec64 ts;
> > > +	s64 sum, total;
> > > +	s32 items = 0;
> > > +	s32 len;
> > > +
> > > +	len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write)
> > > +	      + sizeof(*meta);
> > > +
> > > +	msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
> > > +	if (!msg) {
> > > +		pr_err("send metrics to mds%d, failed to allocate message\n",
> > > +		       s->s_mds);
> > > +		return false;
> > > +	}
> > > +
> > > +	head = msg->front.iov_base;
> > > +
> > > +	/* encode the cap metric */
> > > +	cap = (struct ceph_metric_cap *)(head + 1);
> > > +	cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
> > > +	cap->ver = 1;
> > > +	cap->compat = 1;
> > > +	cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
> > > +	cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit));
> > > +	cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis));
> > > +	cap->total = cpu_to_le64(nr_caps);
> > > +	items++;
> > > +
> > > +	/* encode the read latency metric */
> > > +	read = (struct ceph_metric_read_latency *)(cap + 1);
> > > +	read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
> > > +	read->ver = 1;
> > > +	read->compat = 1;
> > > +	read->data_len = cpu_to_le32(sizeof(*read) - 10);
> > > +	total = m->total_reads;
> > > +	sum = m->read_latency_sum;
> > > +	jiffies_to_timespec64(sum, &ts);
> > > +	read->sec = cpu_to_le32(ts.tv_sec);
> > > +	read->nsec = cpu_to_le32(ts.tv_nsec);
> > > +	items++;
> > > +
> > > +	/* encode the write latency metric */
> > > +	write = (struct ceph_metric_write_latency *)(read + 1);
> > > +	write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
> > > +	write->ver = 1;
> > > +	write->compat = 1;
> > > +	write->data_len = cpu_to_le32(sizeof(*write) - 10);
> > > +	total = m->total_writes;
> > > +	sum = m->write_latency_sum;
> > > +	jiffies_to_timespec64(sum, &ts);
> > > +	write->sec = cpu_to_le32(ts.tv_sec);
> > > +	write->nsec = cpu_to_le32(ts.tv_nsec);
> > > +	items++;
> > > +
> > > +	/* encode the metadata latency metric */
> > > +	meta = (struct ceph_metric_metadata_latency *)(write + 1);
> > > +	meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
> > > +	meta->ver = 1;
> > > +	meta->compat = 1;
> > > +	meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
> > > +	total = m->total_metadatas;
> > > +	sum = m->metadata_latency_sum;
> > > +	jiffies_to_timespec64(sum, &ts);
> > > +	meta->sec = cpu_to_le32(ts.tv_sec);
> > > +	meta->nsec = cpu_to_le32(ts.tv_nsec);
> > > +	items++;
> > > +
> > > +	put_unaligned_le32(items, &head->num);
> > > +	msg->front.iov_len = cpu_to_le32(len);
> > > +	msg->hdr.version = cpu_to_le16(1);
> > > +	msg->hdr.compat_version = cpu_to_le16(1);
> > > +	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
> > > +	dout("client%llu send metrics to mds%d\n",
> > > +	     ceph_client_gid(mdsc->fsc->client), s->s_mds);
> > > +	ceph_con_send(&s->s_con, msg);
> > > +
> > > +	return true;
> > > +}
> > > +
> > > +static void metric_delayed_work(struct work_struct *work)
> > > +{
> > > +	struct ceph_client_metric *m =
> > > +		container_of(work, struct ceph_client_metric, delayed_work.work);
> > > +	struct ceph_mds_client *mdsc =
> > > +		container_of(m, struct ceph_mds_client, metric);
> > > +	struct ceph_mds_session *s;
> > > +	u64 nr_caps = 0;
> > > +	bool ret;
> > > +	int i;
> > > +
> > > +	mutex_lock(&mdsc->mutex);
> > > +	for (i = 0; i < mdsc->max_sessions; i++) {
> > > +		s = __ceph_lookup_mds_session(mdsc, i);
> > > +		if (!s)
> > > +			continue;
> > > +		nr_caps += s->s_nr_caps;
> > > +		ceph_put_mds_session(s);
> > > +	}
> > > +
> > > +	for (i = 0; i < mdsc->max_sessions; i++) {
> > > +		s = __ceph_lookup_mds_session(mdsc, i);
> > > +		if (!s)
> > > +			continue;
> > > +		if (!check_session_state(mdsc, s)) {
> > > +			ceph_put_mds_session(s);
> > > +			continue;
> > > +		}
> > > +
> > > +		/*
> > > +		 * Skip it if MDS doesn't support the metric collection,
> > > +		 * or the MDS will close the session's socket connection
> > > +		 * directly when it get this message.
> > > +		 */
> > > +		if (!test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features))
> > > +			continue;
> > > +
> > > +		/* Only send the metric once in any available session */
> > > +		ret = ceph_mdsc_send_metrics(mdsc, s, nr_caps);
> > > +		ceph_put_mds_session(s);
> > > +		if (ret)
> > > +			break;
> > > +	}
> > > +	mutex_unlock(&mdsc->mutex);
> > > +
> > > +	metric_schedule_delayed(&mdsc->metric);
> > > +}
> > >   
> > You're going to be queueing this job up to run every second, even when
> > none of your MDS's support metrics.
> > 
> > I think it would be better that we make this job conditional on having
> > at least one session with an MDS that supports receiving metrics. Maybe
> > have each MDS session hold a reference to the scheduled job and when the
> > refcount goes to 0, we cancel it...
> > 
> > A simpler approach here might be to just give each session its own
> > struct work, and only queue the work if the session supports metrics.
> > That way you could just cancel the work as part of each session's
> > teardown. I think that would also mean you wouldn't need the mdsc->mutex
> > here either, which would be a bonus.
> 
> Yeah, we need to enhance the code here.
> 
> Since we only need to send the metrics to any of the available MDSs and 
> the MDS with rank 0 is responsible to collect  them.  But we still need 
> to traverse all the mdsc->sessions to collect the total cap number and 
> it is hard to get rid of the mdsc->mutex.
>
> As you mentioned above we could just add one ref counter to record the 
> total number of MDSs supporting the metric collection, when opening a 
> session & ref counter 0 --> 1 then wake up the work and when closing the 
> session & ref counter 1 --> 0 then cancel it.
> 

Counting up total caps doesn't seem like a good reason to involve a
large, coarse-grained mutex here. Instead, let's keep a separate atomic
counter in the mdsc that gets incremented and decremented whenever
s_nr_caps is changed. Then you can just fetch that value from the
sessions stats sending job -- no mutex required.

I think that would be preferable to having to add refcounting to this
single workqueue job.

> > 
> > >   int ceph_metric_init(struct ceph_client_metric *m)
> > >   {
> > > @@ -51,6 +191,8 @@ int ceph_metric_init(struct ceph_client_metric *m)
> > >   	m->total_metadatas = 0;
> > >   	m->metadata_latency_sum = 0;
> > >   
> > > +	INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work);
> > > +
> > >   	return 0;
> > >   
> > >   err_i_caps_mis:
> > > diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
> > > index ccd8128..5a1f8b9 100644
> > > --- a/fs/ceph/metric.h
> > > +++ b/fs/ceph/metric.h
> > > @@ -6,6 +6,71 @@
> > >   #include <linux/percpu_counter.h>
> > >   #include <linux/ktime.h>
> > >   
> > > +extern bool enable_send_metrics;
> > > +
> > > +enum ceph_metric_type {
> > > +	CLIENT_METRIC_TYPE_CAP_INFO,
> > > +	CLIENT_METRIC_TYPE_READ_LATENCY,
> > > +	CLIENT_METRIC_TYPE_WRITE_LATENCY,
> > > +	CLIENT_METRIC_TYPE_METADATA_LATENCY,
> > > +	CLIENT_METRIC_TYPE_DENTRY_LEASE,
> > > +
> > > +	CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE,
> > > +};
> > > +
> > > +/* metric caps header */
> > > +struct ceph_metric_cap {
> > > +	__le32 type;     /* ceph metric type */
> > > +
> > > +	__u8  ver;
> > > +	__u8  compat;
> > > +
> > > +	__le32 data_len; /* length of sizeof(hit + mis + total) */
> > > +	__le64 hit;
> > > +	__le64 mis;
> > > +	__le64 total;
> > > +} __packed;
> > > +
> > > +/* metric read latency header */
> > > +struct ceph_metric_read_latency {
> > > +	__le32 type;     /* ceph metric type */
> > > +
> > > +	__u8  ver;
> > > +	__u8  compat;
> > > +
> > > +	__le32 data_len; /* length of sizeof(sec + nsec) */
> > > +	__le32 sec;
> > > +	__le32 nsec;
> > > +} __packed;
> > > +
> > > +/* metric write latency header */
> > > +struct ceph_metric_write_latency {
> > > +	__le32 type;     /* ceph metric type */
> > > +
> > > +	__u8  ver;
> > > +	__u8  compat;
> > > +
> > > +	__le32 data_len; /* length of sizeof(sec + nsec) */
> > > +	__le32 sec;
> > > +	__le32 nsec;
> > > +} __packed;
> > > +
> > > +/* metric metadata latency header */
> > > +struct ceph_metric_metadata_latency {
> > > +	__le32 type;     /* ceph metric type */
> > > +
> > > +	__u8  ver;
> > > +	__u8  compat;
> > > +
> > > +	__le32 data_len; /* length of sizeof(sec + nsec) */
> > > +	__le32 sec;
> > > +	__le32 nsec;
> > > +} __packed;
> > > +
> > > +struct ceph_metric_head {
> > > +	__le32 num;	/* the number of metrics that will be sent */
> > > +} __packed;
> > > +
> > >   /* This is the global metrics */
> > >   struct ceph_client_metric {
> > >   	atomic64_t            total_dentries;
> > > @@ -35,8 +100,21 @@ struct ceph_client_metric {
> > >   	ktime_t metadata_latency_sq_sum;
> > >   	ktime_t metadata_latency_min;
> > >   	ktime_t metadata_latency_max;
> > > +
> > > +	struct delayed_work delayed_work;  /* delayed work */
> > >   };
> > >   
> > > +static inline void metric_schedule_delayed(struct ceph_client_metric *m)
> > > +{
> > > +	/* per second as default */
> > > +	unsigned int hz = round_jiffies_relative(HZ * enable_send_metrics);
> > > +
> > > +	if (!enable_send_metrics)
> > > +		return;
> > > +
> > > +	schedule_delayed_work(&m->delayed_work, hz);
> > > +}
> > > +
> > >   extern int ceph_metric_init(struct ceph_client_metric *m);
> > >   extern void ceph_metric_destroy(struct ceph_client_metric *m);
> > >   
> > > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > > index c9784eb1..49f20ea 100644
> > > --- a/fs/ceph/super.c
> > > +++ b/fs/ceph/super.c
> > > @@ -27,6 +27,9 @@
> > >   #include <linux/ceph/auth.h>
> > >   #include <linux/ceph/debugfs.h>
> > >   
> > > +static DEFINE_MUTEX(ceph_fsc_lock);
> > > +static LIST_HEAD(ceph_fsc_list);
> > > +
> > >   /*
> > >    * Ceph superblock operations
> > >    *
> > > @@ -691,6 +694,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
> > >   	if (!fsc->wb_pagevec_pool)
> > >   		goto fail_cap_wq;
> > >   
> > > +	mutex_lock(&ceph_fsc_lock);
> > > +	list_add_tail(&fsc->list, &ceph_fsc_list);
> > > +	mutex_unlock(&ceph_fsc_lock);
> > > +
> > >   	return fsc;
> > >   
> > >   fail_cap_wq:
> > > @@ -717,6 +724,10 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
> > >   {
> > >   	dout("destroy_fs_client %p\n", fsc);
> > >   
> > > +	mutex_lock(&ceph_fsc_lock);
> > > +	list_del(&fsc->list);
> > > +	mutex_unlock(&ceph_fsc_lock);
> > > +
> > >   	ceph_mdsc_destroy(fsc);
> > >   	destroy_workqueue(fsc->inode_wq);
> > >   	destroy_workqueue(fsc->cap_wq);
> > > @@ -1282,6 +1293,37 @@ static void __exit exit_ceph(void)
> > >   	destroy_caches();
> > >   }
> > >   
> > > +static int param_set_metrics(const char *val, const struct kernel_param *kp)
> > > +{
> > > +	struct ceph_fs_client *fsc;
> > > +	int ret;
> > > +
> > > +	ret = param_set_bool(val, kp);
> > > +	if (ret) {
> > > +		pr_err("Failed to parse sending metrics switch value '%s'\n",
> > > +		       val);
> > > +		return ret;
> > > +	} else if (enable_send_metrics) {
> > > +		// wake up all the mds clients
> > > +		mutex_lock(&ceph_fsc_lock);
> > > +		list_for_each_entry(fsc, &ceph_fsc_list, list) {
> > > +			metric_schedule_delayed(&fsc->mdsc->metric);
> > > +		}
> > > +		mutex_unlock(&ceph_fsc_lock);
> > > +	}
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static const struct kernel_param_ops param_ops_metrics = {
> > > +	.set = param_set_metrics,
> > > +	.get = param_get_bool,
> > > +};
> > > +
> > > +bool enable_send_metrics = true;
> > > +module_param_cb(enable_send_metrics, &param_ops_metrics, &enable_send_metrics, 0644);
> > > +MODULE_PARM_DESC(enable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)");
> > > +
> > >   module_init(init_ceph);
> > >   module_exit(exit_ceph);
> > >   
> > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > index 5a6cdd3..05edc9a 100644
> > > --- a/fs/ceph/super.h
> > > +++ b/fs/ceph/super.h
> > > @@ -101,6 +101,8 @@ struct ceph_mount_options {
> > >   struct ceph_fs_client {
> > >   	struct super_block *sb;
> > >   
> > > +	struct list_head list;
> > > +
> > >   	struct ceph_mount_options *mount_options;
> > >   	struct ceph_client *client;
> > >   
> > > diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> > > index ebf5ba6..455e9b9 100644
> > > --- a/include/linux/ceph/ceph_fs.h
> > > +++ b/include/linux/ceph/ceph_fs.h
> > > @@ -130,6 +130,7 @@ struct ceph_dir_layout {
> > >   #define CEPH_MSG_CLIENT_REQUEST         24
> > >   #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
> > >   #define CEPH_MSG_CLIENT_REPLY           26
> > > +#define CEPH_MSG_CLIENT_METRICS         29
> > >   #define CEPH_MSG_CLIENT_CAPS            0x310
> > >   #define CEPH_MSG_CLIENT_LEASE           0x311
> > >   #define CEPH_MSG_CLIENT_SNAP            0x312
> 
> 

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v3 2/4] ceph: periodically send perf metrics to ceph
  2020-06-24 10:11       ` Jeff Layton
@ 2020-06-24 12:31         ` Xiubo Li
  0 siblings, 0 replies; 11+ messages in thread
From: Xiubo Li @ 2020-06-24 12:31 UTC (permalink / raw)
  To: Jeff Layton, idryomov; +Cc: zyan, pdonnell, ceph-devel

On 2020/6/24 18:11, Jeff Layton wrote:
> On Wed, 2020-06-24 at 16:32 +0800, Xiubo Li wrote:
>> On 2020/6/24 1:24, Jeff Layton wrote:
>>> On Mon, 2020-06-22 at 09:24 -0400, xiubli@redhat.com wrote:
>>>> From: Xiubo Li <xiubli@redhat.com>
>>>>
>>>> This will send the caps/read/write/metadata metrics to any available
>>>> MDS only once per second as default, which will be the same as the
>>>> userland client, or every metric_send_interval seconds, which is a
>>>> module parameter.
>>>>
>>>> Skip the MDS sessions if they don't support the metric collection,
>>>> or the MDSs will close the socket connections directly when it get
>>>> an unknown type message.
>>>>
>>>> URL: https://tracker.ceph.com/issues/43215
>>>> Signed-off-by: Xiubo Li <xiubli@redhat.com>
>>>> ---
>>>>    fs/ceph/mds_client.c         |   3 +
>>>>    fs/ceph/mds_client.h         |   4 +-
>>>>    fs/ceph/metric.c             | 142 +++++++++++++++++++++++++++++++++++++++++++
>>>>    fs/ceph/metric.h             |  78 ++++++++++++++++++++++++
>>>>    fs/ceph/super.c              |  42 +++++++++++++
>>>>    fs/ceph/super.h              |   2 +
>>>>    include/linux/ceph/ceph_fs.h |   1 +
>>>>    7 files changed, 271 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>> index 608fb5c..f996363 100644
>>>> --- a/fs/ceph/mds_client.c
>>>> +++ b/fs/ceph/mds_client.c
>>>> @@ -4625,6 +4625,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
>>>>    
>>>>    	cancel_work_sync(&mdsc->cap_reclaim_work);
>>>>    	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
>>>> +	cancel_delayed_work_sync(&mdsc->metric.delayed_work); /* cancel timer */
>>>>    
>>>>    	dout("stopped\n");
>>>>    }
>>>> @@ -4667,6 +4668,7 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
>>>>    {
>>>>    	dout("stop\n");
>>>>    	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
>>>> +	cancel_delayed_work_sync(&mdsc->metric.delayed_work); /* cancel timer */
>>>>    	if (mdsc->mdsmap)
>>>>    		ceph_mdsmap_destroy(mdsc->mdsmap);
>>>>    	kfree(mdsc->sessions);
>>>> @@ -4824,6 +4826,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
>>>>    
>>>>    	mutex_unlock(&mdsc->mutex);
>>>>    	schedule_delayed(mdsc);
>>>> +	metric_schedule_delayed(&mdsc->metric);
>>>>    	return;
>>>>    
>>>>    bad_unlock:
>>>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>>>> index bcb3892..3c65ac1 100644
>>>> --- a/fs/ceph/mds_client.h
>>>> +++ b/fs/ceph/mds_client.h
>>>> @@ -28,8 +28,9 @@ enum ceph_feature_type {
>>>>    	CEPHFS_FEATURE_LAZY_CAP_WANTED,
>>>>    	CEPHFS_FEATURE_MULTI_RECONNECT,
>>>>    	CEPHFS_FEATURE_DELEG_INO,
>>>> +	CEPHFS_FEATURE_METRIC_COLLECT,
>>>>    
>>>> -	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_DELEG_INO,
>>>> +	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
>>>>    };
>>>>    
>>>>    /*
>>>> @@ -43,6 +44,7 @@ enum ceph_feature_type {
>>>>    	CEPHFS_FEATURE_LAZY_CAP_WANTED,		\
>>>>    	CEPHFS_FEATURE_MULTI_RECONNECT,		\
>>>>    	CEPHFS_FEATURE_DELEG_INO,		\
>>>> +	CEPHFS_FEATURE_METRIC_COLLECT,		\
>>>>    						\
>>>>    	CEPHFS_FEATURE_MAX,			\
>>>>    }
>>>> diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
>>>> index 9217f35..4267b46 100644
>>>> --- a/fs/ceph/metric.c
>>>> +++ b/fs/ceph/metric.c
>>>> @@ -1,10 +1,150 @@
>>>>    /* SPDX-License-Identifier: GPL-2.0 */
>>>> +#include <linux/ceph/ceph_debug.h>
>>>>    
>>>>    #include <linux/types.h>
>>>>    #include <linux/percpu_counter.h>
>>>>    #include <linux/math64.h>
>>>>    
>>>>    #include "metric.h"
>>>> +#include "mds_client.h"
>>>> +
>>>> +static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
>>>> +				   struct ceph_mds_session *s,
>>>> +				   u64 nr_caps)
>>>> +{
>>>> +	struct ceph_metric_head *head;
>>>> +	struct ceph_metric_cap *cap;
>>>> +	struct ceph_metric_read_latency *read;
>>>> +	struct ceph_metric_write_latency *write;
>>>> +	struct ceph_metric_metadata_latency *meta;
>>>> +	struct ceph_client_metric *m = &mdsc->metric;
>>>> +	struct ceph_msg *msg;
>>>> +	struct timespec64 ts;
>>>> +	s64 sum, total;
>>>> +	s32 items = 0;
>>>> +	s32 len;
>>>> +
>>>> +	len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write)
>>>> +	      + sizeof(*meta);
>>>> +
>>>> +	msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true);
>>>> +	if (!msg) {
>>>> +		pr_err("send metrics to mds%d, failed to allocate message\n",
>>>> +		       s->s_mds);
>>>> +		return false;
>>>> +	}
>>>> +
>>>> +	head = msg->front.iov_base;
>>>> +
>>>> +	/* encode the cap metric */
>>>> +	cap = (struct ceph_metric_cap *)(head + 1);
>>>> +	cap->type = cpu_to_le32(CLIENT_METRIC_TYPE_CAP_INFO);
>>>> +	cap->ver = 1;
>>>> +	cap->compat = 1;
>>>> +	cap->data_len = cpu_to_le32(sizeof(*cap) - 10);
>>>> +	cap->hit = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_hit));
>>>> +	cap->mis = cpu_to_le64(percpu_counter_sum(&mdsc->metric.i_caps_mis));
>>>> +	cap->total = cpu_to_le64(nr_caps);
>>>> +	items++;
>>>> +
>>>> +	/* encode the read latency metric */
>>>> +	read = (struct ceph_metric_read_latency *)(cap + 1);
>>>> +	read->type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
>>>> +	read->ver = 1;
>>>> +	read->compat = 1;
>>>> +	read->data_len = cpu_to_le32(sizeof(*read) - 10);
>>>> +	total = m->total_reads;
>>>> +	sum = m->read_latency_sum;
>>>> +	jiffies_to_timespec64(sum, &ts);
>>>> +	read->sec = cpu_to_le32(ts.tv_sec);
>>>> +	read->nsec = cpu_to_le32(ts.tv_nsec);
>>>> +	items++;
>>>> +
>>>> +	/* encode the write latency metric */
>>>> +	write = (struct ceph_metric_write_latency *)(read + 1);
>>>> +	write->type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
>>>> +	write->ver = 1;
>>>> +	write->compat = 1;
>>>> +	write->data_len = cpu_to_le32(sizeof(*write) - 10);
>>>> +	total = m->total_writes;
>>>> +	sum = m->write_latency_sum;
>>>> +	jiffies_to_timespec64(sum, &ts);
>>>> +	write->sec = cpu_to_le32(ts.tv_sec);
>>>> +	write->nsec = cpu_to_le32(ts.tv_nsec);
>>>> +	items++;
>>>> +
>>>> +	/* encode the metadata latency metric */
>>>> +	meta = (struct ceph_metric_metadata_latency *)(write + 1);
>>>> +	meta->type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
>>>> +	meta->ver = 1;
>>>> +	meta->compat = 1;
>>>> +	meta->data_len = cpu_to_le32(sizeof(*meta) - 10);
>>>> +	total = m->total_metadatas;
>>>> +	sum = m->metadata_latency_sum;
>>>> +	jiffies_to_timespec64(sum, &ts);
>>>> +	meta->sec = cpu_to_le32(ts.tv_sec);
>>>> +	meta->nsec = cpu_to_le32(ts.tv_nsec);
>>>> +	items++;
>>>> +
>>>> +	put_unaligned_le32(items, &head->num);
>>>> +	msg->front.iov_len = cpu_to_le32(len);
>>>> +	msg->hdr.version = cpu_to_le16(1);
>>>> +	msg->hdr.compat_version = cpu_to_le16(1);
>>>> +	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
>>>> +	dout("client%llu send metrics to mds%d\n",
>>>> +	     ceph_client_gid(mdsc->fsc->client), s->s_mds);
>>>> +	ceph_con_send(&s->s_con, msg);
>>>> +
>>>> +	return true;
>>>> +}
>>>> +
>>>> +static void metric_delayed_work(struct work_struct *work)
>>>> +{
>>>> +	struct ceph_client_metric *m =
>>>> +		container_of(work, struct ceph_client_metric, delayed_work.work);
>>>> +	struct ceph_mds_client *mdsc =
>>>> +		container_of(m, struct ceph_mds_client, metric);
>>>> +	struct ceph_mds_session *s;
>>>> +	u64 nr_caps = 0;
>>>> +	bool ret;
>>>> +	int i;
>>>> +
>>>> +	mutex_lock(&mdsc->mutex);
>>>> +	for (i = 0; i < mdsc->max_sessions; i++) {
>>>> +		s = __ceph_lookup_mds_session(mdsc, i);
>>>> +		if (!s)
>>>> +			continue;
>>>> +		nr_caps += s->s_nr_caps;
>>>> +		ceph_put_mds_session(s);
>>>> +	}
>>>> +
>>>> +	for (i = 0; i < mdsc->max_sessions; i++) {
>>>> +		s = __ceph_lookup_mds_session(mdsc, i);
>>>> +		if (!s)
>>>> +			continue;
>>>> +		if (!check_session_state(mdsc, s)) {
>>>> +			ceph_put_mds_session(s);
>>>> +			continue;
>>>> +		}
>>>> +
>>>> +		/*
>>>> +		 * Skip it if MDS doesn't support the metric collection,
>>>> +		 * or the MDS will close the session's socket connection
>>>> +		 * directly when it get this message.
>>>> +		 */
>>>> +		if (!test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &s->s_features))
>>>> +			continue;
>>>> +
>>>> +		/* Only send the metric once in any available session */
>>>> +		ret = ceph_mdsc_send_metrics(mdsc, s, nr_caps);
>>>> +		ceph_put_mds_session(s);
>>>> +		if (ret)
>>>> +			break;
>>>> +	}
>>>> +	mutex_unlock(&mdsc->mutex);
>>>> +
>>>> +	metric_schedule_delayed(&mdsc->metric);
>>>> +}
>>>>    
>>> You're going to be queueing this job up to run every second, even when
>>> none of your MDS's support metrics.
>>>
>>> I think it would be better that we make this job conditional on having
>>> at least one session with an MDS that supports receiving metrics. Maybe
>>> have each MDS session hold a reference to the scheduled job and when the
>>> refcount goes to 0, we cancel it...
>>>
>>> A simpler approach here might be to just give each session its own
>>> struct work, and only queue the work if the session supports metrics.
>>> That way you could just cancel the work as part of each session's
>>> teardown. I think that would also mean you wouldn't need the mdsc->mutex
>>> here either, which would be a bonus.
>> Yeah, we need to enhance the code here.
>>
>> Since we only need to send the metrics to any of the available MDSs and
>> the MDS with rank 0 is responsible to collect  them.  But we still need
>> to traverse all the mdsc->sessions to collect the total cap number and
>> it is hard to get rid of the mdsc->mutex.
>>
>> As you mentioned above we could just add one ref counter to record the
>> total number of MDSs supporting the metric collection, when opening a
>> session & ref counter 0 --> 1 then wake up the work and when closing the
>> session & ref counter 1 --> 0 then cancel it.
>>
> Counting up total caps doesn't seem like a good reason to involve a
> large, coarse-grained mutex here. Instead, let's keep a separate atomic
> counter in the mdsc that gets incremented and decremented whenever
> s_nr_caps is changed. Then you can just fetch that value from the
> sessions stats sending job -- no mutex required.
>
> I think that would be preferable to having to add refcounting to this
> single workqueue job.

Okay, I will fix it.

Thanks.


>
>>>>    int ceph_metric_init(struct ceph_client_metric *m)
>>>>    {
>>>> @@ -51,6 +191,8 @@ int ceph_metric_init(struct ceph_client_metric *m)
>>>>    	m->total_metadatas = 0;
>>>>    	m->metadata_latency_sum = 0;
>>>>    
>>>> +	INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work);
>>>> +
>>>>    	return 0;
>>>>    
>>>>    err_i_caps_mis:
>>>> diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
>>>> index ccd8128..5a1f8b9 100644
>>>> --- a/fs/ceph/metric.h
>>>> +++ b/fs/ceph/metric.h
>>>> @@ -6,6 +6,71 @@
>>>>    #include <linux/percpu_counter.h>
>>>>    #include <linux/ktime.h>
>>>>    
>>>> +extern bool enable_send_metrics;
>>>> +
>>>> +enum ceph_metric_type {
>>>> +	CLIENT_METRIC_TYPE_CAP_INFO,
>>>> +	CLIENT_METRIC_TYPE_READ_LATENCY,
>>>> +	CLIENT_METRIC_TYPE_WRITE_LATENCY,
>>>> +	CLIENT_METRIC_TYPE_METADATA_LATENCY,
>>>> +	CLIENT_METRIC_TYPE_DENTRY_LEASE,
>>>> +
>>>> +	CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE,
>>>> +};
>>>> +
>>>> +/* metric caps header */
>>>> +struct ceph_metric_cap {
>>>> +	__le32 type;     /* ceph metric type */
>>>> +
>>>> +	__u8  ver;
>>>> +	__u8  compat;
>>>> +
>>>> +	__le32 data_len; /* length of sizeof(hit + mis + total) */
>>>> +	__le64 hit;
>>>> +	__le64 mis;
>>>> +	__le64 total;
>>>> +} __packed;
>>>> +
>>>> +/* metric read latency header */
>>>> +struct ceph_metric_read_latency {
>>>> +	__le32 type;     /* ceph metric type */
>>>> +
>>>> +	__u8  ver;
>>>> +	__u8  compat;
>>>> +
>>>> +	__le32 data_len; /* length of sizeof(sec + nsec) */
>>>> +	__le32 sec;
>>>> +	__le32 nsec;
>>>> +} __packed;
>>>> +
>>>> +/* metric write latency header */
>>>> +struct ceph_metric_write_latency {
>>>> +	__le32 type;     /* ceph metric type */
>>>> +
>>>> +	__u8  ver;
>>>> +	__u8  compat;
>>>> +
>>>> +	__le32 data_len; /* length of sizeof(sec + nsec) */
>>>> +	__le32 sec;
>>>> +	__le32 nsec;
>>>> +} __packed;
>>>> +
>>>> +/* metric metadata latency header */
>>>> +struct ceph_metric_metadata_latency {
>>>> +	__le32 type;     /* ceph metric type */
>>>> +
>>>> +	__u8  ver;
>>>> +	__u8  compat;
>>>> +
>>>> +	__le32 data_len; /* length of sizeof(sec + nsec) */
>>>> +	__le32 sec;
>>>> +	__le32 nsec;
>>>> +} __packed;
>>>> +
>>>> +struct ceph_metric_head {
>>>> +	__le32 num;	/* the number of metrics that will be sent */
>>>> +} __packed;
>>>> +
>>>>    /* This is the global metrics */
>>>>    struct ceph_client_metric {
>>>>    	atomic64_t            total_dentries;
>>>> @@ -35,8 +100,21 @@ struct ceph_client_metric {
>>>>    	ktime_t metadata_latency_sq_sum;
>>>>    	ktime_t metadata_latency_min;
>>>>    	ktime_t metadata_latency_max;
>>>> +
>>>> +	struct delayed_work delayed_work;  /* delayed work */
>>>>    };
>>>>    
>>>> +static inline void metric_schedule_delayed(struct ceph_client_metric *m)
>>>> +{
>>>> +	/* per second as default */
>>>> +	unsigned int hz = round_jiffies_relative(HZ * enable_send_metrics);
>>>> +
>>>> +	if (!enable_send_metrics)
>>>> +		return;
>>>> +
>>>> +	schedule_delayed_work(&m->delayed_work, hz);
>>>> +}
>>>> +
>>>>    extern int ceph_metric_init(struct ceph_client_metric *m);
>>>>    extern void ceph_metric_destroy(struct ceph_client_metric *m);
>>>>    
>>>> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
>>>> index c9784eb1..49f20ea 100644
>>>> --- a/fs/ceph/super.c
>>>> +++ b/fs/ceph/super.c
>>>> @@ -27,6 +27,9 @@
>>>>    #include <linux/ceph/auth.h>
>>>>    #include <linux/ceph/debugfs.h>
>>>>    
>>>> +static DEFINE_MUTEX(ceph_fsc_lock);
>>>> +static LIST_HEAD(ceph_fsc_list);
>>>> +
>>>>    /*
>>>>     * Ceph superblock operations
>>>>     *
>>>> @@ -691,6 +694,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
>>>>    	if (!fsc->wb_pagevec_pool)
>>>>    		goto fail_cap_wq;
>>>>    
>>>> +	mutex_lock(&ceph_fsc_lock);
>>>> +	list_add_tail(&fsc->list, &ceph_fsc_list);
>>>> +	mutex_unlock(&ceph_fsc_lock);
>>>> +
>>>>    	return fsc;
>>>>    
>>>>    fail_cap_wq:
>>>> @@ -717,6 +724,10 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
>>>>    {
>>>>    	dout("destroy_fs_client %p\n", fsc);
>>>>    
>>>> +	mutex_lock(&ceph_fsc_lock);
>>>> +	list_del(&fsc->list);
>>>> +	mutex_unlock(&ceph_fsc_lock);
>>>> +
>>>>    	ceph_mdsc_destroy(fsc);
>>>>    	destroy_workqueue(fsc->inode_wq);
>>>>    	destroy_workqueue(fsc->cap_wq);
>>>> @@ -1282,6 +1293,37 @@ static void __exit exit_ceph(void)
>>>>    	destroy_caches();
>>>>    }
>>>>    
>>>> +static int param_set_metrics(const char *val, const struct kernel_param *kp)
>>>> +{
>>>> +	struct ceph_fs_client *fsc;
>>>> +	int ret;
>>>> +
>>>> +	ret = param_set_bool(val, kp);
>>>> +	if (ret) {
>>>> +		pr_err("Failed to parse sending metrics switch value '%s'\n",
>>>> +		       val);
>>>> +		return ret;
>>>> +	} else if (enable_send_metrics) {
>>>> +		// wake up all the mds clients
>>>> +		mutex_lock(&ceph_fsc_lock);
>>>> +		list_for_each_entry(fsc, &ceph_fsc_list, list) {
>>>> +			metric_schedule_delayed(&fsc->mdsc->metric);
>>>> +		}
>>>> +		mutex_unlock(&ceph_fsc_lock);
>>>> +	}
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>> +static const struct kernel_param_ops param_ops_metrics = {
>>>> +	.set = param_set_metrics,
>>>> +	.get = param_get_bool,
>>>> +};
>>>> +
>>>> +bool enable_send_metrics = true;
>>>> +module_param_cb(enable_send_metrics, &param_ops_metrics, &enable_send_metrics, 0644);
>>>> +MODULE_PARM_DESC(enable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)");
>>>> +
>>>>    module_init(init_ceph);
>>>>    module_exit(exit_ceph);
>>>>    
>>>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>>>> index 5a6cdd3..05edc9a 100644
>>>> --- a/fs/ceph/super.h
>>>> +++ b/fs/ceph/super.h
>>>> @@ -101,6 +101,8 @@ struct ceph_mount_options {
>>>>    struct ceph_fs_client {
>>>>    	struct super_block *sb;
>>>>    
>>>> +	struct list_head list;
>>>> +
>>>>    	struct ceph_mount_options *mount_options;
>>>>    	struct ceph_client *client;
>>>>    
>>>> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
>>>> index ebf5ba6..455e9b9 100644
>>>> --- a/include/linux/ceph/ceph_fs.h
>>>> +++ b/include/linux/ceph/ceph_fs.h
>>>> @@ -130,6 +130,7 @@ struct ceph_dir_layout {
>>>>    #define CEPH_MSG_CLIENT_REQUEST         24
>>>>    #define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
>>>>    #define CEPH_MSG_CLIENT_REPLY           26
>>>> +#define CEPH_MSG_CLIENT_METRICS         29
>>>>    #define CEPH_MSG_CLIENT_CAPS            0x310
>>>>    #define CEPH_MSG_CLIENT_LEASE           0x311
>>>>    #define CEPH_MSG_CLIENT_SNAP            0x312
>>

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2020-06-24 12:32 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-22 13:24 [PATCH v3 0/4] ceph: periodically send perf metrics to ceph xiubli
2020-06-22 13:24 ` [PATCH v3 1/4] ceph: add check_session_state helper and make it global xiubli
2020-06-22 13:24 ` [PATCH v3 2/4] ceph: periodically send perf metrics to ceph xiubli
2020-06-23 17:24   ` Jeff Layton
2020-06-24  8:32     ` Xiubo Li
2020-06-24 10:11       ` Jeff Layton
2020-06-24 12:31         ` Xiubo Li
2020-06-22 13:24 ` [PATCH v3 3/4] ceph: switch to WARN_ON and bubble up errnos to the callers xiubli
2020-06-23 18:02   ` Jeff Layton
2020-06-24  0:34     ` Xiubo Li
2020-06-22 13:25 ` [PATCH v3 4/4] ceph: send client provided metric flags in client metadata xiubli

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.