All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/5] libceph: support for replica reads
@ 2020-05-30 15:34 Ilya Dryomov
  2020-05-30 15:34 ` [PATCH v2 1/5] libceph: add non-asserting rbtree insertion helper Ilya Dryomov
                   ` (5 more replies)
  0 siblings, 6 replies; 11+ messages in thread
From: Ilya Dryomov @ 2020-05-30 15:34 UTC (permalink / raw)
  To: ceph-devel; +Cc: Jeff Layton

Hello,

This adds support for replica reads (balanced and localized reads)
to rbd and ceph.  crush_location syntax is slightly different, see
patch 3 for details.

v1 -> v2:
- change crush_location syntax
- rename read_policy to read_from_replica, add read_from_replica=no
- crush_location and read_from_replica are now overridable

Thanks,

                Ilya


Ilya Dryomov (5):
  libceph: add non-asserting rbtree insertion helper
  libceph: decode CRUSH device/bucket types and names
  libceph: crush_location infrastructure
  libceph: support for balanced and localized reads
  libceph: read_from_replica option

 include/linux/ceph/libceph.h    |  13 +-
 include/linux/ceph/osd_client.h |   1 +
 include/linux/ceph/osdmap.h     |  19 +-
 include/linux/crush/crush.h     |   6 +
 net/ceph/ceph_common.c          |  75 +++++++
 net/ceph/crush/crush.c          |   3 +
 net/ceph/debugfs.c              |   6 +-
 net/ceph/osd_client.c           |  92 +++++++-
 net/ceph/osdmap.c               | 363 +++++++++++++++++++++++++++-----
 9 files changed, 517 insertions(+), 61 deletions(-)

-- 
2.19.2

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2 1/5] libceph: add non-asserting rbtree insertion helper
  2020-05-30 15:34 [PATCH v2 0/5] libceph: support for replica reads Ilya Dryomov
@ 2020-05-30 15:34 ` Ilya Dryomov
  2020-05-30 15:34 ` [PATCH v2 2/5] libceph: decode CRUSH device/bucket types and names Ilya Dryomov
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Ilya Dryomov @ 2020-05-30 15:34 UTC (permalink / raw)
  To: ceph-devel; +Cc: Jeff Layton

Needed for the next commit and useful for ceph_pg_pool_info tree as
well.  I'm leaving the asserting helper in for now, but we should look
at getting rid of it in the future.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/libceph.h | 10 ++++--
 net/ceph/osdmap.c            | 60 +++++++-----------------------------
 2 files changed, 19 insertions(+), 51 deletions(-)

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 525b7c3f1c81..4b5a47bcaba4 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -188,7 +188,7 @@ static inline int calc_pages_for(u64 off, u64 len)
 #define RB_CMP3WAY(a, b) ((a) < (b) ? -1 : (a) > (b))
 
 #define DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \
-static void insert_##name(struct rb_root *root, type *t)		\
+static bool __insert_##name(struct rb_root *root, type *t)		\
 {									\
 	struct rb_node **n = &root->rb_node;				\
 	struct rb_node *parent = NULL;					\
@@ -206,11 +206,17 @@ static void insert_##name(struct rb_root *root, type *t)		\
 		else if (cmp > 0)					\
 			n = &(*n)->rb_right;				\
 		else							\
-			BUG();						\
+			return false;					\
 	}								\
 									\
 	rb_link_node(&t->nodefld, parent, n);				\
 	rb_insert_color(&t->nodefld, root);				\
+	return true;							\
+}									\
+static void __maybe_unused insert_##name(struct rb_root *root, type *t)	\
+{									\
+	if (!__insert_##name(root, t))					\
+		BUG();							\
 }									\
 static void erase_##name(struct rb_root *root, type *t)			\
 {									\
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 2a6e63a8edbe..5d00ce2b5339 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -636,48 +636,11 @@ DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare,
 /*
  * rbtree of pg pool info
  */
-static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
-{
-	struct rb_node **p = &root->rb_node;
-	struct rb_node *parent = NULL;
-	struct ceph_pg_pool_info *pi = NULL;
-
-	while (*p) {
-		parent = *p;
-		pi = rb_entry(parent, struct ceph_pg_pool_info, node);
-		if (new->id < pi->id)
-			p = &(*p)->rb_left;
-		else if (new->id > pi->id)
-			p = &(*p)->rb_right;
-		else
-			return -EEXIST;
-	}
-
-	rb_link_node(&new->node, parent, p);
-	rb_insert_color(&new->node, root);
-	return 0;
-}
-
-static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id)
-{
-	struct ceph_pg_pool_info *pi;
-	struct rb_node *n = root->rb_node;
-
-	while (n) {
-		pi = rb_entry(n, struct ceph_pg_pool_info, node);
-		if (id < pi->id)
-			n = n->rb_left;
-		else if (id > pi->id)
-			n = n->rb_right;
-		else
-			return pi;
-	}
-	return NULL;
-}
+DEFINE_RB_FUNCS(pg_pool, struct ceph_pg_pool_info, id, node)
 
 struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id)
 {
-	return __lookup_pg_pool(&map->pg_pools, id);
+	return lookup_pg_pool(&map->pg_pools, id);
 }
 
 const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
@@ -690,8 +653,7 @@ const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
 	if (WARN_ON_ONCE(id > (u64) INT_MAX))
 		return NULL;
 
-	pi = __lookup_pg_pool(&map->pg_pools, (int) id);
-
+	pi = lookup_pg_pool(&map->pg_pools, id);
 	return pi ? pi->name : NULL;
 }
 EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
@@ -714,14 +676,14 @@ u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id)
 {
 	struct ceph_pg_pool_info *pi;
 
-	pi = __lookup_pg_pool(&map->pg_pools, id);
+	pi = lookup_pg_pool(&map->pg_pools, id);
 	return pi ? pi->flags : 0;
 }
 EXPORT_SYMBOL(ceph_pg_pool_flags);
 
 static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
 {
-	rb_erase(&pi->node, root);
+	erase_pg_pool(root, pi);
 	kfree(pi->name);
 	kfree(pi);
 }
@@ -903,7 +865,7 @@ static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
 		ceph_decode_32_safe(p, end, len, bad);
 		dout("  pool %llu len %d\n", pool, len);
 		ceph_decode_need(p, end, len, bad);
-		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		pi = lookup_pg_pool(&map->pg_pools, pool);
 		if (pi) {
 			char *name = kstrndup(*p, len, GFP_NOFS);
 
@@ -1154,18 +1116,18 @@ static int __decode_pools(void **p, void *end, struct ceph_osdmap *map,
 
 		ceph_decode_64_safe(p, end, pool, e_inval);
 
-		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		pi = lookup_pg_pool(&map->pg_pools, pool);
 		if (!incremental || !pi) {
 			pi = kzalloc(sizeof(*pi), GFP_NOFS);
 			if (!pi)
 				return -ENOMEM;
 
+			RB_CLEAR_NODE(&pi->node);
 			pi->id = pool;
 
-			ret = __insert_pg_pool(&map->pg_pools, pi);
-			if (ret) {
+			if (!__insert_pg_pool(&map->pg_pools, pi)) {
 				kfree(pi);
-				return ret;
+				return -EEXIST;
 			}
 		}
 
@@ -1829,7 +1791,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		struct ceph_pg_pool_info *pi;
 
 		ceph_decode_64_safe(p, end, pool, e_inval);
-		pi = __lookup_pg_pool(&map->pg_pools, pool);
+		pi = lookup_pg_pool(&map->pg_pools, pool);
 		if (pi)
 			__remove_pg_pool(&map->pg_pools, pi);
 	}
-- 
2.19.2

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 2/5] libceph: decode CRUSH device/bucket types and names
  2020-05-30 15:34 [PATCH v2 0/5] libceph: support for replica reads Ilya Dryomov
  2020-05-30 15:34 ` [PATCH v2 1/5] libceph: add non-asserting rbtree insertion helper Ilya Dryomov
@ 2020-05-30 15:34 ` Ilya Dryomov
  2020-06-01 10:49   ` Jeff Layton
  2020-05-30 15:34 ` [PATCH v2 3/5] libceph: crush_location infrastructure Ilya Dryomov
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 11+ messages in thread
From: Ilya Dryomov @ 2020-05-30 15:34 UTC (permalink / raw)
  To: ceph-devel; +Cc: Jeff Layton

These would be matched with the provided client location to calculate
the locality value.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/crush/crush.h |  6 +++
 net/ceph/crush/crush.c      |  3 ++
 net/ceph/osdmap.c           | 85 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 92 insertions(+), 2 deletions(-)

diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
index 38b0e4d50ed9..29b0de2e202b 100644
--- a/include/linux/crush/crush.h
+++ b/include/linux/crush/crush.h
@@ -301,6 +301,12 @@ struct crush_map {
 
 	__u32 *choose_tries;
 #else
+	/* device/bucket type id -> type name (CrushWrapper::type_map) */
+	struct rb_root type_names;
+
+	/* device/bucket id -> name (CrushWrapper::name_map) */
+	struct rb_root names;
+
 	/* CrushWrapper::choose_args */
 	struct rb_root choose_args;
 #endif
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 3d70244bc1b6..2e6b29fa8518 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -2,6 +2,7 @@
 #ifdef __KERNEL__
 # include <linux/slab.h>
 # include <linux/crush/crush.h>
+void clear_crush_names(struct rb_root *root);
 void clear_choose_args(struct crush_map *c);
 #else
 # include "crush_compat.h"
@@ -130,6 +131,8 @@ void crush_destroy(struct crush_map *map)
 #ifndef __KERNEL__
 	kfree(map->choose_tries);
 #else
+	clear_crush_names(&map->type_names);
+	clear_crush_names(&map->names);
 	clear_choose_args(map);
 #endif
 	kfree(map);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 5d00ce2b5339..e74130876d3a 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -138,6 +138,79 @@ static int crush_decode_straw2_bucket(void **p, void *end,
 	return -EINVAL;
 }
 
+struct crush_name_node {
+	struct rb_node cn_node;
+	int cn_id;
+	char cn_name[];
+};
+
+static struct crush_name_node *alloc_crush_name(size_t name_len)
+{
+	struct crush_name_node *cn;
+
+	cn = kmalloc(sizeof(*cn) + name_len + 1, GFP_NOIO);
+	if (!cn)
+		return NULL;
+
+	RB_CLEAR_NODE(&cn->cn_node);
+	return cn;
+}
+
+static void free_crush_name(struct crush_name_node *cn)
+{
+	WARN_ON(!RB_EMPTY_NODE(&cn->cn_node));
+
+	kfree(cn);
+}
+
+DEFINE_RB_FUNCS(crush_name, struct crush_name_node, cn_id, cn_node)
+
+static int decode_crush_names(void **p, void *end, struct rb_root *root)
+{
+	u32 n;
+
+	ceph_decode_32_safe(p, end, n, e_inval);
+	while (n--) {
+		struct crush_name_node *cn;
+		int id;
+		u32 name_len;
+
+		ceph_decode_32_safe(p, end, id, e_inval);
+		ceph_decode_32_safe(p, end, name_len, e_inval);
+		ceph_decode_need(p, end, name_len, e_inval);
+
+		cn = alloc_crush_name(name_len);
+		if (!cn)
+			return -ENOMEM;
+
+		cn->cn_id = id;
+		memcpy(cn->cn_name, *p, name_len);
+		cn->cn_name[name_len] = '\0';
+		*p += name_len;
+
+		if (!__insert_crush_name(root, cn)) {
+			free_crush_name(cn);
+			return -EEXIST;
+		}
+	}
+
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+void clear_crush_names(struct rb_root *root)
+{
+	while (!RB_EMPTY_ROOT(root)) {
+		struct crush_name_node *cn =
+		    rb_entry(rb_first(root), struct crush_name_node, cn_node);
+
+		erase_crush_name(root, cn);
+		free_crush_name(cn);
+	}
+}
+
 static struct crush_choose_arg_map *alloc_choose_arg_map(void)
 {
 	struct crush_choose_arg_map *arg_map;
@@ -354,6 +427,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 	if (c == NULL)
 		return ERR_PTR(-ENOMEM);
 
+	c->type_names = RB_ROOT;
+	c->names = RB_ROOT;
 	c->choose_args = RB_ROOT;
 
         /* set tunables to default values */
@@ -510,8 +585,14 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
 		}
 	}
 
-	ceph_decode_skip_map(p, end, 32, string, bad); /* type_map */
-	ceph_decode_skip_map(p, end, 32, string, bad); /* name_map */
+	err = decode_crush_names(p, end, &c->type_names);
+	if (err)
+		goto fail;
+
+	err = decode_crush_names(p, end, &c->names);
+	if (err)
+		goto fail;
+
 	ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */
 
         /* tunables */
-- 
2.19.2

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 3/5] libceph: crush_location infrastructure
  2020-05-30 15:34 [PATCH v2 0/5] libceph: support for replica reads Ilya Dryomov
  2020-05-30 15:34 ` [PATCH v2 1/5] libceph: add non-asserting rbtree insertion helper Ilya Dryomov
  2020-05-30 15:34 ` [PATCH v2 2/5] libceph: decode CRUSH device/bucket types and names Ilya Dryomov
@ 2020-05-30 15:34 ` Ilya Dryomov
  2020-05-31 13:27   ` Jeff Layton
  2020-05-30 15:34 ` [PATCH v2 4/5] libceph: support for balanced and localized reads Ilya Dryomov
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 11+ messages in thread
From: Ilya Dryomov @ 2020-05-30 15:34 UTC (permalink / raw)
  To: ceph-devel; +Cc: Jeff Layton

Allow expressing client's location in terms of CRUSH hierarchy as
a set of (bucket type name, bucket name) pairs.  The userspace syntax
"crush_location = key1=value1 key2=value2" is incompatible with mount
options and needed adaptation.  Key-value pairs are separated by '|'
and we use ':' instead of '=' to separate keys from values.  So for:

  crush_location = host=foo rack=bar

one would write:

  crush_location=host:foo|rack:bar

As in userspace, "multipath" locations are supported, so indicating
locality for parallel hierarchies is possible:

  crush_location=rack:foo1|rack:foo2|datacenter:bar

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/libceph.h |   1 +
 include/linux/ceph/osdmap.h  |  16 ++++-
 net/ceph/ceph_common.c       |  36 +++++++++++
 net/ceph/osdmap.c            | 116 +++++++++++++++++++++++++++++++++++
 4 files changed, 168 insertions(+), 1 deletion(-)

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 4b5a47bcaba4..4733959f1ec7 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -64,6 +64,7 @@ struct ceph_options {
 	int num_mon;
 	char *name;
 	struct ceph_crypto_key *key;
+	struct rb_root crush_locs;
 };
 
 /*
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 5e601975745f..8c9d18cc9f45 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -302,9 +302,23 @@ bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
 int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
 			      const struct ceph_pg *raw_pgid);
 
+struct crush_loc {
+	char *cl_type_name;
+	char *cl_name;
+};
+
+struct crush_loc_node {
+	struct rb_node cl_node;
+	struct crush_loc cl_loc;  /* pointers into cl_data */
+	char cl_data[];
+};
+
+int ceph_parse_crush_location(char *crush_location, struct rb_root *locs);
+int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2);
+void ceph_clear_crush_locs(struct rb_root *locs);
+
 extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
 						    u64 id);
-
 extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
 extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
 u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id);
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index a0e97f6c1072..44770b60bc38 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -176,6 +176,10 @@ int ceph_compare_options(struct ceph_options *new_opt,
 		}
 	}
 
+	ret = ceph_compare_crush_locs(&opt1->crush_locs, &opt2->crush_locs);
+	if (ret)
+		return ret;
+
 	/* any matching mon ip implies a match */
 	for (i = 0; i < opt1->num_mon; i++) {
 		if (ceph_monmap_contains(client->monc.monmap,
@@ -260,6 +264,7 @@ enum {
 	Opt_secret,
 	Opt_key,
 	Opt_ip,
+	Opt_crush_location,
 	/* string args above */
 	Opt_share,
 	Opt_crc,
@@ -274,6 +279,7 @@ static const struct fs_parameter_spec ceph_parameters[] = {
 	fsparam_flag_no ("cephx_require_signatures",	Opt_cephx_require_signatures),
 	fsparam_flag_no ("cephx_sign_messages",		Opt_cephx_sign_messages),
 	fsparam_flag_no ("crc",				Opt_crc),
+	fsparam_string	("crush_location",		Opt_crush_location),
 	fsparam_string	("fsid",			Opt_fsid),
 	fsparam_string	("ip",				Opt_ip),
 	fsparam_string	("key",				Opt_key),
@@ -298,6 +304,7 @@ struct ceph_options *ceph_alloc_options(void)
 	if (!opt)
 		return NULL;
 
+	opt->crush_locs = RB_ROOT;
 	opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
 				GFP_KERNEL);
 	if (!opt->mon_addr) {
@@ -320,6 +327,7 @@ void ceph_destroy_options(struct ceph_options *opt)
 	if (!opt)
 		return;
 
+	ceph_clear_crush_locs(&opt->crush_locs);
 	kfree(opt->name);
 	if (opt->key) {
 		ceph_crypto_key_destroy(opt->key);
@@ -454,6 +462,16 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
 		if (!opt->key)
 			return -ENOMEM;
 		return get_secret(opt->key, param->string, &log);
+	case Opt_crush_location:
+		ceph_clear_crush_locs(&opt->crush_locs);
+		err = ceph_parse_crush_location(param->string,
+						&opt->crush_locs);
+		if (err) {
+			error_plog(&log, "Failed to parse CRUSH location: %d",
+				   err);
+			return err;
+		}
+		break;
 
 	case Opt_osdtimeout:
 		warn_plog(&log, "Ignoring osdtimeout");
@@ -536,6 +554,7 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
 {
 	struct ceph_options *opt = client->options;
 	size_t pos = m->count;
+	struct rb_node *n;
 
 	if (opt->name) {
 		seq_puts(m, "name=");
@@ -545,6 +564,23 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
 	if (opt->key)
 		seq_puts(m, "secret=<hidden>,");
 
+	if (!RB_EMPTY_ROOT(&opt->crush_locs)) {
+		seq_puts(m, "crush_location=");
+		for (n = rb_first(&opt->crush_locs); ; ) {
+			struct crush_loc_node *loc =
+			    rb_entry(n, struct crush_loc_node, cl_node);
+
+			seq_printf(m, "%s:%s", loc->cl_loc.cl_type_name,
+				   loc->cl_loc.cl_name);
+			n = rb_next(n);
+			if (!n)
+				break;
+
+			seq_putc(m, '|');
+		}
+		seq_putc(m, ',');
+	}
+
 	if (opt->flags & CEPH_OPT_FSID)
 		seq_printf(m, "fsid=%pU,", &opt->fsid);
 	if (opt->flags & CEPH_OPT_NOSHARE)
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index e74130876d3a..4b81334e9e5b 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -2715,3 +2715,119 @@ int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
 	return acting.primary;
 }
 EXPORT_SYMBOL(ceph_pg_to_acting_primary);
+
+static struct crush_loc_node *alloc_crush_loc(size_t type_name_len,
+					      size_t name_len)
+{
+	struct crush_loc_node *loc;
+
+	loc = kmalloc(sizeof(*loc) + type_name_len + name_len + 2, GFP_NOIO);
+	if (!loc)
+		return NULL;
+
+	RB_CLEAR_NODE(&loc->cl_node);
+	return loc;
+}
+
+static void free_crush_loc(struct crush_loc_node *loc)
+{
+	WARN_ON(!RB_EMPTY_NODE(&loc->cl_node));
+
+	kfree(loc);
+}
+
+static int crush_loc_compare(const struct crush_loc *loc1,
+			     const struct crush_loc *loc2)
+{
+	return strcmp(loc1->cl_type_name, loc2->cl_type_name) ?:
+	       strcmp(loc1->cl_name, loc2->cl_name);
+}
+
+DEFINE_RB_FUNCS2(crush_loc, struct crush_loc_node, cl_loc, crush_loc_compare,
+		 RB_BYPTR, const struct crush_loc *, cl_node)
+
+/*
+ * Parses a set of <bucket type name>':'<bucket name> pairs separated
+ * by '|', e.g. "rack:foo1|rack:foo2|datacenter:bar".
+ *
+ * Note that @crush_location is modified by strsep().
+ */
+int ceph_parse_crush_location(char *crush_location, struct rb_root *locs)
+{
+	struct crush_loc_node *loc;
+	const char *type_name, *name, *colon;
+	size_t type_name_len, name_len;
+
+	dout("%s '%s'\n", __func__, crush_location);
+	while ((type_name = strsep(&crush_location, "|"))) {
+		colon = strchr(type_name, ':');
+		if (!colon)
+			return -EINVAL;
+
+		type_name_len = colon - type_name;
+		if (type_name_len == 0)
+			return -EINVAL;
+
+		name = colon + 1;
+		name_len = strlen(name);
+		if (name_len == 0)
+			return -EINVAL;
+
+		loc = alloc_crush_loc(type_name_len, name_len);
+		if (!loc)
+			return -ENOMEM;
+
+		loc->cl_loc.cl_type_name = loc->cl_data;
+		memcpy(loc->cl_loc.cl_type_name, type_name, type_name_len);
+		loc->cl_loc.cl_type_name[type_name_len] = '\0';
+
+		loc->cl_loc.cl_name = loc->cl_data + type_name_len + 1;
+		memcpy(loc->cl_loc.cl_name, name, name_len);
+		loc->cl_loc.cl_name[name_len] = '\0';
+
+		if (!__insert_crush_loc(locs, loc)) {
+			free_crush_loc(loc);
+			return -EEXIST;
+		}
+
+		dout("%s type_name '%s' name '%s'\n", __func__,
+		     loc->cl_loc.cl_type_name, loc->cl_loc.cl_name);
+	}
+
+	return 0;
+}
+
+int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2)
+{
+	struct rb_node *n1 = rb_first(locs1);
+	struct rb_node *n2 = rb_first(locs2);
+	int ret;
+
+	for ( ; n1 && n2; n1 = rb_next(n1), n2 = rb_next(n2)) {
+		struct crush_loc_node *loc1 =
+		    rb_entry(n1, struct crush_loc_node, cl_node);
+		struct crush_loc_node *loc2 =
+		    rb_entry(n2, struct crush_loc_node, cl_node);
+
+		ret = crush_loc_compare(&loc1->cl_loc, &loc2->cl_loc);
+		if (ret)
+			return ret;
+	}
+
+	if (!n1 && n2)
+		return -1;
+	if (n1 && !n2)
+		return 1;
+	return 0;
+}
+
+void ceph_clear_crush_locs(struct rb_root *locs)
+{
+	while (!RB_EMPTY_ROOT(locs)) {
+		struct crush_loc_node *loc =
+		    rb_entry(rb_first(locs), struct crush_loc_node, cl_node);
+
+		erase_crush_loc(locs, loc);
+		free_crush_loc(loc);
+	}
+}
-- 
2.19.2

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 4/5] libceph: support for balanced and localized reads
  2020-05-30 15:34 [PATCH v2 0/5] libceph: support for replica reads Ilya Dryomov
                   ` (2 preceding siblings ...)
  2020-05-30 15:34 ` [PATCH v2 3/5] libceph: crush_location infrastructure Ilya Dryomov
@ 2020-05-30 15:34 ` Ilya Dryomov
  2020-05-30 15:34 ` [PATCH v2 5/5] libceph: read_from_replica option Ilya Dryomov
  2020-06-01 10:57 ` [PATCH v2 0/5] libceph: support for replica reads Jeff Layton
  5 siblings, 0 replies; 11+ messages in thread
From: Ilya Dryomov @ 2020-05-30 15:34 UTC (permalink / raw)
  To: ceph-devel; +Cc: Jeff Layton

OSD-side issues with reads from replica have been resolved in
Octopus.  Reading from replica should be safe wrt. unstable or
uncommitted state now, so add support for balanced and localized
reads.

There are two cases when a read from replica can't be served:

- OSD may silently drop the request, expecting the client to
  notice that the acting set has changed and resend via the usual
  means (handled with t->used_replica)

- OSD may return EAGAIN, expecting the client to resend to the
  primary, ignoring replica read flags (see handle_reply())

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/osd_client.h |   1 +
 include/linux/ceph/osdmap.h     |   3 +
 net/ceph/debugfs.c              |   6 +-
 net/ceph/osd_client.c           |  87 +++++++++++++++++++++++++--
 net/ceph/osdmap.c               | 102 ++++++++++++++++++++++++++++++++
 5 files changed, 193 insertions(+), 6 deletions(-)

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 734f7c6a9f56..671fb93e8c60 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -165,6 +165,7 @@ struct ceph_osd_request_target {
 	bool recovery_deletes;
 
 	unsigned int flags;                /* CEPH_OSD_FLAG_* */
+	bool used_replica;
 	bool paused;
 
 	u32 epoch;
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 8c9d18cc9f45..3f4498fef6ad 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -317,6 +317,9 @@ int ceph_parse_crush_location(char *crush_location, struct rb_root *locs);
 int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2);
 void ceph_clear_crush_locs(struct rb_root *locs);
 
+int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id,
+			    struct rb_root *locs);
+
 extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
 						    u64 id);
 extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 1344f232ecc5..409d505ff320 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -81,11 +81,13 @@ static int osdmap_show(struct seq_file *s, void *p)
 		u32 state = map->osd_state[i];
 		char sb[64];
 
-		seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n",
+		seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\t%2d\n",
 			   i, ceph_pr_addr(addr),
 			   ((map->osd_weight[i]*100) >> 16),
 			   ceph_osdmap_state_str(sb, sizeof(sb), state),
-			   ((ceph_get_primary_affinity(map, i)*100) >> 16));
+			   ((ceph_get_primary_affinity(map, i)*100) >> 16),
+			   ceph_get_crush_locality(map, i,
+					   &client->options->crush_locs));
 	}
 	for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) {
 		struct ceph_pg_mapping *pg =
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ece124a5138e..4ce6cdc744e4 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1497,6 +1497,45 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc,
 	       (osdc->osdmap->epoch < osdc->epoch_barrier);
 }
 
+static int pick_random_replica(const struct ceph_osds *acting)
+{
+	int i = prandom_u32() % acting->size;
+
+	dout("%s picked osd%d, primary osd%d\n", __func__,
+	     acting->osds[i], acting->primary);
+	return i;
+}
+
+/*
+ * Picks the closest replica based on client's location given by
+ * crush_location option.  Prefers the primary if the locality is
+ * the same.
+ */
+static int pick_closest_replica(struct ceph_osd_client *osdc,
+				const struct ceph_osds *acting)
+{
+	struct ceph_options *opt = osdc->client->options;
+	int best_i, best_locality;
+	int i = 0, locality;
+
+	do {
+		locality = ceph_get_crush_locality(osdc->osdmap,
+						   acting->osds[i],
+						   &opt->crush_locs);
+		if (i == 0 ||
+		    (locality >= 0 && best_locality < 0) ||
+		    (locality >= 0 && best_locality >= 0 &&
+		     locality < best_locality)) {
+			best_i = i;
+			best_locality = locality;
+		}
+	} while (++i < acting->size);
+
+	dout("%s picked osd%d with locality %d, primary osd%d\n", __func__,
+	     acting->osds[best_i], best_locality, acting->primary);
+	return best_i;
+}
+
 enum calc_target_result {
 	CALC_TARGET_NO_ACTION = 0,
 	CALC_TARGET_NEED_RESEND,
@@ -1510,6 +1549,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 	struct ceph_pg_pool_info *pi;
 	struct ceph_pg pgid, last_pgid;
 	struct ceph_osds up, acting;
+	bool is_read = t->flags & CEPH_OSD_FLAG_READ;
+	bool is_write = t->flags & CEPH_OSD_FLAG_WRITE;
 	bool force_resend = false;
 	bool unpaused = false;
 	bool legacy_change = false;
@@ -1540,9 +1581,9 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 	ceph_oid_copy(&t->target_oid, &t->base_oid);
 	ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
 	if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
-		if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
+		if (is_read && pi->read_tier >= 0)
 			t->target_oloc.pool = pi->read_tier;
-		if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
+		if (is_write && pi->write_tier >= 0)
 			t->target_oloc.pool = pi->write_tier;
 
 		pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool);
@@ -1581,7 +1622,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 		unpaused = true;
 	}
 	legacy_change = ceph_pg_compare(&t->pgid, &pgid) ||
-			ceph_osds_changed(&t->acting, &acting, any_change);
+			ceph_osds_changed(&t->acting, &acting,
+					  t->used_replica || any_change);
 	if (t->pg_num)
 		split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num);
 
@@ -1597,7 +1639,24 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
 		t->sort_bitwise = sort_bitwise;
 		t->recovery_deletes = recovery_deletes;
 
-		t->osd = acting.primary;
+		if ((t->flags & (CEPH_OSD_FLAG_BALANCE_READS |
+				 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
+		    !is_write && pi->type == CEPH_POOL_TYPE_REP &&
+		    acting.size > 1) {
+			int pos;
+
+			WARN_ON(!is_read || acting.osds[0] != acting.primary);
+			if (t->flags & CEPH_OSD_FLAG_BALANCE_READS) {
+				pos = pick_random_replica(&acting);
+			} else {
+				pos = pick_closest_replica(osdc, &acting);
+			}
+			t->osd = acting.osds[pos];
+			t->used_replica = pos > 0;
+		} else {
+			t->osd = acting.primary;
+			t->used_replica = false;
+		}
 	}
 
 	if (unpaused || legacy_change || force_resend || split)
@@ -3660,6 +3719,26 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
 		goto out_unlock_osdc;
 	}
 
+	if (m.result == -EAGAIN) {
+		dout("req %p tid %llu EAGAIN\n", req, req->r_tid);
+		unlink_request(osd, req);
+		mutex_unlock(&osd->lock);
+
+		/*
+		 * The object is missing on the replica or not (yet)
+		 * readable.  Clear pgid to force a resend to the primary
+		 * via legacy_change.
+		 */
+		req->r_t.pgid.pool = 0;
+		req->r_t.pgid.seed = 0;
+		WARN_ON(!req->r_t.used_replica);
+		req->r_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS |
+				  CEPH_OSD_FLAG_LOCALIZE_READS);
+		req->r_tid = 0;
+		__submit_request(req, false);
+		goto out_unlock_osdc;
+	}
+
 	if (m.num_ops != req->r_num_ops) {
 		pr_err("num_ops %d != %d for tid %llu\n", m.num_ops,
 		       req->r_num_ops, req->r_tid);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 4b81334e9e5b..96c25f5e064a 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -2831,3 +2831,105 @@ void ceph_clear_crush_locs(struct rb_root *locs)
 		free_crush_loc(loc);
 	}
 }
+
+/*
+ * [a-zA-Z0-9-_.]+
+ */
+static bool is_valid_crush_name(const char *name)
+{
+	do {
+		if (!('a' <= *name && *name <= 'z') &&
+		    !('A' <= *name && *name <= 'Z') &&
+		    !('0' <= *name && *name <= '9') &&
+		    *name != '-' && *name != '_' && *name != '.')
+			return false;
+	} while (*++name != '\0');
+
+	return true;
+}
+
+/*
+ * Gets the parent of an item.  Returns its id (<0 because the
+ * parent is always a bucket), type id (>0 for the same reason,
+ * via @parent_type_id) and location (via @parent_loc).  If no
+ * parent, returns 0.
+ *
+ * Does a linear search, as there are no parent pointers of any
+ * kind.  Note that the result is ambigous for items that occur
+ * multiple times in the map.
+ */
+static int get_immediate_parent(struct crush_map *c, int id,
+				u16 *parent_type_id,
+				struct crush_loc *parent_loc)
+{
+	struct crush_bucket *b;
+	struct crush_name_node *type_cn, *cn;
+	int i, j;
+
+	for (i = 0; i < c->max_buckets; i++) {
+		b = c->buckets[i];
+		if (!b)
+			continue;
+
+		/* ignore per-class shadow hierarchy */
+		cn = lookup_crush_name(&c->names, b->id);
+		if (!cn || !is_valid_crush_name(cn->cn_name))
+			continue;
+
+		for (j = 0; j < b->size; j++) {
+			if (b->items[j] != id)
+				continue;
+
+			*parent_type_id = b->type;
+			type_cn = lookup_crush_name(&c->type_names, b->type);
+			parent_loc->cl_type_name = type_cn->cn_name;
+			parent_loc->cl_name = cn->cn_name;
+			return b->id;
+		}
+	}
+
+	return 0;  /* no parent */
+}
+
+/*
+ * Calculates the locality/distance from an item to a client
+ * location expressed in terms of CRUSH hierarchy as a set of
+ * (bucket type name, bucket name) pairs.  Specifically, looks
+ * for the lowest-valued bucket type for which the location of
+ * @id matches one of the locations in @locs, so for standard
+ * bucket types (host = 1, rack = 3, datacenter = 8, zone = 9)
+ * a matching host is closer than a matching rack and a matching
+ * data center is closer than a matching zone.
+ *
+ * Specifying multiple locations (a "multipath" location) such
+ * as "rack=foo1 rack=foo2 datacenter=bar" is allowed -- @locs
+ * is a multimap.  The locality will be:
+ *
+ * - 3 for OSDs in racks foo1 and foo2
+ * - 8 for OSDs in data center bar
+ * - -1 for all other OSDs
+ *
+ * The lowest possible bucket type is 1, so the best locality
+ * for an OSD is 1 (i.e. a matching host).  Locality 0 would be
+ * the OSD itself.
+ */
+int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id,
+			    struct rb_root *locs)
+{
+	struct crush_loc loc;
+	u16 type_id;
+
+	/*
+	 * Instead of repeated get_immediate_parent() calls,
+	 * the location of @id could be obtained with a single
+	 * depth-first traversal.
+	 */
+	for (;;) {
+		id = get_immediate_parent(osdmap->crush, id, &type_id, &loc);
+		if (id >= 0)
+			return -1;  /* not local */
+
+		if (lookup_crush_loc(locs, &loc))
+			return type_id;
+	}
+}
-- 
2.19.2

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH v2 5/5] libceph: read_from_replica option
  2020-05-30 15:34 [PATCH v2 0/5] libceph: support for replica reads Ilya Dryomov
                   ` (3 preceding siblings ...)
  2020-05-30 15:34 ` [PATCH v2 4/5] libceph: support for balanced and localized reads Ilya Dryomov
@ 2020-05-30 15:34 ` Ilya Dryomov
  2020-06-01 10:57 ` [PATCH v2 0/5] libceph: support for replica reads Jeff Layton
  5 siblings, 0 replies; 11+ messages in thread
From: Ilya Dryomov @ 2020-05-30 15:34 UTC (permalink / raw)
  To: ceph-devel; +Cc: Jeff Layton

Expose replica reads through read_from_replica=balance and
read_from_replica=localize.  The default is to read from primary
(read_from_replica=no).

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/libceph.h |  2 ++
 net/ceph/ceph_common.c       | 39 ++++++++++++++++++++++++++++++++++++
 net/ceph/osd_client.c        |  5 ++++-
 3 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 4733959f1ec7..0a9f807ceda6 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -52,6 +52,8 @@ struct ceph_options {
 	unsigned long osd_idle_ttl;		/* jiffies */
 	unsigned long osd_keepalive_timeout;	/* jiffies */
 	unsigned long osd_request_timeout;	/* jiffies */
+	unsigned int osd_req_flags;  /* CEPH_OSD_FLAG_*, applied to
+					each OSD request */
 
 	/*
 	 * any type that can't be simply compared or doesn't need
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 44770b60bc38..9bab3e9a039b 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -265,6 +265,7 @@ enum {
 	Opt_key,
 	Opt_ip,
 	Opt_crush_location,
+	Opt_read_from_replica,
 	/* string args above */
 	Opt_share,
 	Opt_crc,
@@ -274,6 +275,19 @@ enum {
 	Opt_abort_on_full,
 };
 
+enum {
+	Opt_read_from_replica_no,
+	Opt_read_from_replica_balance,
+	Opt_read_from_replica_localize,
+};
+
+static const struct constant_table ceph_param_read_from_replica[] = {
+	{"no",		Opt_read_from_replica_no},
+	{"balance",	Opt_read_from_replica_balance},
+	{"localize",	Opt_read_from_replica_localize},
+	{}
+};
+
 static const struct fs_parameter_spec ceph_parameters[] = {
 	fsparam_flag	("abort_on_full",		Opt_abort_on_full),
 	fsparam_flag_no ("cephx_require_signatures",	Opt_cephx_require_signatures),
@@ -290,6 +304,8 @@ static const struct fs_parameter_spec ceph_parameters[] = {
 	fsparam_u32	("osdkeepalive",		Opt_osdkeepalivetimeout),
 	__fsparam	(fs_param_is_s32, "osdtimeout", Opt_osdtimeout,
 			 fs_param_deprecated, NULL),
+	fsparam_enum	("read_from_replica",		Opt_read_from_replica,
+			 ceph_param_read_from_replica),
 	fsparam_string	("secret",			Opt_secret),
 	fsparam_flag_no ("share",			Opt_share),
 	fsparam_flag_no ("tcp_nodelay",			Opt_tcp_nodelay),
@@ -472,6 +488,24 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
 			return err;
 		}
 		break;
+	case Opt_read_from_replica:
+		switch (result.uint_32) {
+		case Opt_read_from_replica_no:
+			opt->osd_req_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS |
+						CEPH_OSD_FLAG_LOCALIZE_READS);
+			break;
+		case Opt_read_from_replica_balance:
+			opt->osd_req_flags |= CEPH_OSD_FLAG_BALANCE_READS;
+			opt->osd_req_flags &= ~CEPH_OSD_FLAG_LOCALIZE_READS;
+			break;
+		case Opt_read_from_replica_localize:
+			opt->osd_req_flags |= CEPH_OSD_FLAG_LOCALIZE_READS;
+			opt->osd_req_flags &= ~CEPH_OSD_FLAG_BALANCE_READS;
+			break;
+		default:
+			BUG();
+		}
+		break;
 
 	case Opt_osdtimeout:
 		warn_plog(&log, "Ignoring osdtimeout");
@@ -580,6 +614,11 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
 		}
 		seq_putc(m, ',');
 	}
+	if (opt->osd_req_flags & CEPH_OSD_FLAG_BALANCE_READS) {
+		seq_puts(m, "read_from_replica=balance,");
+	} else if (opt->osd_req_flags & CEPH_OSD_FLAG_LOCALIZE_READS) {
+		seq_puts(m, "read_from_replica=localize,");
+	}
 
 	if (opt->flags & CEPH_OPT_FSID)
 		seq_printf(m, "fsid=%pU,", &opt->fsid);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 4ce6cdc744e4..22733e844be1 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2425,11 +2425,14 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
 
 static void account_request(struct ceph_osd_request *req)
 {
+	struct ceph_osd_client *osdc = req->r_osdc;
+
 	WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK));
 	WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE)));
 
 	req->r_flags |= CEPH_OSD_FLAG_ONDISK;
-	atomic_inc(&req->r_osdc->num_requests);
+	req->r_flags |= osdc->client->options->osd_req_flags;
+	atomic_inc(&osdc->num_requests);
 
 	req->r_start_stamp = jiffies;
 	req->r_start_latency = ktime_get();
-- 
2.19.2

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 3/5] libceph: crush_location infrastructure
  2020-05-30 15:34 ` [PATCH v2 3/5] libceph: crush_location infrastructure Ilya Dryomov
@ 2020-05-31 13:27   ` Jeff Layton
  2020-05-31 21:07     ` Ilya Dryomov
  0 siblings, 1 reply; 11+ messages in thread
From: Jeff Layton @ 2020-05-31 13:27 UTC (permalink / raw)
  To: Ilya Dryomov, ceph-devel

On Sat, 2020-05-30 at 17:34 +0200, Ilya Dryomov wrote:
> Allow expressing client's location in terms of CRUSH hierarchy as
> a set of (bucket type name, bucket name) pairs.  The userspace syntax
> "crush_location = key1=value1 key2=value2" is incompatible with mount
> options and needed adaptation.  Key-value pairs are separated by '|'
> and we use ':' instead of '=' to separate keys from values.  So for:
> 
>   crush_location = host=foo rack=bar
> 
> one would write:
> 
>   crush_location=host:foo|rack:bar
> 
> As in userspace, "multipath" locations are supported, so indicating
> locality for parallel hierarchies is possible:
> 
>   crush_location=rack:foo1|rack:foo2|datacenter:bar
> 

This looks much nicer.

The only caveat I have is that using '|' means that you'll need to quote
or escape the option string on the command line or in shell scripts
(lest the shell consider it a pipeline directive).

It's hard to find ascii special characters that don't have _some_
special meaning however. '/' would fill the bill, but I understand what
you mean about that having connotations of a pathname.
 

> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
> ---
>  include/linux/ceph/libceph.h |   1 +
>  include/linux/ceph/osdmap.h  |  16 ++++-
>  net/ceph/ceph_common.c       |  36 +++++++++++
>  net/ceph/osdmap.c            | 116 +++++++++++++++++++++++++++++++++++
>  4 files changed, 168 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
> index 4b5a47bcaba4..4733959f1ec7 100644
> --- a/include/linux/ceph/libceph.h
> +++ b/include/linux/ceph/libceph.h
> @@ -64,6 +64,7 @@ struct ceph_options {
>  	int num_mon;
>  	char *name;
>  	struct ceph_crypto_key *key;
> +	struct rb_root crush_locs;
>  };
>  
>  /*
> diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
> index 5e601975745f..8c9d18cc9f45 100644
> --- a/include/linux/ceph/osdmap.h
> +++ b/include/linux/ceph/osdmap.h
> @@ -302,9 +302,23 @@ bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
>  int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
>  			      const struct ceph_pg *raw_pgid);
>  
> +struct crush_loc {
> +	char *cl_type_name;
> +	char *cl_name;
> +};
> +
> +struct crush_loc_node {
> +	struct rb_node cl_node;
> +	struct crush_loc cl_loc;  /* pointers into cl_data */
> +	char cl_data[];
> +};
> +
> +int ceph_parse_crush_location(char *crush_location, struct rb_root *locs);
> +int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2);
> +void ceph_clear_crush_locs(struct rb_root *locs);
> +
>  extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
>  						    u64 id);
> -
>  extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
>  extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
>  u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id);
> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
> index a0e97f6c1072..44770b60bc38 100644
> --- a/net/ceph/ceph_common.c
> +++ b/net/ceph/ceph_common.c
> @@ -176,6 +176,10 @@ int ceph_compare_options(struct ceph_options *new_opt,
>  		}
>  	}
>  
> +	ret = ceph_compare_crush_locs(&opt1->crush_locs, &opt2->crush_locs);
> +	if (ret)
> +		return ret;
> +
>  	/* any matching mon ip implies a match */
>  	for (i = 0; i < opt1->num_mon; i++) {
>  		if (ceph_monmap_contains(client->monc.monmap,
> @@ -260,6 +264,7 @@ enum {
>  	Opt_secret,
>  	Opt_key,
>  	Opt_ip,
> +	Opt_crush_location,
>  	/* string args above */
>  	Opt_share,
>  	Opt_crc,
> @@ -274,6 +279,7 @@ static const struct fs_parameter_spec ceph_parameters[] = {
>  	fsparam_flag_no ("cephx_require_signatures",	Opt_cephx_require_signatures),
>  	fsparam_flag_no ("cephx_sign_messages",		Opt_cephx_sign_messages),
>  	fsparam_flag_no ("crc",				Opt_crc),
> +	fsparam_string	("crush_location",		Opt_crush_location),
>  	fsparam_string	("fsid",			Opt_fsid),
>  	fsparam_string	("ip",				Opt_ip),
>  	fsparam_string	("key",				Opt_key),
> @@ -298,6 +304,7 @@ struct ceph_options *ceph_alloc_options(void)
>  	if (!opt)
>  		return NULL;
>  
> +	opt->crush_locs = RB_ROOT;
>  	opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
>  				GFP_KERNEL);
>  	if (!opt->mon_addr) {
> @@ -320,6 +327,7 @@ void ceph_destroy_options(struct ceph_options *opt)
>  	if (!opt)
>  		return;
>  
> +	ceph_clear_crush_locs(&opt->crush_locs);
>  	kfree(opt->name);
>  	if (opt->key) {
>  		ceph_crypto_key_destroy(opt->key);
> @@ -454,6 +462,16 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
>  		if (!opt->key)
>  			return -ENOMEM;
>  		return get_secret(opt->key, param->string, &log);
> +	case Opt_crush_location:
> +		ceph_clear_crush_locs(&opt->crush_locs);
> +		err = ceph_parse_crush_location(param->string,
> +						&opt->crush_locs);
> +		if (err) {
> +			error_plog(&log, "Failed to parse CRUSH location: %d",
> +				   err);
> +			return err;
> +		}
> +		break;
>  
>  	case Opt_osdtimeout:
>  		warn_plog(&log, "Ignoring osdtimeout");
> @@ -536,6 +554,7 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
>  {
>  	struct ceph_options *opt = client->options;
>  	size_t pos = m->count;
> +	struct rb_node *n;
>  
>  	if (opt->name) {
>  		seq_puts(m, "name=");
> @@ -545,6 +564,23 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
>  	if (opt->key)
>  		seq_puts(m, "secret=<hidden>,");
>  
> +	if (!RB_EMPTY_ROOT(&opt->crush_locs)) {
> +		seq_puts(m, "crush_location=");
> +		for (n = rb_first(&opt->crush_locs); ; ) {
> +			struct crush_loc_node *loc =
> +			    rb_entry(n, struct crush_loc_node, cl_node);
> +
> +			seq_printf(m, "%s:%s", loc->cl_loc.cl_type_name,
> +				   loc->cl_loc.cl_name);
> +			n = rb_next(n);
> +			if (!n)
> +				break;
> +
> +			seq_putc(m, '|');
> +		}
> +		seq_putc(m, ',');
> +	}
> +
>  	if (opt->flags & CEPH_OPT_FSID)
>  		seq_printf(m, "fsid=%pU,", &opt->fsid);
>  	if (opt->flags & CEPH_OPT_NOSHARE)
> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
> index e74130876d3a..4b81334e9e5b 100644
> --- a/net/ceph/osdmap.c
> +++ b/net/ceph/osdmap.c
> @@ -2715,3 +2715,119 @@ int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
>  	return acting.primary;
>  }
>  EXPORT_SYMBOL(ceph_pg_to_acting_primary);
> +
> +static struct crush_loc_node *alloc_crush_loc(size_t type_name_len,
> +					      size_t name_len)
> +{
> +	struct crush_loc_node *loc;
> +
> +	loc = kmalloc(sizeof(*loc) + type_name_len + name_len + 2, GFP_NOIO);
> +	if (!loc)
> +		return NULL;
> +
> +	RB_CLEAR_NODE(&loc->cl_node);
> +	return loc;
> +}
> +
> +static void free_crush_loc(struct crush_loc_node *loc)
> +{
> +	WARN_ON(!RB_EMPTY_NODE(&loc->cl_node));
> +
> +	kfree(loc);
> +}
> +
> +static int crush_loc_compare(const struct crush_loc *loc1,
> +			     const struct crush_loc *loc2)
> +{
> +	return strcmp(loc1->cl_type_name, loc2->cl_type_name) ?:
> +	       strcmp(loc1->cl_name, loc2->cl_name);
> +}
> +
> +DEFINE_RB_FUNCS2(crush_loc, struct crush_loc_node, cl_loc, crush_loc_compare,
> +		 RB_BYPTR, const struct crush_loc *, cl_node)
> +
> +/*
> + * Parses a set of <bucket type name>':'<bucket name> pairs separated
> + * by '|', e.g. "rack:foo1|rack:foo2|datacenter:bar".
> + *
> + * Note that @crush_location is modified by strsep().
> + */
> +int ceph_parse_crush_location(char *crush_location, struct rb_root *locs)
> +{
> +	struct crush_loc_node *loc;
> +	const char *type_name, *name, *colon;
> +	size_t type_name_len, name_len;
> +
> +	dout("%s '%s'\n", __func__, crush_location);
> +	while ((type_name = strsep(&crush_location, "|"))) {
> +		colon = strchr(type_name, ':');
> +		if (!colon)
> +			return -EINVAL;
> +
> +		type_name_len = colon - type_name;
> +		if (type_name_len == 0)
> +			return -EINVAL;
> +
> +		name = colon + 1;
> +		name_len = strlen(name);
> +		if (name_len == 0)
> +			return -EINVAL;
> +
> +		loc = alloc_crush_loc(type_name_len, name_len);
> +		if (!loc)
> +			return -ENOMEM;
> +
> +		loc->cl_loc.cl_type_name = loc->cl_data;
> +		memcpy(loc->cl_loc.cl_type_name, type_name, type_name_len);
> +		loc->cl_loc.cl_type_name[type_name_len] = '\0';
> +
> +		loc->cl_loc.cl_name = loc->cl_data + type_name_len + 1;
> +		memcpy(loc->cl_loc.cl_name, name, name_len);
> +		loc->cl_loc.cl_name[name_len] = '\0';
> +
> +		if (!__insert_crush_loc(locs, loc)) {
> +			free_crush_loc(loc);
> +			return -EEXIST;
> +		}
> +
> +		dout("%s type_name '%s' name '%s'\n", __func__,
> +		     loc->cl_loc.cl_type_name, loc->cl_loc.cl_name);
> +	}
> +
> +	return 0;
> +}
> +
> +int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2)
> +{
> +	struct rb_node *n1 = rb_first(locs1);
> +	struct rb_node *n2 = rb_first(locs2);
> +	int ret;
> +
> +	for ( ; n1 && n2; n1 = rb_next(n1), n2 = rb_next(n2)) {
> +		struct crush_loc_node *loc1 =
> +		    rb_entry(n1, struct crush_loc_node, cl_node);
> +		struct crush_loc_node *loc2 =
> +		    rb_entry(n2, struct crush_loc_node, cl_node);
> +
> +		ret = crush_loc_compare(&loc1->cl_loc, &loc2->cl_loc);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	if (!n1 && n2)
> +		return -1;
> +	if (n1 && !n2)
> +		return 1;
> +	return 0;
> +}
> +
> +void ceph_clear_crush_locs(struct rb_root *locs)
> +{
> +	while (!RB_EMPTY_ROOT(locs)) {
> +		struct crush_loc_node *loc =
> +		    rb_entry(rb_first(locs), struct crush_loc_node, cl_node);
> +
> +		erase_crush_loc(locs, loc);
> +		free_crush_loc(loc);
> +	}
> +}

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 3/5] libceph: crush_location infrastructure
  2020-05-31 13:27   ` Jeff Layton
@ 2020-05-31 21:07     ` Ilya Dryomov
  0 siblings, 0 replies; 11+ messages in thread
From: Ilya Dryomov @ 2020-05-31 21:07 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Ceph Development, Josh Durgin, Sage Weil, Jason Dillaman,
	Patrick Donnelly

On Sun, May 31, 2020 at 3:27 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Sat, 2020-05-30 at 17:34 +0200, Ilya Dryomov wrote:
> > Allow expressing client's location in terms of CRUSH hierarchy as
> > a set of (bucket type name, bucket name) pairs.  The userspace syntax
> > "crush_location = key1=value1 key2=value2" is incompatible with mount
> > options and needed adaptation.  Key-value pairs are separated by '|'
> > and we use ':' instead of '=' to separate keys from values.  So for:
> >
> >   crush_location = host=foo rack=bar
> >
> > one would write:
> >
> >   crush_location=host:foo|rack:bar
> >
> > As in userspace, "multipath" locations are supported, so indicating
> > locality for parallel hierarchies is possible:
> >
> >   crush_location=rack:foo1|rack:foo2|datacenter:bar
> >
>
> This looks much nicer.
>
> The only caveat I have is that using '|' means that you'll need to quote
> or escape the option string on the command line or in shell scripts
> (lest the shell consider it a pipeline directive).
>
> It's hard to find ascii special characters that don't have _some_
> special meaning however. '/' would fill the bill, but I understand what
> you mean about that having connotations of a pathname.

Yeah, the problem with '/' is that crush_location of the form
"host:foo/rack:bar/row:baz" is not a path to the root of the tree,
but rather just a set of nodes: foo, bar and baz can belong to
completely different subtrees and would be matched independent of
each other.  The only reason that is not ambiguous is that CRUSH
enforces globally unique bucket names.

Having to escape is unfortunate, but I thought it was a (slightly)
lesser evil.  If folks think that '/' isn't that confusing or have
other ideas, now is the time to speak up!

Thanks,

                Ilya

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 2/5] libceph: decode CRUSH device/bucket types and names
  2020-05-30 15:34 ` [PATCH v2 2/5] libceph: decode CRUSH device/bucket types and names Ilya Dryomov
@ 2020-06-01 10:49   ` Jeff Layton
  2020-06-01 11:14     ` Ilya Dryomov
  0 siblings, 1 reply; 11+ messages in thread
From: Jeff Layton @ 2020-06-01 10:49 UTC (permalink / raw)
  To: Ilya Dryomov, ceph-devel

On Sat, 2020-05-30 at 17:34 +0200, Ilya Dryomov wrote:
> These would be matched with the provided client location to calculate
> the locality value.
> 
> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
> ---
>  include/linux/crush/crush.h |  6 +++
>  net/ceph/crush/crush.c      |  3 ++
>  net/ceph/osdmap.c           | 85 ++++++++++++++++++++++++++++++++++++-
>  3 files changed, 92 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
> index 38b0e4d50ed9..29b0de2e202b 100644
> --- a/include/linux/crush/crush.h
> +++ b/include/linux/crush/crush.h
> @@ -301,6 +301,12 @@ struct crush_map {
>  
>  	__u32 *choose_tries;
>  #else
> +	/* device/bucket type id -> type name (CrushWrapper::type_map) */
> +	struct rb_root type_names;
> +
> +	/* device/bucket id -> name (CrushWrapper::name_map) */
> +	struct rb_root names;
> +

I'm not as familiar with the OSD client-side code, but I don't see any
mention of locking here. What protects these new rbtrees? From reading
over the code, I'd assume the osdc->lock, but it might be nice to have a
comment here that makes that clear.

>  	/* CrushWrapper::choose_args */
>  	struct rb_root choose_args;
>  #endif
> diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
> index 3d70244bc1b6..2e6b29fa8518 100644
> --- a/net/ceph/crush/crush.c
> +++ b/net/ceph/crush/crush.c
> @@ -2,6 +2,7 @@
>  #ifdef __KERNEL__
>  # include <linux/slab.h>
>  # include <linux/crush/crush.h>
> +void clear_crush_names(struct rb_root *root);
>  void clear_choose_args(struct crush_map *c);
>  #else
>  # include "crush_compat.h"
> @@ -130,6 +131,8 @@ void crush_destroy(struct crush_map *map)
>  #ifndef __KERNEL__
>  	kfree(map->choose_tries);
>  #else
> +	clear_crush_names(&map->type_names);
> +	clear_crush_names(&map->names);
>  	clear_choose_args(map);
>  #endif
>  	kfree(map);
> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
> index 5d00ce2b5339..e74130876d3a 100644
> --- a/net/ceph/osdmap.c
> +++ b/net/ceph/osdmap.c
> @@ -138,6 +138,79 @@ static int crush_decode_straw2_bucket(void **p, void *end,
>  	return -EINVAL;
>  }
>  
> +struct crush_name_node {
> +	struct rb_node cn_node;
> +	int cn_id;
> +	char cn_name[];
> +};
> +
> +static struct crush_name_node *alloc_crush_name(size_t name_len)
> +{
> +	struct crush_name_node *cn;
> +
> +	cn = kmalloc(sizeof(*cn) + name_len + 1, GFP_NOIO);
> +	if (!cn)
> +		return NULL;
> +
> +	RB_CLEAR_NODE(&cn->cn_node);
> +	return cn;
> +}
> +
> +static void free_crush_name(struct crush_name_node *cn)
> +{
> +	WARN_ON(!RB_EMPTY_NODE(&cn->cn_node));
> +
> +	kfree(cn);
> +}
> +
> +DEFINE_RB_FUNCS(crush_name, struct crush_name_node, cn_id, cn_node)
> +
> +static int decode_crush_names(void **p, void *end, struct rb_root *root)
> +{
> +	u32 n;
> +
> +	ceph_decode_32_safe(p, end, n, e_inval);
> +	while (n--) {
> +		struct crush_name_node *cn;
> +		int id;
> +		u32 name_len;
> +
> +		ceph_decode_32_safe(p, end, id, e_inval);
> +		ceph_decode_32_safe(p, end, name_len, e_inval);
> +		ceph_decode_need(p, end, name_len, e_inval);
> +
> +		cn = alloc_crush_name(name_len);
> +		if (!cn)
> +			return -ENOMEM;
> +
> +		cn->cn_id = id;
> +		memcpy(cn->cn_name, *p, name_len);
> +		cn->cn_name[name_len] = '\0';
> +		*p += name_len;
> +
> +		if (!__insert_crush_name(root, cn)) {
> +			free_crush_name(cn);
> +			return -EEXIST;
> +		}
> +	}
> +
> +	return 0;
> +
> +e_inval:
> +	return -EINVAL;
> +}
> +
> +void clear_crush_names(struct rb_root *root)
> +{
> +	while (!RB_EMPTY_ROOT(root)) {
> +		struct crush_name_node *cn =
> +		    rb_entry(rb_first(root), struct crush_name_node, cn_node);
> +
> +		erase_crush_name(root, cn);
> +		free_crush_name(cn);
> +	}
> +}
> +
>  static struct crush_choose_arg_map *alloc_choose_arg_map(void)
>  {
>  	struct crush_choose_arg_map *arg_map;
> @@ -354,6 +427,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
>  	if (c == NULL)
>  		return ERR_PTR(-ENOMEM);
>  
> +	c->type_names = RB_ROOT;
> +	c->names = RB_ROOT;
>  	c->choose_args = RB_ROOT;
>  
>          /* set tunables to default values */
> @@ -510,8 +585,14 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
>  		}
>  	}
>  
> -	ceph_decode_skip_map(p, end, 32, string, bad); /* type_map */
> -	ceph_decode_skip_map(p, end, 32, string, bad); /* name_map */
> +	err = decode_crush_names(p, end, &c->type_names);
> +	if (err)
> +		goto fail;
> +
> +	err = decode_crush_names(p, end, &c->names);
> +	if (err)
> +		goto fail;
> +
>  	ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */
>  
>          /* tunables */

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 0/5] libceph: support for replica reads
  2020-05-30 15:34 [PATCH v2 0/5] libceph: support for replica reads Ilya Dryomov
                   ` (4 preceding siblings ...)
  2020-05-30 15:34 ` [PATCH v2 5/5] libceph: read_from_replica option Ilya Dryomov
@ 2020-06-01 10:57 ` Jeff Layton
  5 siblings, 0 replies; 11+ messages in thread
From: Jeff Layton @ 2020-06-01 10:57 UTC (permalink / raw)
  To: Ilya Dryomov, ceph-devel

On Sat, 2020-05-30 at 17:34 +0200, Ilya Dryomov wrote:
> Hello,
> 
> This adds support for replica reads (balanced and localized reads)
> to rbd and ceph.  crush_location syntax is slightly different, see
> patch 3 for details.
> 
> v1 -> v2:
> - change crush_location syntax
> - rename read_policy to read_from_replica, add read_from_replica=no
> - crush_location and read_from_replica are now overridable
> 
> Thanks,
> 
>                 Ilya
> 
> 
> Ilya Dryomov (5):
>   libceph: add non-asserting rbtree insertion helper
>   libceph: decode CRUSH device/bucket types and names
>   libceph: crush_location infrastructure
>   libceph: support for balanced and localized reads
>   libceph: read_from_replica option
> 
>  include/linux/ceph/libceph.h    |  13 +-
>  include/linux/ceph/osd_client.h |   1 +
>  include/linux/ceph/osdmap.h     |  19 +-
>  include/linux/crush/crush.h     |   6 +
>  net/ceph/ceph_common.c          |  75 +++++++
>  net/ceph/crush/crush.c          |   3 +
>  net/ceph/debugfs.c              |   6 +-
>  net/ceph/osd_client.c           |  92 +++++++-
>  net/ceph/osdmap.c               | 363 +++++++++++++++++++++++++++-----
>  9 files changed, 517 insertions(+), 61 deletions(-)
> 

Nice work, Ilya. This all looks good to me now. The new mount option
syntax is much more readable.

It might be nice to sprinkle in some comments about the locking around
the new rbtrees (and maybe the existing DEFINE_RB_FUNCS trees), but
that's minor stuff.

You can add:

Reviewed-by: Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2 2/5] libceph: decode CRUSH device/bucket types and names
  2020-06-01 10:49   ` Jeff Layton
@ 2020-06-01 11:14     ` Ilya Dryomov
  0 siblings, 0 replies; 11+ messages in thread
From: Ilya Dryomov @ 2020-06-01 11:14 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Ceph Development

On Mon, Jun 1, 2020 at 12:49 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Sat, 2020-05-30 at 17:34 +0200, Ilya Dryomov wrote:
> > These would be matched with the provided client location to calculate
> > the locality value.
> >
> > Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
> > ---
> >  include/linux/crush/crush.h |  6 +++
> >  net/ceph/crush/crush.c      |  3 ++
> >  net/ceph/osdmap.c           | 85 ++++++++++++++++++++++++++++++++++++-
> >  3 files changed, 92 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
> > index 38b0e4d50ed9..29b0de2e202b 100644
> > --- a/include/linux/crush/crush.h
> > +++ b/include/linux/crush/crush.h
> > @@ -301,6 +301,12 @@ struct crush_map {
> >
> >       __u32 *choose_tries;
> >  #else
> > +     /* device/bucket type id -> type name (CrushWrapper::type_map) */
> > +     struct rb_root type_names;
> > +
> > +     /* device/bucket id -> name (CrushWrapper::name_map) */
> > +     struct rb_root names;
> > +
>
> I'm not as familiar with the OSD client-side code, but I don't see any
> mention of locking here. What protects these new rbtrees? From reading
> over the code, I'd assume the osdc->lock, but it might be nice to have a
> comment here that makes that clear.

Yes, everything osdmap and CRUSH (except for the actual invocation) is
protected by osdc->lock.  But these trees don't even need that -- they
are static and could be easily replaced with sorted arrays to save two
pointers per name.

Thanks,

                Ilya

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2020-06-01 11:14 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-30 15:34 [PATCH v2 0/5] libceph: support for replica reads Ilya Dryomov
2020-05-30 15:34 ` [PATCH v2 1/5] libceph: add non-asserting rbtree insertion helper Ilya Dryomov
2020-05-30 15:34 ` [PATCH v2 2/5] libceph: decode CRUSH device/bucket types and names Ilya Dryomov
2020-06-01 10:49   ` Jeff Layton
2020-06-01 11:14     ` Ilya Dryomov
2020-05-30 15:34 ` [PATCH v2 3/5] libceph: crush_location infrastructure Ilya Dryomov
2020-05-31 13:27   ` Jeff Layton
2020-05-31 21:07     ` Ilya Dryomov
2020-05-30 15:34 ` [PATCH v2 4/5] libceph: support for balanced and localized reads Ilya Dryomov
2020-05-30 15:34 ` [PATCH v2 5/5] libceph: read_from_replica option Ilya Dryomov
2020-06-01 10:57 ` [PATCH v2 0/5] libceph: support for replica reads Jeff Layton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.