All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHv2 0/3] rbd: header read/refresh improvements
@ 2015-04-24 13:22 Douglas Fuller
  2015-04-24 13:22 ` [PATCHv2 1/3] ceph: support multiple class method calls in one ceph_msg Douglas Fuller
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Douglas Fuller @ 2015-04-24 13:22 UTC (permalink / raw)
  To: ceph-devel

Support multiple class op calls in one ceph_msg and consolidate rbd header
read and refresh processes to use this feature to reduce the number of
ceph_msgs sent for that process. Refresh features on header refresh and
begin returning EIO if features have changed since mapping.

v2: Edit history and address comments from Mike Christie.

Douglas Fuller (3):
  ceph: support multiple class method calls in one ceph_msg
  rbd: combine object method calls in header refresh using fewer
    ceph_msgs
  rbd: re-read features during header refresh and detect changes.

 drivers/block/rbd.c             | 512 +++++++++++++++++++++++++++++-----------
 include/linux/ceph/osd_client.h |   3 +-
 net/ceph/messenger.c            |   4 +
 net/ceph/osd_client.c           |  90 ++++++-
 4 files changed, 462 insertions(+), 147 deletions(-)

-- 
1.9.3


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCHv2 1/3] ceph: support multiple class method calls in one ceph_msg
  2015-04-24 13:22 [PATCHv2 0/3] rbd: header read/refresh improvements Douglas Fuller
@ 2015-04-24 13:22 ` Douglas Fuller
  2015-04-24 13:22 ` [PATCHv2 2/3] rbd: combine object method calls in header refresh using fewer ceph_msgs Douglas Fuller
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 14+ messages in thread
From: Douglas Fuller @ 2015-04-24 13:22 UTC (permalink / raw)
  To: ceph-devel

Messages are received from the wire directly into destination buffers.
A short read could result in corruption of the following destination
buffers. Allocate a single message buffer for all class method calls
and split them at the osd_client level. This only applies to ceph_msgs
containing multiple op call and may break support for ceph_msgs
containing a mix of class method calls that return data and other ops.

Signed-off-by: Douglas Fuller <dfuller@redhat.com>
---
 include/linux/ceph/osd_client.h |  1 +
 net/ceph/messenger.c            |  4 ++
 net/ceph/osd_client.c           | 90 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 93 insertions(+), 2 deletions(-)

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 61b19c4..65fcf80 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -99,6 +99,7 @@ struct ceph_osd_req_op {
 			struct ceph_osd_data request_info;
 			struct ceph_osd_data request_data;
 			struct ceph_osd_data response_data;
+			struct ceph_osd_data chain_data;
 			__u8 class_len;
 			__u8 method_len;
 			__u8 argc;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 967080a..ec04be4 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -907,6 +907,10 @@ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor,
 	BUG_ON(!data->pages);
 	BUG_ON(!data->length);
 
+	/*
+	 * bug here if a short read occurs and length < data->length; see
+	 * http://tracker.ceph.com/issues/11424
+	 */
 	cursor->resid = min(length, data->length);
 	page_count = calc_pages_for(data->alignment, (u64)data->length);
 	cursor->page_offset = data->alignment & ~PAGE_MASK;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 41a4abc..0092b6b 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -301,6 +301,71 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
 	}
 }
 
+static int __build_op_cls_chain(struct ceph_osd_request *osd_req)
+{
+	u64 chain_length = 0;
+	u32 chain_pagecount = 0;
+	struct ceph_osd_req_op *op = NULL;
+	struct ceph_osd_data *osd_data;
+	struct ceph_osd_data *chain_data;
+	struct page **pages;
+	int i;
+
+	chain_data = osd_req_op_data(osd_req, 0, cls, chain_data);
+
+	for (i = 0; i < osd_req->r_num_ops; i++) {
+		op = &osd_req->r_ops[i];
+		if (op->op != CEPH_OSD_OP_CALL)
+			break;
+
+		osd_data = osd_req_op_data(osd_req, i, cls, chain_data);
+		osd_data->length = 0;
+
+		osd_data = osd_req_op_data(osd_req, i, cls, response_data);
+		chain_length += osd_data->length;
+	}
+
+	chain_data->length = chain_length;
+	chain_pagecount = (u32)calc_pages_for(0, chain_data->length);
+	pages = ceph_alloc_page_vector(chain_pagecount, GFP_KERNEL);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+	ceph_osd_data_pages_init(chain_data, pages, chain_length, 0, false, false);
+
+	return 0;
+}
+
+static int __split_cls_op_chain(struct ceph_osd_request *osd_req)
+{
+	int i;
+	void * data;
+	void * p;
+	struct ceph_osd_data *osd_data;
+
+	osd_data = osd_req_op_data(osd_req, 0, cls, chain_data);
+
+	if (osd_data->length == 0)
+		return 0;
+
+	data = kzalloc(osd_data->length, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	ceph_copy_from_page_vector(osd_data->pages, data, 0, osd_data->length);
+	ceph_osd_data_release(osd_data);
+
+	p = data;
+	for (i = 0; i < osd_req->r_num_ops; i++) {
+		osd_data = osd_req_op_data(osd_req, i, cls, response_data);
+		ceph_copy_to_page_vector(osd_data->pages, p,
+			0, osd_req->r_reply_op_len[i]);
+		p += osd_req->r_reply_op_len[i];
+	}
+
+	kfree(data);
+	return 0;
+}
+
 /*
  * requests
  */
@@ -694,8 +759,20 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 			src->payload_len += data_length;
 			request_data_len += data_length;
 		}
-		osd_data = &src->cls.response_data;
-		ceph_osdc_msg_data_add(req->r_reply, osd_data);
+		if (which == 0)
+		{
+			int err;
+
+			err = __build_op_cls_chain(req);
+			if (err == -ENOMEM)
+			{
+				pr_err("error allocating memory for op chain\n");
+				return 0;
+			}
+			osd_data = &src->cls.chain_data;
+			if (osd_data->length)
+				ceph_osdc_msg_data_add(req->r_reply, osd_data);
+		}
 		break;
 	case CEPH_OSD_OP_STARTSYNC:
 		break;
@@ -1825,6 +1902,15 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	for (i = 0; i < numops; i++)
 		req->r_reply_op_result[i] = ceph_decode_32(&p);
 
+	if (req->r_ops[0].op == CEPH_OSD_OP_CALL &&
+			req->r_ops[0].cls.chain_data.length)
+	{
+		int err;
+		err = __split_cls_op_chain(req);
+		if (err == -ENOMEM)
+			goto bad_put;
+	}
+
 	if (le16_to_cpu(msg->hdr.version) >= 6) {
 		p += 8 + 4; /* skip replay_version */
 		p += 8; /* skip user_version */
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCHv2 2/3] rbd: combine object method calls in header refresh using fewer ceph_msgs
  2015-04-24 13:22 [PATCHv2 0/3] rbd: header read/refresh improvements Douglas Fuller
  2015-04-24 13:22 ` [PATCHv2 1/3] ceph: support multiple class method calls in one ceph_msg Douglas Fuller
@ 2015-04-24 13:22 ` Douglas Fuller
  2015-04-24 13:22 ` [PATCHv2 3/3] rbd: re-read features during header refresh and detect changes Douglas Fuller
  2015-04-26  6:29 ` [PATCHv2 0/3] rbd: header read/refresh improvements Alex Elder
  3 siblings, 0 replies; 14+ messages in thread
From: Douglas Fuller @ 2015-04-24 13:22 UTC (permalink / raw)
  To: ceph-devel

Signed-off-by: Douglas Fuller <douglas.fuller@gmail.com>
---
 drivers/block/rbd.c             | 331 ++++++++++++++++++++++++++++++++++++----
 include/linux/ceph/osd_client.h |   2 +-
 2 files changed, 306 insertions(+), 27 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 8125233..9184b43 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4667,20 +4667,23 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
 	bool first_time = rbd_dev->header.object_prefix == NULL;
 	int ret;
 
-	ret = rbd_dev_v2_image_size(rbd_dev);
-	if (ret)
-		return ret;
+	if (!first_time)
+	{
+		ret = rbd_dev_v2_image_size(rbd_dev);
+		if (ret)
+			return ret;
 
-	if (first_time) {
+		ret = rbd_dev_v2_snap_context(rbd_dev);
+		dout("rbd_dev_v2_snap_context returned %d\n", ret);
+		return ret;
+	}
+	else
+	{
 		ret = rbd_dev_v2_header_onetime(rbd_dev);
 		if (ret)
 			return ret;
 	}
-
-	ret = rbd_dev_v2_snap_context(rbd_dev);
-	dout("rbd_dev_v2_snap_context returned %d\n", ret);
-
-	return ret;
+	return ret; /* XXX change logic? */
 }
 
 static int rbd_dev_header_info(struct rbd_device *rbd_dev)
@@ -5091,36 +5094,312 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
 	memset(header, 0, sizeof (*header));
 }
 
+static struct ceph_osd_request * rbd_obj_header_req_alloc(
+		struct rbd_device *rbd_dev,
+		int num_ops)
+{
+	struct ceph_osd_request *osd_req;
+	struct page ***replies;
+	
+	replies = kmalloc(CEPH_OSD_MAX_OP * sizeof(*replies), GFP_KERNEL);
+	if (!replies)
+		return NULL;
+
+	osd_req = ceph_osdc_alloc_request(&rbd_dev->rbd_client->client->osdc,
+		NULL, num_ops, false, GFP_KERNEL);
+	if (!osd_req) {
+		kfree(replies);
+		return NULL;
+	}
+
+	osd_req->r_flags = CEPH_OSD_FLAG_READ;
+	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+	osd_req->r_priv = replies;
+	ceph_oid_set_name(&osd_req->r_base_oid, rbd_dev->header_name);
+
+	return osd_req;
+}
+
+static int rbd_obj_header_method_add(struct ceph_osd_request *osd_req,
+		int which,
+		const char *class_name,
+		const char *method_name,
+		const void *outbound,
+		size_t outbound_size,
+		size_t inbound_size)
+{
+	u32 page_count;
+	struct page ***replies = osd_req->r_priv;
+
+	BUG_ON(which >= osd_req->r_num_ops);
+
+	osd_req_op_cls_init(osd_req, which, CEPH_OSD_OP_CALL,
+		class_name, method_name);
+
+	if (outbound_size) {
+		struct ceph_pagelist *pagelist;
+		pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+		/* XXX is this ever freed? */
+		if (!pagelist)
+			return -ENOMEM;
+
+		ceph_pagelist_init(pagelist);
+		ceph_pagelist_append(pagelist, outbound, outbound_size);
+		osd_req_op_cls_request_data_pagelist(osd_req, which, pagelist);
+	}
+
+	page_count = (u32)calc_pages_for(0, inbound_size);
+	replies[which] = ceph_alloc_page_vector(page_count, GFP_KERNEL);
+	if (IS_ERR(replies[which]))
+		return PTR_ERR(replies[which]);
+	osd_req_op_cls_response_data_pages(osd_req, which, replies[which],
+		inbound_size, 0, false, false);
+
+	return 0;
+}
+
+static int rbd_obj_header_req_exec(struct rbd_device * rbd_dev,
+		struct ceph_osd_request *osd_req)
+{
+	int ret;
+
+	ceph_osdc_build_request(osd_req, 0, NULL, cpu_to_le64(CEPH_NOSNAP), NULL);
+	ret = ceph_osdc_start_request(&rbd_dev->rbd_client->client->osdc,
+		osd_req, false);
+	if (ret) {
+		ceph_osdc_put_request(osd_req);
+		return ret;
+	}
+
+	ret = ceph_osdc_wait_request(&rbd_dev->rbd_client->client->osdc, osd_req);
+	if (ret < 0) {
+		ceph_osdc_cancel_request(osd_req);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int rbd_obj_header_method_retrieve(struct ceph_osd_request *osd_req,
+		int which, void *buf, size_t length)
+{
+	struct page ***replies = (struct page ***)osd_req->r_priv;
+	size_t reply_length = osd_req->r_reply_op_len[which];
+
+	BUG_ON(reply_length > length);
+
+	ceph_copy_from_page_vector(replies[which], buf, 0, reply_length);
+	ceph_release_page_vector(replies[which], calc_pages_for(0, reply_length));
+
+	return reply_length;
+}
+
+static void rbd_obj_header_req_destroy(struct ceph_osd_request *osd_req)
+{
+	if (osd_req) {
+		kfree(osd_req->r_priv);
+		ceph_osdc_put_request(osd_req);
+	}
+}
+
+static int __extract_prefix(struct rbd_device *rbd_dev,
+		struct ceph_osd_request *osd_req,
+		int which)
+{
+	void *prefix_buf;
+	void * p;
+	size_t prefix_buflen;
+	int ret;
+
+	prefix_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
+	if (!prefix_buf)
+		return -ENOMEM;
+
+	prefix_buflen = rbd_obj_header_method_retrieve(osd_req, which,
+		prefix_buf, RBD_OBJ_PREFIX_LEN_MAX);
+	p = prefix_buf;
+	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
+		p + prefix_buflen, NULL, GFP_KERNEL);
+
+	if (IS_ERR(rbd_dev->header.object_prefix)) {
+		ret = PTR_ERR(rbd_dev->header.object_prefix);
+		rbd_dev->header.object_prefix = NULL;
+		goto out_err;
+	}
+
+	ret = 0;
+
+out_err:
+	kfree(prefix_buf);
+	return ret;
+}
+
+static int __extract_features(struct rbd_device *rbd_dev,
+		struct ceph_osd_request *osd_req,
+		int which)
+{
+	int ret;
+	struct {
+		__le64 features;
+		__le64 incompat;
+	} __attribute__((packed)) features_buf = { 0 };
+	u64 incompat;
+
+	rbd_obj_header_method_retrieve(osd_req, which,
+		&features_buf, sizeof(features_buf));
+	incompat = le64_to_cpu(features_buf.incompat);
+	if (incompat & ~RBD_FEATURES_SUPPORTED) {
+		ret = -ENXIO;
+		goto out_err;
+	}
+	rbd_dev->header.features = le64_to_cpu(features_buf.features);
+
+	ret = 0;
+
+out_err:
+	return ret;
+}
+
+static int __extract_snapcontext(struct rbd_device *rbd_dev,
+		struct ceph_osd_request *osd_req,
+		int which)
+{
+	void *snapc_buf;
+	size_t snapc_max;
+	size_t snapc_buflen;
+
+	void *p;
+	void *q;
+
+	struct ceph_snap_context * snapc;
+
+	u32 seq;
+	u32 snap_count;
+
+	int i;
+	int ret;
+
+	snapc_max = sizeof(__le64) + sizeof(__le32) +
+		RBD_MAX_SNAP_COUNT * sizeof(__le64);
+
+	snapc_buf = kzalloc(snapc_max, GFP_KERNEL);
+	if (!snapc_buf) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	snapc_buflen = rbd_obj_header_method_retrieve(osd_req, which,
+		snapc_buf, snapc_max);
+
+	p = snapc_buf;
+	q = snapc_buf + snapc_buflen;
+	ret = -ERANGE;
+	ceph_decode_64_safe(&p, q, seq, out_err);
+	ceph_decode_32_safe(&p, q, snap_count, out_err);
+
+	if (snap_count > (SIZE_MAX - sizeof(struct ceph_snap_context)) /
+		sizeof(u64)) {
+		ret = -EINVAL;
+		goto out_err;
+	}
+	if (!ceph_has_room(&p, q, snap_count * sizeof(__le64)))
+		goto out_err;
+
+	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
+	if (!snapc) {
+		ret = -ENOMEM;
+	}
+
+	snapc->seq = seq;
+	for (i=0; i < snap_count; i++)
+		snapc->snaps[i] = ceph_decode_64(&p);
+	ceph_put_snap_context(rbd_dev->header.snapc);
+	rbd_dev->header.snapc = snapc;
+
+	ret = 0;
+out_err:
+	kfree(snapc_buf);
+	return ret;
+}
+
+static int __extract_size(struct rbd_device *rbd_dev,
+		struct ceph_osd_request *osd_req,
+		int which)
+{
+	struct {
+		u8 order;
+		__le64 size;
+	} __attribute((packed)) size_buf = { 0 };
+
+	rbd_obj_header_method_retrieve(osd_req, which, &size_buf, sizeof(size_buf));
+
+	rbd_dev->header.obj_order = size_buf.order;
+	rbd_dev->header.image_size = le64_to_cpu(size_buf.size);
+
+	return 0;
+}
+
 static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
 {
 	int ret;
 
-	ret = rbd_dev_v2_object_prefix(rbd_dev);
+	size_t snapc_max;
+	__le64 snapid = cpu_to_le64(CEPH_NOSNAP);
+	struct ceph_osd_request *osd_req;
+
+	snapc_max = sizeof(__le64) + sizeof(__le32) +
+		RBD_MAX_SNAP_COUNT * sizeof(__le64);
+
+	osd_req = rbd_obj_header_req_alloc(rbd_dev, 4);
+
+	if (!osd_req) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	ret = rbd_obj_header_method_add(osd_req, 0,
+		"rbd", "get_object_prefix",
+		NULL, 0, RBD_OBJ_PREFIX_LEN_MAX);
 	if (ret)
 		goto out_err;
 
-	/*
-	 * Get the and check features for the image.  Currently the
-	 * features are assumed to never change.
-	 */
-	ret = rbd_dev_v2_features(rbd_dev);
+	ret = rbd_obj_header_method_add(osd_req, 1,
+		"rbd", "get_features",
+		&snapid, sizeof(snapid),
+		sizeof(struct{__le64 features;__le64 incompat;} __attribute__((packed))));
 	if (ret)
 		goto out_err;
 
-	/* If the image supports fancy striping, get its parameters */
+	ret = rbd_obj_header_method_add(osd_req, 2,
+		"rbd", "get_snapcontext",
+		NULL, 0, snapc_max);
+	if (ret)
+		goto out_err;
 
-	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
-		ret = rbd_dev_v2_striping_info(rbd_dev);
-		if (ret < 0)
-			goto out_err;
-	}
-	/* No support for crypto and compression type format 2 images */
+	ret = rbd_obj_header_method_add(osd_req, 3,
+		"rbd", "get_size",
+		&snapid, sizeof(snapid),
+		sizeof(struct{u8 order; __le64 size;}__attribute((packed))));
+	if (ret)
+		goto out_err;
+
+	ret = rbd_obj_header_req_exec(rbd_dev, osd_req);
+	if (ret)
+		goto out_err;
+
+	if ((ret = __extract_prefix(rbd_dev, osd_req, 0)))
+		goto out_err;
+	if ((ret = __extract_features(rbd_dev, osd_req, 1)))
+		goto out_err;
+	if ((ret = __extract_snapcontext(rbd_dev, osd_req, 2)))
+		goto out_err;
+	if ((ret = __extract_size(rbd_dev, osd_req, 3)))
+		goto out_err;
+
+	ret = 0;
 
-	return 0;
 out_err:
-	rbd_dev->header.features = 0;
-	kfree(rbd_dev->header.object_prefix);
-	rbd_dev->header.object_prefix = NULL;
+	rbd_obj_header_req_destroy(osd_req);
 
 	return ret;
 }
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 65fcf80..71c946d 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -43,7 +43,7 @@ struct ceph_osd {
 };
 
 
-#define CEPH_OSD_MAX_OP	3
+#define CEPH_OSD_MAX_OP	4
 
 enum ceph_osd_data_type {
 	CEPH_OSD_DATA_TYPE_NONE = 0,
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCHv2 3/3] rbd: re-read features during header refresh and detect changes.
  2015-04-24 13:22 [PATCHv2 0/3] rbd: header read/refresh improvements Douglas Fuller
  2015-04-24 13:22 ` [PATCHv2 1/3] ceph: support multiple class method calls in one ceph_msg Douglas Fuller
  2015-04-24 13:22 ` [PATCHv2 2/3] rbd: combine object method calls in header refresh using fewer ceph_msgs Douglas Fuller
@ 2015-04-24 13:22 ` Douglas Fuller
  2015-04-26  6:29 ` [PATCHv2 0/3] rbd: header read/refresh improvements Alex Elder
  3 siblings, 0 replies; 14+ messages in thread
From: Douglas Fuller @ 2015-04-24 13:22 UTC (permalink / raw)
  To: ceph-devel

From: Douglas Fuller <douglas.fuller@gmail.com>

Add a new header item to track when features have changed since mapping and
return EIO on I/O operations. Clean up and consolidate code path for
header read and update.  Remove unused code.

Signed-off-by: Douglas Fuller <douglas.fuller@gmail.com>
---
 drivers/block/rbd.c | 207 +++++++++++++++++++---------------------------------
 1 file changed, 76 insertions(+), 131 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 9184b43..d67dcb0 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -320,9 +320,16 @@ struct rbd_img_request {
 #define for_each_obj_request_safe(ireq, oreq, n) \
 	list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
 
+enum rbd_mapping_state {
+	MAP_STATE_MAPPED,
+	MAP_STATE_DETACHED,
+};
+		
+
 struct rbd_mapping {
 	u64                     size;
 	u64                     features;
+	enum rbd_mapping_state	state;
 	bool			read_only;
 };
 
@@ -527,7 +534,7 @@ static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
 
 static int rbd_dev_refresh(struct rbd_device *rbd_dev);
-static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
+static int rbd_dev_v2_header_data(struct rbd_device *rbd_dev);
 static int rbd_dev_header_info(struct rbd_device *rbd_dev);
 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
@@ -3336,6 +3343,14 @@ static void rbd_queue_workfn(struct work_struct *work)
 	else
 		op_type = OBJ_OP_READ;
 
+	/* If a configuration change was detected, issue EIO. */
+
+	if (rbd_dev->mapping.state == MAP_STATE_DETACHED)
+	{
+		result = -EIO;
+		goto err_rq;
+	}
+
 	/* Ignore/skip any zero-length requests */
 
 	if (!length) {
@@ -3670,12 +3685,23 @@ static void rbd_dev_update_size(struct rbd_device *rbd_dev)
 static int rbd_dev_refresh(struct rbd_device *rbd_dev)
 {
 	u64 mapping_size;
+	u64 features;
 	int ret;
 
 	down_write(&rbd_dev->header_rwsem);
 	mapping_size = rbd_dev->mapping.size;
+	features = rbd_dev->mapping.features;
 
 	ret = rbd_dev_header_info(rbd_dev);
+
+	if (ret == -ENXIO || (ret && features != rbd_dev->mapping.features))
+	{
+		rbd_dev->mapping.read_only = 1;
+		rbd_dev->mapping.state = MAP_STATE_DETACHED;
+		rbd_warn(rbd_dev,
+			"detected feature change in mapped device, setting read-only");
+	}
+
 	if (ret)
 		goto out;
 
@@ -3698,6 +3724,9 @@ static int rbd_dev_refresh(struct rbd_device *rbd_dev)
 
 out:
 	up_write(&rbd_dev->header_rwsem);
+
+	if (rbd_dev->mapping.read_only)
+		set_disk_ro(rbd_dev->disk, 1);
 	if (!ret && mapping_size != rbd_dev->mapping.size)
 		rbd_dev_update_size(rbd_dev);
 
@@ -4111,47 +4140,6 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 	return 0;
 }
 
-static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
-{
-	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
-					&rbd_dev->header.obj_order,
-					&rbd_dev->header.image_size);
-}
-
-static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
-{
-	void *reply_buf;
-	int ret;
-	void *p;
-
-	reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
-	if (!reply_buf)
-		return -ENOMEM;
-
-	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
-				"rbd", "get_object_prefix", NULL, 0,
-				reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
-	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
-	if (ret < 0)
-		goto out;
-
-	p = reply_buf;
-	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
-						p + ret, NULL, GFP_NOIO);
-	ret = 0;
-
-	if (IS_ERR(rbd_dev->header.object_prefix)) {
-		ret = PTR_ERR(rbd_dev->header.object_prefix);
-		rbd_dev->header.object_prefix = NULL;
-	} else {
-		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
-	}
-out:
-	kfree(reply_buf);
-
-	return ret;
-}
-
 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 		u64 *snap_features)
 {
@@ -4187,12 +4175,6 @@ static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 	return 0;
 }
 
-static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
-{
-	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
-						&rbd_dev->header.features);
-}
-
 static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
 {
 	struct rbd_spec *parent_spec;
@@ -4549,78 +4531,6 @@ out_err:
 	return ret;
 }
 
-static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
-{
-	size_t size;
-	int ret;
-	void *reply_buf;
-	void *p;
-	void *end;
-	u64 seq;
-	u32 snap_count;
-	struct ceph_snap_context *snapc;
-	u32 i;
-
-	/*
-	 * We'll need room for the seq value (maximum snapshot id),
-	 * snapshot count, and array of that many snapshot ids.
-	 * For now we have a fixed upper limit on the number we're
-	 * prepared to receive.
-	 */
-	size = sizeof (__le64) + sizeof (__le32) +
-			RBD_MAX_SNAP_COUNT * sizeof (__le64);
-	reply_buf = kzalloc(size, GFP_KERNEL);
-	if (!reply_buf)
-		return -ENOMEM;
-
-	ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
-				"rbd", "get_snapcontext", NULL, 0,
-				reply_buf, size);
-	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
-	if (ret < 0)
-		goto out;
-
-	p = reply_buf;
-	end = reply_buf + ret;
-	ret = -ERANGE;
-	ceph_decode_64_safe(&p, end, seq, out);
-	ceph_decode_32_safe(&p, end, snap_count, out);
-
-	/*
-	 * Make sure the reported number of snapshot ids wouldn't go
-	 * beyond the end of our buffer.  But before checking that,
-	 * make sure the computed size of the snapshot context we
-	 * allocate is representable in a size_t.
-	 */
-	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
-				 / sizeof (u64)) {
-		ret = -EINVAL;
-		goto out;
-	}
-	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
-		goto out;
-	ret = 0;
-
-	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
-	if (!snapc) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	snapc->seq = seq;
-	for (i = 0; i < snap_count; i++)
-		snapc->snaps[i] = ceph_decode_64(&p);
-
-	ceph_put_snap_context(rbd_dev->header.snapc);
-	rbd_dev->header.snapc = snapc;
-
-	dout("  snap context seq = %llu, snap_count = %u\n",
-		(unsigned long long)seq, (unsigned int)snap_count);
-out:
-	kfree(reply_buf);
-
-	return ret;
-}
-
 static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
 					u64 snap_id)
 {
@@ -4662,6 +4572,16 @@ out:
 	return snap_name;
 }
 
+static int rbd_dev_v2_mutable_metadata(struct rbd_device *rbd_dev)
+{
+	return rbd_dev_v2_header_data(rbd_dev);
+}
+
+static int rbd_dev_v2_immutable_metadata(struct rbd_device *rbd_dev)
+{
+	return rbd_dev_v2_header_data(rbd_dev);
+}
+
 static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
 {
 	bool first_time = rbd_dev->header.object_prefix == NULL;
@@ -4669,27 +4589,39 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
 
 	if (!first_time)
 	{
-		ret = rbd_dev_v2_image_size(rbd_dev);
+		ret = rbd_dev_v2_mutable_metadata(rbd_dev);
 		if (ret)
-			return ret;
-
-		ret = rbd_dev_v2_snap_context(rbd_dev);
-		dout("rbd_dev_v2_snap_context returned %d\n", ret);
-		return ret;
+			goto out_err;
 	}
 	else
 	{
-		ret = rbd_dev_v2_header_onetime(rbd_dev);
+		ret = rbd_dev_v2_immutable_metadata(rbd_dev);
 		if (ret)
-			return ret;
+			goto out_err;
 	}
-	return ret; /* XXX change logic? */
+
+	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2)
+	{
+		ret = rbd_dev_v2_striping_info(rbd_dev);
+		if (ret)
+			goto out_err;
+	}
+
+	ret = 0;
+out_err:
+	rbd_dev->header.features = 0;
+	kfree(rbd_dev->header.object_prefix);
+	rbd_dev->header.object_prefix = NULL;
+
+	return ret;
 }
 
 static int rbd_dev_header_info(struct rbd_device *rbd_dev)
 {
 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
 
+	/* XXX need to fix 11418 here too? */
+
 	if (rbd_dev->image_format == 1)
 		return rbd_dev_v1_header_info(rbd_dev);
 
@@ -5139,7 +5071,6 @@ static int rbd_obj_header_method_add(struct ceph_osd_request *osd_req,
 	if (outbound_size) {
 		struct ceph_pagelist *pagelist;
 		pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
-		/* XXX is this ever freed? */
 		if (!pagelist)
 			return -ENOMEM;
 
@@ -5279,6 +5210,12 @@ static int __extract_snapcontext(struct rbd_device *rbd_dev,
 	int i;
 	int ret;
 
+	/*
+	 * We'll need room for the seq value (maximum snapshot id),
+	 * snapshot count, and array of that many snapshot ids.
+	 * For now we have a fixed upper limit on the number we're
+	 * prepared to receive.
+	 */
 	snapc_max = sizeof(__le64) + sizeof(__le32) +
 		RBD_MAX_SNAP_COUNT * sizeof(__le64);
 
@@ -5297,6 +5234,12 @@ static int __extract_snapcontext(struct rbd_device *rbd_dev,
 	ceph_decode_64_safe(&p, q, seq, out_err);
 	ceph_decode_32_safe(&p, q, snap_count, out_err);
 
+	/*
+	 * Make sure the reported number of snapshot ids wouldn't go
+	 * beyond the end of our buffer.  But before checking that,
+	 * make sure the computed size of the snapshot context we
+	 * allocate is representable in a size_t.
+	 */
 	if (snap_count > (SIZE_MAX - sizeof(struct ceph_snap_context)) /
 		sizeof(u64)) {
 		ret = -EINVAL;
@@ -5339,7 +5282,7 @@ static int __extract_size(struct rbd_device *rbd_dev,
 	return 0;
 }
 
-static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
+static int rbd_dev_v2_header_data(struct rbd_device *rbd_dev)
 {
 	int ret;
 
@@ -5712,6 +5655,8 @@ static ssize_t do_rbd_add(struct bus_type *bus,
 		read_only = true;
 	rbd_dev->mapping.read_only = read_only;
 
+	rbd_dev->mapping.state = MAP_STATE_MAPPED;
+
 	rc = rbd_dev_device_setup(rbd_dev);
 	if (rc) {
 		/*
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCHv2 0/3] rbd: header read/refresh improvements
  2015-04-24 13:22 [PATCHv2 0/3] rbd: header read/refresh improvements Douglas Fuller
                   ` (2 preceding siblings ...)
  2015-04-24 13:22 ` [PATCHv2 3/3] rbd: re-read features during header refresh and detect changes Douglas Fuller
@ 2015-04-26  6:29 ` Alex Elder
  2015-04-26  8:44   ` Ilya Dryomov
  3 siblings, 1 reply; 14+ messages in thread
From: Alex Elder @ 2015-04-26  6:29 UTC (permalink / raw)
  To: Douglas Fuller, ceph-devel

On 04/24/2015 08:22 AM, Douglas Fuller wrote:
> Support multiple class op calls in one ceph_msg and consolidate rbd header
> read and refresh processes to use this feature to reduce the number of
> ceph_msgs sent for that process. Refresh features on header refresh and
> begin returning EIO if features have changed since mapping.

I think I understand the motivation for what you're doing here.
You ultimately want to pack more ops into a message, which
would be really great.  Ideally you could pack some arbitrary
number (much larger than 3), but it seems to me there'll be
some restructuring required to get there.

I am having a hard time understanding your intended
implementation though.  I've looked mainly at your first
patch, and scanned the other two.  Let's see if I have
it right.

You are adding another hunk of data to a class request,
and using it as an intermediate buffer for receiving
data.

Currently a class request has three "hunks":
- request_info, a pagelist data item, which contains the
   class name and method name, and perhaps someday, an
   argument list (?).
- request_data, which can contain any number of data
   items of any type, and constitutes outgoing data for
   a write request.
- response_data, also any number of any type of data
   items, which constitutes space into which incoming
   data is placed.

You are adding a fourth:
- chain_data, which is a page vector big enough to hold
   all incoming response data.

When building up an OSD request, we call osd_req_encode_op()
for every op in the request.  For a class op, that records
the length of the class and method names in the op structure,
and then appends those names to the outbound request_data.
Following that, the "real" request data (if any) provided
with the class op is appended to request_data.

Next (before your patches) whatever buffer space was provided
(as a page array, pagelist, or bio) to receive response data
is appended to response_data.  Prior to this, the class
op will have been set up with adequate buffers to directly
receive the expected data.

What your first patch seems to do is circumvent this last
step, and instead of using the prearranged buffers, you
allocate another page vector, big enough to hold the
combined size of all response_data data items, and store
that in the new chain_data field of an OSD request class op.
(Note that this could be in *any* of the ops in an OSD
request, though I suspect it's always the first.)

When a response arrives, if the *first* op in the request
array is CEPH_OSD_OP_CALL, you "split" the data, which will
have been deposited in the "chain_data" page vector.  That
process involves allocating another buffer sufficient to
hold the entirety of the received data.  You copy the
received data into that buffer and release the chain_data
page vector.  And then you copy the data from this new
buffer back into the original response array, and then
free the buffer.

This sounds pretty expensive.  For every class operation
you are copying the received data two extra times.

It's possible I'm not understanding what you're doing here
and that's why I'm writing this now.  Before I provide
any more specific commentary on your patches, I want to
be sure I understand what you're trying to do.

If my understanding *is* correct, I would say you're
going about this the wrong way, and I may have some
suggestions for a better approach.

Will you please correct me where I'm wrong above, and
maybe give a little better high-level explanation of
what you're trying to do?  I see in patch 1 you mention
a problem with short reads, and there may be a simpler
fix than that (to begin with).  But beyond that, if
you're trying to incorporate more ops in a message,
there may be better ways to go about that as well.

Thanks.

					-Alex

> v2: Edit history and address comments from Mike Christie.
>
> Douglas Fuller (3):
>    ceph: support multiple class method calls in one ceph_msg
>    rbd: combine object method calls in header refresh using fewer
>      ceph_msgs
>    rbd: re-read features during header refresh and detect changes.
>
>   drivers/block/rbd.c             | 512 +++++++++++++++++++++++++++++-----------
>   include/linux/ceph/osd_client.h |   3 +-
>   net/ceph/messenger.c            |   4 +
>   net/ceph/osd_client.c           |  90 ++++++-
>   4 files changed, 462 insertions(+), 147 deletions(-)
>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv2 0/3] rbd: header read/refresh improvements
  2015-04-26  6:29 ` [PATCHv2 0/3] rbd: header read/refresh improvements Alex Elder
@ 2015-04-26  8:44   ` Ilya Dryomov
  2015-04-26 14:28     ` Douglas Fuller
  0 siblings, 1 reply; 14+ messages in thread
From: Ilya Dryomov @ 2015-04-26  8:44 UTC (permalink / raw)
  To: Alex Elder; +Cc: Douglas Fuller, Ceph Development

On Sun, Apr 26, 2015 at 9:29 AM, Alex Elder <elder@ieee.org> wrote:
> On 04/24/2015 08:22 AM, Douglas Fuller wrote:
>>
>> Support multiple class op calls in one ceph_msg and consolidate rbd header
>> read and refresh processes to use this feature to reduce the number of
>> ceph_msgs sent for that process. Refresh features on header refresh and
>> begin returning EIO if features have changed since mapping.
>
>
> I think I understand the motivation for what you're doing here.
> You ultimately want to pack more ops into a message, which
> would be really great.  Ideally you could pack some arbitrary
> number (much larger than 3), but it seems to me there'll be
> some restructuring required to get there.
>
> I am having a hard time understanding your intended
> implementation though.  I've looked mainly at your first
> patch, and scanned the other two.  Let's see if I have
> it right.
>
> You are adding another hunk of data to a class request,
> and using it as an intermediate buffer for receiving
> data.
>
> Currently a class request has three "hunks":
> - request_info, a pagelist data item, which contains the
>   class name and method name, and perhaps someday, an
>   argument list (?).
> - request_data, which can contain any number of data
>   items of any type, and constitutes outgoing data for
>   a write request.
> - response_data, also any number of any type of data
>   items, which constitutes space into which incoming
>   data is placed.
>
> You are adding a fourth:
> - chain_data, which is a page vector big enough to hold
>   all incoming response data.
>
> When building up an OSD request, we call osd_req_encode_op()
> for every op in the request.  For a class op, that records
> the length of the class and method names in the op structure,
> and then appends those names to the outbound request_data.
> Following that, the "real" request data (if any) provided
> with the class op is appended to request_data.
>
> Next (before your patches) whatever buffer space was provided
> (as a page array, pagelist, or bio) to receive response data
> is appended to response_data.  Prior to this, the class
> op will have been set up with adequate buffers to directly
> receive the expected data.
>
> What your first patch seems to do is circumvent this last
> step, and instead of using the prearranged buffers, you
> allocate another page vector, big enough to hold the
> combined size of all response_data data items, and store
> that in the new chain_data field of an OSD request class op.
> (Note that this could be in *any* of the ops in an OSD
> request, though I suspect it's always the first.)
>
> When a response arrives, if the *first* op in the request
> array is CEPH_OSD_OP_CALL, you "split" the data, which will
> have been deposited in the "chain_data" page vector.  That
> process involves allocating another buffer sufficient to
> hold the entirety of the received data.  You copy the
> received data into that buffer and release the chain_data
> page vector.  And then you copy the data from this new
> buffer back into the original response array, and then
> free the buffer.
>
> This sounds pretty expensive.  For every class operation
> you are copying the received data two extra times.
>
> It's possible I'm not understanding what you're doing here
> and that's why I'm writing this now.  Before I provide
> any more specific commentary on your patches, I want to
> be sure I understand what you're trying to do.
>
> If my understanding *is* correct, I would say you're
> going about this the wrong way, and I may have some
> suggestions for a better approach.
>
> Will you please correct me where I'm wrong above, and
> maybe give a little better high-level explanation of
> what you're trying to do?  I see in patch 1 you mention
> a problem with short reads, and there may be a simpler
> fix than that (to begin with).  But beyond that, if
> you're trying to incorporate more ops in a message,
> there may be better ways to go about that as well.

Yeah, the only objective of this was to pack more call ops into an
osd_request in order to reduce the number of network RTs during rbd map
and refresh.  The fundamental problem the first patch is trying to work
around is the first ceph_msg_data item consuming all reply buffers
instead of just its own.  We have to preallocate response buffers and
if the preallocated response buffer for the first op is large enough
(it always is) it will consume the entire ceph_msg, along with replies
to other ops.

My understanding is that the first patch is supposed to be a specific
workarond for just that - I think it's noted somewhere that it will
break on reads combined with call ops or similar.

I too have my efficiency concerns and I raised them in one of the
standups but the argument was that this is only going to be used for
header init/refresh, not copyup, so the overhead is negligible.  I'm
still not sold on the copying everything twice though and was going to
try to see if there is a simple way to special case this even more and
make the diffstat smaller.

Thanks,

                Ilya

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv2 0/3] rbd: header read/refresh improvements
  2015-04-26  8:44   ` Ilya Dryomov
@ 2015-04-26 14:28     ` Douglas Fuller
  2015-04-26 14:39       ` Alex Elder
  0 siblings, 1 reply; 14+ messages in thread
From: Douglas Fuller @ 2015-04-26 14:28 UTC (permalink / raw)
  To: Ilya Dryomov; +Cc: Alex Elder, Ceph Development

Alex,

I think you are correct in both your understanding and your impression of the approach.

> On Apr 26, 2015, at 4:44 AM, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Sun, Apr 26, 2015 at 9:29 AM, Alex Elder <elder@ieee.org> wrote:
>> On 04/24/2015 08:22 AM, Douglas Fuller wrote:
>>> 
>>> Support multiple class op calls in one ceph_msg and consolidate rbd header
>>> read and refresh processes to use this feature to reduce the number of
>>> ceph_msgs sent for that process. Refresh features on header refresh and
>>> begin returning EIO if features have changed since mapping.
>> 
>> This sounds pretty expensive.  For every class operation
>> you are copying the received data two extra times.

I’d really like a solution to this, but I don’t think one is available given the constraints.

>> Will you please correct me where I'm wrong above, and
>> maybe give a little better high-level explanation of
>> what you're trying to do?  I see in patch 1 you mention
>> a problem with short reads, and there may be a simpler
>> fix than that (to begin with).  But beyond that, if
>> you're trying to incorporate more ops in a message,
>> there may be better ways to go about that as well.
> 
> Yeah, the only objective of this was to pack more call ops into an
> osd_request in order to reduce the number of network RTs during rbd map
> and refresh.  The fundamental problem the first patch is trying to work
> around is the first ceph_msg_data item consuming all reply buffers
> instead of just its own.  We have to preallocate response buffers and
> if the preallocated response buffer for the first op is large enough
> (it always is) it will consume the entire ceph_msg, along with replies
> to other ops.
> 
> My understanding is that the first patch is supposed to be a specific
> workarond for just that - I think it's noted somewhere that it will
> break on reads combined with call ops or similar.

That’s correct. This patch only works around that short response issue in this specific corner case. It does not fix cases where call ops returning data could be arbitrarily combined with reads or writes (and apparently they never have been, because that would not work). It should not introduce additional cases of breakage, though.

I was told that the preferred way to proceed for now was to avoid changing the osd_client interface and to handle this case in osd_client instead of the messaging layer. Given those constraints, it was agreed in the standups and on #ceph-devel that it should be done this way.

We can’t know the actual response sizes until they are decoded in osd_client. Therefore, we can’t copy them directly to the user buffers off the wire. That costs us one extra copy. The other copy is required because pagevec.c does not provide an interface for copying arbitrary data between two page vectors.

> I too have my efficiency concerns and I raised them in one of the
> standups but the argument was that this is only going to be used for
> header init/refresh, not copyup, so the overhead is negligible.  I'm
> still not sold on the copying everything twice though and was going to
> try to see if there is a simple way to special case this even more and
> make the diffstat smaller.

You and I agreed in that particular standup discussion; I don’t like it, either. I would prefer a general-case solution that avoids introducing so much extra overhead, especially for such a small amount of data (really just a few bytes).

If there’s a way to handle this with a better method, I’m all ears. I had thought of taking advantage of the fact that the sum total of all this data will never exceed a single page, but that seems to me to require working around even more functionality for what is, essentially, a corner case.

-Doug


--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv2 0/3] rbd: header read/refresh improvements
  2015-04-26 14:28     ` Douglas Fuller
@ 2015-04-26 14:39       ` Alex Elder
  2015-04-26 15:07         ` Douglas Fuller
  0 siblings, 1 reply; 14+ messages in thread
From: Alex Elder @ 2015-04-26 14:39 UTC (permalink / raw)
  To: Douglas Fuller, Ilya Dryomov; +Cc: Ceph Development

On 04/26/2015 09:28 AM, Douglas Fuller wrote:
> Alex,
>
> I think you are correct in both your understanding and your
> impression of the approach.

OK, good to hear.

>> On Apr 26, 2015, at 4:44 AM, Ilya Dryomov <idryomov@gmail.com>
>> wrote:
>>
>> On Sun, Apr 26, 2015 at 9:29 AM, Alex Elder <elder@ieee.org>
>> wrote:
>>> On 04/24/2015 08:22 AM, Douglas Fuller wrote:
>>>>
>>>> Support multiple class op calls in one ceph_msg and consolidate
>>>> rbd header read and refresh processes to use this feature to
>>>> reduce the number of ceph_msgs sent for that process. Refresh
>>>> features on header refresh and begin returning EIO if features
>>>> have changed since mapping.
>>>
>>> This sounds pretty expensive.  For every class operation you are
>>> copying the received data two extra times.
>
> I’d really like a solution to this, but I don’t think one is
> available given the constraints.

Now that I know I understand it, I will at least provide
some review comments on the patches.  I'll also think about
it a little.  My instinct says there should be no need to
make the copies, but the devil will be in the details.

>>> Will you please correct me where I'm wrong above, and maybe give
>>> a little better high-level explanation of what you're trying to
>>> do?  I see in patch 1 you mention a problem with short reads, and
>>> there may be a simpler fix than that (to begin with).  But beyond
>>> that, if you're trying to incorporate more ops in a message,
>>> there may be better ways to go about that as well.
>>
>> Yeah, the only objective of this was to pack more call ops into an
>> osd_request in order to reduce the number of network RTs during rbd
>> map and refresh.  The fundamental problem the first patch is trying
>> to work around is the first ceph_msg_data item consuming all reply
>> buffers instead of just its own.  We have to preallocate response
>> buffers and if the preallocated response buffer for the first op is
>> large enough (it always is) it will consume the entire ceph_msg,
>> along with replies to other ops.
>>
>> My understanding is that the first patch is supposed to be a
>> specific workarond for just that - I think it's noted somewhere
>> that it will break on reads combined with call ops or similar.
>
> That’s correct. This patch only works around that short response
> issue in this specific corner case. It does not fix cases where call
> ops returning data could be arbitrarily combined with reads or writes
> (and apparently they never have been, because that would not work).
> It should not introduce additional cases of breakage, though.

I knew when we added support for copyup that we were kind of
kludging things to allow *just one* more op in a message.  Then
the discard support added another, but in both of these cases
it was not done in a nice, general way.  The "right" way is
to generalize it, but if that was easy it would have been
done by now.

> I was told that the preferred way to proceed for now was to avoid
> changing the osd_client interface and to handle this case in
> osd_client instead of the messaging layer. Given those constraints,
> it was agreed in the standups and on #ceph-devel that it should be
> done this way.

Sorry, I haven't been keeping up with all the activity on the
mailing list.  I pay closest attention to patches to the kernel
code.

> We can’t know the actual response sizes until they are decoded in
> osd_client. Therefore, we can’t copy them directly to the user
> buffers off the wire. That costs us one extra copy. The other copy is
> required because pagevec.c does not provide an interface for copying
> arbitrary data between two page vectors.

That could probably be changed, but that's a separate issue
(and somewhat out of your hands as far as getting it done when
you need it).

>> I too have my efficiency concerns and I raised them in one of the
>> standups but the argument was that this is only going to be used
>> for header init/refresh, not copyup, so the overhead is negligible.
>> I'm still not sold on the copying everything twice though and was
>> going to try to see if there is a simple way to special case this
>> even more and make the diffstat smaller.
>
> You and I agreed in that particular standup discussion; I don’t like
> it, either. I would prefer a general-case solution that avoids
> introducing so much extra overhead, especially for such a small
> amount of data (really just a few bytes).
>
> If there’s a way to handle this with a better method, I’m all ears. I
> had thought of taking advantage of the fact that the sum total of all
> this data will never exceed a single page, but that seems to me to
> require working around even more functionality for what is,
> essentially, a corner case.

OK, well I'll go look at that code path again to see if I can
come up with any bright ideas.

Thanks for the explanation.

					-Alex
> -Doug
>
>

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv2 0/3] rbd: header read/refresh improvements
  2015-04-26 14:39       ` Alex Elder
@ 2015-04-26 15:07         ` Douglas Fuller
  2015-04-27  4:15           ` Sage Weil
  0 siblings, 1 reply; 14+ messages in thread
From: Douglas Fuller @ 2015-04-26 15:07 UTC (permalink / raw)
  To: Alex Elder; +Cc: Ilya Dryomov, Ceph Development


>> I’d really like a solution to this, but I don’t think one is
>> available given the constraints.
> 
> Now that I know I understand it, I will at least provide
> some review comments on the patches.  I'll also think about
> it a little.  My instinct says there should be no need to
> make the copies, but the devil will be in the details.

Yeah, it seems like it should be simpler. To fix it, I think we would have to either commit a layering violation (decode the call ops’ response sizes in the message receive flow) or change the interface to osd_client (don’t provide preallocated pagevecs for call response data even though that’s what’s done for other types of ops). I’m the new guy, though, so it’s certainly possible I’m missing something.

>> I was told that the preferred way to proceed for now was to avoid
>> changing the osd_client interface and to handle this case in
>> osd_client instead of the messaging layer. Given those constraints,
>> it was agreed in the standups and on #ceph-devel that it should be
>> done this way.
> 
> Sorry, I haven't been keeping up with all the activity on the
> mailing list.  I pay closest attention to patches to the kernel
> code.

Well, this was worked out mainly in the standups and on IRC. There wasn’t a mailing list discussion on it. Having said that, the attention to the patch is certainly appreciated.

> OK, well I'll go look at that code path again to see if I
> can
> come up with any bright ideas.

Thanks very much.

-Doug--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv2 0/3] rbd: header read/refresh improvements
  2015-04-26 15:07         ` Douglas Fuller
@ 2015-04-27  4:15           ` Sage Weil
  2015-04-27  4:35             ` Douglas Fuller
  0 siblings, 1 reply; 14+ messages in thread
From: Sage Weil @ 2015-04-27  4:15 UTC (permalink / raw)
  To: Douglas Fuller; +Cc: Alex Elder, Ilya Dryomov, Ceph Development

On Sun, 26 Apr 2015, Douglas Fuller wrote:
> >> I?d really like a solution to this, but I don?t think one is
> >> available given the constraints.
> > 
> > Now that I know I understand it, I will at least provide
> > some review comments on the patches.  I'll also think about
> > it a little.  My instinct says there should be no need to
> > make the copies, but the devil will be in the details.
> 
> Yeah, it seems like it should be simpler. To fix it, I think we would have to either commit a layering violation (decode the call ops? response sizes in the message receive flow) or change the interface to osd_client (don?t provide preallocated pagevecs for call response data even though that?s what?s done for other types of ops). I?m the new guy, though, so it?s certainly possible I?m missing something.
> 
> >> I was told that the preferred way to proceed for now was to avoid
> >> changing the osd_client interface and to handle this case in
> >> osd_client instead of the messaging layer. Given those constraints,
> >> it was agreed in the standups and on #ceph-devel that it should be
> >> done this way.
> > 
> > Sorry, I haven't been keeping up with all the activity on the
> > mailing list.  I pay closest attention to patches to the kernel
> > code.
> 
> Well, this was worked out mainly in the standups and on IRC. There wasn?t a mailing list discussion on it. Having said that, the attention to the patch is certainly appreciated.

Two basic approaches come to mind:

1) add an up-call between the front and data portions.  Right now we have 
a get_reply between header and front that is used to get the buffers for 
front (and data).  We could add a second that comes after front to prepare 
the data buffers.  This would basically change teh sequence from

 - read header
 - get/allocate ceph_msg, along with front, middle, and data buffers
 - read front, middle, data
 - dispatch message
    - decode
    - process

to

 - read header
 - get/allocate ceph_msg, along with front, maybe middle, and data buffers
 - read front, middle
 - call get_data hook
    - decode front, prepare/allocate data buffers
 - read data
 - dispatch message
    - process

I think that's a viable approach, but it means a fair bit of refactoring 
to separate the decoding from the processing (at least for 
osd_reply)--right now those two steps are combined in all of the 
handle_* functions.

2) Introduce a protocol change so that the section boundaries are 
described in a generic way.  Even more work!

Given the only uesrs we have in mind have very small data payloads, 
copying seems simplest to me, but it's up to you guys.  Maybe there is 
something much simpler I'm missing?  Somehow we have to do the front 
parsing before reading data... and probably we want to avoid parsing it 
twice...

sage

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv2 0/3] rbd: header read/refresh improvements
  2015-04-27  4:15           ` Sage Weil
@ 2015-04-27  4:35             ` Douglas Fuller
  2015-04-27  4:40               ` Alex Elder
  0 siblings, 1 reply; 14+ messages in thread
From: Douglas Fuller @ 2015-04-27  4:35 UTC (permalink / raw)
  To: Sage Weil; +Cc: Alex Elder, Ilya Dryomov, Ceph Development





> On Apr 27, 2015, at 12:15 AM, Sage Weil <sage@newdream.net> wrote:
> 
> Two basic approaches come to mind:
> 
> 1) add an up-call between the front and data portions.  Right now we have 
> a get_reply between header and front that is used to get the buffers for 
> front (and data).  We could add a second that comes after front to prepare 
> the data buffers.  This would basically change teh sequence from
> 
> - read header
> - get/allocate ceph_msg, along with front, middle, and data buffers
> - read front, middle, data
> - dispatch message
>    - decode
>    - process
> 
> to
> 
> - read header
> - get/allocate ceph_msg, along with front, maybe middle, and data buffers
> - read front, middle
> - call get_data hook
>    - decode front, prepare/allocate data buffers
> - read data
> - dispatch message
>    - process
> 
> I think that's a viable approach, but it means a fair bit of refactoring 
> to separate the decoding from the processing (at least for 
> osd_reply)--right now those two steps are combined in all of the 
> handle_* functions.

In the kernel client, the data buffers are allocated before the message is even sent. The messaging code reads the reply data directly into those preallocated buffers. If we add a call up to decode the message size and allocate the data buffers when we process the reply, we have to change the osd_client interface.

> 2) Introduce a protocol change so that the section boundaries are 
> described in a generic way.  Even more work!

We could also pad the relevant op replies so that each has a fixed length. It would be a protocol change, but a smaller one, I assume.

> Given the only uesrs we have in mind have very small data payloads, 
> copying seems simplest to me, but it's up to you guys.  Maybe there is 
> something much simpler I'm missing?  Somehow we have to do the front 
> parsing before reading data... and probably we want to avoid parsing it 
> twice...


This solution just feels hacky and inefficient, so I think there is a desire to feel like we at least tried to come up something simpler and more efficient before proceeding.

-Doug

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv2 0/3] rbd: header read/refresh improvements
  2015-04-27  4:35             ` Douglas Fuller
@ 2015-04-27  4:40               ` Alex Elder
  2015-05-07 15:28                 ` Alex Elder
  0 siblings, 1 reply; 14+ messages in thread
From: Alex Elder @ 2015-04-27  4:40 UTC (permalink / raw)
  To: Douglas Fuller, Sage Weil; +Cc: Ilya Dryomov, Ceph Development

On 04/26/2015 11:35 PM, Douglas Fuller wrote:
> This solution just feels hacky and inefficient, so I think there is a
> desire to feel like we at least tried to come up something simpler
> and more efficient before proceeding.

Right.  And somehow more fitting with the existing code, if
that's possible.  There are a few things in this proposal
that are just special case code, and it would be nice to
avoid that.

I'm sorry I've been too busy today to get back to this.  I spent
a lot of time on Saturday getting ramped back up on this code
so I could provide a decent review...

I'm going to at least look at the code (error on a read op)
before going to bed, so I at least know what the problem is.
I'll try to have something to say (even if it's "I've got
nothing") in the next day or two.

					-Alex

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv2 0/3] rbd: header read/refresh improvements
  2015-04-27  4:40               ` Alex Elder
@ 2015-05-07 15:28                 ` Alex Elder
  2015-05-07 16:48                   ` Douglas Fuller
  0 siblings, 1 reply; 14+ messages in thread
From: Alex Elder @ 2015-05-07 15:28 UTC (permalink / raw)
  To: Douglas Fuller, Sage Weil; +Cc: Ilya Dryomov, Ceph Development

On 04/26/2015 11:40 PM, Alex Elder wrote:
> On 04/26/2015 11:35 PM, Douglas Fuller wrote:
>> This solution just feels hacky and inefficient, so I think there is a
>> desire to feel like we at least tried to come up something simpler
>> and more efficient before proceeding.
> 
> Right.  And somehow more fitting with the existing code, if
> that's possible.  There are a few things in this proposal
> that are just special case code, and it would be nice to
> avoid that.
> 
> I'm sorry I've been too busy today to get back to this.  I spent
> a lot of time on Saturday getting ramped back up on this code
> so I could provide a decent review...
> 
> I'm going to at least look at the code (error on a read op)
> before going to bed, so I at least know what the problem is.
> I'll try to have something to say (even if it's "I've got
> nothing") in the next day or two.
> 
>                     -Alex

It's been a week and a half since I sent this.  I don't
expect to be able to contribute anything for at least a
few more days.

If adding this functionality is something you need quickly,
I don't want you to be waiting for me.

If you can wait, I will gladly review it and offer my
suggestions for a better way (if I can think of one...).

But if not, you should just proceed with what you have
if it solves the problem.

I just don't want to keep you waiting.

					-Alex

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCHv2 0/3] rbd: header read/refresh improvements
  2015-05-07 15:28                 ` Alex Elder
@ 2015-05-07 16:48                   ` Douglas Fuller
  0 siblings, 0 replies; 14+ messages in thread
From: Douglas Fuller @ 2015-05-07 16:48 UTC (permalink / raw)
  To: Alex Elder; +Cc: Sage Weil, Ilya Dryomov, Ceph Development

Alex,

Thanks for circling back.

We should prioritize correctness and accuracy over promptness here; this isn’t a critical feature and I’d like my first submission to receive a thorough review.

Thanks,
—Doug

> On May 7, 2015, at 11:28 AM, Alex Elder <elder@ieee.org> wrote:
> 
> On 04/26/2015 11:40 PM, Alex Elder wrote:
>> On 04/26/2015 11:35 PM, Douglas Fuller wrote:
>>> This solution just feels hacky and inefficient, so I think there is a
>>> desire to feel like we at least tried to come up something simpler
>>> and more efficient before proceeding.
>> 
>> Right.  And somehow more fitting with the existing code, if
>> that's possible.  There are a few things in this proposal
>> that are just special case code, and it would be nice to
>> avoid that.
>> 
>> I'm sorry I've been too busy today to get back to this.  I spent
>> a lot of time on Saturday getting ramped back up on this code
>> so I could provide a decent review...
>> 
>> I'm going to at least look at the code (error on a read op)
>> before going to bed, so I at least know what the problem is.
>> I'll try to have something to say (even if it's "I've got
>> nothing") in the next day or two.
>> 
>>                    -Alex
> 
> It's been a week and a half since I sent this.  I don't
> expect to be able to contribute anything for at least a
> few more days.
> 
> If adding this functionality is something you need quickly,
> I don't want you to be waiting for me.
> 
> If you can wait, I will gladly review it and offer my
> suggestions for a better way (if I can think of one...).
> 
> But if not, you should just proceed with what you have
> if it solves the problem.
> 
> I just don't want to keep you waiting.
> 
> 					-Alex

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2015-05-07 16:48 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-04-24 13:22 [PATCHv2 0/3] rbd: header read/refresh improvements Douglas Fuller
2015-04-24 13:22 ` [PATCHv2 1/3] ceph: support multiple class method calls in one ceph_msg Douglas Fuller
2015-04-24 13:22 ` [PATCHv2 2/3] rbd: combine object method calls in header refresh using fewer ceph_msgs Douglas Fuller
2015-04-24 13:22 ` [PATCHv2 3/3] rbd: re-read features during header refresh and detect changes Douglas Fuller
2015-04-26  6:29 ` [PATCHv2 0/3] rbd: header read/refresh improvements Alex Elder
2015-04-26  8:44   ` Ilya Dryomov
2015-04-26 14:28     ` Douglas Fuller
2015-04-26 14:39       ` Alex Elder
2015-04-26 15:07         ` Douglas Fuller
2015-04-27  4:15           ` Sage Weil
2015-04-27  4:35             ` Douglas Fuller
2015-04-27  4:40               ` Alex Elder
2015-05-07 15:28                 ` Alex Elder
2015-05-07 16:48                   ` Douglas Fuller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.