All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH] rbd: use writefull op for object size writes
       [not found] ` <56153BF3.5050403@ieee.org>
@ 2015-10-07 16:13   ` Ilya Dryomov
  0 siblings, 0 replies; 3+ messages in thread
From: Ilya Dryomov @ 2015-10-07 16:13 UTC (permalink / raw)
  To: Alex Elder; +Cc: Ceph Development

On Wed, Oct 7, 2015 at 5:36 PM, Alex Elder <elder@ieee.org> wrote:
> On 10/07/2015 12:02 PM, Ilya Dryomov wrote:
>> This covers only the simplest case - an object size sized write, but
>> it's still useful in tiering setups when EC is used for the base tier
>> as writefull op can be proxied, saving an object promotion.
>>
>> Even though updating ceph_osdc_new_request() to allow writefull should
>> just be a matter of fixing an assert, I didn't do it because its only
>> user is cephfs.  All other sites were updated.
>>
>> Reflects ceph.git commit 7bfb7f9025a8ee0d2305f49bf0336d2424da5b5b.
>
> I haven't looked at this at all.  But can you give me a
> short explanation of what "writefull" is?
>
> Full object write?

Well, in a way.  It replaces previous data, you can think of it as an
atomic truncate to 0 + write from offset 0.  So it always writes
(replaces) entire objects.

Thanks,

                Ilya

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH] rbd: use writefull op for object size writes
@ 2015-10-07 17:02 Ilya Dryomov
       [not found] ` <56153BF3.5050403@ieee.org>
  2015-10-16 12:24 ` Alex Elder
  0 siblings, 2 replies; 3+ messages in thread
From: Ilya Dryomov @ 2015-10-07 17:02 UTC (permalink / raw)
  To: ceph-devel

This covers only the simplest case - an object size sized write, but
it's still useful in tiering setups when EC is used for the base tier
as writefull op can be proxied, saving an object promotion.

Even though updating ceph_osdc_new_request() to allow writefull should
just be a matter of fixing an assert, I didn't do it because its only
user is cephfs.  All other sites were updated.

Reflects ceph.git commit 7bfb7f9025a8ee0d2305f49bf0336d2424da5b5b.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c   |  9 +++++++--
 net/ceph/osd_client.c | 13 +++++++++----
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 04e69b4df664..cd00e4653e49 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1863,9 +1863,11 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 		rbd_osd_read_callback(obj_request);
 		break;
 	case CEPH_OSD_OP_SETALLOCHINT:
-		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE);
+		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE ||
+			   osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL);
 		/* fall through */
 	case CEPH_OSD_OP_WRITE:
+	case CEPH_OSD_OP_WRITEFULL:
 		rbd_osd_write_callback(obj_request);
 		break;
 	case CEPH_OSD_OP_STAT:
@@ -2401,7 +2403,10 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
 				opcode = CEPH_OSD_OP_ZERO;
 		}
 	} else if (op_type == OBJ_OP_WRITE) {
-		opcode = CEPH_OSD_OP_WRITE;
+		if (!offset && length == object_size)
+			opcode = CEPH_OSD_OP_WRITEFULL;
+		else
+			opcode = CEPH_OSD_OP_WRITE;
 		osd_req_op_alloc_hint_init(osd_request, num_ops,
 					object_size, object_size);
 		num_ops++;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 80b94e37c94a..f79ccac6699f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -285,6 +285,7 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
 	switch (op->op) {
 	case CEPH_OSD_OP_READ:
 	case CEPH_OSD_OP_WRITE:
+	case CEPH_OSD_OP_WRITEFULL:
 		ceph_osd_data_release(&op->extent.osd_data);
 		break;
 	case CEPH_OSD_OP_CALL:
@@ -485,13 +486,14 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
 	size_t payload_len = 0;
 
 	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
-	       opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE);
+	       opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO &&
+	       opcode != CEPH_OSD_OP_TRUNCATE);
 
 	op->extent.offset = offset;
 	op->extent.length = length;
 	op->extent.truncate_size = truncate_size;
 	op->extent.truncate_seq = truncate_seq;
-	if (opcode == CEPH_OSD_OP_WRITE)
+	if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL)
 		payload_len += length;
 
 	op->payload_len = payload_len;
@@ -670,9 +672,11 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 		break;
 	case CEPH_OSD_OP_READ:
 	case CEPH_OSD_OP_WRITE:
+	case CEPH_OSD_OP_WRITEFULL:
 	case CEPH_OSD_OP_ZERO:
 	case CEPH_OSD_OP_TRUNCATE:
-		if (src->op == CEPH_OSD_OP_WRITE)
+		if (src->op == CEPH_OSD_OP_WRITE ||
+		    src->op == CEPH_OSD_OP_WRITEFULL)
 			request_data_len = src->extent.length;
 		dst->extent.offset = cpu_to_le64(src->extent.offset);
 		dst->extent.length = cpu_to_le64(src->extent.length);
@@ -681,7 +685,8 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 		dst->extent.truncate_seq =
 			cpu_to_le32(src->extent.truncate_seq);
 		osd_data = &src->extent.osd_data;
-		if (src->op == CEPH_OSD_OP_WRITE)
+		if (src->op == CEPH_OSD_OP_WRITE ||
+		    src->op == CEPH_OSD_OP_WRITEFULL)
 			ceph_osdc_msg_data_add(req->r_request, osd_data);
 		else
 			ceph_osdc_msg_data_add(req->r_reply, osd_data);
-- 
2.4.3


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] rbd: use writefull op for object size writes
  2015-10-07 17:02 [PATCH] rbd: use writefull op for object size writes Ilya Dryomov
       [not found] ` <56153BF3.5050403@ieee.org>
@ 2015-10-16 12:24 ` Alex Elder
  1 sibling, 0 replies; 3+ messages in thread
From: Alex Elder @ 2015-10-16 12:24 UTC (permalink / raw)
  To: Ilya Dryomov, ceph-devel

On 10/07/2015 12:02 PM, Ilya Dryomov wrote:
> This covers only the simplest case - an object size sized write, but
> it's still useful in tiering setups when EC is used for the base tier
> as writefull op can be proxied, saving an object promotion.
> 
> Even though updating ceph_osdc_new_request() to allow writefull should
> just be a matter of fixing an assert, I didn't do it because its only
> user is cephfs.  All other sites were updated.
> 
> Reflects ceph.git commit 7bfb7f9025a8ee0d2305f49bf0336d2424da5b5b.
> 
> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>

Looks good to me.

Reviewed-by: Alex Elder <elder@linaro.org>

> ---
>  drivers/block/rbd.c   |  9 +++++++--
>  net/ceph/osd_client.c | 13 +++++++++----
>  2 files changed, 16 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
> index 04e69b4df664..cd00e4653e49 100644
> --- a/drivers/block/rbd.c
> +++ b/drivers/block/rbd.c
> @@ -1863,9 +1863,11 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
>  		rbd_osd_read_callback(obj_request);
>  		break;
>  	case CEPH_OSD_OP_SETALLOCHINT:
> -		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE);
> +		rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE ||
> +			   osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL);
>  		/* fall through */
>  	case CEPH_OSD_OP_WRITE:
> +	case CEPH_OSD_OP_WRITEFULL:
>  		rbd_osd_write_callback(obj_request);
>  		break;
>  	case CEPH_OSD_OP_STAT:
> @@ -2401,7 +2403,10 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
>  				opcode = CEPH_OSD_OP_ZERO;
>  		}
>  	} else if (op_type == OBJ_OP_WRITE) {
> -		opcode = CEPH_OSD_OP_WRITE;
> +		if (!offset && length == object_size)
> +			opcode = CEPH_OSD_OP_WRITEFULL;
> +		else
> +			opcode = CEPH_OSD_OP_WRITE;
>  		osd_req_op_alloc_hint_init(osd_request, num_ops,
>  					object_size, object_size);
>  		num_ops++;
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 80b94e37c94a..f79ccac6699f 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -285,6 +285,7 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
>  	switch (op->op) {
>  	case CEPH_OSD_OP_READ:
>  	case CEPH_OSD_OP_WRITE:
> +	case CEPH_OSD_OP_WRITEFULL:
>  		ceph_osd_data_release(&op->extent.osd_data);
>  		break;
>  	case CEPH_OSD_OP_CALL:
> @@ -485,13 +486,14 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
>  	size_t payload_len = 0;
>  
>  	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
> -	       opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE);
> +	       opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO &&
> +	       opcode != CEPH_OSD_OP_TRUNCATE);
>  
>  	op->extent.offset = offset;
>  	op->extent.length = length;
>  	op->extent.truncate_size = truncate_size;
>  	op->extent.truncate_seq = truncate_seq;
> -	if (opcode == CEPH_OSD_OP_WRITE)
> +	if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL)
>  		payload_len += length;
>  
>  	op->payload_len = payload_len;
> @@ -670,9 +672,11 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
>  		break;
>  	case CEPH_OSD_OP_READ:
>  	case CEPH_OSD_OP_WRITE:
> +	case CEPH_OSD_OP_WRITEFULL:
>  	case CEPH_OSD_OP_ZERO:
>  	case CEPH_OSD_OP_TRUNCATE:
> -		if (src->op == CEPH_OSD_OP_WRITE)
> +		if (src->op == CEPH_OSD_OP_WRITE ||
> +		    src->op == CEPH_OSD_OP_WRITEFULL)
>  			request_data_len = src->extent.length;
>  		dst->extent.offset = cpu_to_le64(src->extent.offset);
>  		dst->extent.length = cpu_to_le64(src->extent.length);
> @@ -681,7 +685,8 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
>  		dst->extent.truncate_seq =
>  			cpu_to_le32(src->extent.truncate_seq);
>  		osd_data = &src->extent.osd_data;
> -		if (src->op == CEPH_OSD_OP_WRITE)
> +		if (src->op == CEPH_OSD_OP_WRITE ||
> +		    src->op == CEPH_OSD_OP_WRITEFULL)
>  			ceph_osdc_msg_data_add(req->r_request, osd_data);
>  		else
>  			ceph_osdc_msg_data_add(req->r_reply, osd_data);
> 


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2015-10-16 12:24 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-07 17:02 [PATCH] rbd: use writefull op for object size writes Ilya Dryomov
     [not found] ` <56153BF3.5050403@ieee.org>
2015-10-07 16:13   ` Ilya Dryomov
2015-10-16 12:24 ` Alex Elder

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.