All of lore.kernel.org
 help / color / mirror / Atom feed
From: Sage Weil <sage@inktank.com>
To: Ilya Dryomov <ilya.dryomov@inktank.com>
Cc: ceph-devel@vger.kernel.org
Subject: Re: [PATCH 2/6] libceph: add support for CEPH_OSD_OP_SETALLOCHINT osd op
Date: Sun, 23 Feb 2014 08:03:01 -0800 (PST)	[thread overview]
Message-ID: <alpine.DEB.2.00.1402230759290.17508@cobra.newdream.net> (raw)
In-Reply-To: <1393008946-7931-3-git-send-email-ilya.dryomov@inktank.com>

On Fri, 21 Feb 2014, Ilya Dryomov wrote:
> This is primarily for rbd's benefit and is supposed to combat
> fragmentation:
> 
> "... knowing that rbd images have a 4m size, librbd can pass a hint
> that will let the osd do the xfs allocation size ioctl on new files so
> that they are allocated in 1m or 4m chunks.  We've seen cases where
> users with rbd workloads have very high levels of fragmentation in xfs
> and this would mitigate that and probably have a pretty nice
> performance benefit."
> 
> SETALLOCHINT is considered advisory, so our backwards compatibility
> mechanism here is to set FAILOK flag for all SETALLOCHINT ops.
> 
> Signed-off-by: Ilya Dryomov <ilya.dryomov@inktank.com>
> ---
>  include/linux/ceph/osd_client.h |    9 +++++++++
>  include/linux/ceph/rados.h      |    8 ++++++++
>  net/ceph/osd_client.c           |   30 ++++++++++++++++++++++++++++++
>  3 files changed, 47 insertions(+)
> 
> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
> index e94f5da251d6..6bfcb0eca8ab 100644
> --- a/include/linux/ceph/osd_client.h
> +++ b/include/linux/ceph/osd_client.h
> @@ -103,6 +103,11 @@ struct ceph_osd_req_op {
>  			u32 timeout;
>  			__u8 flag;
>  		} watch;
> +		struct {
> +			u64 expected_size;
> +			u64 expected_write_size;
> +			__u8 expected_size_probability;
> +		} hint;

s/hint/alloc_hint/ ?

>  	};
>  };
>  
> @@ -294,6 +299,10 @@ extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
>  extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
>  					unsigned int which, u16 opcode,
>  					u64 cookie, u64 version, int flag);
> +extern void osd_req_op_hint_init(struct ceph_osd_request *osd_req,
> +				 unsigned int which, u16 opcode,
> +				 u64 expected_size, u64 expected_write_size,
> +				 u8 expected_size_probability);
>  
>  extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
>  					       struct ceph_snap_context *snapc,
> diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
> index 8f9bf4570215..b8e2dd11f186 100644
> --- a/include/linux/ceph/rados.h
> +++ b/include/linux/ceph/rados.h
> @@ -227,6 +227,9 @@ enum {
>  	CEPH_OSD_OP_OMAPRMKEYS    = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 24,
>  	CEPH_OSD_OP_OMAP_CMP      = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 25,
>  
> +	/* hints */
> +	CEPH_OSD_OP_SETALLOCHINT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 35,
> +
>  	/** multi **/
>  	CEPH_OSD_OP_CLONERANGE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_MULTI | 1,
>  	CEPH_OSD_OP_ASSERT_SRC_VERSION = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_MULTI | 2,
> @@ -416,6 +419,11 @@ struct ceph_osd_op {
>  			__le64 offset, length;
>  			__le64 src_offset;
>  		} __attribute__ ((packed)) clonerange;
> +		struct {
> +			__le64 expected_size;
> +			__le64 expected_write_size;
> +			__u8 expected_size_probability;
> +		} __attribute__ ((packed)) hint;

s/hint/alloc_hint/, I think.  Just made the same comment on the user space 
side.

>  	};
>  	__le32 payload_len;
>  } __attribute__ ((packed));
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 5d7fd0b8c1c8..4090f6e8db3a 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -436,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode)
>  	case CEPH_OSD_OP_OMAPCLEAR:
>  	case CEPH_OSD_OP_OMAPRMKEYS:
>  	case CEPH_OSD_OP_OMAP_CMP:
> +	case CEPH_OSD_OP_SETALLOCHINT:
>  	case CEPH_OSD_OP_CLONERANGE:
>  	case CEPH_OSD_OP_ASSERT_SRC_VERSION:
>  	case CEPH_OSD_OP_SRC_CMPXATTR:
> @@ -591,6 +592,28 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
>  }
>  EXPORT_SYMBOL(osd_req_op_watch_init);
>  
> +void osd_req_op_hint_init(struct ceph_osd_request *osd_req,
> +			  unsigned int which, u16 opcode,
> +			  u64 expected_size, u64 expected_write_size,
> +			  u8 expected_size_probability)
> +{
> +	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
> +
> +	BUG_ON(opcode != CEPH_OSD_OP_SETALLOCHINT);

I would just drop the opcode argument all together.  And 
s/hint/alloc_hint/ in the function name...  I wouldn't expect that any 
other type of hint would have these same arguments.

> +
> +	op->hint.expected_size = expected_size;
> +	op->hint.expected_write_size = expected_write_size;
> +	op->hint.expected_size_probability = expected_size_probability;
> +
> +	/*
> +	 * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
> +	 * not worth a feature bit.  Set FAILOK per-op flag to make
> +	 * sure older osds don't trip over an unsupported opcode.
> +	 */
> +	op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
> +}
> +EXPORT_SYMBOL(osd_req_op_hint_init);
> +
>  static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
>  				struct ceph_osd_data *osd_data)
>  {
> @@ -681,6 +704,13 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
>  		dst->watch.ver = cpu_to_le64(src->watch.ver);
>  		dst->watch.flag = src->watch.flag;
>  		break;
> +	case CEPH_OSD_OP_SETALLOCHINT:
> +		dst->hint.expected_size = cpu_to_le64(src->hint.expected_size);
> +		dst->hint.expected_write_size =
> +		    cpu_to_le64(src->hint.expected_write_size);
> +		dst->hint.expected_size_probability =
> +		    src->hint.expected_size_probability;
> +		break;
>  	default:
>  		pr_err("unsupported osd opcode %s\n",
>  			ceph_osd_op_name(src->op));
> -- 
> 1.7.10.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 

  reply	other threads:[~2014-02-23 16:03 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-02-21 18:55 [PATCH 0/6] libceph: CEPH_OSD_OP_SETALLOCHINT osd op Ilya Dryomov
2014-02-21 18:55 ` [PATCH 1/6] libceph: encode CEPH_OSD_OP_FLAG_* op flags Ilya Dryomov
2014-02-24 14:58   ` Alex Elder
2014-02-21 18:55 ` [PATCH 2/6] libceph: add support for CEPH_OSD_OP_SETALLOCHINT osd op Ilya Dryomov
2014-02-23 16:03   ` Sage Weil [this message]
2014-02-24 14:59   ` Alex Elder
2014-02-25 12:52     ` Ilya Dryomov
2014-02-25 13:05       ` Alex Elder
2014-02-25 13:38         ` Ilya Dryomov
2014-02-25 17:12           ` Sage Weil
2014-02-25 17:12       ` Sage Weil
2014-02-21 18:55 ` [PATCH 3/6] libceph: bump CEPH_OSD_MAX_OP to 3 Ilya Dryomov
2014-02-24 14:59   ` Alex Elder
2014-02-21 18:55 ` [PATCH 4/6] rbd: do not hard-code CEPH_OSD_MAX_OP in rbd_osd_req_callback() Ilya Dryomov
2014-02-24 14:59   ` Alex Elder
2014-02-25 12:53     ` Ilya Dryomov
2014-02-21 18:55 ` [PATCH 5/6] rbd: num_ops parameter for rbd_osd_req_create() Ilya Dryomov
2014-02-24 14:59   ` Alex Elder
2014-02-21 18:55 ` [PATCH 6/6] rbd: prefix rbd writes with CEPH_OSD_OP_SETALLOCHINT osd op Ilya Dryomov
2014-02-24 14:59   ` Alex Elder
2014-02-25 12:58     ` Ilya Dryomov
2014-02-25 13:19       ` Alex Elder
2014-02-23 16:14 ` [PATCH 0/6] libceph: " Sage Weil
2014-02-23 16:15   ` Alex Elder
2014-02-24 14:58 ` Alex Elder
2014-02-25 12:50   ` Ilya Dryomov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=alpine.DEB.2.00.1402230759290.17508@cobra.newdream.net \
    --to=sage@inktank.com \
    --cc=ceph-devel@vger.kernel.org \
    --cc=ilya.dryomov@inktank.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.