All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v4 0/1] ceph: add support for getvxattr op
@ 2022-01-17  8:51 Milind Changire
  2022-01-17  8:51 ` [PATCH v4 1/1] ceph: add " Milind Changire
  0 siblings, 1 reply; 13+ messages in thread
From: Milind Changire @ 2022-01-17  8:51 UTC (permalink / raw)
  To: Jeff Layton, Ilya Dryomov, ceph-devel; +Cc: Milind Changire

Adds support for getting ceph virtual xattrs using the new getvxattr op.

fixed in v4:
* fixed dout format specifier for size_t types to %zu in place of %lu

Milind Changire (1):
  ceph: add getvxattr op

 fs/ceph/inode.c              | 51 ++++++++++++++++++++++++++++++++++++
 fs/ceph/mds_client.c         | 27 ++++++++++++++++++-
 fs/ceph/mds_client.h         | 12 ++++++++-
 fs/ceph/strings.c            |  1 +
 fs/ceph/super.h              |  1 +
 fs/ceph/xattr.c              | 34 ++++++++++++++++++++++++
 include/linux/ceph/ceph_fs.h |  1 +
 7 files changed, 125 insertions(+), 2 deletions(-)


base-commit: fd84bfdddd169c219c3a637889a8b87f70a072c2
-- 
2.31.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH v4 1/1] ceph: add getvxattr op
  2022-01-17  8:51 [PATCH v4 0/1] ceph: add support for getvxattr op Milind Changire
@ 2022-01-17  8:51 ` Milind Changire
  2022-01-17 10:53   ` Jeff Layton
  0 siblings, 1 reply; 13+ messages in thread
From: Milind Changire @ 2022-01-17  8:51 UTC (permalink / raw)
  To: Jeff Layton, Ilya Dryomov, ceph-devel; +Cc: Milind Changire

Problem:
Directory vxattrs like ceph.dir.pin* and ceph.dir.layout* may not be
propagated to the client as frequently to keep them updated. This
creates vxattr availability problems.

Solution:
Adds new getvxattr op to fetch ceph.dir.pin*, ceph.dir.layout* and
ceph.file.layout* vxattrs.
If the entire layout for a dir or a file is being set, then it is
expected that the layout be set in standard JSON format. Individual
field value retrieval is not wrapped in JSON. The JSON format also
applies while setting the vxattr if the entire layout is being set in
one go.
As a temporary measure, setting a vxattr can also be done in the old
format. The old format will be deprecated in the future.

URL: https://tracker.ceph.com/issues/51062
Signed-off-by: Milind Changire <mchangir@redhat.com>
---
 fs/ceph/inode.c              | 51 ++++++++++++++++++++++++++++++++++++
 fs/ceph/mds_client.c         | 27 ++++++++++++++++++-
 fs/ceph/mds_client.h         | 12 ++++++++-
 fs/ceph/strings.c            |  1 +
 fs/ceph/super.h              |  1 +
 fs/ceph/xattr.c              | 34 ++++++++++++++++++++++++
 include/linux/ceph/ceph_fs.h |  1 +
 7 files changed, 125 insertions(+), 2 deletions(-)

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e3322fcb2e8d..efdce049b7f0 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -2291,6 +2291,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
 	return err;
 }
 
+int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
+		      size_t size)
+{
+	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+	struct ceph_mds_client *mdsc = fsc->mdsc;
+	struct ceph_mds_request *req;
+	int mode = USE_AUTH_MDS;
+	int err;
+	char *xattr_value;
+	size_t xattr_value_len;
+
+	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
+	if (IS_ERR(req)) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	req->r_path2 = kstrdup(name, GFP_NOFS);
+	if (!req->r_path2) {
+		err = -ENOMEM;
+		goto put;
+	}
+
+	ihold(inode);
+	req->r_inode = inode;
+	err = ceph_mdsc_do_request(mdsc, NULL, req);
+	if (err < 0)
+		goto put;
+
+	xattr_value = req->r_reply_info.xattr_info.xattr_value;
+	xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
+
+	dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
+
+	err = (int)xattr_value_len;
+	if (size == 0)
+		goto put;
+
+	if (xattr_value_len > size) {
+		err = -ERANGE;
+		goto put;
+	}
+
+	memcpy(value, xattr_value, xattr_value_len);
+put:
+	ceph_mdsc_put_request(req);
+out:
+	dout("do_getvxattr result=%d\n", err);
+	return err;
+}
+
 
 /*
  * Check inode permissions.  We verify we have a valid value for
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index c30eefc0ac19..a5eafc71d976 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -555,6 +555,29 @@ static int parse_reply_info_create(void **p, void *end,
 	return -EIO;
 }
 
+static int parse_reply_info_getvxattr(void **p, void *end,
+				      struct ceph_mds_reply_info_parsed *info,
+				      u64 features)
+{
+	u8 struct_v, struct_compat;
+	u32 struct_len;
+	u32 value_len;
+
+	ceph_decode_8_safe(p, end, struct_v, bad);
+	ceph_decode_8_safe(p, end, struct_compat, bad);
+	ceph_decode_32_safe(p, end, struct_len, bad);
+	ceph_decode_32_safe(p, end, value_len, bad);
+
+	if (value_len == end - *p) {
+	  info->xattr_info.xattr_value = *p;
+	  info->xattr_info.xattr_value_len = end - *p;
+	  *p = end;
+	  return info->xattr_info.xattr_value_len;
+	}
+bad:
+	return -EIO;
+}
+
 /*
  * parse extra results
  */
@@ -570,6 +593,8 @@ static int parse_reply_info_extra(void **p, void *end,
 		return parse_reply_info_readdir(p, end, info, features);
 	else if (op == CEPH_MDS_OP_CREATE)
 		return parse_reply_info_create(p, end, info, features, s);
+	else if (op == CEPH_MDS_OP_GETVXATTR)
+		return parse_reply_info_getvxattr(p, end, info, features);
 	else
 		return -EIO;
 }
@@ -615,7 +640,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
 
 	if (p != end)
 		goto bad;
-	return 0;
+	return err;
 
 bad:
 	err = -EIO;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 97c7f7bfa55f..f2a8e5af3c2e 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -29,8 +29,10 @@ enum ceph_feature_type {
 	CEPHFS_FEATURE_MULTI_RECONNECT,
 	CEPHFS_FEATURE_DELEG_INO,
 	CEPHFS_FEATURE_METRIC_COLLECT,
+	CEPHFS_FEATURE_ALTERNATE_NAME,
+	CEPHFS_FEATURE_GETVXATTR,
 
-	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
+	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_GETVXATTR,
 };
 
 /*
@@ -45,6 +47,8 @@ enum ceph_feature_type {
 	CEPHFS_FEATURE_MULTI_RECONNECT,		\
 	CEPHFS_FEATURE_DELEG_INO,		\
 	CEPHFS_FEATURE_METRIC_COLLECT,		\
+	CEPHFS_FEATURE_ALTERNATE_NAME,		\
+	CEPHFS_FEATURE_GETVXATTR,		\
 						\
 	CEPHFS_FEATURE_MAX,			\
 }
@@ -100,6 +104,11 @@ struct ceph_mds_reply_dir_entry {
 	loff_t			      offset;
 };
 
+struct ceph_mds_reply_xattr {
+	char *xattr_value;
+	size_t xattr_value_len;
+};
+
 /*
  * parsed info about an mds reply, including information about
  * either: 1) the target inode and/or its parent directory and dentry,
@@ -115,6 +124,7 @@ struct ceph_mds_reply_info_parsed {
 	char                          *dname;
 	u32                           dname_len;
 	struct ceph_mds_reply_lease   *dlease;
+	struct ceph_mds_reply_xattr   xattr_info;
 
 	/* extra */
 	union {
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 573bb9556fb5..e36e8948e728 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op)
 	case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
 	case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";
 	case CEPH_MDS_OP_GETATTR:  return "getattr";
+	case CEPH_MDS_OP_GETVXATTR:  return "getvxattr";
 	case CEPH_MDS_OP_SETXATTR: return "setxattr";
 	case CEPH_MDS_OP_SETATTR: return "setattr";
 	case CEPH_MDS_OP_RMXATTR: return "rmxattr";
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index ac331aa07cfa..a627fa69668e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1043,6 +1043,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode)
 
 /* xattr.c */
 int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
+int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
 ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
 extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
 extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index fcf7dfdecf96..dc32876a541a 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -918,6 +918,30 @@ static inline int __get_request_mask(struct inode *in) {
 	return mask;
 }
 
+/* check if the entire cluster supports the given feature */
+static inline bool ceph_cluster_has_feature(struct inode *inode, int feature_bit)
+{
+	int64_t i;
+	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_mds_session **sessions = fsc->mdsc->sessions;
+	int64_t num_sessions = atomic_read(&fsc->mdsc->num_sessions);
+
+	if (fsc->mdsc->stopping)
+		return false;
+
+	if (!sessions)
+		return false;
+
+	for (i = 0; i < num_sessions; i++) {
+		struct ceph_mds_session *session = sessions[i];
+		if (!session)
+			return false;
+		if (!test_bit(feature_bit, &session->s_features))
+			return false;
+	}
+	return true;
+}
+
 ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
 		      size_t size)
 {
@@ -927,6 +951,16 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
 	int req_mask;
 	ssize_t err;
 
+	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) &&
+	    ceph_cluster_has_feature(inode, CEPHFS_FEATURE_GETVXATTR)) {
+		err = ceph_do_getvxattr(inode, name, value, size);
+		/* if cluster doesn't support xattr, we try to service it
+		 * locally
+		 */
+		if (err >= 0)
+			return err;
+	}
+
 	/* let's see if a virtual xattr was requested */
 	vxattr = ceph_match_vxattr(inode, name);
 	if (vxattr) {
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 7ad6c3d0db7d..66db21ac5f0c 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -328,6 +328,7 @@ enum {
 	CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
 	CEPH_MDS_OP_LOOKUPINO  = 0x00104,
 	CEPH_MDS_OP_LOOKUPNAME = 0x00105,
+	CEPH_MDS_OP_GETVXATTR  = 0x00106,
 
 	CEPH_MDS_OP_SETXATTR   = 0x01105,
 	CEPH_MDS_OP_RMXATTR    = 0x01106,
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v4 1/1] ceph: add getvxattr op
  2022-01-17  8:51 ` [PATCH v4 1/1] ceph: add " Milind Changire
@ 2022-01-17 10:53   ` Jeff Layton
  2022-01-17 11:07     ` Milind Changire
  0 siblings, 1 reply; 13+ messages in thread
From: Jeff Layton @ 2022-01-17 10:53 UTC (permalink / raw)
  To: Milind Changire, Ilya Dryomov, ceph-devel; +Cc: Milind Changire

On Mon, 2022-01-17 at 08:51 +0000, Milind Changire wrote:
> Problem:
> Directory vxattrs like ceph.dir.pin* and ceph.dir.layout* may not be
> propagated to the client as frequently to keep them updated. This
> creates vxattr availability problems.
> 
> Solution:
> Adds new getvxattr op to fetch ceph.dir.pin*, ceph.dir.layout* and
> ceph.file.layout* vxattrs.
> If the entire layout for a dir or a file is being set, then it is
> expected that the layout be set in standard JSON format. Individual
> field value retrieval is not wrapped in JSON. The JSON format also
> applies while setting the vxattr if the entire layout is being set in
> one go.
> As a temporary measure, setting a vxattr can also be done in the old
> format. The old format will be deprecated in the future.
> 
> URL: https://tracker.ceph.com/issues/51062
> Signed-off-by: Milind Changire <mchangir@redhat.com>
> ---
>  fs/ceph/inode.c              | 51 ++++++++++++++++++++++++++++++++++++
>  fs/ceph/mds_client.c         | 27 ++++++++++++++++++-
>  fs/ceph/mds_client.h         | 12 ++++++++-
>  fs/ceph/strings.c            |  1 +
>  fs/ceph/super.h              |  1 +
>  fs/ceph/xattr.c              | 34 ++++++++++++++++++++++++
>  include/linux/ceph/ceph_fs.h |  1 +
>  7 files changed, 125 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index e3322fcb2e8d..efdce049b7f0 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -2291,6 +2291,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
>  	return err;
>  }
>  
> +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
> +		      size_t size)
> +{
> +	struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
> +	struct ceph_mds_client *mdsc = fsc->mdsc;
> +	struct ceph_mds_request *req;
> +	int mode = USE_AUTH_MDS;
> +	int err;
> +	char *xattr_value;
> +	size_t xattr_value_len;
> +
> +	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
> +	if (IS_ERR(req)) {
> +		err = -ENOMEM;
> +		goto out;
> +	}
> +
> +	req->r_path2 = kstrdup(name, GFP_NOFS);
> +	if (!req->r_path2) {
> +		err = -ENOMEM;
> +		goto put;
> +	}
> +
> +	ihold(inode);
> +	req->r_inode = inode;
> +	err = ceph_mdsc_do_request(mdsc, NULL, req);
> +	if (err < 0)
> +		goto put;
> +
> +	xattr_value = req->r_reply_info.xattr_info.xattr_value;
> +	xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
> +
> +	dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
> +
> +	err = (int)xattr_value_len;
> +	if (size == 0)
> +		goto put;
> +
> +	if (xattr_value_len > size) {
> +		err = -ERANGE;
> +		goto put;
> +	}
> +
> +	memcpy(value, xattr_value, xattr_value_len);
> +put:
> +	ceph_mdsc_put_request(req);
> +out:
> +	dout("do_getvxattr result=%d\n", err);
> +	return err;
> +}
> +
>  
>  /*
>   * Check inode permissions.  We verify we have a valid value for
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index c30eefc0ac19..a5eafc71d976 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -555,6 +555,29 @@ static int parse_reply_info_create(void **p, void *end,
>  	return -EIO;
>  }
>  
> +static int parse_reply_info_getvxattr(void **p, void *end,
> +				      struct ceph_mds_reply_info_parsed *info,
> +				      u64 features)
> +{
> +	u8 struct_v, struct_compat;
> +	u32 struct_len;
> +	u32 value_len;
> +
> +	ceph_decode_8_safe(p, end, struct_v, bad);
> +	ceph_decode_8_safe(p, end, struct_compat, bad);
> +	ceph_decode_32_safe(p, end, struct_len, bad);
> +	ceph_decode_32_safe(p, end, value_len, bad);
> +
> +	if (value_len == end - *p) {
> +	  info->xattr_info.xattr_value = *p;
> +	  info->xattr_info.xattr_value_len = end - *p;
> +	  *p = end;
> +	  return info->xattr_info.xattr_value_len;
> +	}
> +bad:
> +	return -EIO;
> +}
> +
>  /*
>   * parse extra results
>   */
> @@ -570,6 +593,8 @@ static int parse_reply_info_extra(void **p, void *end,
>  		return parse_reply_info_readdir(p, end, info, features);
>  	else if (op == CEPH_MDS_OP_CREATE)
>  		return parse_reply_info_create(p, end, info, features, s);
> +	else if (op == CEPH_MDS_OP_GETVXATTR)
> +		return parse_reply_info_getvxattr(p, end, info, features);
>  	else
>  		return -EIO;
>  }
> @@ -615,7 +640,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
>  
>  	if (p != end)
>  		goto bad;
> -	return 0;
> +	return err;
>  
>  bad:
>  	err = -EIO;
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 97c7f7bfa55f..f2a8e5af3c2e 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -29,8 +29,10 @@ enum ceph_feature_type {
>  	CEPHFS_FEATURE_MULTI_RECONNECT,
>  	CEPHFS_FEATURE_DELEG_INO,
>  	CEPHFS_FEATURE_METRIC_COLLECT,
> +	CEPHFS_FEATURE_ALTERNATE_NAME,
> +	CEPHFS_FEATURE_GETVXATTR,
>  
> -	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
> +	CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_GETVXATTR,
>  };
>  
>  /*
> @@ -45,6 +47,8 @@ enum ceph_feature_type {
>  	CEPHFS_FEATURE_MULTI_RECONNECT,		\
>  	CEPHFS_FEATURE_DELEG_INO,		\
>  	CEPHFS_FEATURE_METRIC_COLLECT,		\
> +	CEPHFS_FEATURE_ALTERNATE_NAME,		\
> +	CEPHFS_FEATURE_GETVXATTR,		\
>  						\
>  	CEPHFS_FEATURE_MAX,			\
>  }
> @@ -100,6 +104,11 @@ struct ceph_mds_reply_dir_entry {
>  	loff_t			      offset;
>  };
>  
> +struct ceph_mds_reply_xattr {
> +	char *xattr_value;
> +	size_t xattr_value_len;
> +};
> +
>  /*
>   * parsed info about an mds reply, including information about
>   * either: 1) the target inode and/or its parent directory and dentry,
> @@ -115,6 +124,7 @@ struct ceph_mds_reply_info_parsed {
>  	char                          *dname;
>  	u32                           dname_len;
>  	struct ceph_mds_reply_lease   *dlease;
> +	struct ceph_mds_reply_xattr   xattr_info;
>  
>  	/* extra */
>  	union {
> diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
> index 573bb9556fb5..e36e8948e728 100644
> --- a/fs/ceph/strings.c
> +++ b/fs/ceph/strings.c
> @@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op)
>  	case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
>  	case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";
>  	case CEPH_MDS_OP_GETATTR:  return "getattr";
> +	case CEPH_MDS_OP_GETVXATTR:  return "getvxattr";
>  	case CEPH_MDS_OP_SETXATTR: return "setxattr";
>  	case CEPH_MDS_OP_SETATTR: return "setattr";
>  	case CEPH_MDS_OP_RMXATTR: return "rmxattr";
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index ac331aa07cfa..a627fa69668e 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -1043,6 +1043,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode)
>  
>  /* xattr.c */
>  int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
> +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
>  ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
>  extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
>  extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
> diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
> index fcf7dfdecf96..dc32876a541a 100644
> --- a/fs/ceph/xattr.c
> +++ b/fs/ceph/xattr.c
> @@ -918,6 +918,30 @@ static inline int __get_request_mask(struct inode *in) {
>  	return mask;
>  }
>  
> +/* check if the entire cluster supports the given feature */
> +static inline bool ceph_cluster_has_feature(struct inode *inode, int feature_bit)
> +{
> +	int64_t i;
> +	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> +	struct ceph_mds_session **sessions = fsc->mdsc->sessions;
> +	int64_t num_sessions = atomic_read(&fsc->mdsc->num_sessions);
> +
> +	if (fsc->mdsc->stopping)
> +		return false;
> +
> +	if (!sessions)
> +		return false;
> +
> +	for (i = 0; i < num_sessions; i++) {
> +		struct ceph_mds_session *session = sessions[i];
> +		if (!session)
> +			return false;
> +		if (!test_bit(feature_bit, &session->s_features))
> +			return false;

What guarantee do you have that "session" will still be a valid pointer
by the time you get to dereferencing it here?

I think this loop needs some locking (as Xiubo pointed out in his
earlier review).

> +	}
> +	return true;
> +}
> +
>  ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
>  		      size_t size)
>  {
> @@ -927,6 +951,16 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
>  	int req_mask;
>  	ssize_t err;
>  
> +	if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) &&
> +	    ceph_cluster_has_feature(inode, CEPHFS_FEATURE_GETVXATTR)) {
> +		err = ceph_do_getvxattr(inode, name, value, size);
> +		/* if cluster doesn't support xattr, we try to service it
> +		 * locally
> +		 */
> +		if (err >= 0)
> +			return err;
> +	}
> +

What is this? Why not always service this locally?

>  	/* let's see if a virtual xattr was requested */
>  	vxattr = ceph_match_vxattr(inode, name);
>  	if (vxattr) {
> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> index 7ad6c3d0db7d..66db21ac5f0c 100644
> --- a/include/linux/ceph/ceph_fs.h
> +++ b/include/linux/ceph/ceph_fs.h
> @@ -328,6 +328,7 @@ enum {
>  	CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
>  	CEPH_MDS_OP_LOOKUPINO  = 0x00104,
>  	CEPH_MDS_OP_LOOKUPNAME = 0x00105,
> +	CEPH_MDS_OP_GETVXATTR  = 0x00106,
>  
>  	CEPH_MDS_OP_SETXATTR   = 0x01105,
>  	CEPH_MDS_OP_RMXATTR    = 0x01106,

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v4 1/1] ceph: add getvxattr op
  2022-01-17 10:53   ` Jeff Layton
@ 2022-01-17 11:07     ` Milind Changire
  2022-01-17 12:50       ` Xiubo Li
  0 siblings, 1 reply; 13+ messages in thread
From: Milind Changire @ 2022-01-17 11:07 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Ilya Dryomov, ceph-devel, Milind Changire

On Mon, Jan 17, 2022 at 4:23 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Mon, 2022-01-17 at 08:51 +0000, Milind Changire wrote:
> > Problem:
> > Directory vxattrs like ceph.dir.pin* and ceph.dir.layout* may not be
> > propagated to the client as frequently to keep them updated. This
> > creates vxattr availability problems.
> >
> > Solution:
> > Adds new getvxattr op to fetch ceph.dir.pin*, ceph.dir.layout* and
> > ceph.file.layout* vxattrs.
> > If the entire layout for a dir or a file is being set, then it is
> > expected that the layout be set in standard JSON format. Individual
> > field value retrieval is not wrapped in JSON. The JSON format also
> > applies while setting the vxattr if the entire layout is being set in
> > one go.
> > As a temporary measure, setting a vxattr can also be done in the old
> > format. The old format will be deprecated in the future.
> >
> > URL: https://tracker.ceph.com/issues/51062
> > Signed-off-by: Milind Changire <mchangir@redhat.com>
> > ---
> >  fs/ceph/inode.c              | 51 ++++++++++++++++++++++++++++++++++++
> >  fs/ceph/mds_client.c         | 27 ++++++++++++++++++-
> >  fs/ceph/mds_client.h         | 12 ++++++++-
> >  fs/ceph/strings.c            |  1 +
> >  fs/ceph/super.h              |  1 +
> >  fs/ceph/xattr.c              | 34 ++++++++++++++++++++++++
> >  include/linux/ceph/ceph_fs.h |  1 +
> >  7 files changed, 125 insertions(+), 2 deletions(-)
> >
> > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > index e3322fcb2e8d..efdce049b7f0 100644
> > --- a/fs/ceph/inode.c
> > +++ b/fs/ceph/inode.c
> > @@ -2291,6 +2291,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> >       return err;
> >  }
> >
> > +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
> > +                   size_t size)
> > +{
> > +     struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
> > +     struct ceph_mds_client *mdsc = fsc->mdsc;
> > +     struct ceph_mds_request *req;
> > +     int mode = USE_AUTH_MDS;
> > +     int err;
> > +     char *xattr_value;
> > +     size_t xattr_value_len;
> > +
> > +     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
> > +     if (IS_ERR(req)) {
> > +             err = -ENOMEM;
> > +             goto out;
> > +     }
> > +
> > +     req->r_path2 = kstrdup(name, GFP_NOFS);
> > +     if (!req->r_path2) {
> > +             err = -ENOMEM;
> > +             goto put;
> > +     }
> > +
> > +     ihold(inode);
> > +     req->r_inode = inode;
> > +     err = ceph_mdsc_do_request(mdsc, NULL, req);
> > +     if (err < 0)
> > +             goto put;
> > +
> > +     xattr_value = req->r_reply_info.xattr_info.xattr_value;
> > +     xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
> > +
> > +     dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
> > +
> > +     err = (int)xattr_value_len;
> > +     if (size == 0)
> > +             goto put;
> > +
> > +     if (xattr_value_len > size) {
> > +             err = -ERANGE;
> > +             goto put;
> > +     }
> > +
> > +     memcpy(value, xattr_value, xattr_value_len);
> > +put:
> > +     ceph_mdsc_put_request(req);
> > +out:
> > +     dout("do_getvxattr result=%d\n", err);
> > +     return err;
> > +}
> > +
> >
> >  /*
> >   * Check inode permissions.  We verify we have a valid value for
> > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > index c30eefc0ac19..a5eafc71d976 100644
> > --- a/fs/ceph/mds_client.c
> > +++ b/fs/ceph/mds_client.c
> > @@ -555,6 +555,29 @@ static int parse_reply_info_create(void **p, void *end,
> >       return -EIO;
> >  }
> >
> > +static int parse_reply_info_getvxattr(void **p, void *end,
> > +                                   struct ceph_mds_reply_info_parsed *info,
> > +                                   u64 features)
> > +{
> > +     u8 struct_v, struct_compat;
> > +     u32 struct_len;
> > +     u32 value_len;
> > +
> > +     ceph_decode_8_safe(p, end, struct_v, bad);
> > +     ceph_decode_8_safe(p, end, struct_compat, bad);
> > +     ceph_decode_32_safe(p, end, struct_len, bad);
> > +     ceph_decode_32_safe(p, end, value_len, bad);
> > +
> > +     if (value_len == end - *p) {
> > +       info->xattr_info.xattr_value = *p;
> > +       info->xattr_info.xattr_value_len = end - *p;
> > +       *p = end;
> > +       return info->xattr_info.xattr_value_len;
> > +     }
> > +bad:
> > +     return -EIO;
> > +}
> > +
> >  /*
> >   * parse extra results
> >   */
> > @@ -570,6 +593,8 @@ static int parse_reply_info_extra(void **p, void *end,
> >               return parse_reply_info_readdir(p, end, info, features);
> >       else if (op == CEPH_MDS_OP_CREATE)
> >               return parse_reply_info_create(p, end, info, features, s);
> > +     else if (op == CEPH_MDS_OP_GETVXATTR)
> > +             return parse_reply_info_getvxattr(p, end, info, features);
> >       else
> >               return -EIO;
> >  }
> > @@ -615,7 +640,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
> >
> >       if (p != end)
> >               goto bad;
> > -     return 0;
> > +     return err;
> >
> >  bad:
> >       err = -EIO;
> > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > index 97c7f7bfa55f..f2a8e5af3c2e 100644
> > --- a/fs/ceph/mds_client.h
> > +++ b/fs/ceph/mds_client.h
> > @@ -29,8 +29,10 @@ enum ceph_feature_type {
> >       CEPHFS_FEATURE_MULTI_RECONNECT,
> >       CEPHFS_FEATURE_DELEG_INO,
> >       CEPHFS_FEATURE_METRIC_COLLECT,
> > +     CEPHFS_FEATURE_ALTERNATE_NAME,
> > +     CEPHFS_FEATURE_GETVXATTR,
> >
> > -     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
> > +     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_GETVXATTR,
> >  };
> >
> >  /*
> > @@ -45,6 +47,8 @@ enum ceph_feature_type {
> >       CEPHFS_FEATURE_MULTI_RECONNECT,         \
> >       CEPHFS_FEATURE_DELEG_INO,               \
> >       CEPHFS_FEATURE_METRIC_COLLECT,          \
> > +     CEPHFS_FEATURE_ALTERNATE_NAME,          \
> > +     CEPHFS_FEATURE_GETVXATTR,               \
> >                                               \
> >       CEPHFS_FEATURE_MAX,                     \
> >  }
> > @@ -100,6 +104,11 @@ struct ceph_mds_reply_dir_entry {
> >       loff_t                        offset;
> >  };
> >
> > +struct ceph_mds_reply_xattr {
> > +     char *xattr_value;
> > +     size_t xattr_value_len;
> > +};
> > +
> >  /*
> >   * parsed info about an mds reply, including information about
> >   * either: 1) the target inode and/or its parent directory and dentry,
> > @@ -115,6 +124,7 @@ struct ceph_mds_reply_info_parsed {
> >       char                          *dname;
> >       u32                           dname_len;
> >       struct ceph_mds_reply_lease   *dlease;
> > +     struct ceph_mds_reply_xattr   xattr_info;
> >
> >       /* extra */
> >       union {
> > diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
> > index 573bb9556fb5..e36e8948e728 100644
> > --- a/fs/ceph/strings.c
> > +++ b/fs/ceph/strings.c
> > @@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op)
> >       case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
> >       case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";
> >       case CEPH_MDS_OP_GETATTR:  return "getattr";
> > +     case CEPH_MDS_OP_GETVXATTR:  return "getvxattr";
> >       case CEPH_MDS_OP_SETXATTR: return "setxattr";
> >       case CEPH_MDS_OP_SETATTR: return "setattr";
> >       case CEPH_MDS_OP_RMXATTR: return "rmxattr";
> > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > index ac331aa07cfa..a627fa69668e 100644
> > --- a/fs/ceph/super.h
> > +++ b/fs/ceph/super.h
> > @@ -1043,6 +1043,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode)
> >
> >  /* xattr.c */
> >  int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
> > +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
> >  ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
> >  extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
> >  extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
> > diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
> > index fcf7dfdecf96..dc32876a541a 100644
> > --- a/fs/ceph/xattr.c
> > +++ b/fs/ceph/xattr.c
> > @@ -918,6 +918,30 @@ static inline int __get_request_mask(struct inode *in) {
> >       return mask;
> >  }
> >
> > +/* check if the entire cluster supports the given feature */
> > +static inline bool ceph_cluster_has_feature(struct inode *inode, int feature_bit)
> > +{
> > +     int64_t i;
> > +     struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > +     struct ceph_mds_session **sessions = fsc->mdsc->sessions;
> > +     int64_t num_sessions = atomic_read(&fsc->mdsc->num_sessions);
> > +
> > +     if (fsc->mdsc->stopping)
> > +             return false;
> > +
> > +     if (!sessions)
> > +             return false;
> > +
> > +     for (i = 0; i < num_sessions; i++) {
> > +             struct ceph_mds_session *session = sessions[i];
> > +             if (!session)
> > +                     return false;
> > +             if (!test_bit(feature_bit, &session->s_features))
> > +                     return false;
>
> What guarantee do you have that "session" will still be a valid pointer
> by the time you get to dereferencing it here?
>
> I think this loop needs some locking (as Xiubo pointed out in his
> earlier review).

yeah, thanks for pointing that out
I'm trying to wrap the entire processing of this function inside a
mutex_unlock(&mdsc->mutex) ... but the mount command fails
to mount if done so. If code is not wrapped in mutex lock...unlock
then the mount is successful.
It's a surprise that the code doesn't deadlock under the mutex
lock...unlock and gracefully fails with a message.
Any hints on what I could be missing.

>
> > +     }
> > +     return true;
> > +}
> > +
> >  ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
> >                     size_t size)
> >  {
> > @@ -927,6 +951,16 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
> >       int req_mask;
> >       ssize_t err;
> >
> > +     if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) &&
> > +         ceph_cluster_has_feature(inode, CEPHFS_FEATURE_GETVXATTR)) {
> > +             err = ceph_do_getvxattr(inode, name, value, size);
> > +             /* if cluster doesn't support xattr, we try to service it
> > +              * locally
> > +              */
> > +             if (err >= 0)
> > +                     return err;
> > +     }
> > +
>
> What is this? Why not always service this locally?

vxattr handling is planned to be moved to the MDS side.
As I've pointed out to Xiubo, there's a few new things that have been done for
layout vxattr management. Also, as per your original tracker, ceph.dir.pin*
can't be handled locally.
getvxattr() currently handles:
1. ceph.dir.layout*
2. ceph.file.layout*
3. ceph.dir.pin*

If kclient is new and the cluster is old, then layout vxattr will be
handled the old
way, i.e. locally. ceph.dir.pin* will remain inaccessible.

>
> >       /* let's see if a virtual xattr was requested */
> >       vxattr = ceph_match_vxattr(inode, name);
> >       if (vxattr) {
> > diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> > index 7ad6c3d0db7d..66db21ac5f0c 100644
> > --- a/include/linux/ceph/ceph_fs.h
> > +++ b/include/linux/ceph/ceph_fs.h
> > @@ -328,6 +328,7 @@ enum {
> >       CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
> >       CEPH_MDS_OP_LOOKUPINO  = 0x00104,
> >       CEPH_MDS_OP_LOOKUPNAME = 0x00105,
> > +     CEPH_MDS_OP_GETVXATTR  = 0x00106,
> >
> >       CEPH_MDS_OP_SETXATTR   = 0x01105,
> >       CEPH_MDS_OP_RMXATTR    = 0x01106,
>
> --
> Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v4 1/1] ceph: add getvxattr op
  2022-01-17 11:07     ` Milind Changire
@ 2022-01-17 12:50       ` Xiubo Li
  2022-01-17 12:58         ` Milind Changire
  0 siblings, 1 reply; 13+ messages in thread
From: Xiubo Li @ 2022-01-17 12:50 UTC (permalink / raw)
  To: Milind Changire, Jeff Layton; +Cc: Ilya Dryomov, ceph-devel, Milind Changire


On 1/17/22 7:07 PM, Milind Changire wrote:
> On Mon, Jan 17, 2022 at 4:23 PM Jeff Layton <jlayton@kernel.org> wrote:
>> On Mon, 2022-01-17 at 08:51 +0000, Milind Changire wrote:
>>> Problem:
>>> Directory vxattrs like ceph.dir.pin* and ceph.dir.layout* may not be
>>> propagated to the client as frequently to keep them updated. This
>>> creates vxattr availability problems.
>>>
>>> Solution:
>>> Adds new getvxattr op to fetch ceph.dir.pin*, ceph.dir.layout* and
>>> ceph.file.layout* vxattrs.
>>> If the entire layout for a dir or a file is being set, then it is
>>> expected that the layout be set in standard JSON format. Individual
>>> field value retrieval is not wrapped in JSON. The JSON format also
>>> applies while setting the vxattr if the entire layout is being set in
>>> one go.
>>> As a temporary measure, setting a vxattr can also be done in the old
>>> format. The old format will be deprecated in the future.
>>>
>>> URL: https://tracker.ceph.com/issues/51062
>>> Signed-off-by: Milind Changire <mchangir@redhat.com>
>>> ---
>>>   fs/ceph/inode.c              | 51 ++++++++++++++++++++++++++++++++++++
>>>   fs/ceph/mds_client.c         | 27 ++++++++++++++++++-
>>>   fs/ceph/mds_client.h         | 12 ++++++++-
>>>   fs/ceph/strings.c            |  1 +
>>>   fs/ceph/super.h              |  1 +
>>>   fs/ceph/xattr.c              | 34 ++++++++++++++++++++++++
>>>   include/linux/ceph/ceph_fs.h |  1 +
>>>   7 files changed, 125 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
>>> index e3322fcb2e8d..efdce049b7f0 100644
>>> --- a/fs/ceph/inode.c
>>> +++ b/fs/ceph/inode.c
>>> @@ -2291,6 +2291,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
>>>        return err;
>>>   }
>>>
>>> +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
>>> +                   size_t size)
>>> +{
>>> +     struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
>>> +     struct ceph_mds_client *mdsc = fsc->mdsc;
>>> +     struct ceph_mds_request *req;
>>> +     int mode = USE_AUTH_MDS;
>>> +     int err;
>>> +     char *xattr_value;
>>> +     size_t xattr_value_len;
>>> +
>>> +     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
>>> +     if (IS_ERR(req)) {
>>> +             err = -ENOMEM;
>>> +             goto out;
>>> +     }
>>> +
>>> +     req->r_path2 = kstrdup(name, GFP_NOFS);
>>> +     if (!req->r_path2) {
>>> +             err = -ENOMEM;
>>> +             goto put;
>>> +     }
>>> +
>>> +     ihold(inode);
>>> +     req->r_inode = inode;
>>> +     err = ceph_mdsc_do_request(mdsc, NULL, req);
>>> +     if (err < 0)
>>> +             goto put;
>>> +
>>> +     xattr_value = req->r_reply_info.xattr_info.xattr_value;
>>> +     xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
>>> +
>>> +     dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
>>> +
>>> +     err = (int)xattr_value_len;
>>> +     if (size == 0)
>>> +             goto put;
>>> +
>>> +     if (xattr_value_len > size) {
>>> +             err = -ERANGE;
>>> +             goto put;
>>> +     }
>>> +
>>> +     memcpy(value, xattr_value, xattr_value_len);
>>> +put:
>>> +     ceph_mdsc_put_request(req);
>>> +out:
>>> +     dout("do_getvxattr result=%d\n", err);
>>> +     return err;
>>> +}
>>> +
>>>
>>>   /*
>>>    * Check inode permissions.  We verify we have a valid value for
>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>> index c30eefc0ac19..a5eafc71d976 100644
>>> --- a/fs/ceph/mds_client.c
>>> +++ b/fs/ceph/mds_client.c
>>> @@ -555,6 +555,29 @@ static int parse_reply_info_create(void **p, void *end,
>>>        return -EIO;
>>>   }
>>>
>>> +static int parse_reply_info_getvxattr(void **p, void *end,
>>> +                                   struct ceph_mds_reply_info_parsed *info,
>>> +                                   u64 features)
>>> +{
>>> +     u8 struct_v, struct_compat;
>>> +     u32 struct_len;
>>> +     u32 value_len;
>>> +
>>> +     ceph_decode_8_safe(p, end, struct_v, bad);
>>> +     ceph_decode_8_safe(p, end, struct_compat, bad);
>>> +     ceph_decode_32_safe(p, end, struct_len, bad);
>>> +     ceph_decode_32_safe(p, end, value_len, bad);
>>> +
>>> +     if (value_len == end - *p) {
>>> +       info->xattr_info.xattr_value = *p;
>>> +       info->xattr_info.xattr_value_len = end - *p;
>>> +       *p = end;
>>> +       return info->xattr_info.xattr_value_len;
>>> +     }
>>> +bad:
>>> +     return -EIO;
>>> +}
>>> +
>>>   /*
>>>    * parse extra results
>>>    */
>>> @@ -570,6 +593,8 @@ static int parse_reply_info_extra(void **p, void *end,
>>>                return parse_reply_info_readdir(p, end, info, features);
>>>        else if (op == CEPH_MDS_OP_CREATE)
>>>                return parse_reply_info_create(p, end, info, features, s);
>>> +     else if (op == CEPH_MDS_OP_GETVXATTR)
>>> +             return parse_reply_info_getvxattr(p, end, info, features);
>>>        else
>>>                return -EIO;
>>>   }
>>> @@ -615,7 +640,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
>>>
>>>        if (p != end)
>>>                goto bad;
>>> -     return 0;
>>> +     return err;
>>>
>>>   bad:
>>>        err = -EIO;
>>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>>> index 97c7f7bfa55f..f2a8e5af3c2e 100644
>>> --- a/fs/ceph/mds_client.h
>>> +++ b/fs/ceph/mds_client.h
>>> @@ -29,8 +29,10 @@ enum ceph_feature_type {
>>>        CEPHFS_FEATURE_MULTI_RECONNECT,
>>>        CEPHFS_FEATURE_DELEG_INO,
>>>        CEPHFS_FEATURE_METRIC_COLLECT,
>>> +     CEPHFS_FEATURE_ALTERNATE_NAME,
>>> +     CEPHFS_FEATURE_GETVXATTR,
>>>
>>> -     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
>>> +     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_GETVXATTR,
>>>   };
>>>
>>>   /*
>>> @@ -45,6 +47,8 @@ enum ceph_feature_type {
>>>        CEPHFS_FEATURE_MULTI_RECONNECT,         \
>>>        CEPHFS_FEATURE_DELEG_INO,               \
>>>        CEPHFS_FEATURE_METRIC_COLLECT,          \
>>> +     CEPHFS_FEATURE_ALTERNATE_NAME,          \
>>> +     CEPHFS_FEATURE_GETVXATTR,               \
>>>                                                \
>>>        CEPHFS_FEATURE_MAX,                     \
>>>   }
>>> @@ -100,6 +104,11 @@ struct ceph_mds_reply_dir_entry {
>>>        loff_t                        offset;
>>>   };
>>>
>>> +struct ceph_mds_reply_xattr {
>>> +     char *xattr_value;
>>> +     size_t xattr_value_len;
>>> +};
>>> +
>>>   /*
>>>    * parsed info about an mds reply, including information about
>>>    * either: 1) the target inode and/or its parent directory and dentry,
>>> @@ -115,6 +124,7 @@ struct ceph_mds_reply_info_parsed {
>>>        char                          *dname;
>>>        u32                           dname_len;
>>>        struct ceph_mds_reply_lease   *dlease;
>>> +     struct ceph_mds_reply_xattr   xattr_info;
>>>
>>>        /* extra */
>>>        union {
>>> diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
>>> index 573bb9556fb5..e36e8948e728 100644
>>> --- a/fs/ceph/strings.c
>>> +++ b/fs/ceph/strings.c
>>> @@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op)
>>>        case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
>>>        case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";
>>>        case CEPH_MDS_OP_GETATTR:  return "getattr";
>>> +     case CEPH_MDS_OP_GETVXATTR:  return "getvxattr";
>>>        case CEPH_MDS_OP_SETXATTR: return "setxattr";
>>>        case CEPH_MDS_OP_SETATTR: return "setattr";
>>>        case CEPH_MDS_OP_RMXATTR: return "rmxattr";
>>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>>> index ac331aa07cfa..a627fa69668e 100644
>>> --- a/fs/ceph/super.h
>>> +++ b/fs/ceph/super.h
>>> @@ -1043,6 +1043,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode)
>>>
>>>   /* xattr.c */
>>>   int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
>>> +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
>>>   ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
>>>   extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
>>>   extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
>>> diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
>>> index fcf7dfdecf96..dc32876a541a 100644
>>> --- a/fs/ceph/xattr.c
>>> +++ b/fs/ceph/xattr.c
>>> @@ -918,6 +918,30 @@ static inline int __get_request_mask(struct inode *in) {
>>>        return mask;
>>>   }
>>>
>>> +/* check if the entire cluster supports the given feature */
>>> +static inline bool ceph_cluster_has_feature(struct inode *inode, int feature_bit)
>>> +{
>>> +     int64_t i;
>>> +     struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
>>> +     struct ceph_mds_session **sessions = fsc->mdsc->sessions;
>>> +     int64_t num_sessions = atomic_read(&fsc->mdsc->num_sessions);
>>> +
>>> +     if (fsc->mdsc->stopping)
>>> +             return false;
>>> +
>>> +     if (!sessions)
>>> +             return false;
>>> +
>>> +     for (i = 0; i < num_sessions; i++) {
>>> +             struct ceph_mds_session *session = sessions[i];
>>> +             if (!session)
>>> +                     return false;
>>> +             if (!test_bit(feature_bit, &session->s_features))
>>> +                     return false;
>> What guarantee do you have that "session" will still be a valid pointer
>> by the time you get to dereferencing it here?
>>
>> I think this loop needs some locking (as Xiubo pointed out in his
>> earlier review).
> yeah, thanks for pointing that out
> I'm trying to wrap the entire processing of this function inside a
> mutex_unlock(&mdsc->mutex) ... but the mount command fails
> to mount if done so. If code is not wrapped in mutex lock...unlock
> then the mount is successful.
> It's a surprise that the code doesn't deadlock under the mutex
> lock...unlock and gracefully fails with a message.
> Any hints on what I could be missing.
>
>>> +     }
>>> +     return true;
>>> +}
>>> +
>>>   ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
>>>                      size_t size)
>>>   {
>>> @@ -927,6 +951,16 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
>>>        int req_mask;
>>>        ssize_t err;
>>>
>>> +     if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) &&
>>> +         ceph_cluster_has_feature(inode, CEPHFS_FEATURE_GETVXATTR)) {
>>> +             err = ceph_do_getvxattr(inode, name, value, size);
>>> +             /* if cluster doesn't support xattr, we try to service it
>>> +              * locally
>>> +              */
>>> +             if (err >= 0)
>>> +                     return err;
>>> +     }
>>> +
>> What is this? Why not always service this locally?
> vxattr handling is planned to be moved to the MDS side.
> As I've pointed out to Xiubo, there's a few new things that have been done for
> layout vxattr management. Also, as per your original tracker, ceph.dir.pin*
> can't be handled locally.
> getvxattr() currently handles:
> 1. ceph.dir.layout*
> 2. ceph.file.layout*
> 3. ceph.dir.pin*

The above seems will include the 'ceph.dir.layout', 'ceph.file.layout' 
and 'ceph.dir.pin' ? All these have been handled in 'ceph_file_vxattrs' 
and 'ceph_dir_vxattrs'...

And the code above will always force kclient to get all the 'ceph.XXX' 
xattrs from MDS ?



> If kclient is new and the cluster is old, then layout vxattr will be
> handled the old
> way, i.e. locally. ceph.dir.pin* will remain inaccessible.
>
>>>        /* let's see if a virtual xattr was requested */
>>>        vxattr = ceph_match_vxattr(inode, name);
>>>        if (vxattr) {
>>> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
>>> index 7ad6c3d0db7d..66db21ac5f0c 100644
>>> --- a/include/linux/ceph/ceph_fs.h
>>> +++ b/include/linux/ceph/ceph_fs.h
>>> @@ -328,6 +328,7 @@ enum {
>>>        CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
>>>        CEPH_MDS_OP_LOOKUPINO  = 0x00104,
>>>        CEPH_MDS_OP_LOOKUPNAME = 0x00105,
>>> +     CEPH_MDS_OP_GETVXATTR  = 0x00106,
>>>
>>>        CEPH_MDS_OP_SETXATTR   = 0x01105,
>>>        CEPH_MDS_OP_RMXATTR    = 0x01106,
>> --
>> Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v4 1/1] ceph: add getvxattr op
  2022-01-17 12:50       ` Xiubo Li
@ 2022-01-17 12:58         ` Milind Changire
  2022-01-17 13:09           ` Xiubo Li
  0 siblings, 1 reply; 13+ messages in thread
From: Milind Changire @ 2022-01-17 12:58 UTC (permalink / raw)
  To: Xiubo Li; +Cc: Jeff Layton, Ilya Dryomov, ceph-devel, Milind Changire

On Mon, Jan 17, 2022 at 6:20 PM Xiubo Li <xiubli@redhat.com> wrote:
>
>
> On 1/17/22 7:07 PM, Milind Changire wrote:
> > On Mon, Jan 17, 2022 at 4:23 PM Jeff Layton <jlayton@kernel.org> wrote:
> >> On Mon, 2022-01-17 at 08:51 +0000, Milind Changire wrote:
> >>> Problem:
> >>> Directory vxattrs like ceph.dir.pin* and ceph.dir.layout* may not be
> >>> propagated to the client as frequently to keep them updated. This
> >>> creates vxattr availability problems.
> >>>
> >>> Solution:
> >>> Adds new getvxattr op to fetch ceph.dir.pin*, ceph.dir.layout* and
> >>> ceph.file.layout* vxattrs.
> >>> If the entire layout for a dir or a file is being set, then it is
> >>> expected that the layout be set in standard JSON format. Individual
> >>> field value retrieval is not wrapped in JSON. The JSON format also
> >>> applies while setting the vxattr if the entire layout is being set in
> >>> one go.
> >>> As a temporary measure, setting a vxattr can also be done in the old
> >>> format. The old format will be deprecated in the future.
> >>>
> >>> URL: https://tracker.ceph.com/issues/51062
> >>> Signed-off-by: Milind Changire <mchangir@redhat.com>
> >>> ---
> >>>   fs/ceph/inode.c              | 51 ++++++++++++++++++++++++++++++++++++
> >>>   fs/ceph/mds_client.c         | 27 ++++++++++++++++++-
> >>>   fs/ceph/mds_client.h         | 12 ++++++++-
> >>>   fs/ceph/strings.c            |  1 +
> >>>   fs/ceph/super.h              |  1 +
> >>>   fs/ceph/xattr.c              | 34 ++++++++++++++++++++++++
> >>>   include/linux/ceph/ceph_fs.h |  1 +
> >>>   7 files changed, 125 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> >>> index e3322fcb2e8d..efdce049b7f0 100644
> >>> --- a/fs/ceph/inode.c
> >>> +++ b/fs/ceph/inode.c
> >>> @@ -2291,6 +2291,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> >>>        return err;
> >>>   }
> >>>
> >>> +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
> >>> +                   size_t size)
> >>> +{
> >>> +     struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
> >>> +     struct ceph_mds_client *mdsc = fsc->mdsc;
> >>> +     struct ceph_mds_request *req;
> >>> +     int mode = USE_AUTH_MDS;
> >>> +     int err;
> >>> +     char *xattr_value;
> >>> +     size_t xattr_value_len;
> >>> +
> >>> +     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
> >>> +     if (IS_ERR(req)) {
> >>> +             err = -ENOMEM;
> >>> +             goto out;
> >>> +     }
> >>> +
> >>> +     req->r_path2 = kstrdup(name, GFP_NOFS);
> >>> +     if (!req->r_path2) {
> >>> +             err = -ENOMEM;
> >>> +             goto put;
> >>> +     }
> >>> +
> >>> +     ihold(inode);
> >>> +     req->r_inode = inode;
> >>> +     err = ceph_mdsc_do_request(mdsc, NULL, req);
> >>> +     if (err < 0)
> >>> +             goto put;
> >>> +
> >>> +     xattr_value = req->r_reply_info.xattr_info.xattr_value;
> >>> +     xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
> >>> +
> >>> +     dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
> >>> +
> >>> +     err = (int)xattr_value_len;
> >>> +     if (size == 0)
> >>> +             goto put;
> >>> +
> >>> +     if (xattr_value_len > size) {
> >>> +             err = -ERANGE;
> >>> +             goto put;
> >>> +     }
> >>> +
> >>> +     memcpy(value, xattr_value, xattr_value_len);
> >>> +put:
> >>> +     ceph_mdsc_put_request(req);
> >>> +out:
> >>> +     dout("do_getvxattr result=%d\n", err);
> >>> +     return err;
> >>> +}
> >>> +
> >>>
> >>>   /*
> >>>    * Check inode permissions.  We verify we have a valid value for
> >>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> >>> index c30eefc0ac19..a5eafc71d976 100644
> >>> --- a/fs/ceph/mds_client.c
> >>> +++ b/fs/ceph/mds_client.c
> >>> @@ -555,6 +555,29 @@ static int parse_reply_info_create(void **p, void *end,
> >>>        return -EIO;
> >>>   }
> >>>
> >>> +static int parse_reply_info_getvxattr(void **p, void *end,
> >>> +                                   struct ceph_mds_reply_info_parsed *info,
> >>> +                                   u64 features)
> >>> +{
> >>> +     u8 struct_v, struct_compat;
> >>> +     u32 struct_len;
> >>> +     u32 value_len;
> >>> +
> >>> +     ceph_decode_8_safe(p, end, struct_v, bad);
> >>> +     ceph_decode_8_safe(p, end, struct_compat, bad);
> >>> +     ceph_decode_32_safe(p, end, struct_len, bad);
> >>> +     ceph_decode_32_safe(p, end, value_len, bad);
> >>> +
> >>> +     if (value_len == end - *p) {
> >>> +       info->xattr_info.xattr_value = *p;
> >>> +       info->xattr_info.xattr_value_len = end - *p;
> >>> +       *p = end;
> >>> +       return info->xattr_info.xattr_value_len;
> >>> +     }
> >>> +bad:
> >>> +     return -EIO;
> >>> +}
> >>> +
> >>>   /*
> >>>    * parse extra results
> >>>    */
> >>> @@ -570,6 +593,8 @@ static int parse_reply_info_extra(void **p, void *end,
> >>>                return parse_reply_info_readdir(p, end, info, features);
> >>>        else if (op == CEPH_MDS_OP_CREATE)
> >>>                return parse_reply_info_create(p, end, info, features, s);
> >>> +     else if (op == CEPH_MDS_OP_GETVXATTR)
> >>> +             return parse_reply_info_getvxattr(p, end, info, features);
> >>>        else
> >>>                return -EIO;
> >>>   }
> >>> @@ -615,7 +640,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
> >>>
> >>>        if (p != end)
> >>>                goto bad;
> >>> -     return 0;
> >>> +     return err;
> >>>
> >>>   bad:
> >>>        err = -EIO;
> >>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> >>> index 97c7f7bfa55f..f2a8e5af3c2e 100644
> >>> --- a/fs/ceph/mds_client.h
> >>> +++ b/fs/ceph/mds_client.h
> >>> @@ -29,8 +29,10 @@ enum ceph_feature_type {
> >>>        CEPHFS_FEATURE_MULTI_RECONNECT,
> >>>        CEPHFS_FEATURE_DELEG_INO,
> >>>        CEPHFS_FEATURE_METRIC_COLLECT,
> >>> +     CEPHFS_FEATURE_ALTERNATE_NAME,
> >>> +     CEPHFS_FEATURE_GETVXATTR,
> >>>
> >>> -     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
> >>> +     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_GETVXATTR,
> >>>   };
> >>>
> >>>   /*
> >>> @@ -45,6 +47,8 @@ enum ceph_feature_type {
> >>>        CEPHFS_FEATURE_MULTI_RECONNECT,         \
> >>>        CEPHFS_FEATURE_DELEG_INO,               \
> >>>        CEPHFS_FEATURE_METRIC_COLLECT,          \
> >>> +     CEPHFS_FEATURE_ALTERNATE_NAME,          \
> >>> +     CEPHFS_FEATURE_GETVXATTR,               \
> >>>                                                \
> >>>        CEPHFS_FEATURE_MAX,                     \
> >>>   }
> >>> @@ -100,6 +104,11 @@ struct ceph_mds_reply_dir_entry {
> >>>        loff_t                        offset;
> >>>   };
> >>>
> >>> +struct ceph_mds_reply_xattr {
> >>> +     char *xattr_value;
> >>> +     size_t xattr_value_len;
> >>> +};
> >>> +
> >>>   /*
> >>>    * parsed info about an mds reply, including information about
> >>>    * either: 1) the target inode and/or its parent directory and dentry,
> >>> @@ -115,6 +124,7 @@ struct ceph_mds_reply_info_parsed {
> >>>        char                          *dname;
> >>>        u32                           dname_len;
> >>>        struct ceph_mds_reply_lease   *dlease;
> >>> +     struct ceph_mds_reply_xattr   xattr_info;
> >>>
> >>>        /* extra */
> >>>        union {
> >>> diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
> >>> index 573bb9556fb5..e36e8948e728 100644
> >>> --- a/fs/ceph/strings.c
> >>> +++ b/fs/ceph/strings.c
> >>> @@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op)
> >>>        case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
> >>>        case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";
> >>>        case CEPH_MDS_OP_GETATTR:  return "getattr";
> >>> +     case CEPH_MDS_OP_GETVXATTR:  return "getvxattr";
> >>>        case CEPH_MDS_OP_SETXATTR: return "setxattr";
> >>>        case CEPH_MDS_OP_SETATTR: return "setattr";
> >>>        case CEPH_MDS_OP_RMXATTR: return "rmxattr";
> >>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> >>> index ac331aa07cfa..a627fa69668e 100644
> >>> --- a/fs/ceph/super.h
> >>> +++ b/fs/ceph/super.h
> >>> @@ -1043,6 +1043,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode)
> >>>
> >>>   /* xattr.c */
> >>>   int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
> >>> +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
> >>>   ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
> >>>   extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
> >>>   extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
> >>> diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
> >>> index fcf7dfdecf96..dc32876a541a 100644
> >>> --- a/fs/ceph/xattr.c
> >>> +++ b/fs/ceph/xattr.c
> >>> @@ -918,6 +918,30 @@ static inline int __get_request_mask(struct inode *in) {
> >>>        return mask;
> >>>   }
> >>>
> >>> +/* check if the entire cluster supports the given feature */
> >>> +static inline bool ceph_cluster_has_feature(struct inode *inode, int feature_bit)
> >>> +{
> >>> +     int64_t i;
> >>> +     struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> >>> +     struct ceph_mds_session **sessions = fsc->mdsc->sessions;
> >>> +     int64_t num_sessions = atomic_read(&fsc->mdsc->num_sessions);
> >>> +
> >>> +     if (fsc->mdsc->stopping)
> >>> +             return false;
> >>> +
> >>> +     if (!sessions)
> >>> +             return false;
> >>> +
> >>> +     for (i = 0; i < num_sessions; i++) {
> >>> +             struct ceph_mds_session *session = sessions[i];
> >>> +             if (!session)
> >>> +                     return false;
> >>> +             if (!test_bit(feature_bit, &session->s_features))
> >>> +                     return false;
> >> What guarantee do you have that "session" will still be a valid pointer
> >> by the time you get to dereferencing it here?
> >>
> >> I think this loop needs some locking (as Xiubo pointed out in his
> >> earlier review).
> > yeah, thanks for pointing that out
> > I'm trying to wrap the entire processing of this function inside a
> > mutex_unlock(&mdsc->mutex) ... but the mount command fails
> > to mount if done so. If code is not wrapped in mutex lock...unlock
> > then the mount is successful.
> > It's a surprise that the code doesn't deadlock under the mutex
> > lock...unlock and gracefully fails with a message.
> > Any hints on what I could be missing.
> >
> >>> +     }
> >>> +     return true;
> >>> +}
> >>> +
> >>>   ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
> >>>                      size_t size)
> >>>   {
> >>> @@ -927,6 +951,16 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
> >>>        int req_mask;
> >>>        ssize_t err;
> >>>
> >>> +     if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) &&
> >>> +         ceph_cluster_has_feature(inode, CEPHFS_FEATURE_GETVXATTR)) {
> >>> +             err = ceph_do_getvxattr(inode, name, value, size);
> >>> +             /* if cluster doesn't support xattr, we try to service it
> >>> +              * locally
> >>> +              */
> >>> +             if (err >= 0)
> >>> +                     return err;
> >>> +     }
> >>> +
> >> What is this? Why not always service this locally?
> > vxattr handling is planned to be moved to the MDS side.
> > As I've pointed out to Xiubo, there's a few new things that have been done for
> > layout vxattr management. Also, as per your original tracker, ceph.dir.pin*
> > can't be handled locally.
> > getvxattr() currently handles:
> > 1. ceph.dir.layout*
> > 2. ceph.file.layout*
> > 3. ceph.dir.pin*
>
> The above seems will include the 'ceph.dir.layout', 'ceph.file.layout'
> and 'ceph.dir.pin' ? All these have been handled in 'ceph_file_vxattrs'
> and 'ceph_dir_vxattrs'...
>
> And the code above will always force kclient to get all the 'ceph.XXX'
> xattrs from MDS ?

yes, the kclient will always get vxattr values from MDS
for old cluster, op will be handled locally

Jeff has a proposal to expose the JSON output variety of layout
vxattr vlaue via new vxattr altogether
eg. ceph.dir.layout_json and ceph.file.layout_json

>
>
>
> > If kclient is new and the cluster is old, then layout vxattr will be
> > handled the old
> > way, i.e. locally. ceph.dir.pin* will remain inaccessible.
> >
> >>>        /* let's see if a virtual xattr was requested */
> >>>        vxattr = ceph_match_vxattr(inode, name);
> >>>        if (vxattr) {
> >>> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> >>> index 7ad6c3d0db7d..66db21ac5f0c 100644
> >>> --- a/include/linux/ceph/ceph_fs.h
> >>> +++ b/include/linux/ceph/ceph_fs.h
> >>> @@ -328,6 +328,7 @@ enum {
> >>>        CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
> >>>        CEPH_MDS_OP_LOOKUPINO  = 0x00104,
> >>>        CEPH_MDS_OP_LOOKUPNAME = 0x00105,
> >>> +     CEPH_MDS_OP_GETVXATTR  = 0x00106,
> >>>
> >>>        CEPH_MDS_OP_SETXATTR   = 0x01105,
> >>>        CEPH_MDS_OP_RMXATTR    = 0x01106,
> >> --
> >> Jeff Layton <jlayton@kernel.org>
>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v4 1/1] ceph: add getvxattr op
  2022-01-17 12:58         ` Milind Changire
@ 2022-01-17 13:09           ` Xiubo Li
  2022-01-17 13:27             ` Jeff Layton
  0 siblings, 1 reply; 13+ messages in thread
From: Xiubo Li @ 2022-01-17 13:09 UTC (permalink / raw)
  To: Milind Changire; +Cc: Jeff Layton, Ilya Dryomov, ceph-devel, Milind Changire


On 1/17/22 8:58 PM, Milind Changire wrote:
> On Mon, Jan 17, 2022 at 6:20 PM Xiubo Li <xiubli@redhat.com> wrote:
>>
>> On 1/17/22 7:07 PM, Milind Changire wrote:
>>> On Mon, Jan 17, 2022 at 4:23 PM Jeff Layton <jlayton@kernel.org> wrote:
>>>> On Mon, 2022-01-17 at 08:51 +0000, Milind Changire wrote:
>>>>> Problem:
>>>>> Directory vxattrs like ceph.dir.pin* and ceph.dir.layout* may not be
>>>>> propagated to the client as frequently to keep them updated. This
>>>>> creates vxattr availability problems.
>>>>>
>>>>> Solution:
>>>>> Adds new getvxattr op to fetch ceph.dir.pin*, ceph.dir.layout* and
>>>>> ceph.file.layout* vxattrs.
>>>>> If the entire layout for a dir or a file is being set, then it is
>>>>> expected that the layout be set in standard JSON format. Individual
>>>>> field value retrieval is not wrapped in JSON. The JSON format also
>>>>> applies while setting the vxattr if the entire layout is being set in
>>>>> one go.
>>>>> As a temporary measure, setting a vxattr can also be done in the old
>>>>> format. The old format will be deprecated in the future.
>>>>>
>>>>> URL: https://tracker.ceph.com/issues/51062
>>>>> Signed-off-by: Milind Changire <mchangir@redhat.com>
>>>>> ---
>>>>>    fs/ceph/inode.c              | 51 ++++++++++++++++++++++++++++++++++++
>>>>>    fs/ceph/mds_client.c         | 27 ++++++++++++++++++-
>>>>>    fs/ceph/mds_client.h         | 12 ++++++++-
>>>>>    fs/ceph/strings.c            |  1 +
>>>>>    fs/ceph/super.h              |  1 +
>>>>>    fs/ceph/xattr.c              | 34 ++++++++++++++++++++++++
>>>>>    include/linux/ceph/ceph_fs.h |  1 +
>>>>>    7 files changed, 125 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
>>>>> index e3322fcb2e8d..efdce049b7f0 100644
>>>>> --- a/fs/ceph/inode.c
>>>>> +++ b/fs/ceph/inode.c
>>>>> @@ -2291,6 +2291,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
>>>>>         return err;
>>>>>    }
>>>>>
>>>>> +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
>>>>> +                   size_t size)
>>>>> +{
>>>>> +     struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
>>>>> +     struct ceph_mds_client *mdsc = fsc->mdsc;
>>>>> +     struct ceph_mds_request *req;
>>>>> +     int mode = USE_AUTH_MDS;
>>>>> +     int err;
>>>>> +     char *xattr_value;
>>>>> +     size_t xattr_value_len;
>>>>> +
>>>>> +     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
>>>>> +     if (IS_ERR(req)) {
>>>>> +             err = -ENOMEM;
>>>>> +             goto out;
>>>>> +     }
>>>>> +
>>>>> +     req->r_path2 = kstrdup(name, GFP_NOFS);
>>>>> +     if (!req->r_path2) {
>>>>> +             err = -ENOMEM;
>>>>> +             goto put;
>>>>> +     }
>>>>> +
>>>>> +     ihold(inode);
>>>>> +     req->r_inode = inode;
>>>>> +     err = ceph_mdsc_do_request(mdsc, NULL, req);
>>>>> +     if (err < 0)
>>>>> +             goto put;
>>>>> +
>>>>> +     xattr_value = req->r_reply_info.xattr_info.xattr_value;
>>>>> +     xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
>>>>> +
>>>>> +     dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
>>>>> +
>>>>> +     err = (int)xattr_value_len;
>>>>> +     if (size == 0)
>>>>> +             goto put;
>>>>> +
>>>>> +     if (xattr_value_len > size) {
>>>>> +             err = -ERANGE;
>>>>> +             goto put;
>>>>> +     }
>>>>> +
>>>>> +     memcpy(value, xattr_value, xattr_value_len);
>>>>> +put:
>>>>> +     ceph_mdsc_put_request(req);
>>>>> +out:
>>>>> +     dout("do_getvxattr result=%d\n", err);
>>>>> +     return err;
>>>>> +}
>>>>> +
>>>>>
>>>>>    /*
>>>>>     * Check inode permissions.  We verify we have a valid value for
>>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>>> index c30eefc0ac19..a5eafc71d976 100644
>>>>> --- a/fs/ceph/mds_client.c
>>>>> +++ b/fs/ceph/mds_client.c
>>>>> @@ -555,6 +555,29 @@ static int parse_reply_info_create(void **p, void *end,
>>>>>         return -EIO;
>>>>>    }
>>>>>
>>>>> +static int parse_reply_info_getvxattr(void **p, void *end,
>>>>> +                                   struct ceph_mds_reply_info_parsed *info,
>>>>> +                                   u64 features)
>>>>> +{
>>>>> +     u8 struct_v, struct_compat;
>>>>> +     u32 struct_len;
>>>>> +     u32 value_len;
>>>>> +
>>>>> +     ceph_decode_8_safe(p, end, struct_v, bad);
>>>>> +     ceph_decode_8_safe(p, end, struct_compat, bad);
>>>>> +     ceph_decode_32_safe(p, end, struct_len, bad);
>>>>> +     ceph_decode_32_safe(p, end, value_len, bad);
>>>>> +
>>>>> +     if (value_len == end - *p) {
>>>>> +       info->xattr_info.xattr_value = *p;
>>>>> +       info->xattr_info.xattr_value_len = end - *p;
>>>>> +       *p = end;
>>>>> +       return info->xattr_info.xattr_value_len;
>>>>> +     }
>>>>> +bad:
>>>>> +     return -EIO;
>>>>> +}
>>>>> +
>>>>>    /*
>>>>>     * parse extra results
>>>>>     */
>>>>> @@ -570,6 +593,8 @@ static int parse_reply_info_extra(void **p, void *end,
>>>>>                 return parse_reply_info_readdir(p, end, info, features);
>>>>>         else if (op == CEPH_MDS_OP_CREATE)
>>>>>                 return parse_reply_info_create(p, end, info, features, s);
>>>>> +     else if (op == CEPH_MDS_OP_GETVXATTR)
>>>>> +             return parse_reply_info_getvxattr(p, end, info, features);
>>>>>         else
>>>>>                 return -EIO;
>>>>>    }
>>>>> @@ -615,7 +640,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
>>>>>
>>>>>         if (p != end)
>>>>>                 goto bad;
>>>>> -     return 0;
>>>>> +     return err;
>>>>>
>>>>>    bad:
>>>>>         err = -EIO;
>>>>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>>>>> index 97c7f7bfa55f..f2a8e5af3c2e 100644
>>>>> --- a/fs/ceph/mds_client.h
>>>>> +++ b/fs/ceph/mds_client.h
>>>>> @@ -29,8 +29,10 @@ enum ceph_feature_type {
>>>>>         CEPHFS_FEATURE_MULTI_RECONNECT,
>>>>>         CEPHFS_FEATURE_DELEG_INO,
>>>>>         CEPHFS_FEATURE_METRIC_COLLECT,
>>>>> +     CEPHFS_FEATURE_ALTERNATE_NAME,
>>>>> +     CEPHFS_FEATURE_GETVXATTR,
>>>>>
>>>>> -     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
>>>>> +     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_GETVXATTR,
>>>>>    };
>>>>>
>>>>>    /*
>>>>> @@ -45,6 +47,8 @@ enum ceph_feature_type {
>>>>>         CEPHFS_FEATURE_MULTI_RECONNECT,         \
>>>>>         CEPHFS_FEATURE_DELEG_INO,               \
>>>>>         CEPHFS_FEATURE_METRIC_COLLECT,          \
>>>>> +     CEPHFS_FEATURE_ALTERNATE_NAME,          \
>>>>> +     CEPHFS_FEATURE_GETVXATTR,               \
>>>>>                                                 \
>>>>>         CEPHFS_FEATURE_MAX,                     \
>>>>>    }
>>>>> @@ -100,6 +104,11 @@ struct ceph_mds_reply_dir_entry {
>>>>>         loff_t                        offset;
>>>>>    };
>>>>>
>>>>> +struct ceph_mds_reply_xattr {
>>>>> +     char *xattr_value;
>>>>> +     size_t xattr_value_len;
>>>>> +};
>>>>> +
>>>>>    /*
>>>>>     * parsed info about an mds reply, including information about
>>>>>     * either: 1) the target inode and/or its parent directory and dentry,
>>>>> @@ -115,6 +124,7 @@ struct ceph_mds_reply_info_parsed {
>>>>>         char                          *dname;
>>>>>         u32                           dname_len;
>>>>>         struct ceph_mds_reply_lease   *dlease;
>>>>> +     struct ceph_mds_reply_xattr   xattr_info;
>>>>>
>>>>>         /* extra */
>>>>>         union {
>>>>> diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
>>>>> index 573bb9556fb5..e36e8948e728 100644
>>>>> --- a/fs/ceph/strings.c
>>>>> +++ b/fs/ceph/strings.c
>>>>> @@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op)
>>>>>         case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
>>>>>         case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";
>>>>>         case CEPH_MDS_OP_GETATTR:  return "getattr";
>>>>> +     case CEPH_MDS_OP_GETVXATTR:  return "getvxattr";
>>>>>         case CEPH_MDS_OP_SETXATTR: return "setxattr";
>>>>>         case CEPH_MDS_OP_SETATTR: return "setattr";
>>>>>         case CEPH_MDS_OP_RMXATTR: return "rmxattr";
>>>>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>>>>> index ac331aa07cfa..a627fa69668e 100644
>>>>> --- a/fs/ceph/super.h
>>>>> +++ b/fs/ceph/super.h
>>>>> @@ -1043,6 +1043,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode)
>>>>>
>>>>>    /* xattr.c */
>>>>>    int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
>>>>> +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
>>>>>    ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
>>>>>    extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
>>>>>    extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
>>>>> diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
>>>>> index fcf7dfdecf96..dc32876a541a 100644
>>>>> --- a/fs/ceph/xattr.c
>>>>> +++ b/fs/ceph/xattr.c
>>>>> @@ -918,6 +918,30 @@ static inline int __get_request_mask(struct inode *in) {
>>>>>         return mask;
>>>>>    }
>>>>>
>>>>> +/* check if the entire cluster supports the given feature */
>>>>> +static inline bool ceph_cluster_has_feature(struct inode *inode, int feature_bit)
>>>>> +{
>>>>> +     int64_t i;
>>>>> +     struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
>>>>> +     struct ceph_mds_session **sessions = fsc->mdsc->sessions;
>>>>> +     int64_t num_sessions = atomic_read(&fsc->mdsc->num_sessions);
>>>>> +
>>>>> +     if (fsc->mdsc->stopping)
>>>>> +             return false;
>>>>> +
>>>>> +     if (!sessions)
>>>>> +             return false;
>>>>> +
>>>>> +     for (i = 0; i < num_sessions; i++) {
>>>>> +             struct ceph_mds_session *session = sessions[i];
>>>>> +             if (!session)
>>>>> +                     return false;
>>>>> +             if (!test_bit(feature_bit, &session->s_features))
>>>>> +                     return false;
>>>> What guarantee do you have that "session" will still be a valid pointer
>>>> by the time you get to dereferencing it here?
>>>>
>>>> I think this loop needs some locking (as Xiubo pointed out in his
>>>> earlier review).
>>> yeah, thanks for pointing that out
>>> I'm trying to wrap the entire processing of this function inside a
>>> mutex_unlock(&mdsc->mutex) ... but the mount command fails
>>> to mount if done so. If code is not wrapped in mutex lock...unlock
>>> then the mount is successful.
>>> It's a surprise that the code doesn't deadlock under the mutex
>>> lock...unlock and gracefully fails with a message.
>>> Any hints on what I could be missing.
>>>
>>>>> +     }
>>>>> +     return true;
>>>>> +}
>>>>> +
>>>>>    ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
>>>>>                       size_t size)
>>>>>    {
>>>>> @@ -927,6 +951,16 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
>>>>>         int req_mask;
>>>>>         ssize_t err;
>>>>>
>>>>> +     if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) &&
>>>>> +         ceph_cluster_has_feature(inode, CEPHFS_FEATURE_GETVXATTR)) {
>>>>> +             err = ceph_do_getvxattr(inode, name, value, size);
>>>>> +             /* if cluster doesn't support xattr, we try to service it
>>>>> +              * locally
>>>>> +              */
>>>>> +             if (err >= 0)
>>>>> +                     return err;
>>>>> +     }
>>>>> +
>>>> What is this? Why not always service this locally?
>>> vxattr handling is planned to be moved to the MDS side.
>>> As I've pointed out to Xiubo, there's a few new things that have been done for
>>> layout vxattr management. Also, as per your original tracker, ceph.dir.pin*
>>> can't be handled locally.
>>> getvxattr() currently handles:
>>> 1. ceph.dir.layout*
>>> 2. ceph.file.layout*
>>> 3. ceph.dir.pin*
>> The above seems will include the 'ceph.dir.layout', 'ceph.file.layout'
>> and 'ceph.dir.pin' ? All these have been handled in 'ceph_file_vxattrs'
>> and 'ceph_dir_vxattrs'...
>>
>> And the code above will always force kclient to get all the 'ceph.XXX'
>> xattrs from MDS ?
> yes, the kclient will always get vxattr values from MDS
> for old cluster, op will be handled locally
>
> Jeff has a proposal to expose the JSON output variety of layout
> vxattr vlaue via new vxattr altogether
> eg. ceph.dir.layout_json and ceph.file.layout_json
>
Yeah, sounds good. Or maybe just adding the '.json' instead of '_json'.


>>
>>
>>> If kclient is new and the cluster is old, then layout vxattr will be
>>> handled the old
>>> way, i.e. locally. ceph.dir.pin* will remain inaccessible.
>>>
>>>>>         /* let's see if a virtual xattr was requested */
>>>>>         vxattr = ceph_match_vxattr(inode, name);
>>>>>         if (vxattr) {
>>>>> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
>>>>> index 7ad6c3d0db7d..66db21ac5f0c 100644
>>>>> --- a/include/linux/ceph/ceph_fs.h
>>>>> +++ b/include/linux/ceph/ceph_fs.h
>>>>> @@ -328,6 +328,7 @@ enum {
>>>>>         CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
>>>>>         CEPH_MDS_OP_LOOKUPINO  = 0x00104,
>>>>>         CEPH_MDS_OP_LOOKUPNAME = 0x00105,
>>>>> +     CEPH_MDS_OP_GETVXATTR  = 0x00106,
>>>>>
>>>>>         CEPH_MDS_OP_SETXATTR   = 0x01105,
>>>>>         CEPH_MDS_OP_RMXATTR    = 0x01106,
>>>> --
>>>> Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v4 1/1] ceph: add getvxattr op
  2022-01-17 13:09           ` Xiubo Li
@ 2022-01-17 13:27             ` Jeff Layton
  2022-01-17 13:56               ` Xiubo Li
  2022-01-17 14:50               ` Gregory Farnum
  0 siblings, 2 replies; 13+ messages in thread
From: Jeff Layton @ 2022-01-17 13:27 UTC (permalink / raw)
  To: Xiubo Li, Milind Changire; +Cc: Ilya Dryomov, ceph-devel, Milind Changire

On Mon, 2022-01-17 at 21:09 +0800, Xiubo Li wrote:
> On 1/17/22 8:58 PM, Milind Changire wrote:
> > On Mon, Jan 17, 2022 at 6:20 PM Xiubo Li <xiubli@redhat.com> wrote:
> > > 
> > > On 1/17/22 7:07 PM, Milind Changire wrote:
> > > > On Mon, Jan 17, 2022 at 4:23 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > On Mon, 2022-01-17 at 08:51 +0000, Milind Changire wrote:
> > > > > > Problem:
> > > > > > Directory vxattrs like ceph.dir.pin* and ceph.dir.layout* may not be
> > > > > > propagated to the client as frequently to keep them updated. This
> > > > > > creates vxattr availability problems.
> > > > > > 
> > > > > > Solution:
> > > > > > Adds new getvxattr op to fetch ceph.dir.pin*, ceph.dir.layout* and
> > > > > > ceph.file.layout* vxattrs.
> > > > > > If the entire layout for a dir or a file is being set, then it is
> > > > > > expected that the layout be set in standard JSON format. Individual
> > > > > > field value retrieval is not wrapped in JSON. The JSON format also
> > > > > > applies while setting the vxattr if the entire layout is being set in
> > > > > > one go.
> > > > > > As a temporary measure, setting a vxattr can also be done in the old
> > > > > > format. The old format will be deprecated in the future.
> > > > > > 
> > > > > > URL: https://tracker.ceph.com/issues/51062
> > > > > > Signed-off-by: Milind Changire <mchangir@redhat.com>
> > > > > > ---
> > > > > >    fs/ceph/inode.c              | 51 ++++++++++++++++++++++++++++++++++++
> > > > > >    fs/ceph/mds_client.c         | 27 ++++++++++++++++++-
> > > > > >    fs/ceph/mds_client.h         | 12 ++++++++-
> > > > > >    fs/ceph/strings.c            |  1 +
> > > > > >    fs/ceph/super.h              |  1 +
> > > > > >    fs/ceph/xattr.c              | 34 ++++++++++++++++++++++++
> > > > > >    include/linux/ceph/ceph_fs.h |  1 +
> > > > > >    7 files changed, 125 insertions(+), 2 deletions(-)
> > > > > > 
> > > > > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > > > > index e3322fcb2e8d..efdce049b7f0 100644
> > > > > > --- a/fs/ceph/inode.c
> > > > > > +++ b/fs/ceph/inode.c
> > > > > > @@ -2291,6 +2291,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> > > > > >         return err;
> > > > > >    }
> > > > > > 
> > > > > > +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
> > > > > > +                   size_t size)
> > > > > > +{
> > > > > > +     struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
> > > > > > +     struct ceph_mds_client *mdsc = fsc->mdsc;
> > > > > > +     struct ceph_mds_request *req;
> > > > > > +     int mode = USE_AUTH_MDS;
> > > > > > +     int err;
> > > > > > +     char *xattr_value;
> > > > > > +     size_t xattr_value_len;
> > > > > > +
> > > > > > +     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
> > > > > > +     if (IS_ERR(req)) {
> > > > > > +             err = -ENOMEM;
> > > > > > +             goto out;
> > > > > > +     }
> > > > > > +
> > > > > > +     req->r_path2 = kstrdup(name, GFP_NOFS);
> > > > > > +     if (!req->r_path2) {
> > > > > > +             err = -ENOMEM;
> > > > > > +             goto put;
> > > > > > +     }
> > > > > > +
> > > > > > +     ihold(inode);
> > > > > > +     req->r_inode = inode;
> > > > > > +     err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > > > +     if (err < 0)
> > > > > > +             goto put;
> > > > > > +
> > > > > > +     xattr_value = req->r_reply_info.xattr_info.xattr_value;
> > > > > > +     xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
> > > > > > +
> > > > > > +     dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
> > > > > > +
> > > > > > +     err = (int)xattr_value_len;
> > > > > > +     if (size == 0)
> > > > > > +             goto put;
> > > > > > +
> > > > > > +     if (xattr_value_len > size) {
> > > > > > +             err = -ERANGE;
> > > > > > +             goto put;
> > > > > > +     }
> > > > > > +
> > > > > > +     memcpy(value, xattr_value, xattr_value_len);
> > > > > > +put:
> > > > > > +     ceph_mdsc_put_request(req);
> > > > > > +out:
> > > > > > +     dout("do_getvxattr result=%d\n", err);
> > > > > > +     return err;
> > > > > > +}
> > > > > > +
> > > > > > 
> > > > > >    /*
> > > > > >     * Check inode permissions.  We verify we have a valid value for
> > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > > index c30eefc0ac19..a5eafc71d976 100644
> > > > > > --- a/fs/ceph/mds_client.c
> > > > > > +++ b/fs/ceph/mds_client.c
> > > > > > @@ -555,6 +555,29 @@ static int parse_reply_info_create(void **p, void *end,
> > > > > >         return -EIO;
> > > > > >    }
> > > > > > 
> > > > > > +static int parse_reply_info_getvxattr(void **p, void *end,
> > > > > > +                                   struct ceph_mds_reply_info_parsed *info,
> > > > > > +                                   u64 features)
> > > > > > +{
> > > > > > +     u8 struct_v, struct_compat;
> > > > > > +     u32 struct_len;
> > > > > > +     u32 value_len;
> > > > > > +
> > > > > > +     ceph_decode_8_safe(p, end, struct_v, bad);
> > > > > > +     ceph_decode_8_safe(p, end, struct_compat, bad);
> > > > > > +     ceph_decode_32_safe(p, end, struct_len, bad);
> > > > > > +     ceph_decode_32_safe(p, end, value_len, bad);
> > > > > > +
> > > > > > +     if (value_len == end - *p) {
> > > > > > +       info->xattr_info.xattr_value = *p;
> > > > > > +       info->xattr_info.xattr_value_len = end - *p;
> > > > > > +       *p = end;
> > > > > > +       return info->xattr_info.xattr_value_len;
> > > > > > +     }
> > > > > > +bad:
> > > > > > +     return -EIO;
> > > > > > +}
> > > > > > +
> > > > > >    /*
> > > > > >     * parse extra results
> > > > > >     */
> > > > > > @@ -570,6 +593,8 @@ static int parse_reply_info_extra(void **p, void *end,
> > > > > >                 return parse_reply_info_readdir(p, end, info, features);
> > > > > >         else if (op == CEPH_MDS_OP_CREATE)
> > > > > >                 return parse_reply_info_create(p, end, info, features, s);
> > > > > > +     else if (op == CEPH_MDS_OP_GETVXATTR)
> > > > > > +             return parse_reply_info_getvxattr(p, end, info, features);
> > > > > >         else
> > > > > >                 return -EIO;
> > > > > >    }
> > > > > > @@ -615,7 +640,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
> > > > > > 
> > > > > >         if (p != end)
> > > > > >                 goto bad;
> > > > > > -     return 0;
> > > > > > +     return err;
> > > > > > 
> > > > > >    bad:
> > > > > >         err = -EIO;
> > > > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > > > > index 97c7f7bfa55f..f2a8e5af3c2e 100644
> > > > > > --- a/fs/ceph/mds_client.h
> > > > > > +++ b/fs/ceph/mds_client.h
> > > > > > @@ -29,8 +29,10 @@ enum ceph_feature_type {
> > > > > >         CEPHFS_FEATURE_MULTI_RECONNECT,
> > > > > >         CEPHFS_FEATURE_DELEG_INO,
> > > > > >         CEPHFS_FEATURE_METRIC_COLLECT,
> > > > > > +     CEPHFS_FEATURE_ALTERNATE_NAME,
> > > > > > +     CEPHFS_FEATURE_GETVXATTR,
> > > > > > 
> > > > > > -     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
> > > > > > +     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_GETVXATTR,
> > > > > >    };
> > > > > > 
> > > > > >    /*
> > > > > > @@ -45,6 +47,8 @@ enum ceph_feature_type {
> > > > > >         CEPHFS_FEATURE_MULTI_RECONNECT,         \
> > > > > >         CEPHFS_FEATURE_DELEG_INO,               \
> > > > > >         CEPHFS_FEATURE_METRIC_COLLECT,          \
> > > > > > +     CEPHFS_FEATURE_ALTERNATE_NAME,          \
> > > > > > +     CEPHFS_FEATURE_GETVXATTR,               \
> > > > > >                                                 \
> > > > > >         CEPHFS_FEATURE_MAX,                     \
> > > > > >    }
> > > > > > @@ -100,6 +104,11 @@ struct ceph_mds_reply_dir_entry {
> > > > > >         loff_t                        offset;
> > > > > >    };
> > > > > > 
> > > > > > +struct ceph_mds_reply_xattr {
> > > > > > +     char *xattr_value;
> > > > > > +     size_t xattr_value_len;
> > > > > > +};
> > > > > > +
> > > > > >    /*
> > > > > >     * parsed info about an mds reply, including information about
> > > > > >     * either: 1) the target inode and/or its parent directory and dentry,
> > > > > > @@ -115,6 +124,7 @@ struct ceph_mds_reply_info_parsed {
> > > > > >         char                          *dname;
> > > > > >         u32                           dname_len;
> > > > > >         struct ceph_mds_reply_lease   *dlease;
> > > > > > +     struct ceph_mds_reply_xattr   xattr_info;
> > > > > > 
> > > > > >         /* extra */
> > > > > >         union {
> > > > > > diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
> > > > > > index 573bb9556fb5..e36e8948e728 100644
> > > > > > --- a/fs/ceph/strings.c
> > > > > > +++ b/fs/ceph/strings.c
> > > > > > @@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op)
> > > > > >         case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
> > > > > >         case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";
> > > > > >         case CEPH_MDS_OP_GETATTR:  return "getattr";
> > > > > > +     case CEPH_MDS_OP_GETVXATTR:  return "getvxattr";
> > > > > >         case CEPH_MDS_OP_SETXATTR: return "setxattr";
> > > > > >         case CEPH_MDS_OP_SETATTR: return "setattr";
> > > > > >         case CEPH_MDS_OP_RMXATTR: return "rmxattr";
> > > > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > > > index ac331aa07cfa..a627fa69668e 100644
> > > > > > --- a/fs/ceph/super.h
> > > > > > +++ b/fs/ceph/super.h
> > > > > > @@ -1043,6 +1043,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode)
> > > > > > 
> > > > > >    /* xattr.c */
> > > > > >    int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
> > > > > > +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
> > > > > >    ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
> > > > > >    extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
> > > > > >    extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
> > > > > > diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
> > > > > > index fcf7dfdecf96..dc32876a541a 100644
> > > > > > --- a/fs/ceph/xattr.c
> > > > > > +++ b/fs/ceph/xattr.c
> > > > > > @@ -918,6 +918,30 @@ static inline int __get_request_mask(struct inode *in) {
> > > > > >         return mask;
> > > > > >    }
> > > > > > 
> > > > > > +/* check if the entire cluster supports the given feature */
> > > > > > +static inline bool ceph_cluster_has_feature(struct inode *inode, int feature_bit)
> > > > > > +{
> > > > > > +     int64_t i;
> > > > > > +     struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > > > > > +     struct ceph_mds_session **sessions = fsc->mdsc->sessions;
> > > > > > +     int64_t num_sessions = atomic_read(&fsc->mdsc->num_sessions);
> > > > > > +
> > > > > > +     if (fsc->mdsc->stopping)
> > > > > > +             return false;
> > > > > > +
> > > > > > +     if (!sessions)
> > > > > > +             return false;
> > > > > > +
> > > > > > +     for (i = 0; i < num_sessions; i++) {
> > > > > > +             struct ceph_mds_session *session = sessions[i];
> > > > > > +             if (!session)
> > > > > > +                     return false;
> > > > > > +             if (!test_bit(feature_bit, &session->s_features))
> > > > > > +                     return false;
> > > > > What guarantee do you have that "session" will still be a valid pointer
> > > > > by the time you get to dereferencing it here?
> > > > > 
> > > > > I think this loop needs some locking (as Xiubo pointed out in his
> > > > > earlier review).
> > > > yeah, thanks for pointing that out
> > > > I'm trying to wrap the entire processing of this function inside a
> > > > mutex_unlock(&mdsc->mutex) ... but the mount command fails
> > > > to mount if done so. If code is not wrapped in mutex lock...unlock
> > > > then the mount is successful.
> > > > It's a surprise that the code doesn't deadlock under the mutex
> > > > lock...unlock and gracefully fails with a message.
> > > > Any hints on what I could be missing.
> > > > 
> > > > > > +     }
> > > > > > +     return true;
> > > > > > +}
> > > > > > +
> > > > > >    ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
> > > > > >                       size_t size)
> > > > > >    {
> > > > > > @@ -927,6 +951,16 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
> > > > > >         int req_mask;
> > > > > >         ssize_t err;
> > > > > > 
> > > > > > +     if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) &&
> > > > > > +         ceph_cluster_has_feature(inode, CEPHFS_FEATURE_GETVXATTR)) {
> > > > > > +             err = ceph_do_getvxattr(inode, name, value, size);
> > > > > > +             /* if cluster doesn't support xattr, we try to service it
> > > > > > +              * locally
> > > > > > +              */
> > > > > > +             if (err >= 0)
> > > > > > +                     return err;
> > > > > > +     }
> > > > > > +
> > > > > What is this? Why not always service this locally?
> > > > vxattr handling is planned to be moved to the MDS side.
> > > > As I've pointed out to Xiubo, there's a few new things that have been done for
> > > > layout vxattr management. Also, as per your original tracker, ceph.dir.pin*
> > > > can't be handled locally.
> > > > getvxattr() currently handles:
> > > > 1. ceph.dir.layout*
> > > > 2. ceph.file.layout*
> > > > 3. ceph.dir.pin*
> > > The above seems will include the 'ceph.dir.layout', 'ceph.file.layout'
> > > and 'ceph.dir.pin' ? All these have been handled in 'ceph_file_vxattrs'
> > > and 'ceph_dir_vxattrs'...
> > > 
> > > And the code above will always force kclient to get all the 'ceph.XXX'
> > > xattrs from MDS ?
> > yes, the kclient will always get vxattr values from MDS
> > for old cluster, op will be handled locally
> > 
> > Jeff has a proposal to expose the JSON output variety of layout
> > vxattr vlaue via new vxattr altogether
> > eg. ceph.dir.layout_json and ceph.file.layout_json
> > 
> Yeah, sounds good. Or maybe just adding the '.json' instead of '_json'.
> 

+1 -- that looks cleaner than mixing up delimiters

To be clear, I think this should wholly be a fallback mechanism for when
the client doesn't recognize a vxattr name. IOW, we should only call the
MDS after ceph_match_vxattr doesn't match an xattr name.

> 
> > > 
> > > 
> > > > If kclient is new and the cluster is old, then layout vxattr will be
> > > > handled the old
> > > > way, i.e. locally. ceph.dir.pin* will remain inaccessible.
> > > > 
> > > > > >         /* let's see if a virtual xattr was requested */
> > > > > >         vxattr = ceph_match_vxattr(inode, name);
> > > > > >         if (vxattr) {
> > > > > > diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> > > > > > index 7ad6c3d0db7d..66db21ac5f0c 100644
> > > > > > --- a/include/linux/ceph/ceph_fs.h
> > > > > > +++ b/include/linux/ceph/ceph_fs.h
> > > > > > @@ -328,6 +328,7 @@ enum {
> > > > > >         CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
> > > > > >         CEPH_MDS_OP_LOOKUPINO  = 0x00104,
> > > > > >         CEPH_MDS_OP_LOOKUPNAME = 0x00105,
> > > > > > +     CEPH_MDS_OP_GETVXATTR  = 0x00106,
> > > > > > 
> > > > > >         CEPH_MDS_OP_SETXATTR   = 0x01105,
> > > > > >         CEPH_MDS_OP_RMXATTR    = 0x01106,
> > > > > --
> > > > > Jeff Layton <jlayton@kernel.org>
> 

-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v4 1/1] ceph: add getvxattr op
  2022-01-17 13:27             ` Jeff Layton
@ 2022-01-17 13:56               ` Xiubo Li
  2022-01-17 14:50               ` Gregory Farnum
  1 sibling, 0 replies; 13+ messages in thread
From: Xiubo Li @ 2022-01-17 13:56 UTC (permalink / raw)
  To: Jeff Layton, Milind Changire; +Cc: Ilya Dryomov, ceph-devel, Milind Changire


On 1/17/22 9:27 PM, Jeff Layton wrote:
> On Mon, 2022-01-17 at 21:09 +0800, Xiubo Li wrote:
>> On 1/17/22 8:58 PM, Milind Changire wrote:
>>> On Mon, Jan 17, 2022 at 6:20 PM Xiubo Li <xiubli@redhat.com> wrote:
>>>> On 1/17/22 7:07 PM, Milind Changire wrote:
>>>>> On Mon, Jan 17, 2022 at 4:23 PM Jeff Layton <jlayton@kernel.org> wrote:
>>>>>> On Mon, 2022-01-17 at 08:51 +0000, Milind Changire wrote:
>>>>>>> Problem:
>>>>>>> Directory vxattrs like ceph.dir.pin* and ceph.dir.layout* may not be
>>>>>>> propagated to the client as frequently to keep them updated. This
>>>>>>> creates vxattr availability problems.
>>>>>>>
>>>>>>> Solution:
>>>>>>> Adds new getvxattr op to fetch ceph.dir.pin*, ceph.dir.layout* and
>>>>>>> ceph.file.layout* vxattrs.
>>>>>>> If the entire layout for a dir or a file is being set, then it is
>>>>>>> expected that the layout be set in standard JSON format. Individual
>>>>>>> field value retrieval is not wrapped in JSON. The JSON format also
>>>>>>> applies while setting the vxattr if the entire layout is being set in
>>>>>>> one go.
>>>>>>> As a temporary measure, setting a vxattr can also be done in the old
>>>>>>> format. The old format will be deprecated in the future.
>>>>>>>
>>>>>>> URL: https://tracker.ceph.com/issues/51062
>>>>>>> Signed-off-by: Milind Changire <mchangir@redhat.com>
>>>>>>> ---
>>>>>>>     fs/ceph/inode.c              | 51 ++++++++++++++++++++++++++++++++++++
>>>>>>>     fs/ceph/mds_client.c         | 27 ++++++++++++++++++-
>>>>>>>     fs/ceph/mds_client.h         | 12 ++++++++-
>>>>>>>     fs/ceph/strings.c            |  1 +
>>>>>>>     fs/ceph/super.h              |  1 +
>>>>>>>     fs/ceph/xattr.c              | 34 ++++++++++++++++++++++++
>>>>>>>     include/linux/ceph/ceph_fs.h |  1 +
>>>>>>>     7 files changed, 125 insertions(+), 2 deletions(-)
>>>>>>>
>>>>>>> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
>>>>>>> index e3322fcb2e8d..efdce049b7f0 100644
>>>>>>> --- a/fs/ceph/inode.c
>>>>>>> +++ b/fs/ceph/inode.c
>>>>>>> @@ -2291,6 +2291,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
>>>>>>>          return err;
>>>>>>>     }
>>>>>>>
>>>>>>> +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
>>>>>>> +                   size_t size)
>>>>>>> +{
>>>>>>> +     struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
>>>>>>> +     struct ceph_mds_client *mdsc = fsc->mdsc;
>>>>>>> +     struct ceph_mds_request *req;
>>>>>>> +     int mode = USE_AUTH_MDS;
>>>>>>> +     int err;
>>>>>>> +     char *xattr_value;
>>>>>>> +     size_t xattr_value_len;
>>>>>>> +
>>>>>>> +     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
>>>>>>> +     if (IS_ERR(req)) {
>>>>>>> +             err = -ENOMEM;
>>>>>>> +             goto out;
>>>>>>> +     }
>>>>>>> +
>>>>>>> +     req->r_path2 = kstrdup(name, GFP_NOFS);
>>>>>>> +     if (!req->r_path2) {
>>>>>>> +             err = -ENOMEM;
>>>>>>> +             goto put;
>>>>>>> +     }
>>>>>>> +
>>>>>>> +     ihold(inode);
>>>>>>> +     req->r_inode = inode;
>>>>>>> +     err = ceph_mdsc_do_request(mdsc, NULL, req);
>>>>>>> +     if (err < 0)
>>>>>>> +             goto put;
>>>>>>> +
>>>>>>> +     xattr_value = req->r_reply_info.xattr_info.xattr_value;
>>>>>>> +     xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
>>>>>>> +
>>>>>>> +     dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
>>>>>>> +
>>>>>>> +     err = (int)xattr_value_len;
>>>>>>> +     if (size == 0)
>>>>>>> +             goto put;
>>>>>>> +
>>>>>>> +     if (xattr_value_len > size) {
>>>>>>> +             err = -ERANGE;
>>>>>>> +             goto put;
>>>>>>> +     }
>>>>>>> +
>>>>>>> +     memcpy(value, xattr_value, xattr_value_len);
>>>>>>> +put:
>>>>>>> +     ceph_mdsc_put_request(req);
>>>>>>> +out:
>>>>>>> +     dout("do_getvxattr result=%d\n", err);
>>>>>>> +     return err;
>>>>>>> +}
>>>>>>> +
>>>>>>>
>>>>>>>     /*
>>>>>>>      * Check inode permissions.  We verify we have a valid value for
>>>>>>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>>>>>>> index c30eefc0ac19..a5eafc71d976 100644
>>>>>>> --- a/fs/ceph/mds_client.c
>>>>>>> +++ b/fs/ceph/mds_client.c
>>>>>>> @@ -555,6 +555,29 @@ static int parse_reply_info_create(void **p, void *end,
>>>>>>>          return -EIO;
>>>>>>>     }
>>>>>>>
>>>>>>> +static int parse_reply_info_getvxattr(void **p, void *end,
>>>>>>> +                                   struct ceph_mds_reply_info_parsed *info,
>>>>>>> +                                   u64 features)
>>>>>>> +{
>>>>>>> +     u8 struct_v, struct_compat;
>>>>>>> +     u32 struct_len;
>>>>>>> +     u32 value_len;
>>>>>>> +
>>>>>>> +     ceph_decode_8_safe(p, end, struct_v, bad);
>>>>>>> +     ceph_decode_8_safe(p, end, struct_compat, bad);
>>>>>>> +     ceph_decode_32_safe(p, end, struct_len, bad);
>>>>>>> +     ceph_decode_32_safe(p, end, value_len, bad);
>>>>>>> +
>>>>>>> +     if (value_len == end - *p) {
>>>>>>> +       info->xattr_info.xattr_value = *p;
>>>>>>> +       info->xattr_info.xattr_value_len = end - *p;
>>>>>>> +       *p = end;
>>>>>>> +       return info->xattr_info.xattr_value_len;
>>>>>>> +     }
>>>>>>> +bad:
>>>>>>> +     return -EIO;
>>>>>>> +}
>>>>>>> +
>>>>>>>     /*
>>>>>>>      * parse extra results
>>>>>>>      */
>>>>>>> @@ -570,6 +593,8 @@ static int parse_reply_info_extra(void **p, void *end,
>>>>>>>                  return parse_reply_info_readdir(p, end, info, features);
>>>>>>>          else if (op == CEPH_MDS_OP_CREATE)
>>>>>>>                  return parse_reply_info_create(p, end, info, features, s);
>>>>>>> +     else if (op == CEPH_MDS_OP_GETVXATTR)
>>>>>>> +             return parse_reply_info_getvxattr(p, end, info, features);
>>>>>>>          else
>>>>>>>                  return -EIO;
>>>>>>>     }
>>>>>>> @@ -615,7 +640,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
>>>>>>>
>>>>>>>          if (p != end)
>>>>>>>                  goto bad;
>>>>>>> -     return 0;
>>>>>>> +     return err;
>>>>>>>
>>>>>>>     bad:
>>>>>>>          err = -EIO;
>>>>>>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>>>>>>> index 97c7f7bfa55f..f2a8e5af3c2e 100644
>>>>>>> --- a/fs/ceph/mds_client.h
>>>>>>> +++ b/fs/ceph/mds_client.h
>>>>>>> @@ -29,8 +29,10 @@ enum ceph_feature_type {
>>>>>>>          CEPHFS_FEATURE_MULTI_RECONNECT,
>>>>>>>          CEPHFS_FEATURE_DELEG_INO,
>>>>>>>          CEPHFS_FEATURE_METRIC_COLLECT,
>>>>>>> +     CEPHFS_FEATURE_ALTERNATE_NAME,
>>>>>>> +     CEPHFS_FEATURE_GETVXATTR,
>>>>>>>
>>>>>>> -     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
>>>>>>> +     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_GETVXATTR,
>>>>>>>     };
>>>>>>>
>>>>>>>     /*
>>>>>>> @@ -45,6 +47,8 @@ enum ceph_feature_type {
>>>>>>>          CEPHFS_FEATURE_MULTI_RECONNECT,         \
>>>>>>>          CEPHFS_FEATURE_DELEG_INO,               \
>>>>>>>          CEPHFS_FEATURE_METRIC_COLLECT,          \
>>>>>>> +     CEPHFS_FEATURE_ALTERNATE_NAME,          \
>>>>>>> +     CEPHFS_FEATURE_GETVXATTR,               \
>>>>>>>                                                  \
>>>>>>>          CEPHFS_FEATURE_MAX,                     \
>>>>>>>     }
>>>>>>> @@ -100,6 +104,11 @@ struct ceph_mds_reply_dir_entry {
>>>>>>>          loff_t                        offset;
>>>>>>>     };
>>>>>>>
>>>>>>> +struct ceph_mds_reply_xattr {
>>>>>>> +     char *xattr_value;
>>>>>>> +     size_t xattr_value_len;
>>>>>>> +};
>>>>>>> +
>>>>>>>     /*
>>>>>>>      * parsed info about an mds reply, including information about
>>>>>>>      * either: 1) the target inode and/or its parent directory and dentry,
>>>>>>> @@ -115,6 +124,7 @@ struct ceph_mds_reply_info_parsed {
>>>>>>>          char                          *dname;
>>>>>>>          u32                           dname_len;
>>>>>>>          struct ceph_mds_reply_lease   *dlease;
>>>>>>> +     struct ceph_mds_reply_xattr   xattr_info;
>>>>>>>
>>>>>>>          /* extra */
>>>>>>>          union {
>>>>>>> diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
>>>>>>> index 573bb9556fb5..e36e8948e728 100644
>>>>>>> --- a/fs/ceph/strings.c
>>>>>>> +++ b/fs/ceph/strings.c
>>>>>>> @@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op)
>>>>>>>          case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
>>>>>>>          case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";
>>>>>>>          case CEPH_MDS_OP_GETATTR:  return "getattr";
>>>>>>> +     case CEPH_MDS_OP_GETVXATTR:  return "getvxattr";
>>>>>>>          case CEPH_MDS_OP_SETXATTR: return "setxattr";
>>>>>>>          case CEPH_MDS_OP_SETATTR: return "setattr";
>>>>>>>          case CEPH_MDS_OP_RMXATTR: return "rmxattr";
>>>>>>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>>>>>>> index ac331aa07cfa..a627fa69668e 100644
>>>>>>> --- a/fs/ceph/super.h
>>>>>>> +++ b/fs/ceph/super.h
>>>>>>> @@ -1043,6 +1043,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode)
>>>>>>>
>>>>>>>     /* xattr.c */
>>>>>>>     int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
>>>>>>> +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
>>>>>>>     ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
>>>>>>>     extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
>>>>>>>     extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
>>>>>>> diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
>>>>>>> index fcf7dfdecf96..dc32876a541a 100644
>>>>>>> --- a/fs/ceph/xattr.c
>>>>>>> +++ b/fs/ceph/xattr.c
>>>>>>> @@ -918,6 +918,30 @@ static inline int __get_request_mask(struct inode *in) {
>>>>>>>          return mask;
>>>>>>>     }
>>>>>>>
>>>>>>> +/* check if the entire cluster supports the given feature */
>>>>>>> +static inline bool ceph_cluster_has_feature(struct inode *inode, int feature_bit)
>>>>>>> +{
>>>>>>> +     int64_t i;
>>>>>>> +     struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
>>>>>>> +     struct ceph_mds_session **sessions = fsc->mdsc->sessions;
>>>>>>> +     int64_t num_sessions = atomic_read(&fsc->mdsc->num_sessions);
>>>>>>> +
>>>>>>> +     if (fsc->mdsc->stopping)
>>>>>>> +             return false;
>>>>>>> +
>>>>>>> +     if (!sessions)
>>>>>>> +             return false;
>>>>>>> +
>>>>>>> +     for (i = 0; i < num_sessions; i++) {
>>>>>>> +             struct ceph_mds_session *session = sessions[i];
>>>>>>> +             if (!session)
>>>>>>> +                     return false;
>>>>>>> +             if (!test_bit(feature_bit, &session->s_features))
>>>>>>> +                     return false;
>>>>>> What guarantee do you have that "session" will still be a valid pointer
>>>>>> by the time you get to dereferencing it here?
>>>>>>
>>>>>> I think this loop needs some locking (as Xiubo pointed out in his
>>>>>> earlier review).
>>>>> yeah, thanks for pointing that out
>>>>> I'm trying to wrap the entire processing of this function inside a
>>>>> mutex_unlock(&mdsc->mutex) ... but the mount command fails
>>>>> to mount if done so. If code is not wrapped in mutex lock...unlock
>>>>> then the mount is successful.
>>>>> It's a surprise that the code doesn't deadlock under the mutex
>>>>> lock...unlock and gracefully fails with a message.
>>>>> Any hints on what I could be missing.
>>>>>
>>>>>>> +     }
>>>>>>> +     return true;
>>>>>>> +}
>>>>>>> +
>>>>>>>     ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
>>>>>>>                        size_t size)
>>>>>>>     {
>>>>>>> @@ -927,6 +951,16 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
>>>>>>>          int req_mask;
>>>>>>>          ssize_t err;
>>>>>>>
>>>>>>> +     if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) &&
>>>>>>> +         ceph_cluster_has_feature(inode, CEPHFS_FEATURE_GETVXATTR)) {
>>>>>>> +             err = ceph_do_getvxattr(inode, name, value, size);
>>>>>>> +             /* if cluster doesn't support xattr, we try to service it
>>>>>>> +              * locally
>>>>>>> +              */
>>>>>>> +             if (err >= 0)
>>>>>>> +                     return err;
>>>>>>> +     }
>>>>>>> +
>>>>>> What is this? Why not always service this locally?
>>>>> vxattr handling is planned to be moved to the MDS side.
>>>>> As I've pointed out to Xiubo, there's a few new things that have been done for
>>>>> layout vxattr management. Also, as per your original tracker, ceph.dir.pin*
>>>>> can't be handled locally.
>>>>> getvxattr() currently handles:
>>>>> 1. ceph.dir.layout*
>>>>> 2. ceph.file.layout*
>>>>> 3. ceph.dir.pin*
>>>> The above seems will include the 'ceph.dir.layout', 'ceph.file.layout'
>>>> and 'ceph.dir.pin' ? All these have been handled in 'ceph_file_vxattrs'
>>>> and 'ceph_dir_vxattrs'...
>>>>
>>>> And the code above will always force kclient to get all the 'ceph.XXX'
>>>> xattrs from MDS ?
>>> yes, the kclient will always get vxattr values from MDS
>>> for old cluster, op will be handled locally
>>>
>>> Jeff has a proposal to expose the JSON output variety of layout
>>> vxattr vlaue via new vxattr altogether
>>> eg. ceph.dir.layout_json and ceph.file.layout_json
>>>
>> Yeah, sounds good. Or maybe just adding the '.json' instead of '_json'.
>>
> +1 -- that looks cleaner than mixing up delimiters
>
> To be clear, I think this should wholly be a fallback mechanism for when
> the client doesn't recognize a vxattr name. IOW, we should only call the
> MDS after ceph_match_vxattr doesn't match an xattr name.

Sounds good.


>>>>
>>>>> If kclient is new and the cluster is old, then layout vxattr will be
>>>>> handled the old
>>>>> way, i.e. locally. ceph.dir.pin* will remain inaccessible.
>>>>>
>>>>>>>          /* let's see if a virtual xattr was requested */
>>>>>>>          vxattr = ceph_match_vxattr(inode, name);
>>>>>>>          if (vxattr) {
>>>>>>> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
>>>>>>> index 7ad6c3d0db7d..66db21ac5f0c 100644
>>>>>>> --- a/include/linux/ceph/ceph_fs.h
>>>>>>> +++ b/include/linux/ceph/ceph_fs.h
>>>>>>> @@ -328,6 +328,7 @@ enum {
>>>>>>>          CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
>>>>>>>          CEPH_MDS_OP_LOOKUPINO  = 0x00104,
>>>>>>>          CEPH_MDS_OP_LOOKUPNAME = 0x00105,
>>>>>>> +     CEPH_MDS_OP_GETVXATTR  = 0x00106,
>>>>>>>
>>>>>>>          CEPH_MDS_OP_SETXATTR   = 0x01105,
>>>>>>>          CEPH_MDS_OP_RMXATTR    = 0x01106,
>>>>>> --
>>>>>> Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v4 1/1] ceph: add getvxattr op
  2022-01-17 13:27             ` Jeff Layton
  2022-01-17 13:56               ` Xiubo Li
@ 2022-01-17 14:50               ` Gregory Farnum
  2022-01-17 18:09                 ` Patrick Donnelly
  1 sibling, 1 reply; 13+ messages in thread
From: Gregory Farnum @ 2022-01-17 14:50 UTC (permalink / raw)
  To: Jeff Layton, Patrick Donnelly, Venky Shankar
  Cc: Xiubo Li, Milind Changire, Ilya Dryomov, ceph-devel, Milind Changire

On Mon, Jan 17, 2022 at 5:28 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Mon, 2022-01-17 at 21:09 +0800, Xiubo Li wrote:
> > On 1/17/22 8:58 PM, Milind Changire wrote:
> > > On Mon, Jan 17, 2022 at 6:20 PM Xiubo Li <xiubli@redhat.com> wrote:
> > > >
> > > > On 1/17/22 7:07 PM, Milind Changire wrote:
> > > > > On Mon, Jan 17, 2022 at 4:23 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > On Mon, 2022-01-17 at 08:51 +0000, Milind Changire wrote:
> > > > > > > Problem:
> > > > > > > Directory vxattrs like ceph.dir.pin* and ceph.dir.layout* may not be
> > > > > > > propagated to the client as frequently to keep them updated. This
> > > > > > > creates vxattr availability problems.
> > > > > > >
> > > > > > > Solution:
> > > > > > > Adds new getvxattr op to fetch ceph.dir.pin*, ceph.dir.layout* and
> > > > > > > ceph.file.layout* vxattrs.
> > > > > > > If the entire layout for a dir or a file is being set, then it is
> > > > > > > expected that the layout be set in standard JSON format. Individual
> > > > > > > field value retrieval is not wrapped in JSON. The JSON format also
> > > > > > > applies while setting the vxattr if the entire layout is being set in
> > > > > > > one go.
> > > > > > > As a temporary measure, setting a vxattr can also be done in the old
> > > > > > > format. The old format will be deprecated in the future.
> > > > > > >
> > > > > > > URL: https://tracker.ceph.com/issues/51062
> > > > > > > Signed-off-by: Milind Changire <mchangir@redhat.com>
> > > > > > > ---
> > > > > > >    fs/ceph/inode.c              | 51 ++++++++++++++++++++++++++++++++++++
> > > > > > >    fs/ceph/mds_client.c         | 27 ++++++++++++++++++-
> > > > > > >    fs/ceph/mds_client.h         | 12 ++++++++-
> > > > > > >    fs/ceph/strings.c            |  1 +
> > > > > > >    fs/ceph/super.h              |  1 +
> > > > > > >    fs/ceph/xattr.c              | 34 ++++++++++++++++++++++++
> > > > > > >    include/linux/ceph/ceph_fs.h |  1 +
> > > > > > >    7 files changed, 125 insertions(+), 2 deletions(-)
> > > > > > >
> > > > > > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > > > > > index e3322fcb2e8d..efdce049b7f0 100644
> > > > > > > --- a/fs/ceph/inode.c
> > > > > > > +++ b/fs/ceph/inode.c
> > > > > > > @@ -2291,6 +2291,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> > > > > > >         return err;
> > > > > > >    }
> > > > > > >
> > > > > > > +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
> > > > > > > +                   size_t size)
> > > > > > > +{
> > > > > > > +     struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
> > > > > > > +     struct ceph_mds_client *mdsc = fsc->mdsc;
> > > > > > > +     struct ceph_mds_request *req;
> > > > > > > +     int mode = USE_AUTH_MDS;
> > > > > > > +     int err;
> > > > > > > +     char *xattr_value;
> > > > > > > +     size_t xattr_value_len;
> > > > > > > +
> > > > > > > +     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
> > > > > > > +     if (IS_ERR(req)) {
> > > > > > > +             err = -ENOMEM;
> > > > > > > +             goto out;
> > > > > > > +     }
> > > > > > > +
> > > > > > > +     req->r_path2 = kstrdup(name, GFP_NOFS);
> > > > > > > +     if (!req->r_path2) {
> > > > > > > +             err = -ENOMEM;
> > > > > > > +             goto put;
> > > > > > > +     }
> > > > > > > +
> > > > > > > +     ihold(inode);
> > > > > > > +     req->r_inode = inode;
> > > > > > > +     err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > > > > +     if (err < 0)
> > > > > > > +             goto put;
> > > > > > > +
> > > > > > > +     xattr_value = req->r_reply_info.xattr_info.xattr_value;
> > > > > > > +     xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
> > > > > > > +
> > > > > > > +     dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
> > > > > > > +
> > > > > > > +     err = (int)xattr_value_len;
> > > > > > > +     if (size == 0)
> > > > > > > +             goto put;
> > > > > > > +
> > > > > > > +     if (xattr_value_len > size) {
> > > > > > > +             err = -ERANGE;
> > > > > > > +             goto put;
> > > > > > > +     }
> > > > > > > +
> > > > > > > +     memcpy(value, xattr_value, xattr_value_len);
> > > > > > > +put:
> > > > > > > +     ceph_mdsc_put_request(req);
> > > > > > > +out:
> > > > > > > +     dout("do_getvxattr result=%d\n", err);
> > > > > > > +     return err;
> > > > > > > +}
> > > > > > > +
> > > > > > >
> > > > > > >    /*
> > > > > > >     * Check inode permissions.  We verify we have a valid value for
> > > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > > > index c30eefc0ac19..a5eafc71d976 100644
> > > > > > > --- a/fs/ceph/mds_client.c
> > > > > > > +++ b/fs/ceph/mds_client.c
> > > > > > > @@ -555,6 +555,29 @@ static int parse_reply_info_create(void **p, void *end,
> > > > > > >         return -EIO;
> > > > > > >    }
> > > > > > >
> > > > > > > +static int parse_reply_info_getvxattr(void **p, void *end,
> > > > > > > +                                   struct ceph_mds_reply_info_parsed *info,
> > > > > > > +                                   u64 features)
> > > > > > > +{
> > > > > > > +     u8 struct_v, struct_compat;
> > > > > > > +     u32 struct_len;
> > > > > > > +     u32 value_len;
> > > > > > > +
> > > > > > > +     ceph_decode_8_safe(p, end, struct_v, bad);
> > > > > > > +     ceph_decode_8_safe(p, end, struct_compat, bad);
> > > > > > > +     ceph_decode_32_safe(p, end, struct_len, bad);
> > > > > > > +     ceph_decode_32_safe(p, end, value_len, bad);
> > > > > > > +
> > > > > > > +     if (value_len == end - *p) {
> > > > > > > +       info->xattr_info.xattr_value = *p;
> > > > > > > +       info->xattr_info.xattr_value_len = end - *p;
> > > > > > > +       *p = end;
> > > > > > > +       return info->xattr_info.xattr_value_len;
> > > > > > > +     }
> > > > > > > +bad:
> > > > > > > +     return -EIO;
> > > > > > > +}
> > > > > > > +
> > > > > > >    /*
> > > > > > >     * parse extra results
> > > > > > >     */
> > > > > > > @@ -570,6 +593,8 @@ static int parse_reply_info_extra(void **p, void *end,
> > > > > > >                 return parse_reply_info_readdir(p, end, info, features);
> > > > > > >         else if (op == CEPH_MDS_OP_CREATE)
> > > > > > >                 return parse_reply_info_create(p, end, info, features, s);
> > > > > > > +     else if (op == CEPH_MDS_OP_GETVXATTR)
> > > > > > > +             return parse_reply_info_getvxattr(p, end, info, features);
> > > > > > >         else
> > > > > > >                 return -EIO;
> > > > > > >    }
> > > > > > > @@ -615,7 +640,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
> > > > > > >
> > > > > > >         if (p != end)
> > > > > > >                 goto bad;
> > > > > > > -     return 0;
> > > > > > > +     return err;
> > > > > > >
> > > > > > >    bad:
> > > > > > >         err = -EIO;
> > > > > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > > > > > index 97c7f7bfa55f..f2a8e5af3c2e 100644
> > > > > > > --- a/fs/ceph/mds_client.h
> > > > > > > +++ b/fs/ceph/mds_client.h
> > > > > > > @@ -29,8 +29,10 @@ enum ceph_feature_type {
> > > > > > >         CEPHFS_FEATURE_MULTI_RECONNECT,
> > > > > > >         CEPHFS_FEATURE_DELEG_INO,
> > > > > > >         CEPHFS_FEATURE_METRIC_COLLECT,
> > > > > > > +     CEPHFS_FEATURE_ALTERNATE_NAME,
> > > > > > > +     CEPHFS_FEATURE_GETVXATTR,
> > > > > > >
> > > > > > > -     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
> > > > > > > +     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_GETVXATTR,
> > > > > > >    };
> > > > > > >
> > > > > > >    /*
> > > > > > > @@ -45,6 +47,8 @@ enum ceph_feature_type {
> > > > > > >         CEPHFS_FEATURE_MULTI_RECONNECT,         \
> > > > > > >         CEPHFS_FEATURE_DELEG_INO,               \
> > > > > > >         CEPHFS_FEATURE_METRIC_COLLECT,          \
> > > > > > > +     CEPHFS_FEATURE_ALTERNATE_NAME,          \
> > > > > > > +     CEPHFS_FEATURE_GETVXATTR,               \
> > > > > > >                                                 \
> > > > > > >         CEPHFS_FEATURE_MAX,                     \
> > > > > > >    }
> > > > > > > @@ -100,6 +104,11 @@ struct ceph_mds_reply_dir_entry {
> > > > > > >         loff_t                        offset;
> > > > > > >    };
> > > > > > >
> > > > > > > +struct ceph_mds_reply_xattr {
> > > > > > > +     char *xattr_value;
> > > > > > > +     size_t xattr_value_len;
> > > > > > > +};
> > > > > > > +
> > > > > > >    /*
> > > > > > >     * parsed info about an mds reply, including information about
> > > > > > >     * either: 1) the target inode and/or its parent directory and dentry,
> > > > > > > @@ -115,6 +124,7 @@ struct ceph_mds_reply_info_parsed {
> > > > > > >         char                          *dname;
> > > > > > >         u32                           dname_len;
> > > > > > >         struct ceph_mds_reply_lease   *dlease;
> > > > > > > +     struct ceph_mds_reply_xattr   xattr_info;
> > > > > > >
> > > > > > >         /* extra */
> > > > > > >         union {
> > > > > > > diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
> > > > > > > index 573bb9556fb5..e36e8948e728 100644
> > > > > > > --- a/fs/ceph/strings.c
> > > > > > > +++ b/fs/ceph/strings.c
> > > > > > > @@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op)
> > > > > > >         case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
> > > > > > >         case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";
> > > > > > >         case CEPH_MDS_OP_GETATTR:  return "getattr";
> > > > > > > +     case CEPH_MDS_OP_GETVXATTR:  return "getvxattr";
> > > > > > >         case CEPH_MDS_OP_SETXATTR: return "setxattr";
> > > > > > >         case CEPH_MDS_OP_SETATTR: return "setattr";
> > > > > > >         case CEPH_MDS_OP_RMXATTR: return "rmxattr";
> > > > > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > > > > index ac331aa07cfa..a627fa69668e 100644
> > > > > > > --- a/fs/ceph/super.h
> > > > > > > +++ b/fs/ceph/super.h
> > > > > > > @@ -1043,6 +1043,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode)
> > > > > > >
> > > > > > >    /* xattr.c */
> > > > > > >    int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
> > > > > > > +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
> > > > > > >    ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
> > > > > > >    extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
> > > > > > >    extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
> > > > > > > diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
> > > > > > > index fcf7dfdecf96..dc32876a541a 100644
> > > > > > > --- a/fs/ceph/xattr.c
> > > > > > > +++ b/fs/ceph/xattr.c
> > > > > > > @@ -918,6 +918,30 @@ static inline int __get_request_mask(struct inode *in) {
> > > > > > >         return mask;
> > > > > > >    }
> > > > > > >
> > > > > > > +/* check if the entire cluster supports the given feature */
> > > > > > > +static inline bool ceph_cluster_has_feature(struct inode *inode, int feature_bit)
> > > > > > > +{
> > > > > > > +     int64_t i;
> > > > > > > +     struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > > > > > > +     struct ceph_mds_session **sessions = fsc->mdsc->sessions;
> > > > > > > +     int64_t num_sessions = atomic_read(&fsc->mdsc->num_sessions);
> > > > > > > +
> > > > > > > +     if (fsc->mdsc->stopping)
> > > > > > > +             return false;
> > > > > > > +
> > > > > > > +     if (!sessions)
> > > > > > > +             return false;
> > > > > > > +
> > > > > > > +     for (i = 0; i < num_sessions; i++) {
> > > > > > > +             struct ceph_mds_session *session = sessions[i];
> > > > > > > +             if (!session)
> > > > > > > +                     return false;
> > > > > > > +             if (!test_bit(feature_bit, &session->s_features))
> > > > > > > +                     return false;
> > > > > > What guarantee do you have that "session" will still be a valid pointer
> > > > > > by the time you get to dereferencing it here?
> > > > > >
> > > > > > I think this loop needs some locking (as Xiubo pointed out in his
> > > > > > earlier review).
> > > > > yeah, thanks for pointing that out
> > > > > I'm trying to wrap the entire processing of this function inside a
> > > > > mutex_unlock(&mdsc->mutex) ... but the mount command fails
> > > > > to mount if done so. If code is not wrapped in mutex lock...unlock
> > > > > then the mount is successful.
> > > > > It's a surprise that the code doesn't deadlock under the mutex
> > > > > lock...unlock and gracefully fails with a message.
> > > > > Any hints on what I could be missing.
> > > > >
> > > > > > > +     }
> > > > > > > +     return true;
> > > > > > > +}
> > > > > > > +
> > > > > > >    ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
> > > > > > >                       size_t size)
> > > > > > >    {
> > > > > > > @@ -927,6 +951,16 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
> > > > > > >         int req_mask;
> > > > > > >         ssize_t err;
> > > > > > >
> > > > > > > +     if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) &&
> > > > > > > +         ceph_cluster_has_feature(inode, CEPHFS_FEATURE_GETVXATTR)) {
> > > > > > > +             err = ceph_do_getvxattr(inode, name, value, size);
> > > > > > > +             /* if cluster doesn't support xattr, we try to service it
> > > > > > > +              * locally
> > > > > > > +              */
> > > > > > > +             if (err >= 0)
> > > > > > > +                     return err;
> > > > > > > +     }
> > > > > > > +
> > > > > > What is this? Why not always service this locally?
> > > > > vxattr handling is planned to be moved to the MDS side.
> > > > > As I've pointed out to Xiubo, there's a few new things that have been done for
> > > > > layout vxattr management. Also, as per your original tracker, ceph.dir.pin*
> > > > > can't be handled locally.
> > > > > getvxattr() currently handles:
> > > > > 1. ceph.dir.layout*
> > > > > 2. ceph.file.layout*
> > > > > 3. ceph.dir.pin*
> > > > The above seems will include the 'ceph.dir.layout', 'ceph.file.layout'
> > > > and 'ceph.dir.pin' ? All these have been handled in 'ceph_file_vxattrs'
> > > > and 'ceph_dir_vxattrs'...
> > > >
> > > > And the code above will always force kclient to get all the 'ceph.XXX'
> > > > xattrs from MDS ?
> > > yes, the kclient will always get vxattr values from MDS
> > > for old cluster, op will be handled locally
> > >
> > > Jeff has a proposal to expose the JSON output variety of layout
> > > vxattr vlaue via new vxattr altogether
> > > eg. ceph.dir.layout_json and ceph.file.layout_json
> > >
> > Yeah, sounds good. Or maybe just adding the '.json' instead of '_json'.
> >
>
> +1 -- that looks cleaner than mixing up delimiters
>
> To be clear, I think this should wholly be a fallback mechanism for when
> the client doesn't recognize a vxattr name. IOW, we should only call the
> MDS after ceph_match_vxattr doesn't match an xattr name.

I'm with you, but this set of changes (and its Client.cc equivalent)
is by request from Patrick and Venky (added), so there may be some
subtleties we're missing. And we should follow the same pattern for
who gets to resolve vxattrs in both clients.
-Greg

>
> >
> > > >
> > > >
> > > > > If kclient is new and the cluster is old, then layout vxattr will be
> > > > > handled the old
> > > > > way, i.e. locally. ceph.dir.pin* will remain inaccessible.
> > > > >
> > > > > > >         /* let's see if a virtual xattr was requested */
> > > > > > >         vxattr = ceph_match_vxattr(inode, name);
> > > > > > >         if (vxattr) {
> > > > > > > diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> > > > > > > index 7ad6c3d0db7d..66db21ac5f0c 100644
> > > > > > > --- a/include/linux/ceph/ceph_fs.h
> > > > > > > +++ b/include/linux/ceph/ceph_fs.h
> > > > > > > @@ -328,6 +328,7 @@ enum {
> > > > > > >         CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
> > > > > > >         CEPH_MDS_OP_LOOKUPINO  = 0x00104,
> > > > > > >         CEPH_MDS_OP_LOOKUPNAME = 0x00105,
> > > > > > > +     CEPH_MDS_OP_GETVXATTR  = 0x00106,
> > > > > > >
> > > > > > >         CEPH_MDS_OP_SETXATTR   = 0x01105,
> > > > > > >         CEPH_MDS_OP_RMXATTR    = 0x01106,
> > > > > > --
> > > > > > Jeff Layton <jlayton@kernel.org>
> >
>
> --
> Jeff Layton <jlayton@kernel.org>
>


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v4 1/1] ceph: add getvxattr op
  2022-01-17 14:50               ` Gregory Farnum
@ 2022-01-17 18:09                 ` Patrick Donnelly
  2022-01-17 18:15                   ` Jeff Layton
  0 siblings, 1 reply; 13+ messages in thread
From: Patrick Donnelly @ 2022-01-17 18:09 UTC (permalink / raw)
  To: Gregory Farnum
  Cc: Jeff Layton, Venky Shankar, Xiubo Li, Milind Changire,
	Ilya Dryomov, ceph-devel, Milind Changire

On Mon, Jan 17, 2022 at 9:50 AM Gregory Farnum <gfarnum@redhat.com> wrote:
>
> On Mon, Jan 17, 2022 at 5:28 AM Jeff Layton <jlayton@kernel.org> wrote:
> >
> > On Mon, 2022-01-17 at 21:09 +0800, Xiubo Li wrote:
> > > On 1/17/22 8:58 PM, Milind Changire wrote:
> > > > On Mon, Jan 17, 2022 at 6:20 PM Xiubo Li <xiubli@redhat.com> wrote:
> > > > >
> > > > > On 1/17/22 7:07 PM, Milind Changire wrote:
> > > > > > On Mon, Jan 17, 2022 at 4:23 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > On Mon, 2022-01-17 at 08:51 +0000, Milind Changire wrote:
> > > > > > > > Problem:
> > > > > > > > Directory vxattrs like ceph.dir.pin* and ceph.dir.layout* may not be
> > > > > > > > propagated to the client as frequently to keep them updated. This
> > > > > > > > creates vxattr availability problems.
> > > > > > > >
> > > > > > > > Solution:
> > > > > > > > Adds new getvxattr op to fetch ceph.dir.pin*, ceph.dir.layout* and
> > > > > > > > ceph.file.layout* vxattrs.
> > > > > > > > If the entire layout for a dir or a file is being set, then it is
> > > > > > > > expected that the layout be set in standard JSON format. Individual
> > > > > > > > field value retrieval is not wrapped in JSON. The JSON format also
> > > > > > > > applies while setting the vxattr if the entire layout is being set in
> > > > > > > > one go.
> > > > > > > > As a temporary measure, setting a vxattr can also be done in the old
> > > > > > > > format. The old format will be deprecated in the future.
> > > > > > > >
> > > > > > > > URL: https://tracker.ceph.com/issues/51062
> > > > > > > > Signed-off-by: Milind Changire <mchangir@redhat.com>
> > > > > > > > ---
> > > > > > > >    fs/ceph/inode.c              | 51 ++++++++++++++++++++++++++++++++++++
> > > > > > > >    fs/ceph/mds_client.c         | 27 ++++++++++++++++++-
> > > > > > > >    fs/ceph/mds_client.h         | 12 ++++++++-
> > > > > > > >    fs/ceph/strings.c            |  1 +
> > > > > > > >    fs/ceph/super.h              |  1 +
> > > > > > > >    fs/ceph/xattr.c              | 34 ++++++++++++++++++++++++
> > > > > > > >    include/linux/ceph/ceph_fs.h |  1 +
> > > > > > > >    7 files changed, 125 insertions(+), 2 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > > > > > > index e3322fcb2e8d..efdce049b7f0 100644
> > > > > > > > --- a/fs/ceph/inode.c
> > > > > > > > +++ b/fs/ceph/inode.c
> > > > > > > > @@ -2291,6 +2291,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> > > > > > > >         return err;
> > > > > > > >    }
> > > > > > > >
> > > > > > > > +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
> > > > > > > > +                   size_t size)
> > > > > > > > +{
> > > > > > > > +     struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
> > > > > > > > +     struct ceph_mds_client *mdsc = fsc->mdsc;
> > > > > > > > +     struct ceph_mds_request *req;
> > > > > > > > +     int mode = USE_AUTH_MDS;
> > > > > > > > +     int err;
> > > > > > > > +     char *xattr_value;
> > > > > > > > +     size_t xattr_value_len;
> > > > > > > > +
> > > > > > > > +     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
> > > > > > > > +     if (IS_ERR(req)) {
> > > > > > > > +             err = -ENOMEM;
> > > > > > > > +             goto out;
> > > > > > > > +     }
> > > > > > > > +
> > > > > > > > +     req->r_path2 = kstrdup(name, GFP_NOFS);
> > > > > > > > +     if (!req->r_path2) {
> > > > > > > > +             err = -ENOMEM;
> > > > > > > > +             goto put;
> > > > > > > > +     }
> > > > > > > > +
> > > > > > > > +     ihold(inode);
> > > > > > > > +     req->r_inode = inode;
> > > > > > > > +     err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > > > > > +     if (err < 0)
> > > > > > > > +             goto put;
> > > > > > > > +
> > > > > > > > +     xattr_value = req->r_reply_info.xattr_info.xattr_value;
> > > > > > > > +     xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
> > > > > > > > +
> > > > > > > > +     dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
> > > > > > > > +
> > > > > > > > +     err = (int)xattr_value_len;
> > > > > > > > +     if (size == 0)
> > > > > > > > +             goto put;
> > > > > > > > +
> > > > > > > > +     if (xattr_value_len > size) {
> > > > > > > > +             err = -ERANGE;
> > > > > > > > +             goto put;
> > > > > > > > +     }
> > > > > > > > +
> > > > > > > > +     memcpy(value, xattr_value, xattr_value_len);
> > > > > > > > +put:
> > > > > > > > +     ceph_mdsc_put_request(req);
> > > > > > > > +out:
> > > > > > > > +     dout("do_getvxattr result=%d\n", err);
> > > > > > > > +     return err;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >
> > > > > > > >    /*
> > > > > > > >     * Check inode permissions.  We verify we have a valid value for
> > > > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > > > > index c30eefc0ac19..a5eafc71d976 100644
> > > > > > > > --- a/fs/ceph/mds_client.c
> > > > > > > > +++ b/fs/ceph/mds_client.c
> > > > > > > > @@ -555,6 +555,29 @@ static int parse_reply_info_create(void **p, void *end,
> > > > > > > >         return -EIO;
> > > > > > > >    }
> > > > > > > >
> > > > > > > > +static int parse_reply_info_getvxattr(void **p, void *end,
> > > > > > > > +                                   struct ceph_mds_reply_info_parsed *info,
> > > > > > > > +                                   u64 features)
> > > > > > > > +{
> > > > > > > > +     u8 struct_v, struct_compat;
> > > > > > > > +     u32 struct_len;
> > > > > > > > +     u32 value_len;
> > > > > > > > +
> > > > > > > > +     ceph_decode_8_safe(p, end, struct_v, bad);
> > > > > > > > +     ceph_decode_8_safe(p, end, struct_compat, bad);
> > > > > > > > +     ceph_decode_32_safe(p, end, struct_len, bad);
> > > > > > > > +     ceph_decode_32_safe(p, end, value_len, bad);
> > > > > > > > +
> > > > > > > > +     if (value_len == end - *p) {
> > > > > > > > +       info->xattr_info.xattr_value = *p;
> > > > > > > > +       info->xattr_info.xattr_value_len = end - *p;
> > > > > > > > +       *p = end;
> > > > > > > > +       return info->xattr_info.xattr_value_len;
> > > > > > > > +     }
> > > > > > > > +bad:
> > > > > > > > +     return -EIO;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >    /*
> > > > > > > >     * parse extra results
> > > > > > > >     */
> > > > > > > > @@ -570,6 +593,8 @@ static int parse_reply_info_extra(void **p, void *end,
> > > > > > > >                 return parse_reply_info_readdir(p, end, info, features);
> > > > > > > >         else if (op == CEPH_MDS_OP_CREATE)
> > > > > > > >                 return parse_reply_info_create(p, end, info, features, s);
> > > > > > > > +     else if (op == CEPH_MDS_OP_GETVXATTR)
> > > > > > > > +             return parse_reply_info_getvxattr(p, end, info, features);
> > > > > > > >         else
> > > > > > > >                 return -EIO;
> > > > > > > >    }
> > > > > > > > @@ -615,7 +640,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
> > > > > > > >
> > > > > > > >         if (p != end)
> > > > > > > >                 goto bad;
> > > > > > > > -     return 0;
> > > > > > > > +     return err;
> > > > > > > >
> > > > > > > >    bad:
> > > > > > > >         err = -EIO;
> > > > > > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > > > > > > index 97c7f7bfa55f..f2a8e5af3c2e 100644
> > > > > > > > --- a/fs/ceph/mds_client.h
> > > > > > > > +++ b/fs/ceph/mds_client.h
> > > > > > > > @@ -29,8 +29,10 @@ enum ceph_feature_type {
> > > > > > > >         CEPHFS_FEATURE_MULTI_RECONNECT,
> > > > > > > >         CEPHFS_FEATURE_DELEG_INO,
> > > > > > > >         CEPHFS_FEATURE_METRIC_COLLECT,
> > > > > > > > +     CEPHFS_FEATURE_ALTERNATE_NAME,
> > > > > > > > +     CEPHFS_FEATURE_GETVXATTR,
> > > > > > > >
> > > > > > > > -     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
> > > > > > > > +     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_GETVXATTR,
> > > > > > > >    };
> > > > > > > >
> > > > > > > >    /*
> > > > > > > > @@ -45,6 +47,8 @@ enum ceph_feature_type {
> > > > > > > >         CEPHFS_FEATURE_MULTI_RECONNECT,         \
> > > > > > > >         CEPHFS_FEATURE_DELEG_INO,               \
> > > > > > > >         CEPHFS_FEATURE_METRIC_COLLECT,          \
> > > > > > > > +     CEPHFS_FEATURE_ALTERNATE_NAME,          \
> > > > > > > > +     CEPHFS_FEATURE_GETVXATTR,               \
> > > > > > > >                                                 \
> > > > > > > >         CEPHFS_FEATURE_MAX,                     \
> > > > > > > >    }
> > > > > > > > @@ -100,6 +104,11 @@ struct ceph_mds_reply_dir_entry {
> > > > > > > >         loff_t                        offset;
> > > > > > > >    };
> > > > > > > >
> > > > > > > > +struct ceph_mds_reply_xattr {
> > > > > > > > +     char *xattr_value;
> > > > > > > > +     size_t xattr_value_len;
> > > > > > > > +};
> > > > > > > > +
> > > > > > > >    /*
> > > > > > > >     * parsed info about an mds reply, including information about
> > > > > > > >     * either: 1) the target inode and/or its parent directory and dentry,
> > > > > > > > @@ -115,6 +124,7 @@ struct ceph_mds_reply_info_parsed {
> > > > > > > >         char                          *dname;
> > > > > > > >         u32                           dname_len;
> > > > > > > >         struct ceph_mds_reply_lease   *dlease;
> > > > > > > > +     struct ceph_mds_reply_xattr   xattr_info;
> > > > > > > >
> > > > > > > >         /* extra */
> > > > > > > >         union {
> > > > > > > > diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
> > > > > > > > index 573bb9556fb5..e36e8948e728 100644
> > > > > > > > --- a/fs/ceph/strings.c
> > > > > > > > +++ b/fs/ceph/strings.c
> > > > > > > > @@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op)
> > > > > > > >         case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
> > > > > > > >         case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";
> > > > > > > >         case CEPH_MDS_OP_GETATTR:  return "getattr";
> > > > > > > > +     case CEPH_MDS_OP_GETVXATTR:  return "getvxattr";
> > > > > > > >         case CEPH_MDS_OP_SETXATTR: return "setxattr";
> > > > > > > >         case CEPH_MDS_OP_SETATTR: return "setattr";
> > > > > > > >         case CEPH_MDS_OP_RMXATTR: return "rmxattr";
> > > > > > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > > > > > index ac331aa07cfa..a627fa69668e 100644
> > > > > > > > --- a/fs/ceph/super.h
> > > > > > > > +++ b/fs/ceph/super.h
> > > > > > > > @@ -1043,6 +1043,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode)
> > > > > > > >
> > > > > > > >    /* xattr.c */
> > > > > > > >    int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
> > > > > > > > +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
> > > > > > > >    ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
> > > > > > > >    extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
> > > > > > > >    extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
> > > > > > > > diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
> > > > > > > > index fcf7dfdecf96..dc32876a541a 100644
> > > > > > > > --- a/fs/ceph/xattr.c
> > > > > > > > +++ b/fs/ceph/xattr.c
> > > > > > > > @@ -918,6 +918,30 @@ static inline int __get_request_mask(struct inode *in) {
> > > > > > > >         return mask;
> > > > > > > >    }
> > > > > > > >
> > > > > > > > +/* check if the entire cluster supports the given feature */
> > > > > > > > +static inline bool ceph_cluster_has_feature(struct inode *inode, int feature_bit)
> > > > > > > > +{
> > > > > > > > +     int64_t i;
> > > > > > > > +     struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > > > > > > > +     struct ceph_mds_session **sessions = fsc->mdsc->sessions;
> > > > > > > > +     int64_t num_sessions = atomic_read(&fsc->mdsc->num_sessions);
> > > > > > > > +
> > > > > > > > +     if (fsc->mdsc->stopping)
> > > > > > > > +             return false;
> > > > > > > > +
> > > > > > > > +     if (!sessions)
> > > > > > > > +             return false;
> > > > > > > > +
> > > > > > > > +     for (i = 0; i < num_sessions; i++) {
> > > > > > > > +             struct ceph_mds_session *session = sessions[i];
> > > > > > > > +             if (!session)
> > > > > > > > +                     return false;
> > > > > > > > +             if (!test_bit(feature_bit, &session->s_features))
> > > > > > > > +                     return false;
> > > > > > > What guarantee do you have that "session" will still be a valid pointer
> > > > > > > by the time you get to dereferencing it here?
> > > > > > >
> > > > > > > I think this loop needs some locking (as Xiubo pointed out in his
> > > > > > > earlier review).
> > > > > > yeah, thanks for pointing that out
> > > > > > I'm trying to wrap the entire processing of this function inside a
> > > > > > mutex_unlock(&mdsc->mutex) ... but the mount command fails
> > > > > > to mount if done so. If code is not wrapped in mutex lock...unlock
> > > > > > then the mount is successful.
> > > > > > It's a surprise that the code doesn't deadlock under the mutex
> > > > > > lock...unlock and gracefully fails with a message.
> > > > > > Any hints on what I could be missing.
> > > > > >
> > > > > > > > +     }
> > > > > > > > +     return true;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >    ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
> > > > > > > >                       size_t size)
> > > > > > > >    {
> > > > > > > > @@ -927,6 +951,16 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
> > > > > > > >         int req_mask;
> > > > > > > >         ssize_t err;
> > > > > > > >
> > > > > > > > +     if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) &&
> > > > > > > > +         ceph_cluster_has_feature(inode, CEPHFS_FEATURE_GETVXATTR)) {
> > > > > > > > +             err = ceph_do_getvxattr(inode, name, value, size);
> > > > > > > > +             /* if cluster doesn't support xattr, we try to service it
> > > > > > > > +              * locally
> > > > > > > > +              */
> > > > > > > > +             if (err >= 0)
> > > > > > > > +                     return err;
> > > > > > > > +     }
> > > > > > > > +
> > > > > > > What is this? Why not always service this locally?
> > > > > > vxattr handling is planned to be moved to the MDS side.
> > > > > > As I've pointed out to Xiubo, there's a few new things that have been done for
> > > > > > layout vxattr management. Also, as per your original tracker, ceph.dir.pin*
> > > > > > can't be handled locally.
> > > > > > getvxattr() currently handles:
> > > > > > 1. ceph.dir.layout*
> > > > > > 2. ceph.file.layout*
> > > > > > 3. ceph.dir.pin*
> > > > > The above seems will include the 'ceph.dir.layout', 'ceph.file.layout'
> > > > > and 'ceph.dir.pin' ? All these have been handled in 'ceph_file_vxattrs'
> > > > > and 'ceph_dir_vxattrs'...
> > > > >
> > > > > And the code above will always force kclient to get all the 'ceph.XXX'
> > > > > xattrs from MDS ?
> > > > yes, the kclient will always get vxattr values from MDS
> > > > for old cluster, op will be handled locally
> > > >
> > > > Jeff has a proposal to expose the JSON output variety of layout
> > > > vxattr vlaue via new vxattr altogether
> > > > eg. ceph.dir.layout_json and ceph.file.layout_json
> > > >
> > > Yeah, sounds good. Or maybe just adding the '.json' instead of '_json'.
> > >
> >
> > +1 -- that looks cleaner than mixing up delimiters
> >
> > To be clear, I think this should wholly be a fallback mechanism for when
> > the client doesn't recognize a vxattr name. IOW, we should only call the
> > MDS after ceph_match_vxattr doesn't match an xattr name.
>
> I'm with you, but this set of changes (and its Client.cc equivalent)
> is by request from Patrick and Venky (added), so there may be some
> subtleties we're missing. And we should follow the same pattern for
> who gets to resolve vxattrs in both clients.
> -Greg

I had incorrectly recalled that the client did return json for the
layout xattrs. So, yes, we can't just switch to json for these
existing vxattrs. I agree adding a "json" extension cleanly avoids the
issue. We can continue handling the existing layout vxattrs without
the suffix in the client (for now).

Although, I do think ".json" is a little weird. I had privately
expressed to Jeff/Milind that "!json" or similar as a suffix could be
more appropriate. It emphasizes it's not a derivative xattr.

-- 
Patrick Donnelly, Ph.D.
He / Him / His
Principal Software Engineer
Red Hat, Inc.
GPG: 19F28A586F808C2402351B93C3301A3E258DD79D


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v4 1/1] ceph: add getvxattr op
  2022-01-17 18:09                 ` Patrick Donnelly
@ 2022-01-17 18:15                   ` Jeff Layton
  2022-01-17 18:20                     ` Patrick Donnelly
  0 siblings, 1 reply; 13+ messages in thread
From: Jeff Layton @ 2022-01-17 18:15 UTC (permalink / raw)
  To: Patrick Donnelly, Gregory Farnum
  Cc: Venky Shankar, Xiubo Li, Milind Changire, Ilya Dryomov,
	ceph-devel, Milind Changire

On Mon, 2022-01-17 at 13:09 -0500, Patrick Donnelly wrote:
> On Mon, Jan 17, 2022 at 9:50 AM Gregory Farnum <gfarnum@redhat.com> wrote:
> > 
> > On Mon, Jan 17, 2022 at 5:28 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > 
> > > On Mon, 2022-01-17 at 21:09 +0800, Xiubo Li wrote:
> > > > On 1/17/22 8:58 PM, Milind Changire wrote:
> > > > > On Mon, Jan 17, 2022 at 6:20 PM Xiubo Li <xiubli@redhat.com> wrote:
> > > > > > 
> > > > > > On 1/17/22 7:07 PM, Milind Changire wrote:
> > > > > > > On Mon, Jan 17, 2022 at 4:23 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > > On Mon, 2022-01-17 at 08:51 +0000, Milind Changire wrote:
> > > > > > > > > Problem:
> > > > > > > > > Directory vxattrs like ceph.dir.pin* and ceph.dir.layout* may not be
> > > > > > > > > propagated to the client as frequently to keep them updated. This
> > > > > > > > > creates vxattr availability problems.
> > > > > > > > > 
> > > > > > > > > Solution:
> > > > > > > > > Adds new getvxattr op to fetch ceph.dir.pin*, ceph.dir.layout* and
> > > > > > > > > ceph.file.layout* vxattrs.
> > > > > > > > > If the entire layout for a dir or a file is being set, then it is
> > > > > > > > > expected that the layout be set in standard JSON format. Individual
> > > > > > > > > field value retrieval is not wrapped in JSON. The JSON format also
> > > > > > > > > applies while setting the vxattr if the entire layout is being set in
> > > > > > > > > one go.
> > > > > > > > > As a temporary measure, setting a vxattr can also be done in the old
> > > > > > > > > format. The old format will be deprecated in the future.
> > > > > > > > > 
> > > > > > > > > URL: https://tracker.ceph.com/issues/51062
> > > > > > > > > Signed-off-by: Milind Changire <mchangir@redhat.com>
> > > > > > > > > ---
> > > > > > > > >    fs/ceph/inode.c              | 51 ++++++++++++++++++++++++++++++++++++
> > > > > > > > >    fs/ceph/mds_client.c         | 27 ++++++++++++++++++-
> > > > > > > > >    fs/ceph/mds_client.h         | 12 ++++++++-
> > > > > > > > >    fs/ceph/strings.c            |  1 +
> > > > > > > > >    fs/ceph/super.h              |  1 +
> > > > > > > > >    fs/ceph/xattr.c              | 34 ++++++++++++++++++++++++
> > > > > > > > >    include/linux/ceph/ceph_fs.h |  1 +
> > > > > > > > >    7 files changed, 125 insertions(+), 2 deletions(-)
> > > > > > > > > 
> > > > > > > > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > > > > > > > index e3322fcb2e8d..efdce049b7f0 100644
> > > > > > > > > --- a/fs/ceph/inode.c
> > > > > > > > > +++ b/fs/ceph/inode.c
> > > > > > > > > @@ -2291,6 +2291,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> > > > > > > > >         return err;
> > > > > > > > >    }
> > > > > > > > > 
> > > > > > > > > +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
> > > > > > > > > +                   size_t size)
> > > > > > > > > +{
> > > > > > > > > +     struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
> > > > > > > > > +     struct ceph_mds_client *mdsc = fsc->mdsc;
> > > > > > > > > +     struct ceph_mds_request *req;
> > > > > > > > > +     int mode = USE_AUTH_MDS;
> > > > > > > > > +     int err;
> > > > > > > > > +     char *xattr_value;
> > > > > > > > > +     size_t xattr_value_len;
> > > > > > > > > +
> > > > > > > > > +     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
> > > > > > > > > +     if (IS_ERR(req)) {
> > > > > > > > > +             err = -ENOMEM;
> > > > > > > > > +             goto out;
> > > > > > > > > +     }
> > > > > > > > > +
> > > > > > > > > +     req->r_path2 = kstrdup(name, GFP_NOFS);
> > > > > > > > > +     if (!req->r_path2) {
> > > > > > > > > +             err = -ENOMEM;
> > > > > > > > > +             goto put;
> > > > > > > > > +     }
> > > > > > > > > +
> > > > > > > > > +     ihold(inode);
> > > > > > > > > +     req->r_inode = inode;
> > > > > > > > > +     err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > > > > > > +     if (err < 0)
> > > > > > > > > +             goto put;
> > > > > > > > > +
> > > > > > > > > +     xattr_value = req->r_reply_info.xattr_info.xattr_value;
> > > > > > > > > +     xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
> > > > > > > > > +
> > > > > > > > > +     dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
> > > > > > > > > +
> > > > > > > > > +     err = (int)xattr_value_len;
> > > > > > > > > +     if (size == 0)
> > > > > > > > > +             goto put;
> > > > > > > > > +
> > > > > > > > > +     if (xattr_value_len > size) {
> > > > > > > > > +             err = -ERANGE;
> > > > > > > > > +             goto put;
> > > > > > > > > +     }
> > > > > > > > > +
> > > > > > > > > +     memcpy(value, xattr_value, xattr_value_len);
> > > > > > > > > +put:
> > > > > > > > > +     ceph_mdsc_put_request(req);
> > > > > > > > > +out:
> > > > > > > > > +     dout("do_getvxattr result=%d\n", err);
> > > > > > > > > +     return err;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > 
> > > > > > > > >    /*
> > > > > > > > >     * Check inode permissions.  We verify we have a valid value for
> > > > > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > > > > > index c30eefc0ac19..a5eafc71d976 100644
> > > > > > > > > --- a/fs/ceph/mds_client.c
> > > > > > > > > +++ b/fs/ceph/mds_client.c
> > > > > > > > > @@ -555,6 +555,29 @@ static int parse_reply_info_create(void **p, void *end,
> > > > > > > > >         return -EIO;
> > > > > > > > >    }
> > > > > > > > > 
> > > > > > > > > +static int parse_reply_info_getvxattr(void **p, void *end,
> > > > > > > > > +                                   struct ceph_mds_reply_info_parsed *info,
> > > > > > > > > +                                   u64 features)
> > > > > > > > > +{
> > > > > > > > > +     u8 struct_v, struct_compat;
> > > > > > > > > +     u32 struct_len;
> > > > > > > > > +     u32 value_len;
> > > > > > > > > +
> > > > > > > > > +     ceph_decode_8_safe(p, end, struct_v, bad);
> > > > > > > > > +     ceph_decode_8_safe(p, end, struct_compat, bad);
> > > > > > > > > +     ceph_decode_32_safe(p, end, struct_len, bad);
> > > > > > > > > +     ceph_decode_32_safe(p, end, value_len, bad);
> > > > > > > > > +
> > > > > > > > > +     if (value_len == end - *p) {
> > > > > > > > > +       info->xattr_info.xattr_value = *p;
> > > > > > > > > +       info->xattr_info.xattr_value_len = end - *p;
> > > > > > > > > +       *p = end;
> > > > > > > > > +       return info->xattr_info.xattr_value_len;
> > > > > > > > > +     }
> > > > > > > > > +bad:
> > > > > > > > > +     return -EIO;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >    /*
> > > > > > > > >     * parse extra results
> > > > > > > > >     */
> > > > > > > > > @@ -570,6 +593,8 @@ static int parse_reply_info_extra(void **p, void *end,
> > > > > > > > >                 return parse_reply_info_readdir(p, end, info, features);
> > > > > > > > >         else if (op == CEPH_MDS_OP_CREATE)
> > > > > > > > >                 return parse_reply_info_create(p, end, info, features, s);
> > > > > > > > > +     else if (op == CEPH_MDS_OP_GETVXATTR)
> > > > > > > > > +             return parse_reply_info_getvxattr(p, end, info, features);
> > > > > > > > >         else
> > > > > > > > >                 return -EIO;
> > > > > > > > >    }
> > > > > > > > > @@ -615,7 +640,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
> > > > > > > > > 
> > > > > > > > >         if (p != end)
> > > > > > > > >                 goto bad;
> > > > > > > > > -     return 0;
> > > > > > > > > +     return err;
> > > > > > > > > 
> > > > > > > > >    bad:
> > > > > > > > >         err = -EIO;
> > > > > > > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > > > > > > > index 97c7f7bfa55f..f2a8e5af3c2e 100644
> > > > > > > > > --- a/fs/ceph/mds_client.h
> > > > > > > > > +++ b/fs/ceph/mds_client.h
> > > > > > > > > @@ -29,8 +29,10 @@ enum ceph_feature_type {
> > > > > > > > >         CEPHFS_FEATURE_MULTI_RECONNECT,
> > > > > > > > >         CEPHFS_FEATURE_DELEG_INO,
> > > > > > > > >         CEPHFS_FEATURE_METRIC_COLLECT,
> > > > > > > > > +     CEPHFS_FEATURE_ALTERNATE_NAME,
> > > > > > > > > +     CEPHFS_FEATURE_GETVXATTR,
> > > > > > > > > 
> > > > > > > > > -     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
> > > > > > > > > +     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_GETVXATTR,
> > > > > > > > >    };
> > > > > > > > > 
> > > > > > > > >    /*
> > > > > > > > > @@ -45,6 +47,8 @@ enum ceph_feature_type {
> > > > > > > > >         CEPHFS_FEATURE_MULTI_RECONNECT,         \
> > > > > > > > >         CEPHFS_FEATURE_DELEG_INO,               \
> > > > > > > > >         CEPHFS_FEATURE_METRIC_COLLECT,          \
> > > > > > > > > +     CEPHFS_FEATURE_ALTERNATE_NAME,          \
> > > > > > > > > +     CEPHFS_FEATURE_GETVXATTR,               \
> > > > > > > > >                                                 \
> > > > > > > > >         CEPHFS_FEATURE_MAX,                     \
> > > > > > > > >    }
> > > > > > > > > @@ -100,6 +104,11 @@ struct ceph_mds_reply_dir_entry {
> > > > > > > > >         loff_t                        offset;
> > > > > > > > >    };
> > > > > > > > > 
> > > > > > > > > +struct ceph_mds_reply_xattr {
> > > > > > > > > +     char *xattr_value;
> > > > > > > > > +     size_t xattr_value_len;
> > > > > > > > > +};
> > > > > > > > > +
> > > > > > > > >    /*
> > > > > > > > >     * parsed info about an mds reply, including information about
> > > > > > > > >     * either: 1) the target inode and/or its parent directory and dentry,
> > > > > > > > > @@ -115,6 +124,7 @@ struct ceph_mds_reply_info_parsed {
> > > > > > > > >         char                          *dname;
> > > > > > > > >         u32                           dname_len;
> > > > > > > > >         struct ceph_mds_reply_lease   *dlease;
> > > > > > > > > +     struct ceph_mds_reply_xattr   xattr_info;
> > > > > > > > > 
> > > > > > > > >         /* extra */
> > > > > > > > >         union {
> > > > > > > > > diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
> > > > > > > > > index 573bb9556fb5..e36e8948e728 100644
> > > > > > > > > --- a/fs/ceph/strings.c
> > > > > > > > > +++ b/fs/ceph/strings.c
> > > > > > > > > @@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op)
> > > > > > > > >         case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
> > > > > > > > >         case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";
> > > > > > > > >         case CEPH_MDS_OP_GETATTR:  return "getattr";
> > > > > > > > > +     case CEPH_MDS_OP_GETVXATTR:  return "getvxattr";
> > > > > > > > >         case CEPH_MDS_OP_SETXATTR: return "setxattr";
> > > > > > > > >         case CEPH_MDS_OP_SETATTR: return "setattr";
> > > > > > > > >         case CEPH_MDS_OP_RMXATTR: return "rmxattr";
> > > > > > > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > > > > > > index ac331aa07cfa..a627fa69668e 100644
> > > > > > > > > --- a/fs/ceph/super.h
> > > > > > > > > +++ b/fs/ceph/super.h
> > > > > > > > > @@ -1043,6 +1043,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode)
> > > > > > > > > 
> > > > > > > > >    /* xattr.c */
> > > > > > > > >    int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
> > > > > > > > > +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
> > > > > > > > >    ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
> > > > > > > > >    extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
> > > > > > > > >    extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
> > > > > > > > > diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
> > > > > > > > > index fcf7dfdecf96..dc32876a541a 100644
> > > > > > > > > --- a/fs/ceph/xattr.c
> > > > > > > > > +++ b/fs/ceph/xattr.c
> > > > > > > > > @@ -918,6 +918,30 @@ static inline int __get_request_mask(struct inode *in) {
> > > > > > > > >         return mask;
> > > > > > > > >    }
> > > > > > > > > 
> > > > > > > > > +/* check if the entire cluster supports the given feature */
> > > > > > > > > +static inline bool ceph_cluster_has_feature(struct inode *inode, int feature_bit)
> > > > > > > > > +{
> > > > > > > > > +     int64_t i;
> > > > > > > > > +     struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > > > > > > > > +     struct ceph_mds_session **sessions = fsc->mdsc->sessions;
> > > > > > > > > +     int64_t num_sessions = atomic_read(&fsc->mdsc->num_sessions);
> > > > > > > > > +
> > > > > > > > > +     if (fsc->mdsc->stopping)
> > > > > > > > > +             return false;
> > > > > > > > > +
> > > > > > > > > +     if (!sessions)
> > > > > > > > > +             return false;
> > > > > > > > > +
> > > > > > > > > +     for (i = 0; i < num_sessions; i++) {
> > > > > > > > > +             struct ceph_mds_session *session = sessions[i];
> > > > > > > > > +             if (!session)
> > > > > > > > > +                     return false;
> > > > > > > > > +             if (!test_bit(feature_bit, &session->s_features))
> > > > > > > > > +                     return false;
> > > > > > > > What guarantee do you have that "session" will still be a valid pointer
> > > > > > > > by the time you get to dereferencing it here?
> > > > > > > > 
> > > > > > > > I think this loop needs some locking (as Xiubo pointed out in his
> > > > > > > > earlier review).
> > > > > > > yeah, thanks for pointing that out
> > > > > > > I'm trying to wrap the entire processing of this function inside a
> > > > > > > mutex_unlock(&mdsc->mutex) ... but the mount command fails
> > > > > > > to mount if done so. If code is not wrapped in mutex lock...unlock
> > > > > > > then the mount is successful.
> > > > > > > It's a surprise that the code doesn't deadlock under the mutex
> > > > > > > lock...unlock and gracefully fails with a message.
> > > > > > > Any hints on what I could be missing.
> > > > > > > 
> > > > > > > > > +     }
> > > > > > > > > +     return true;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >    ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
> > > > > > > > >                       size_t size)
> > > > > > > > >    {
> > > > > > > > > @@ -927,6 +951,16 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
> > > > > > > > >         int req_mask;
> > > > > > > > >         ssize_t err;
> > > > > > > > > 
> > > > > > > > > +     if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) &&
> > > > > > > > > +         ceph_cluster_has_feature(inode, CEPHFS_FEATURE_GETVXATTR)) {
> > > > > > > > > +             err = ceph_do_getvxattr(inode, name, value, size);
> > > > > > > > > +             /* if cluster doesn't support xattr, we try to service it
> > > > > > > > > +              * locally
> > > > > > > > > +              */
> > > > > > > > > +             if (err >= 0)
> > > > > > > > > +                     return err;
> > > > > > > > > +     }
> > > > > > > > > +
> > > > > > > > What is this? Why not always service this locally?
> > > > > > > vxattr handling is planned to be moved to the MDS side.
> > > > > > > As I've pointed out to Xiubo, there's a few new things that have been done for
> > > > > > > layout vxattr management. Also, as per your original tracker, ceph.dir.pin*
> > > > > > > can't be handled locally.
> > > > > > > getvxattr() currently handles:
> > > > > > > 1. ceph.dir.layout*
> > > > > > > 2. ceph.file.layout*
> > > > > > > 3. ceph.dir.pin*
> > > > > > The above seems will include the 'ceph.dir.layout', 'ceph.file.layout'
> > > > > > and 'ceph.dir.pin' ? All these have been handled in 'ceph_file_vxattrs'
> > > > > > and 'ceph_dir_vxattrs'...
> > > > > > 
> > > > > > And the code above will always force kclient to get all the 'ceph.XXX'
> > > > > > xattrs from MDS ?
> > > > > yes, the kclient will always get vxattr values from MDS
> > > > > for old cluster, op will be handled locally
> > > > > 
> > > > > Jeff has a proposal to expose the JSON output variety of layout
> > > > > vxattr vlaue via new vxattr altogether
> > > > > eg. ceph.dir.layout_json and ceph.file.layout_json
> > > > > 
> > > > Yeah, sounds good. Or maybe just adding the '.json' instead of '_json'.
> > > > 
> > > 
> > > +1 -- that looks cleaner than mixing up delimiters
> > > 
> > > To be clear, I think this should wholly be a fallback mechanism for when
> > > the client doesn't recognize a vxattr name. IOW, we should only call the
> > > MDS after ceph_match_vxattr doesn't match an xattr name.
> > 
> > I'm with you, but this set of changes (and its Client.cc equivalent)
> > is by request from Patrick and Venky (added), so there may be some
> > subtleties we're missing. And we should follow the same pattern for
> > who gets to resolve vxattrs in both clients.
> > -Greg
> 
> I had incorrectly recalled that the client did return json for the
> layout xattrs. So, yes, we can't just switch to json for these
> existing vxattrs. I agree adding a "json" extension cleanly avoids the
> issue. We can continue handling the existing layout vxattrs without
> the suffix in the client (for now).
> 
> Although, I do think ".json" is a little weird. I had privately
> expressed to Jeff/Milind that "!json" or similar as a suffix could be
> more appropriate. It emphasizes it's not a derivative xattr.
> 

The mixed delimiters are also weird, and I don't think it's really
outside the scope of the existing ceph.file.layout.* xattrs.

Also, I have to wonder -- why json? Isn't yaml more en vogue these days?
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v4 1/1] ceph: add getvxattr op
  2022-01-17 18:15                   ` Jeff Layton
@ 2022-01-17 18:20                     ` Patrick Donnelly
  0 siblings, 0 replies; 13+ messages in thread
From: Patrick Donnelly @ 2022-01-17 18:20 UTC (permalink / raw)
  To: Jeff Layton
  Cc: Gregory Farnum, Venky Shankar, Xiubo Li, Milind Changire,
	Ilya Dryomov, ceph-devel, Milind Changire

On Mon, Jan 17, 2022 at 1:15 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Mon, 2022-01-17 at 13:09 -0500, Patrick Donnelly wrote:
> > On Mon, Jan 17, 2022 at 9:50 AM Gregory Farnum <gfarnum@redhat.com> wrote:
> > >
> > > On Mon, Jan 17, 2022 at 5:28 AM Jeff Layton <jlayton@kernel.org> wrote:
> > > >
> > > > On Mon, 2022-01-17 at 21:09 +0800, Xiubo Li wrote:
> > > > > On 1/17/22 8:58 PM, Milind Changire wrote:
> > > > > > On Mon, Jan 17, 2022 at 6:20 PM Xiubo Li <xiubli@redhat.com> wrote:
> > > > > > >
> > > > > > > On 1/17/22 7:07 PM, Milind Changire wrote:
> > > > > > > > On Mon, Jan 17, 2022 at 4:23 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > > > > > On Mon, 2022-01-17 at 08:51 +0000, Milind Changire wrote:
> > > > > > > > > > Problem:
> > > > > > > > > > Directory vxattrs like ceph.dir.pin* and ceph.dir.layout* may not be
> > > > > > > > > > propagated to the client as frequently to keep them updated. This
> > > > > > > > > > creates vxattr availability problems.
> > > > > > > > > >
> > > > > > > > > > Solution:
> > > > > > > > > > Adds new getvxattr op to fetch ceph.dir.pin*, ceph.dir.layout* and
> > > > > > > > > > ceph.file.layout* vxattrs.
> > > > > > > > > > If the entire layout for a dir or a file is being set, then it is
> > > > > > > > > > expected that the layout be set in standard JSON format. Individual
> > > > > > > > > > field value retrieval is not wrapped in JSON. The JSON format also
> > > > > > > > > > applies while setting the vxattr if the entire layout is being set in
> > > > > > > > > > one go.
> > > > > > > > > > As a temporary measure, setting a vxattr can also be done in the old
> > > > > > > > > > format. The old format will be deprecated in the future.
> > > > > > > > > >
> > > > > > > > > > URL: https://tracker.ceph.com/issues/51062
> > > > > > > > > > Signed-off-by: Milind Changire <mchangir@redhat.com>
> > > > > > > > > > ---
> > > > > > > > > >    fs/ceph/inode.c              | 51 ++++++++++++++++++++++++++++++++++++
> > > > > > > > > >    fs/ceph/mds_client.c         | 27 ++++++++++++++++++-
> > > > > > > > > >    fs/ceph/mds_client.h         | 12 ++++++++-
> > > > > > > > > >    fs/ceph/strings.c            |  1 +
> > > > > > > > > >    fs/ceph/super.h              |  1 +
> > > > > > > > > >    fs/ceph/xattr.c              | 34 ++++++++++++++++++++++++
> > > > > > > > > >    include/linux/ceph/ceph_fs.h |  1 +
> > > > > > > > > >    7 files changed, 125 insertions(+), 2 deletions(-)
> > > > > > > > > >
> > > > > > > > > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > > > > > > > > index e3322fcb2e8d..efdce049b7f0 100644
> > > > > > > > > > --- a/fs/ceph/inode.c
> > > > > > > > > > +++ b/fs/ceph/inode.c
> > > > > > > > > > @@ -2291,6 +2291,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
> > > > > > > > > >         return err;
> > > > > > > > > >    }
> > > > > > > > > >
> > > > > > > > > > +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
> > > > > > > > > > +                   size_t size)
> > > > > > > > > > +{
> > > > > > > > > > +     struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
> > > > > > > > > > +     struct ceph_mds_client *mdsc = fsc->mdsc;
> > > > > > > > > > +     struct ceph_mds_request *req;
> > > > > > > > > > +     int mode = USE_AUTH_MDS;
> > > > > > > > > > +     int err;
> > > > > > > > > > +     char *xattr_value;
> > > > > > > > > > +     size_t xattr_value_len;
> > > > > > > > > > +
> > > > > > > > > > +     req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
> > > > > > > > > > +     if (IS_ERR(req)) {
> > > > > > > > > > +             err = -ENOMEM;
> > > > > > > > > > +             goto out;
> > > > > > > > > > +     }
> > > > > > > > > > +
> > > > > > > > > > +     req->r_path2 = kstrdup(name, GFP_NOFS);
> > > > > > > > > > +     if (!req->r_path2) {
> > > > > > > > > > +             err = -ENOMEM;
> > > > > > > > > > +             goto put;
> > > > > > > > > > +     }
> > > > > > > > > > +
> > > > > > > > > > +     ihold(inode);
> > > > > > > > > > +     req->r_inode = inode;
> > > > > > > > > > +     err = ceph_mdsc_do_request(mdsc, NULL, req);
> > > > > > > > > > +     if (err < 0)
> > > > > > > > > > +             goto put;
> > > > > > > > > > +
> > > > > > > > > > +     xattr_value = req->r_reply_info.xattr_info.xattr_value;
> > > > > > > > > > +     xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
> > > > > > > > > > +
> > > > > > > > > > +     dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
> > > > > > > > > > +
> > > > > > > > > > +     err = (int)xattr_value_len;
> > > > > > > > > > +     if (size == 0)
> > > > > > > > > > +             goto put;
> > > > > > > > > > +
> > > > > > > > > > +     if (xattr_value_len > size) {
> > > > > > > > > > +             err = -ERANGE;
> > > > > > > > > > +             goto put;
> > > > > > > > > > +     }
> > > > > > > > > > +
> > > > > > > > > > +     memcpy(value, xattr_value, xattr_value_len);
> > > > > > > > > > +put:
> > > > > > > > > > +     ceph_mdsc_put_request(req);
> > > > > > > > > > +out:
> > > > > > > > > > +     dout("do_getvxattr result=%d\n", err);
> > > > > > > > > > +     return err;
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > >
> > > > > > > > > >    /*
> > > > > > > > > >     * Check inode permissions.  We verify we have a valid value for
> > > > > > > > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > > > > > > > index c30eefc0ac19..a5eafc71d976 100644
> > > > > > > > > > --- a/fs/ceph/mds_client.c
> > > > > > > > > > +++ b/fs/ceph/mds_client.c
> > > > > > > > > > @@ -555,6 +555,29 @@ static int parse_reply_info_create(void **p, void *end,
> > > > > > > > > >         return -EIO;
> > > > > > > > > >    }
> > > > > > > > > >
> > > > > > > > > > +static int parse_reply_info_getvxattr(void **p, void *end,
> > > > > > > > > > +                                   struct ceph_mds_reply_info_parsed *info,
> > > > > > > > > > +                                   u64 features)
> > > > > > > > > > +{
> > > > > > > > > > +     u8 struct_v, struct_compat;
> > > > > > > > > > +     u32 struct_len;
> > > > > > > > > > +     u32 value_len;
> > > > > > > > > > +
> > > > > > > > > > +     ceph_decode_8_safe(p, end, struct_v, bad);
> > > > > > > > > > +     ceph_decode_8_safe(p, end, struct_compat, bad);
> > > > > > > > > > +     ceph_decode_32_safe(p, end, struct_len, bad);
> > > > > > > > > > +     ceph_decode_32_safe(p, end, value_len, bad);
> > > > > > > > > > +
> > > > > > > > > > +     if (value_len == end - *p) {
> > > > > > > > > > +       info->xattr_info.xattr_value = *p;
> > > > > > > > > > +       info->xattr_info.xattr_value_len = end - *p;
> > > > > > > > > > +       *p = end;
> > > > > > > > > > +       return info->xattr_info.xattr_value_len;
> > > > > > > > > > +     }
> > > > > > > > > > +bad:
> > > > > > > > > > +     return -EIO;
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > >    /*
> > > > > > > > > >     * parse extra results
> > > > > > > > > >     */
> > > > > > > > > > @@ -570,6 +593,8 @@ static int parse_reply_info_extra(void **p, void *end,
> > > > > > > > > >                 return parse_reply_info_readdir(p, end, info, features);
> > > > > > > > > >         else if (op == CEPH_MDS_OP_CREATE)
> > > > > > > > > >                 return parse_reply_info_create(p, end, info, features, s);
> > > > > > > > > > +     else if (op == CEPH_MDS_OP_GETVXATTR)
> > > > > > > > > > +             return parse_reply_info_getvxattr(p, end, info, features);
> > > > > > > > > >         else
> > > > > > > > > >                 return -EIO;
> > > > > > > > > >    }
> > > > > > > > > > @@ -615,7 +640,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
> > > > > > > > > >
> > > > > > > > > >         if (p != end)
> > > > > > > > > >                 goto bad;
> > > > > > > > > > -     return 0;
> > > > > > > > > > +     return err;
> > > > > > > > > >
> > > > > > > > > >    bad:
> > > > > > > > > >         err = -EIO;
> > > > > > > > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > > > > > > > > index 97c7f7bfa55f..f2a8e5af3c2e 100644
> > > > > > > > > > --- a/fs/ceph/mds_client.h
> > > > > > > > > > +++ b/fs/ceph/mds_client.h
> > > > > > > > > > @@ -29,8 +29,10 @@ enum ceph_feature_type {
> > > > > > > > > >         CEPHFS_FEATURE_MULTI_RECONNECT,
> > > > > > > > > >         CEPHFS_FEATURE_DELEG_INO,
> > > > > > > > > >         CEPHFS_FEATURE_METRIC_COLLECT,
> > > > > > > > > > +     CEPHFS_FEATURE_ALTERNATE_NAME,
> > > > > > > > > > +     CEPHFS_FEATURE_GETVXATTR,
> > > > > > > > > >
> > > > > > > > > > -     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
> > > > > > > > > > +     CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_GETVXATTR,
> > > > > > > > > >    };
> > > > > > > > > >
> > > > > > > > > >    /*
> > > > > > > > > > @@ -45,6 +47,8 @@ enum ceph_feature_type {
> > > > > > > > > >         CEPHFS_FEATURE_MULTI_RECONNECT,         \
> > > > > > > > > >         CEPHFS_FEATURE_DELEG_INO,               \
> > > > > > > > > >         CEPHFS_FEATURE_METRIC_COLLECT,          \
> > > > > > > > > > +     CEPHFS_FEATURE_ALTERNATE_NAME,          \
> > > > > > > > > > +     CEPHFS_FEATURE_GETVXATTR,               \
> > > > > > > > > >                                                 \
> > > > > > > > > >         CEPHFS_FEATURE_MAX,                     \
> > > > > > > > > >    }
> > > > > > > > > > @@ -100,6 +104,11 @@ struct ceph_mds_reply_dir_entry {
> > > > > > > > > >         loff_t                        offset;
> > > > > > > > > >    };
> > > > > > > > > >
> > > > > > > > > > +struct ceph_mds_reply_xattr {
> > > > > > > > > > +     char *xattr_value;
> > > > > > > > > > +     size_t xattr_value_len;
> > > > > > > > > > +};
> > > > > > > > > > +
> > > > > > > > > >    /*
> > > > > > > > > >     * parsed info about an mds reply, including information about
> > > > > > > > > >     * either: 1) the target inode and/or its parent directory and dentry,
> > > > > > > > > > @@ -115,6 +124,7 @@ struct ceph_mds_reply_info_parsed {
> > > > > > > > > >         char                          *dname;
> > > > > > > > > >         u32                           dname_len;
> > > > > > > > > >         struct ceph_mds_reply_lease   *dlease;
> > > > > > > > > > +     struct ceph_mds_reply_xattr   xattr_info;
> > > > > > > > > >
> > > > > > > > > >         /* extra */
> > > > > > > > > >         union {
> > > > > > > > > > diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
> > > > > > > > > > index 573bb9556fb5..e36e8948e728 100644
> > > > > > > > > > --- a/fs/ceph/strings.c
> > > > > > > > > > +++ b/fs/ceph/strings.c
> > > > > > > > > > @@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op)
> > > > > > > > > >         case CEPH_MDS_OP_LOOKUPINO:  return "lookupino";
> > > > > > > > > >         case CEPH_MDS_OP_LOOKUPNAME:  return "lookupname";
> > > > > > > > > >         case CEPH_MDS_OP_GETATTR:  return "getattr";
> > > > > > > > > > +     case CEPH_MDS_OP_GETVXATTR:  return "getvxattr";
> > > > > > > > > >         case CEPH_MDS_OP_SETXATTR: return "setxattr";
> > > > > > > > > >         case CEPH_MDS_OP_SETATTR: return "setattr";
> > > > > > > > > >         case CEPH_MDS_OP_RMXATTR: return "rmxattr";
> > > > > > > > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > > > > > > > index ac331aa07cfa..a627fa69668e 100644
> > > > > > > > > > --- a/fs/ceph/super.h
> > > > > > > > > > +++ b/fs/ceph/super.h
> > > > > > > > > > @@ -1043,6 +1043,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode)
> > > > > > > > > >
> > > > > > > > > >    /* xattr.c */
> > > > > > > > > >    int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
> > > > > > > > > > +int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
> > > > > > > > > >    ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
> > > > > > > > > >    extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
> > > > > > > > > >    extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
> > > > > > > > > > diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
> > > > > > > > > > index fcf7dfdecf96..dc32876a541a 100644
> > > > > > > > > > --- a/fs/ceph/xattr.c
> > > > > > > > > > +++ b/fs/ceph/xattr.c
> > > > > > > > > > @@ -918,6 +918,30 @@ static inline int __get_request_mask(struct inode *in) {
> > > > > > > > > >         return mask;
> > > > > > > > > >    }
> > > > > > > > > >
> > > > > > > > > > +/* check if the entire cluster supports the given feature */
> > > > > > > > > > +static inline bool ceph_cluster_has_feature(struct inode *inode, int feature_bit)
> > > > > > > > > > +{
> > > > > > > > > > +     int64_t i;
> > > > > > > > > > +     struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
> > > > > > > > > > +     struct ceph_mds_session **sessions = fsc->mdsc->sessions;
> > > > > > > > > > +     int64_t num_sessions = atomic_read(&fsc->mdsc->num_sessions);
> > > > > > > > > > +
> > > > > > > > > > +     if (fsc->mdsc->stopping)
> > > > > > > > > > +             return false;
> > > > > > > > > > +
> > > > > > > > > > +     if (!sessions)
> > > > > > > > > > +             return false;
> > > > > > > > > > +
> > > > > > > > > > +     for (i = 0; i < num_sessions; i++) {
> > > > > > > > > > +             struct ceph_mds_session *session = sessions[i];
> > > > > > > > > > +             if (!session)
> > > > > > > > > > +                     return false;
> > > > > > > > > > +             if (!test_bit(feature_bit, &session->s_features))
> > > > > > > > > > +                     return false;
> > > > > > > > > What guarantee do you have that "session" will still be a valid pointer
> > > > > > > > > by the time you get to dereferencing it here?
> > > > > > > > >
> > > > > > > > > I think this loop needs some locking (as Xiubo pointed out in his
> > > > > > > > > earlier review).
> > > > > > > > yeah, thanks for pointing that out
> > > > > > > > I'm trying to wrap the entire processing of this function inside a
> > > > > > > > mutex_unlock(&mdsc->mutex) ... but the mount command fails
> > > > > > > > to mount if done so. If code is not wrapped in mutex lock...unlock
> > > > > > > > then the mount is successful.
> > > > > > > > It's a surprise that the code doesn't deadlock under the mutex
> > > > > > > > lock...unlock and gracefully fails with a message.
> > > > > > > > Any hints on what I could be missing.
> > > > > > > >
> > > > > > > > > > +     }
> > > > > > > > > > +     return true;
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > >    ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
> > > > > > > > > >                       size_t size)
> > > > > > > > > >    {
> > > > > > > > > > @@ -927,6 +951,16 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
> > > > > > > > > >         int req_mask;
> > > > > > > > > >         ssize_t err;
> > > > > > > > > >
> > > > > > > > > > +     if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) &&
> > > > > > > > > > +         ceph_cluster_has_feature(inode, CEPHFS_FEATURE_GETVXATTR)) {
> > > > > > > > > > +             err = ceph_do_getvxattr(inode, name, value, size);
> > > > > > > > > > +             /* if cluster doesn't support xattr, we try to service it
> > > > > > > > > > +              * locally
> > > > > > > > > > +              */
> > > > > > > > > > +             if (err >= 0)
> > > > > > > > > > +                     return err;
> > > > > > > > > > +     }
> > > > > > > > > > +
> > > > > > > > > What is this? Why not always service this locally?
> > > > > > > > vxattr handling is planned to be moved to the MDS side.
> > > > > > > > As I've pointed out to Xiubo, there's a few new things that have been done for
> > > > > > > > layout vxattr management. Also, as per your original tracker, ceph.dir.pin*
> > > > > > > > can't be handled locally.
> > > > > > > > getvxattr() currently handles:
> > > > > > > > 1. ceph.dir.layout*
> > > > > > > > 2. ceph.file.layout*
> > > > > > > > 3. ceph.dir.pin*
> > > > > > > The above seems will include the 'ceph.dir.layout', 'ceph.file.layout'
> > > > > > > and 'ceph.dir.pin' ? All these have been handled in 'ceph_file_vxattrs'
> > > > > > > and 'ceph_dir_vxattrs'...
> > > > > > >
> > > > > > > And the code above will always force kclient to get all the 'ceph.XXX'
> > > > > > > xattrs from MDS ?
> > > > > > yes, the kclient will always get vxattr values from MDS
> > > > > > for old cluster, op will be handled locally
> > > > > >
> > > > > > Jeff has a proposal to expose the JSON output variety of layout
> > > > > > vxattr vlaue via new vxattr altogether
> > > > > > eg. ceph.dir.layout_json and ceph.file.layout_json
> > > > > >
> > > > > Yeah, sounds good. Or maybe just adding the '.json' instead of '_json'.
> > > > >
> > > >
> > > > +1 -- that looks cleaner than mixing up delimiters
> > > >
> > > > To be clear, I think this should wholly be a fallback mechanism for when
> > > > the client doesn't recognize a vxattr name. IOW, we should only call the
> > > > MDS after ceph_match_vxattr doesn't match an xattr name.
> > >
> > > I'm with you, but this set of changes (and its Client.cc equivalent)
> > > is by request from Patrick and Venky (added), so there may be some
> > > subtleties we're missing. And we should follow the same pattern for
> > > who gets to resolve vxattrs in both clients.
> > > -Greg
> >
> > I had incorrectly recalled that the client did return json for the
> > layout xattrs. So, yes, we can't just switch to json for these
> > existing vxattrs. I agree adding a "json" extension cleanly avoids the
> > issue. We can continue handling the existing layout vxattrs without
> > the suffix in the client (for now).
> >
> > Although, I do think ".json" is a little weird. I had privately
> > expressed to Jeff/Milind that "!json" or similar as a suffix could be
> > more appropriate. It emphasizes it's not a derivative xattr.
> >
>
> The mixed delimiters are also weird, and I don't think it's really
> outside the scope of the existing ceph.file.layout.* xattrs.

Both are weird to me. I don't have a strong opinion either way.

> Also, I have to wonder -- why json? Isn't yaml more en vogue these days?

json is a common format for all ceph commands / utilities.

-- 
Patrick Donnelly, Ph.D.
He / Him / His
Principal Software Engineer
Red Hat, Inc.
GPG: 19F28A586F808C2402351B93C3301A3E258DD79D


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2022-01-17 18:21 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-17  8:51 [PATCH v4 0/1] ceph: add support for getvxattr op Milind Changire
2022-01-17  8:51 ` [PATCH v4 1/1] ceph: add " Milind Changire
2022-01-17 10:53   ` Jeff Layton
2022-01-17 11:07     ` Milind Changire
2022-01-17 12:50       ` Xiubo Li
2022-01-17 12:58         ` Milind Changire
2022-01-17 13:09           ` Xiubo Li
2022-01-17 13:27             ` Jeff Layton
2022-01-17 13:56               ` Xiubo Li
2022-01-17 14:50               ` Gregory Farnum
2022-01-17 18:09                 ` Patrick Donnelly
2022-01-17 18:15                   ` Jeff Layton
2022-01-17 18:20                     ` Patrick Donnelly

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.