All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/6] Rados pool namespace support
@ 2016-02-06  7:00 Yan, Zheng
  2016-02-06  7:00 ` [PATCH v2 1/6] libceph: define new ceph_file_layout structure Yan, Zheng
                   ` (5 more replies)
  0 siblings, 6 replies; 33+ messages in thread
From: Yan, Zheng @ 2016-02-06  7:00 UTC (permalink / raw)
  To: ceph-devel; +Cc: sweil, idryomov, Yan, Zheng

This series adds pool namespae support for libceph and cephfs. I also pushed
the code to https://github.com/ceph/ceph-client/tree/wip-fs-filelayout2.

Changes since v1:
- rename pool namespace cache to string table and make it generic
- fix RCU related sparse warning

--
Yan, Zheng (6):
  libceph: define new ceph_file_layout structure
  libceph: introduce reference counted string
  libceph: rados pool namesapce support
  libceph: make sure redirect does not change namespace
  libceph: calculate request message size according to namespace
  ceph: rados pool namesapce support

 drivers/block/rbd.c                |  17 +++---
 fs/ceph/addr.c                     |  81 +++++++++++++++++++------
 fs/ceph/caps.c                     |  54 +++++++++++++----
 fs/ceph/file.c                     |  15 +++--
 fs/ceph/inode.c                    |  22 ++++++-
 fs/ceph/ioctl.c                    |  22 +++----
 fs/ceph/mds_client.c               |  11 ++++
 fs/ceph/mds_client.h               |   6 +-
 fs/ceph/xattr.c                    |  90 +++++++++++++++++----------
 include/linux/ceph/ceph_features.h |   1 +
 include/linux/ceph/ceph_fs.h       |  52 +++++++---------
 include/linux/ceph/libceph.h       |   1 +
 include/linux/ceph/osd_client.h    |   2 +
 include/linux/ceph/osdmap.h        |   1 +
 include/linux/ceph/string_table.h  |  58 ++++++++++++++++++
 net/ceph/Makefile                  |   2 +-
 net/ceph/ceph_common.c             |   1 +
 net/ceph/ceph_fs.c                 |  31 +++++++++-
 net/ceph/osd_client.c              |  68 ++++++++++++++++-----
 net/ceph/osdmap.c                  |  39 +++++++++---
 net/ceph/string_table.c            | 121 +++++++++++++++++++++++++++++++++++++
 21 files changed, 546 insertions(+), 149 deletions(-)
 create mode 100644 include/linux/ceph/string_table.h
 create mode 100644 net/ceph/string_table.c

-- 
2.5.0


^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH v2 1/6] libceph: define new ceph_file_layout structure
  2016-02-06  7:00 [PATCH v2 0/6] Rados pool namespace support Yan, Zheng
@ 2016-02-06  7:00 ` Yan, Zheng
  2016-02-06  7:00 ` [PATCH v2 2/6] libceph: introduce reference counted string Yan, Zheng
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 33+ messages in thread
From: Yan, Zheng @ 2016-02-06  7:00 UTC (permalink / raw)
  To: ceph-devel; +Cc: sweil, idryomov, Yan, Zheng

Define new ceph_file_layout structure and rename old ceph_file_layout
to ceph_file_layout_legacy. This is preparation for adding namespace
to ceph_file_layout structure.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 drivers/block/rbd.c          | 12 +++++------
 fs/ceph/addr.c               | 18 ++++++++--------
 fs/ceph/caps.c               |  6 +++++-
 fs/ceph/file.c               |  6 +++---
 fs/ceph/inode.c              |  5 +++--
 fs/ceph/ioctl.c              | 22 +++++++++----------
 fs/ceph/mds_client.h         |  2 +-
 fs/ceph/xattr.c              | 28 ++++++++++---------------
 include/linux/ceph/ceph_fs.h | 50 +++++++++++++++++++-------------------------
 net/ceph/ceph_fs.c           | 30 +++++++++++++++++++++++---
 net/ceph/osd_client.c        |  4 ++--
 net/ceph/osdmap.c            |  6 +++---
 12 files changed, 102 insertions(+), 87 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 22cfc01..b0bcb2d 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1967,7 +1967,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
 	osd_req->r_callback = rbd_osd_req_callback;
 	osd_req->r_priv = obj_request;
 
-	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+	osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
 
 	return osd_req;
@@ -2012,7 +2012,7 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
 	osd_req->r_callback = rbd_osd_req_callback;
 	osd_req->r_priv = obj_request;
 
-	osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+	osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
 	ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
 
 	return osd_req;
@@ -4084,10 +4084,10 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
 
 	/* Initialize the layout used for all rbd requests */
 
-	rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-	rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
-	rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-	rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
+	rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER;
+	rbd_dev->layout.stripe_count = 1;
+	rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER;
+	rbd_dev->layout.pool_id = spec->pool_id;
 
 	/*
 	 * If this is a mapping rbd_dev (as opposed to a parent one),
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5b3a857..6047395 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1701,7 +1701,7 @@ enum {
 	POOL_WRITE	= 2,
 };
 
-static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
+static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool)
 {
 	struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -1728,7 +1728,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
 	if (*p)
 		goto out;
 
-	dout("__ceph_pool_perm_get pool %u no perm cached\n", pool);
+	dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool);
 
 	down_write(&mdsc->pool_perm_rwsem);
 	parent = NULL;
@@ -1833,13 +1833,13 @@ out_unlock:
 out:
 	if (!err)
 		err = have;
-	dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err);
+	dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err);
 	return err;
 }
 
 int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
 {
-	u32 pool;
+	s64 pool;
 	int ret, flags;
 
 	if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
@@ -1848,17 +1848,17 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
 
 	spin_lock(&ci->i_ceph_lock);
 	flags = ci->i_ceph_flags;
-	pool = ceph_file_layout_pg_pool(ci->i_layout);
+	pool = ci->i_layout.pool_id;
 	spin_unlock(&ci->i_ceph_lock);
 check:
 	if (flags & CEPH_I_POOL_PERM) {
 		if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
-			dout("ceph_pool_perm_check pool %u no read perm\n",
+			dout("ceph_pool_perm_check pool %lld no read perm\n",
 			     pool);
 			return -EPERM;
 		}
 		if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
-			dout("ceph_pool_perm_check pool %u no write perm\n",
+			dout("ceph_pool_perm_check pool %lld no write perm\n",
 			     pool);
 			return -EPERM;
 		}
@@ -1876,10 +1876,10 @@ check:
 		flags |= CEPH_I_POOL_WR;
 
 	spin_lock(&ci->i_ceph_lock);
-	if (pool == ceph_file_layout_pg_pool(ci->i_layout)) {
+	if (pool == ci->i_layout.pool_id) {
 		ci->i_ceph_flags = flags;
         } else {
-		pool = ceph_file_layout_pg_pool(ci->i_layout);
+		pool = ci->i_layout.pool_id;
 		flags = ci->i_ceph_flags;
 	}
 	spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 6094d0e..b68be57 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2875,7 +2875,11 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
 
 	if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
 		/* file layout may have changed */
-		ci->i_layout = grant->layout;
+		s64 old_pool = ci->i_layout.pool_id;
+		ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout);
+		if (ci->i_layout.pool_id != old_pool)
+			ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
+
 		/* size/truncate_seq? */
 		queue_trunc = ceph_fill_file_size(inode, issued,
 					le32_to_cpu(grant->truncate_seq),
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 389adac..dc6dc5b 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1537,9 +1537,9 @@ static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
 {
 	int ret = 0;
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	s32 stripe_unit = ceph_file_layout_su(ci->i_layout);
-	s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
-	s32 object_size = ceph_file_layout_object_size(ci->i_layout);
+	s32 stripe_unit = ci->i_layout.stripe_unit;
+	s32 stripe_count = ci->i_layout.stripe_count;
+	s32 object_size = ci->i_layout.object_size;
 	u64 object_set_size = object_size * stripe_count;
 	u64 nearly, t;
 
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 63d0198..b0ad53d 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -753,9 +753,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 
 	if (new_version ||
 	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
-		if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
+		s64 old_pool = ci->i_layout.pool_id;
+		ceph_file_layout_from_legacy(&ci->i_layout, &info->layout);
+		if (ci->i_layout.pool_id != old_pool)
 			ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
-		ci->i_layout = info->layout;
 
 		queue_trunc = ceph_fill_file_size(inode, issued,
 					le32_to_cpu(info->truncate_seq),
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index f851d8d..bd4a755 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -21,10 +21,10 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
 
 	err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
 	if (!err) {
-		l.stripe_unit = ceph_file_layout_su(ci->i_layout);
-		l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
-		l.object_size = ceph_file_layout_object_size(ci->i_layout);
-		l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
+		l.stripe_unit = ci->i_layout.stripe_unit;
+		l.stripe_count = ci->i_layout.stripe_count;
+		l.object_size = ci->i_layout.object_size;
+		l.data_pool = ci->i_layout.pool_id;
 		l.preferred_osd = (s32)-1;
 		if (copy_to_user(arg, &l, sizeof(l)))
 			return -EFAULT;
@@ -82,19 +82,19 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 	if (l.stripe_count)
 		nl.stripe_count = l.stripe_count;
 	else
-		nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+		nl.stripe_count = ci->i_layout.stripe_count;
 	if (l.stripe_unit)
 		nl.stripe_unit = l.stripe_unit;
 	else
-		nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
+		nl.stripe_unit = ci->i_layout.stripe_unit;
 	if (l.object_size)
 		nl.object_size = l.object_size;
 	else
-		nl.object_size = ceph_file_layout_object_size(ci->i_layout);
+		nl.object_size = ci->i_layout.object_size;
 	if (l.data_pool)
 		nl.data_pool = l.data_pool;
 	else
-		nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout);
+		nl.data_pool = ci->i_layout.pool_id;
 
 	/* this is obsolete, and always -1 */
 	nl.preferred_osd = le64_to_cpu(-1);
@@ -202,8 +202,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 		return -EIO;
 	}
 	dl.file_offset -= dl.object_offset;
-	dl.object_size = ceph_file_layout_object_size(ci->i_layout);
-	dl.block_size = ceph_file_layout_su(ci->i_layout);
+	dl.object_size = ci->i_layout.object_size;
+	dl.block_size = ci->i_layout.stripe_unit;
 
 	/* block_offset = object_offset % block_size */
 	tmp = dl.object_offset;
@@ -212,7 +212,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 	snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
 		 ceph_ino(inode), dl.object_no);
 
-	oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
+	oloc.pool = ci->i_layout.pool_id;
 	ceph_oid_set_name(&oid, dl.object_name);
 
 	r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ccf11ef..6eb331c 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -267,8 +267,8 @@ struct ceph_mds_request {
 
 struct ceph_pool_perm {
 	struct rb_node node;
-	u32 pool;
 	int perm;
+	s64 pool;
 };
 
 /*
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 1e1c00a..c431f4c 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -70,7 +70,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 	int ret;
 	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
 	struct ceph_osd_client *osdc = &fsc->client->osdc;
-	s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+	s64 pool = ci->i_layout.pool_id;
 	const char *pool_name;
 	char buf[128];
 
@@ -80,10 +80,9 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 	if (pool_name) {
 		size_t len = strlen(pool_name);
 		ret = snprintf(buf, sizeof(buf),
-		"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
-		(unsigned long long)ceph_file_layout_su(ci->i_layout),
-		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
-	        (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+		"stripe_unit=%u stripe_count=%u object_size=%u pool=",
+		ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
+	        ci->i_layout.object_size);
 		if (!size) {
 			ret += len;
 		} else if (ret + len > size) {
@@ -95,11 +94,9 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 		}
 	} else {
 		ret = snprintf(buf, sizeof(buf),
-		"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
-		(unsigned long long)ceph_file_layout_su(ci->i_layout),
-		(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
-	        (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
-		(unsigned long long)pool);
+		"stripe_unit=%u stripe_count=%u object_size=%u pool=%lld",
+		ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
+	        ci->i_layout.object_size, (unsigned long long)pool);
 		if (size) {
 			if (ret <= size)
 				memcpy(val, buf, ret);
@@ -114,22 +111,19 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
 					       char *val, size_t size)
 {
-	return snprintf(val, size, "%lld",
-			(unsigned long long)ceph_file_layout_su(ci->i_layout));
+	return snprintf(val, size, "%u", ci->i_layout.stripe_unit);
 }
 
 static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
 						char *val, size_t size)
 {
-	return snprintf(val, size, "%lld",
-	       (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout));
+	return snprintf(val, size, "%u", ci->i_layout.stripe_count);
 }
 
 static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
 					       char *val, size_t size)
 {
-	return snprintf(val, size, "%lld",
-	       (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+	return snprintf(val, size, "%u", ci->i_layout.object_size);
 }
 
 static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
@@ -138,7 +132,7 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
 	int ret;
 	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
 	struct ceph_osd_client *osdc = &fsc->client->osdc;
-	s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+	s64 pool = ci->i_layout.pool_id;
 	const char *pool_name;
 
 	down_read(&osdc->map_sem);
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index bf74005..7d8728e 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -34,9 +34,9 @@
 #define CEPH_MAX_MON   31
 
 /*
- * ceph_file_layout - describe data layout for a file/inode
+ * legacy ceph_file_layoute
  */
-struct ceph_file_layout {
+struct ceph_file_layout_legacy {
 	/* file -> object mapping */
 	__le32 fl_stripe_unit;     /* stripe unit, in bytes.  must be multiple
 				      of page size. */
@@ -53,33 +53,25 @@ struct ceph_file_layout {
 	__le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
 } __attribute__ ((packed));
 
-#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
-#define ceph_file_layout_stripe_count(l) \
-	((__s32)le32_to_cpu((l).fl_stripe_count))
-#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
-#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
-#define ceph_file_layout_object_su(l) \
-	((__s32)le32_to_cpu((l).fl_object_stripe_unit))
-#define ceph_file_layout_pg_pool(l) \
-	((__s32)le32_to_cpu((l).fl_pg_pool))
-
-static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
-{
-	return le32_to_cpu(l->fl_stripe_unit) *
-		le32_to_cpu(l->fl_stripe_count);
-}
-
-/* "period" == bytes before i start on a new set of objects */
-static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
-{
-	return le32_to_cpu(l->fl_object_size) *
-		le32_to_cpu(l->fl_stripe_count);
-}
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+	/* file -> object mapping */
+	u32 stripe_unit;   /* stripe unit, in bytes */
+	u32 stripe_count;  /* over this many objects */
+	u32 object_size;   /* until objects are this big */
+	s64 pool_id;        /* rados pool id */
+};
+
+extern int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
+extern void ceph_file_layout_from_legacy(struct ceph_file_layout *fl,
+				struct ceph_file_layout_legacy *legacy);
+extern void ceph_file_layout_to_legacy(struct ceph_file_layout *fl,
+				struct ceph_file_layout_legacy *legacy);
 
 #define CEPH_MIN_STRIPE_UNIT 65536
 
-int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
-
 struct ceph_dir_layout {
 	__u8   dl_dir_hash;   /* see ceph_hash.h for ids */
 	__u8   dl_unused1;
@@ -382,7 +374,7 @@ union ceph_mds_request_args {
 		__le32 flags;
 	} __attribute__ ((packed)) setxattr;
 	struct {
-		struct ceph_file_layout layout;
+		struct ceph_file_layout_legacy layout;
 	} __attribute__ ((packed)) setlayout;
 	struct {
 		__u8 rule; /* currently fcntl or flock */
@@ -461,7 +453,7 @@ struct ceph_mds_reply_inode {
 	__le64 version;                /* inode version */
 	__le64 xattr_version;          /* version for xattr blob */
 	struct ceph_mds_reply_cap cap; /* caps issued for this inode */
-	struct ceph_file_layout layout;
+	struct ceph_file_layout_legacy layout;
 	struct ceph_timespec ctime, mtime, atime;
 	__le32 time_warp_seq;
 	__le64 size, max_size, truncate_size;
@@ -656,7 +648,7 @@ struct ceph_mds_caps {
 	__le64 size, max_size, truncate_size;
 	__le32 truncate_seq;
 	struct ceph_timespec mtime, atime, ctime;
-	struct ceph_file_layout layout;
+	struct ceph_file_layout_legacy layout;
 	__le32 time_warp_seq;
 } __attribute__ ((packed));
 
diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
index 41466cc..52c8264 100644
--- a/net/ceph/ceph_fs.c
+++ b/net/ceph/ceph_fs.c
@@ -9,9 +9,9 @@
  */
 int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
 {
-	__u32 su = le32_to_cpu(layout->fl_stripe_unit);
-	__u32 sc = le32_to_cpu(layout->fl_stripe_count);
-	__u32 os = le32_to_cpu(layout->fl_object_size);
+	__u32 su = layout->stripe_unit;
+	__u32 sc = layout->stripe_count;
+	__u32 os = layout->object_size;
 
 	/* stripe unit, object size must be non-zero, 64k increment */
 	if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
@@ -27,6 +27,30 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
 	return 1;
 }
 
+void ceph_file_layout_from_legacy(struct ceph_file_layout *fl,
+				  struct ceph_file_layout_legacy *legacy)
+{       
+	fl->stripe_unit = le32_to_cpu(legacy->fl_stripe_unit);
+	fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count);
+	fl->object_size = le32_to_cpu(legacy->fl_object_size);
+	fl->pool_id = le64_to_cpu(legacy->fl_pg_pool);
+	if (fl->pool_id == 0) 
+		fl->pool_id = -1;
+}
+EXPORT_SYMBOL(ceph_file_layout_from_legacy);
+
+void ceph_file_layout_to_legacy(struct ceph_file_layout *fl,
+				struct ceph_file_layout_legacy *legacy)
+{       
+	legacy->fl_stripe_unit = cpu_to_le32(fl->stripe_unit);
+	legacy->fl_stripe_count = cpu_to_le32(fl->stripe_count);
+	legacy->fl_object_size = cpu_to_le32(fl->object_size);
+	if (fl->pool_id >= 0)
+		legacy->fl_pg_pool = cpu_to_le64(fl->pool_id);
+	else    
+		legacy->fl_pg_pool = 0;
+}
+EXPORT_SYMBOL(ceph_file_layout_to_legacy);
 
 int ceph_flags_to_mode(int flags)
 {
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index fb9da42..450955e 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -848,7 +848,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
 		osd_req_op_init(req, which, opcode, 0);
 	} else {
-		u32 object_size = le32_to_cpu(layout->fl_object_size);
+		u32 object_size = layout->object_size;
 		u32 object_base = off - objoff;
 		if (!(truncate_seq == 1 && truncate_size == -1ULL)) {
 			if (truncate_size <= object_base) {
@@ -863,7 +863,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 				       truncate_size, truncate_seq);
 	}
 
-	req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
+	req->r_base_oloc.pool = layout->pool_id;
 
 	snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
 		 "%llx.%08llx", vino.ino, objnum);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 243574c..f033ca5 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1397,9 +1397,9 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 				   u64 *ono,
 				   u64 *oxoff, u64 *oxlen)
 {
-	u32 osize = le32_to_cpu(layout->fl_object_size);
-	u32 su = le32_to_cpu(layout->fl_stripe_unit);
-	u32 sc = le32_to_cpu(layout->fl_stripe_count);
+	u32 osize = layout->object_size;
+	u32 su = layout->stripe_unit;
+	u32 sc = layout->stripe_count;
 	u32 bl, stripeno, stripepos, objsetno;
 	u32 su_per_object;
 	u64 t, su_offset;
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v2 2/6] libceph: introduce reference counted string
  2016-02-06  7:00 [PATCH v2 0/6] Rados pool namespace support Yan, Zheng
  2016-02-06  7:00 ` [PATCH v2 1/6] libceph: define new ceph_file_layout structure Yan, Zheng
@ 2016-02-06  7:00 ` Yan, Zheng
  2016-03-22  6:05   ` Ilya Dryomov
  2016-03-23 15:39   ` Douglas Fuller
  2016-02-06  7:00 ` [PATCH v2 3/6] libceph: rados pool namesapce support Yan, Zheng
                   ` (3 subsequent siblings)
  5 siblings, 2 replies; 33+ messages in thread
From: Yan, Zheng @ 2016-02-06  7:00 UTC (permalink / raw)
  To: ceph-devel; +Cc: sweil, idryomov, Yan, Zheng

The data structure is for storing namesapce string. It allows namespace
string to be shared by cephfs inodes with same layout.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 include/linux/ceph/libceph.h      |   1 +
 include/linux/ceph/string_table.h |  58 ++++++++++++++++++
 net/ceph/Makefile                 |   2 +-
 net/ceph/ceph_common.c            |   1 +
 net/ceph/string_table.c           | 121 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 182 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/ceph/string_table.h
 create mode 100644 net/ceph/string_table.c

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index e7975e4..8e9868d 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -21,6 +21,7 @@
 #include <linux/ceph/mon_client.h>
 #include <linux/ceph/osd_client.h>
 #include <linux/ceph/ceph_fs.h>
+#include <linux/ceph/string_table.h>
 
 /*
  * mount options
diff --git a/include/linux/ceph/string_table.h b/include/linux/ceph/string_table.h
new file mode 100644
index 0000000..427b6f9
--- /dev/null
+++ b/include/linux/ceph/string_table.h
@@ -0,0 +1,58 @@
+#ifndef _FS_CEPH_STRING_TABLE_H
+#define _FS_CEPH_STRING_TABLE_H
+
+#include <linux/types.h>
+#include <linux/kref.h>
+#include <linux/rbtree.h>
+#include <linux/rcupdate.h>
+
+struct ceph_string {
+	struct kref kref;
+	union {
+		struct rb_node node;
+		struct rcu_head rcu;
+	};
+	size_t len;
+	char str[];
+};
+
+extern void ceph_release_string(struct kref *ref);
+extern struct ceph_string *ceph_find_or_create_string(const char *str,
+						      size_t len);
+extern void ceph_string_cleanup(void);
+
+static inline void ceph_get_string(struct ceph_string *str)
+{
+	kref_get(&str->kref);
+}
+
+static inline void ceph_put_string(struct ceph_string *str)
+{
+	if (!str)
+		return;
+	kref_put(&str->kref, ceph_release_string);
+}
+
+static inline int ceph_compare_string(struct ceph_string *cs,
+				      const char* str, size_t len)
+{
+	size_t cs_len = cs ? cs->len : 0;
+	if (cs_len != len)
+		return cs_len - len;
+	if (len == 0)
+		return 0;
+	return strncmp(cs->str, str, len);
+}
+
+#define ceph_try_get_string(x)					\
+({								\
+	struct ceph_string *___str;				\
+	rcu_read_lock();					\
+	___str = rcu_dereference(x);				\
+	if (___str && !kref_get_unless_zero(&___str->kref))	\
+		___str = NULL;					\
+	rcu_read_unlock();					\
+	(___str);						\
+})
+
+#endif
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index 958d9856..84cbed6 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
 	crypto.o armor.o \
 	auth_x.o \
 	ceph_fs.o ceph_strings.o ceph_hash.o \
-	pagevec.o snapshot.o
+	pagevec.o snapshot.o string_table.o
 
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index dcc18c6..baca649 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -751,6 +751,7 @@ static void __exit exit_ceph_lib(void)
 	ceph_msgr_exit();
 	ceph_crypto_shutdown();
 	ceph_debugfs_cleanup();
+	ceph_string_cleanup();
 }
 
 module_init(init_ceph_lib);
diff --git a/net/ceph/string_table.c b/net/ceph/string_table.c
new file mode 100644
index 0000000..bb49bd7
--- /dev/null
+++ b/net/ceph/string_table.c
@@ -0,0 +1,121 @@
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/ceph/string_table.h>
+
+static DEFINE_SPINLOCK(string_tree_lock);
+static struct rb_root string_tree = RB_ROOT;
+
+struct ceph_string *ceph_find_or_create_string(const char* str, size_t len)
+{
+	struct ceph_string *cs, *exist;
+	struct rb_node **p, *parent;
+	int ret;
+
+	exist = NULL;
+	spin_lock(&string_tree_lock);
+	p = &string_tree.rb_node;
+	while (*p) {
+		exist = rb_entry(*p, struct ceph_string, node);
+		ret = ceph_compare_string(exist, str, len);
+		if (ret > 0)
+			p = &(*p)->rb_left;
+		else if (ret < 0)
+			p = &(*p)->rb_right;
+		else
+			break;
+		exist = NULL;
+	}
+	if (exist && !kref_get_unless_zero(&exist->kref)) {
+		rb_erase(&exist->node, &string_tree);
+		RB_CLEAR_NODE(&exist->node);
+		exist = NULL;
+	}
+	spin_unlock(&string_tree_lock);
+	if (exist)
+		return exist;
+
+	cs = kmalloc(sizeof(*cs) + len + 1, GFP_NOFS);
+	if (!cs)
+		return NULL;
+
+	kref_init(&cs->kref);
+	cs->len = len;
+	memcpy(cs->str, str, len);
+	cs->str[len] = 0;
+
+retry:
+	exist = NULL;
+	parent = NULL;
+	p = &string_tree.rb_node;
+	spin_lock(&string_tree_lock);
+	while (*p) {
+		parent = *p;
+		exist = rb_entry(*p, struct ceph_string, node);
+		ret = ceph_compare_string(exist, str, len);
+		if (ret > 0)
+			p = &(*p)->rb_left;
+		else if (ret < 0)
+			p = &(*p)->rb_right;
+		else
+			break;
+		exist = NULL;
+	}
+	ret = 0;
+	if (!exist) {
+		rb_link_node(&cs->node, parent, p);
+		rb_insert_color(&cs->node, &string_tree);
+	} else if (!kref_get_unless_zero(&exist->kref)) {
+		rb_erase(&exist->node, &string_tree);
+		RB_CLEAR_NODE(&exist->node);
+		ret = -EAGAIN;
+	}
+	spin_unlock(&string_tree_lock);
+	if (ret == -EAGAIN)
+		goto retry;
+
+	if (exist) {
+		kfree(cs);
+		cs = exist;
+	}
+
+	return cs;
+}
+EXPORT_SYMBOL(ceph_find_or_create_string);
+
+static void ceph_free_string(struct rcu_head *head)
+{
+	struct ceph_string *cs = container_of(head, struct ceph_string, rcu);
+	kfree(cs);
+}
+
+void ceph_release_string(struct kref *ref)
+{
+	struct ceph_string *cs = container_of(ref, struct ceph_string, kref);
+
+	spin_lock(&string_tree_lock);
+	if (!RB_EMPTY_NODE(&cs->node)) {
+		rb_erase(&cs->node, &string_tree);
+		RB_CLEAR_NODE(&cs->node);
+	}
+	spin_unlock(&string_tree_lock);
+
+	call_rcu(&cs->rcu, ceph_free_string);
+}
+EXPORT_SYMBOL(ceph_release_string);
+
+void ceph_string_cleanup(void)
+{
+	struct rb_node *p;
+	struct ceph_string *cs;
+	if (RB_EMPTY_ROOT(&string_tree))
+		return;
+
+	pr_err("libceph: detect string leaks\n");
+	while ((p = rb_first(&string_tree))) {
+		cs = rb_entry(p, struct ceph_string, node);
+		rb_erase(p, &string_tree);
+		kfree(cs);
+	}
+}
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v2 3/6] libceph: rados pool namesapce support
  2016-02-06  7:00 [PATCH v2 0/6] Rados pool namespace support Yan, Zheng
  2016-02-06  7:00 ` [PATCH v2 1/6] libceph: define new ceph_file_layout structure Yan, Zheng
  2016-02-06  7:00 ` [PATCH v2 2/6] libceph: introduce reference counted string Yan, Zheng
@ 2016-02-06  7:00 ` Yan, Zheng
  2016-03-22  6:11   ` Ilya Dryomov
  2016-02-06  7:00 ` [PATCH v2 4/6] libceph: make sure redirect does not change namespace Yan, Zheng
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 33+ messages in thread
From: Yan, Zheng @ 2016-02-06  7:00 UTC (permalink / raw)
  To: ceph-devel; +Cc: sweil, idryomov, Yan, Zheng

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 drivers/block/rbd.c          |  1 +
 fs/ceph/inode.c              |  3 +++
 include/linux/ceph/ceph_fs.h |  2 ++
 include/linux/ceph/osdmap.h  |  2 ++
 net/ceph/osd_client.c        | 37 ++++++++++++++++++++++++++-----------
 net/ceph/osdmap.c            | 33 +++++++++++++++++++++++++++------
 6 files changed, 61 insertions(+), 17 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index b0bcb2d..0423493 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4088,6 +4088,7 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
 	rbd_dev->layout.stripe_count = 1;
 	rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER;
 	rbd_dev->layout.pool_id = spec->pool_id;
+	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
 
 	/*
 	 * If this is a mapping rbd_dev (as opposed to a parent one),
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index b0ad53d..3c220f1 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -396,6 +396,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_symlink = NULL;
 
 	memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+	RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
 
 	ci->i_fragtree = RB_ROOT;
 	mutex_init(&ci->i_fragtree_mutex);
@@ -518,6 +519,8 @@ void ceph_destroy_inode(struct inode *inode)
 	if (ci->i_xattrs.prealloc_blob)
 		ceph_buffer_put(ci->i_xattrs.prealloc_blob);
 
+	ceph_put_string(ci->i_layout.pool_ns);
+
 	call_rcu(&inode->i_rcu, ceph_i_callback);
 }
 
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 7d8728e..3858923 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -53,6 +53,7 @@ struct ceph_file_layout_legacy {
 	__le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
 } __attribute__ ((packed));
 
+struct ceph_string;
 /*
  * ceph_file_layout - describe data layout for a file/inode
  */
@@ -62,6 +63,7 @@ struct ceph_file_layout {
 	u32 stripe_count;  /* over this many objects */
 	u32 object_size;   /* until objects are this big */
 	s64 pool_id;        /* rados pool id */
+	struct ceph_string __rcu *pool_ns; /* rados pool namespace */
 };
 
 extern int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index e55c08b..3d59d6c 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -55,6 +55,7 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
 
 struct ceph_object_locator {
 	s64 pool;
+	struct ceph_string *pool_ns;
 };
 
 /*
@@ -63,6 +64,7 @@ struct ceph_object_locator {
  * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100)
  */
 #define CEPH_MAX_OID_NAME_LEN 100
+#define CEPH_MAX_NAMESPACE_LEN 100
 
 struct ceph_object_id {
 	char name[CEPH_MAX_OID_NAME_LEN];
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 450955e..68e7f68 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -339,6 +339,8 @@ static void ceph_osdc_release_request(struct kref *kref)
 		kfree(req->r_ops);
 
 	ceph_put_snap_context(req->r_snapc);
+	ceph_put_string(req->r_base_oloc.pool_ns);
+
 	if (req->r_mempool)
 		mempool_free(req, req->r_osdc->req_mempool);
 	else
@@ -388,6 +390,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	req->r_num_ops = 0;
 	req->r_max_ops = num_ops;
 
+	req->r_base_oloc.pool = -1;
+	req->r_target_oloc.pool = -1;
+
 	if (num_ops <= CEPH_OSD_INITIAL_OP) {
 		req->r_ops = req->r_inline_ops;
 	} else {
@@ -409,9 +414,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	INIT_LIST_HEAD(&req->r_req_lru_item);
 	INIT_LIST_HEAD(&req->r_osd_item);
 
-	req->r_base_oloc.pool = -1;
-	req->r_target_oloc.pool = -1;
-
 	/* create reply message */
 	msg_size = OSD_OPREPLY_FRONT_LEN;
 	if (num_ops > CEPH_OSD_INITIAL_OP) {
@@ -433,7 +435,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 
 	/* create request message; allow space for oid */
 	msg_size = 4 + 4 + 8 + 8 + 4 + 8;
-	msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
+	msg_size += 2 + 4 + 8 + 4 + 4 + 4 + CEPH_MAX_NAMESPACE_LEN; /* oloc */
 	msg_size += 1 + 8 + 4 + 4;     /* pg_t */
 	msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
 	msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
@@ -864,6 +866,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	}
 
 	req->r_base_oloc.pool = layout->pool_id;
+	req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
 
 	snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
 		 "%llx.%08llx", vino.ino, objnum);
@@ -1719,10 +1722,10 @@ static int ceph_oloc_decode(void **p, void *end,
 	}
 
 	if (struct_v >= 5) {
-		len = ceph_decode_32(p);
-		if (len > 0) {
-			pr_warn("ceph_object_locator::nspace is set\n");
-			goto e_inval;
+		u32 ns_len = ceph_decode_32(p);
+		if (ns_len > 0) {
+			ceph_decode_need(p, end, ns_len, e_inval);
+			*p += ns_len;
 		}
 	}
 
@@ -1907,7 +1910,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 
 		__unregister_request(osdc, req);
 
-		req->r_target_oloc = redir.oloc; /* struct */
+		req->r_target_oloc.pool = redir.oloc.pool;
 
 		/*
 		 * Start redirect requests with nofail=true.  If
@@ -2459,6 +2462,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
 				struct timespec *mtime)
 {
 	struct ceph_msg *msg = req->r_request;
+	struct ceph_string *pool_ns;
 	void *p;
 	size_t msg_size;
 	int flags = req->r_flags;
@@ -2483,14 +2487,25 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
 	req->r_request_reassert_version = p;
 	p += sizeof(struct ceph_eversion); /* will get filled in */
 
+	if (req->r_base_oloc.pool_ns)
+		pool_ns = req->r_base_oloc.pool_ns;
+	else
+		pool_ns = NULL;
+
 	/* oloc */
+	ceph_encode_8(&p, 5);
 	ceph_encode_8(&p, 4);
-	ceph_encode_8(&p, 4);
-	ceph_encode_32(&p, 8 + 4 + 4);
+	ceph_encode_32(&p, 8 + 4 + 4 + 4 + (pool_ns ? pool_ns->len : 0));
 	req->r_request_pool = p;
 	p += 8;
 	ceph_encode_32(&p, -1);  /* preferred */
 	ceph_encode_32(&p, 0);   /* key len */
+	if (pool_ns) {
+		ceph_encode_32(&p, pool_ns->len);
+		ceph_encode_copy(&p, pool_ns->str, pool_ns->len);
+	} else {
+		ceph_encode_32(&p, 0);
+	}
 
 	ceph_encode_8(&p, 1);
 	req->r_request_pgid = p;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index f033ca5..f117848 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1470,12 +1470,33 @@ int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
 	if (!pi)
 		return -EIO;
 
-	pg_out->pool = oloc->pool;
-	pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
-				     oid->name_len);
-
-	dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
-	     pg_out->pool, pg_out->seed);
+	if (!oloc->pool_ns) {
+		pg_out->pool = oloc->pool;
+		pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
+					     oid->name_len);
+		dout("%s '%.*s' pgid %llu.%x\n", __func__,
+		     oid->name_len, oid->name, pg_out->pool, pg_out->seed);
+	} else {
+		char stack_buf[256];
+		char *buf = stack_buf;
+		int nsl = oloc->pool_ns->len;
+		size_t total = nsl + 1 + oid->name_len;
+		if (total > sizeof(stack_buf)) {
+			buf = kmalloc(total, GFP_NOFS);
+			if (!buf)
+				return -ENOMEM;
+		}
+		memcpy(buf, oloc->pool_ns->str, nsl);
+		buf[nsl] = '\037';
+		memcpy(buf + nsl + 1, oid->name, oid->name_len);
+		pg_out->pool = oloc->pool;
+		pg_out->seed = ceph_str_hash(pi->object_hash, buf, total);
+		if (buf != stack_buf)
+			kfree(buf);
+		dout("%s '%.*s' ns '%.*s' pgid %llu.%x\n", __func__,
+		     oid->name_len, oid->name, nsl, oloc->pool_ns->str,
+		     pg_out->pool, pg_out->seed);
+	}
 	return 0;
 }
 EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v2 4/6] libceph: make sure redirect does not change namespace
  2016-02-06  7:00 [PATCH v2 0/6] Rados pool namespace support Yan, Zheng
                   ` (2 preceding siblings ...)
  2016-02-06  7:00 ` [PATCH v2 3/6] libceph: rados pool namesapce support Yan, Zheng
@ 2016-02-06  7:00 ` Yan, Zheng
  2016-02-06  7:00 ` [PATCH v2 5/6] libceph: calculate request message size according to namespace Yan, Zheng
  2016-02-06  7:00 ` [PATCH v2 6/6] ceph: rados pool namesapce support Yan, Zheng
  5 siblings, 0 replies; 33+ messages in thread
From: Yan, Zheng @ 2016-02-06  7:00 UTC (permalink / raw)
  To: ceph-devel; +Cc: sweil, idryomov, Yan, Zheng

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 net/ceph/osd_client.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 68e7f68..136245d 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1722,10 +1722,23 @@ static int ceph_oloc_decode(void **p, void *end,
 	}
 
 	if (struct_v >= 5) {
+		bool changed = false;
 		u32 ns_len = ceph_decode_32(p);
 		if (ns_len > 0) {
 			ceph_decode_need(p, end, ns_len, e_inval);
+			if (oloc->pool != -1 &&
+			    (!oloc->pool_ns ||
+			     ceph_compare_string(oloc->pool_ns, *p, ns_len)))
+				changed = true;
 			*p += ns_len;
+		} else {
+			if (oloc->pool != -1 && oloc->pool_ns)
+				changed = true;
+		}
+		if (changed) {
+			/* redirect changes namespace */
+			pr_warn("ceph_object_locator::nspace is changed\n");
+			goto e_inval;
 		}
 	}
 
@@ -1898,7 +1911,9 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
 	}
 
 	if (decode_redir) {
+		redir.oloc.pool_ns = req->r_target_oloc.pool_ns;
 		err = ceph_redirect_decode(&p, end, &redir);
+		redir.oloc.pool_ns = NULL;
 		if (err)
 			goto bad_put;
 	} else {
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v2 5/6] libceph: calculate request message size according to namespace
  2016-02-06  7:00 [PATCH v2 0/6] Rados pool namespace support Yan, Zheng
                   ` (3 preceding siblings ...)
  2016-02-06  7:00 ` [PATCH v2 4/6] libceph: make sure redirect does not change namespace Yan, Zheng
@ 2016-02-06  7:00 ` Yan, Zheng
  2016-02-06  7:00 ` [PATCH v2 6/6] ceph: rados pool namesapce support Yan, Zheng
  5 siblings, 0 replies; 33+ messages in thread
From: Yan, Zheng @ 2016-02-06  7:00 UTC (permalink / raw)
  To: ceph-devel; +Cc: sweil, idryomov, Yan, Zheng

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 drivers/block/rbd.c             |  4 ++--
 fs/ceph/addr.c                  |  4 ++--
 fs/ceph/file.c                  |  5 +++--
 include/linux/ceph/osd_client.h |  2 ++
 include/linux/ceph/osdmap.h     |  1 -
 net/ceph/osd_client.c           | 20 +++++++++++++-------
 6 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 0423493..669fc70 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1954,7 +1954,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
 	/* Allocate and initialize the request, for the num_ops ops */
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
-	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
+	osd_req = ceph_osdc_alloc_request(osdc, snapc, NULL, num_ops, false,
 					  GFP_ATOMIC);
 	if (!osd_req)
 		return NULL;	/* ENOMEM */
@@ -2003,7 +2003,7 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
 	snapc = img_request->snapc;
 	rbd_dev = img_request->rbd_dev;
 	osdc = &rbd_dev->rbd_client->client->osdc;
-	osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
+	osd_req = ceph_osdc_alloc_request(osdc, snapc, NULL, num_osd_ops,
 						false, GFP_ATOMIC);
 	if (!osd_req)
 		return NULL;	/* ENOMEM */
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6047395..d5ea244 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1750,7 +1750,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool)
 	}
 
 	rd_req = ceph_osdc_alloc_request(&fsc->client->osdc,
-					 ceph_empty_snapc,
+					 ceph_empty_snapc, NULL,
 					 1, false, GFP_NOFS);
 	if (!rd_req) {
 		err = -ENOMEM;
@@ -1765,7 +1765,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool)
 	rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
 
 	wr_req = ceph_osdc_alloc_request(&fsc->client->osdc,
-					 ceph_empty_snapc,
+					 ceph_empty_snapc, NULL,
 					 1, false, GFP_NOFS);
 	if (!wr_req) {
 		err = -ENOMEM;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index dc6dc5b..a5565f7 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -696,8 +696,9 @@ static void ceph_aio_retry_work(struct work_struct *work)
 	}
 	spin_unlock(&ci->i_ceph_lock);
 
-	req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
-			false, GFP_NOFS);
+	req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc,
+				      orig_req->r_base_oloc.pool_ns,
+				      2, false, GFP_NOFS);
 	if (!req) {
 		ret = -ENOMEM;
 		req = orig_req;
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 4df2fc5..4afb23e 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -14,6 +14,7 @@
 
 struct ceph_msg;
 struct ceph_snap_context;
+struct ceph_string;
 struct ceph_osd_request;
 struct ceph_osd_client;
 struct ceph_authorizer;
@@ -321,6 +322,7 @@ extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
 
 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
+					       struct ceph_string *pool_ns,
 					       unsigned int num_ops,
 					       bool use_mempool,
 					       gfp_t gfp_flags);
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 3d59d6c..c09970f 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -64,7 +64,6 @@ struct ceph_object_locator {
  * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100)
  */
 #define CEPH_MAX_OID_NAME_LEN 100
-#define CEPH_MAX_NAMESPACE_LEN 100
 
 struct ceph_object_id {
 	char name[CEPH_MAX_OID_NAME_LEN];
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 136245d..b21d774 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -365,6 +365,7 @@ EXPORT_SYMBOL(ceph_osdc_put_request);
 
 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
+					       struct ceph_string *pool_ns,
 					       unsigned int num_ops,
 					       bool use_mempool,
 					       gfp_t gfp_flags)
@@ -435,7 +436,8 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 
 	/* create request message; allow space for oid */
 	msg_size = 4 + 4 + 8 + 8 + 4 + 8;
-	msg_size += 2 + 4 + 8 + 4 + 4 + 4 + CEPH_MAX_NAMESPACE_LEN; /* oloc */
+	msg_size += 2 + 4 + 8 + 4 + 4 + 4 +
+		    (pool_ns ? pool_ns->len : 0); /* oloc */
 	msg_size += 1 + 8 + 4 + 4;     /* pg_t */
 	msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
 	msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
@@ -824,6 +826,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 					       bool use_mempool)
 {
 	struct ceph_osd_request *req;
+	struct ceph_string *pool_ns;
 	u64 objnum = 0;
 	u64 objoff = 0;
 	u64 objlen = 0;
@@ -833,12 +836,18 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	       opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE &&
 	       opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE);
 
-	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
-					GFP_NOFS);
-	if (!req)
+	pool_ns = ceph_try_get_string(layout->pool_ns);
+
+	req = ceph_osdc_alloc_request(osdc, snapc, pool_ns, num_ops,
+				      use_mempool, GFP_NOFS);
+	if (!req) {
+		ceph_put_string(pool_ns);
 		return ERR_PTR(-ENOMEM);
+	}
 
 	req->r_flags = flags;
+	req->r_base_oloc.pool = layout->pool_id;
+	req->r_base_oloc.pool_ns = pool_ns;
 
 	/* calculate max write size */
 	r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
@@ -865,9 +874,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 				       truncate_size, truncate_seq);
 	}
 
-	req->r_base_oloc.pool = layout->pool_id;
-	req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
-
 	snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
 		 "%llx.%08llx", vino.ino, objnum);
 	req->r_base_oid.name_len = strlen(req->r_base_oid.name);
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH v2 6/6] ceph: rados pool namesapce support
  2016-02-06  7:00 [PATCH v2 0/6] Rados pool namespace support Yan, Zheng
                   ` (4 preceding siblings ...)
  2016-02-06  7:00 ` [PATCH v2 5/6] libceph: calculate request message size according to namespace Yan, Zheng
@ 2016-02-06  7:00 ` Yan, Zheng
  5 siblings, 0 replies; 33+ messages in thread
From: Yan, Zheng @ 2016-02-06  7:00 UTC (permalink / raw)
  To: ceph-devel; +Cc: sweil, idryomov, Yan, Zheng

Signed-off-by: Yan, Zheng <zyan@redhat.com>
---
 fs/ceph/addr.c                     | 71 +++++++++++++++++++++++++++++++-------
 fs/ceph/caps.c                     | 50 ++++++++++++++++++++-------
 fs/ceph/file.c                     |  4 ++-
 fs/ceph/inode.c                    | 16 ++++++++-
 fs/ceph/mds_client.c               | 11 ++++++
 fs/ceph/mds_client.h               |  4 +++
 fs/ceph/xattr.c                    | 66 +++++++++++++++++++++++++----------
 include/linux/ceph/ceph_features.h |  1 +
 net/ceph/ceph_fs.c                 |  1 +
 9 files changed, 179 insertions(+), 45 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index d5ea244..a14b148 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1701,7 +1701,8 @@ enum {
 	POOL_WRITE	= 2,
 };
 
-static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool)
+static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
+				s64 pool, struct ceph_string *pool_ns)
 {
 	struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
 	struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -1709,6 +1710,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool)
 	struct rb_node **p, *parent;
 	struct ceph_pool_perm *perm;
 	struct page **pages;
+	size_t pool_ns_len;
 	int err = 0, err2 = 0, have = 0;
 
 	down_read(&mdsc->pool_perm_rwsem);
@@ -1720,17 +1722,31 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool)
 		else if (pool > perm->pool)
 			p = &(*p)->rb_right;
 		else {
-			have = perm->perm;
-			break;
+			int ret = ceph_compare_string(pool_ns,
+						perm->pool_ns,
+						perm->pool_ns_len);
+			if (ret < 0)
+				p = &(*p)->rb_left;
+			else if (ret > 0)
+				p = &(*p)->rb_right;
+			else {
+				have = perm->perm;
+				break;
+			}
 		}
 	}
 	up_read(&mdsc->pool_perm_rwsem);
 	if (*p)
 		goto out;
 
-	dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool);
+	if (pool_ns)
+		dout("__ceph_pool_perm_get pool %lld ns %.*s no perm cached\n",
+		     pool, (int)pool_ns->len, pool_ns->str);
+	else
+		dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool);
 
 	down_write(&mdsc->pool_perm_rwsem);
+	p = &mdsc->pool_perm_tree.rb_node;
 	parent = NULL;
 	while (*p) {
 		parent = *p;
@@ -1740,8 +1756,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool)
 		else if (pool > perm->pool)
 			p = &(*p)->rb_right;
 		else {
-			have = perm->perm;
-			break;
+			int ret = ceph_compare_string(pool_ns,
+						perm->pool_ns,
+						perm->pool_ns_len);
+			if (ret < 0)
+				p = &(*p)->rb_left;
+			else if (ret > 0)
+				p = &(*p)->rb_right;
+			else {
+				have = perm->perm;
+				break;
+			}
 		}
 	}
 	if (*p) {
@@ -1750,7 +1775,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool)
 	}
 
 	rd_req = ceph_osdc_alloc_request(&fsc->client->osdc,
-					 ceph_empty_snapc, NULL,
+					 ceph_empty_snapc, pool_ns,
 					 1, false, GFP_NOFS);
 	if (!rd_req) {
 		err = -ENOMEM;
@@ -1760,12 +1785,15 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool)
 	rd_req->r_flags = CEPH_OSD_FLAG_READ;
 	osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
 	rd_req->r_base_oloc.pool = pool;
+	rd_req->r_base_oloc.pool_ns = pool_ns;
+	if (pool_ns)
+		ceph_get_string(pool_ns);
 	snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name),
 		 "%llx.00000000", ci->i_vino.ino);
 	rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
 
 	wr_req = ceph_osdc_alloc_request(&fsc->client->osdc,
-					 ceph_empty_snapc, NULL,
+					 ceph_empty_snapc, pool_ns,
 					 1, false, GFP_NOFS);
 	if (!wr_req) {
 		err = -ENOMEM;
@@ -1776,6 +1804,9 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool)
 			  CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK;
 	osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
 	wr_req->r_base_oloc.pool = pool;
+	wr_req->r_base_oloc.pool_ns = pool_ns;
+	if (pool_ns)
+		ceph_get_string(pool_ns);
 	wr_req->r_base_oid = rd_req->r_base_oid;
 
 	/* one page should be large enough for STAT data */
@@ -1812,7 +1843,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool)
 		goto out_unlock;
 	}
 
-	perm = kmalloc(sizeof(*perm), GFP_NOFS);
+	pool_ns_len = pool_ns ? pool_ns->len : 0;
+	perm = kmalloc(sizeof(*perm) + pool_ns_len + 1, GFP_NOFS);
 	if (!perm) {
 		err = -ENOMEM;
 		goto out_unlock;
@@ -1820,6 +1852,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, s64 pool)
 
 	perm->pool = pool;
 	perm->perm = have;
+	perm->pool_ns_len = pool_ns_len;
+	if (pool_ns_len > 0)
+		memcpy(perm->pool_ns, pool_ns->str, pool_ns_len);
+	perm->pool_ns[pool_ns_len] = 0;
+
 	rb_link_node(&perm->node, parent, p);
 	rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
 	err = 0;
@@ -1833,13 +1870,18 @@ out_unlock:
 out:
 	if (!err)
 		err = have;
-	dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err);
+	if (pool_ns)
+		dout("__ceph_pool_perm_get pool %lld ns %.*s result = %d\n",
+		     pool, (int)pool_ns->len, pool_ns->str, err);
+	else
+		dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err);
 	return err;
 }
 
 int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
 {
 	s64 pool;
+	struct ceph_string *pool_ns;
 	int ret, flags;
 
 	if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
@@ -1865,7 +1907,9 @@ check:
 		return 0;
 	}
 
-	ret = __ceph_pool_perm_get(ci, pool);
+	pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
+	ret = __ceph_pool_perm_get(ci, pool, pool_ns);
+	ceph_put_string(pool_ns);
 	if (ret < 0)
 		return ret;
 
@@ -1876,8 +1920,9 @@ check:
 		flags |= CEPH_I_POOL_WR;
 
 	spin_lock(&ci->i_ceph_lock);
-	if (pool == ci->i_layout.pool_id) {
-		ci->i_ceph_flags = flags;
+	if (pool == ci->i_layout.pool_id &&
+	    pool_ns == ci->i_layout.pool_ns) {
+		ci->i_ceph_flags |= flags;
         } else {
 		pool = ci->i_layout.pool_id;
 		flags = ci->i_ceph_flags;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b68be57..1430bb9 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2752,8 +2752,8 @@ static void invalidate_aliases(struct inode *inode)
  */
 static void handle_cap_grant(struct ceph_mds_client *mdsc,
 			     struct inode *inode, struct ceph_mds_caps *grant,
-			     u64 inline_version,
-			     void *inline_data, int inline_len,
+			     struct ceph_string **pns, u64 inline_version,
+			     void *inline_data, u32 inline_len,
 			     struct ceph_buffer *xattr_buf,
 			     struct ceph_mds_session *session,
 			     struct ceph_cap *cap, int issued)
@@ -2876,10 +2876,18 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
 	if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
 		/* file layout may have changed */
 		s64 old_pool = ci->i_layout.pool_id;
+		struct ceph_string *old_ns;
+
 		ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout);
-		if (ci->i_layout.pool_id != old_pool)
+
+		old_ns = rcu_dereference(ci->i_layout.pool_ns);
+		rcu_assign_pointer(ci->i_layout.pool_ns, *pns);
+
+		if (ci->i_layout.pool_id != old_pool || *pns != old_ns)
 			ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
 
+		*pns = old_ns;
+
 		/* size/truncate_seq? */
 		queue_trunc = ceph_fill_file_size(inode, issued,
 					le32_to_cpu(grant->truncate_seq),
@@ -3405,13 +3413,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	struct ceph_cap *cap;
 	struct ceph_mds_caps *h;
 	struct ceph_mds_cap_peer *peer = NULL;
-	struct ceph_snap_realm *realm;
+	struct ceph_snap_realm *realm = NULL;
+	struct ceph_string *pool_ns = NULL;
 	int mds = session->s_mds;
 	int op, issued;
 	u32 seq, mseq;
 	struct ceph_vino vino;
-	u64 cap_id;
-	u64 size, max_size;
 	u64 tid;
 	u64 inline_version = 0;
 	void *inline_data = NULL;
@@ -3431,11 +3438,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	op = le32_to_cpu(h->op);
 	vino.ino = le64_to_cpu(h->ino);
 	vino.snap = CEPH_NOSNAP;
-	cap_id = le64_to_cpu(h->cap_id);
 	seq = le32_to_cpu(h->seq);
 	mseq = le32_to_cpu(h->migrate_seq);
-	size = le64_to_cpu(h->size);
-	max_size = le64_to_cpu(h->max_size);
 
 	snaptrace = h + 1;
 	snaptrace_len = le32_to_cpu(h->snap_trace_len);
@@ -3470,6 +3474,27 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		p += inline_len;
 	}
 
+	if (le16_to_cpu(msg->hdr.version) >= 8) {
+		u64 flush_tid;
+		u32 caller_uid, caller_gid;
+		u32 osd_epoch_barrier;
+		u32 pool_ns_len;
+		/* version >= 5 */
+		ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad);
+		/* version >= 6 */
+		ceph_decode_64_safe(&p, end, flush_tid, bad);
+		/* version >= 7 */
+		ceph_decode_32_safe(&p, end, caller_uid, bad);
+		ceph_decode_32_safe(&p, end, caller_gid, bad);
+		/* version >= 8 */
+		ceph_decode_32_safe(&p, end, pool_ns_len, bad);
+		if (pool_ns_len > 0) {
+			ceph_decode_need(&p, end, pool_ns_len, bad);
+			pool_ns = ceph_find_or_create_string(p, pool_ns_len);
+			p += pool_ns_len;
+		}
+	}
+
 	/* lookup ino */
 	inode = ceph_find_inode(sb, vino);
 	ci = ceph_inode(inode);
@@ -3488,7 +3513,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 			cap = ceph_get_cap(mdsc, NULL);
 			cap->cap_ino = vino.ino;
 			cap->queue_release = 1;
-			cap->cap_id = cap_id;
+			cap->cap_id = le64_to_cpu(h->cap_id);
 			cap->mseq = mseq;
 			cap->seq = seq;
 			spin_lock(&session->s_cap_lock);
@@ -3523,7 +3548,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		}
 		handle_cap_import(mdsc, inode, h, peer, session,
 				  &cap, &issued);
-		handle_cap_grant(mdsc, inode, h,
+		handle_cap_grant(mdsc, inode, h, &pool_ns,
 				 inline_version, inline_data, inline_len,
 				 msg->middle, session, cap, issued);
 		if (realm)
@@ -3547,7 +3572,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	case CEPH_CAP_OP_GRANT:
 		__ceph_caps_issued(ci, &issued);
 		issued |= __ceph_caps_dirty(ci);
-		handle_cap_grant(mdsc, inode, h,
+		handle_cap_grant(mdsc, inode, h, &pool_ns,
 				 inline_version, inline_data, inline_len,
 				 msg->middle, session, cap, issued);
 		goto done_unlocked;
@@ -3580,6 +3605,7 @@ done:
 	mutex_unlock(&session->s_mutex);
 done_unlocked:
 	iput(inode);
+	ceph_put_string(pool_ns);
 	return;
 
 bad:
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a5565f7..3994ec4 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -708,8 +708,10 @@ static void ceph_aio_retry_work(struct work_struct *work)
 	req->r_flags =	CEPH_OSD_FLAG_ORDERSNAP |
 			CEPH_OSD_FLAG_ONDISK |
 			CEPH_OSD_FLAG_WRITE;
-	req->r_base_oloc = orig_req->r_base_oloc;
 	req->r_base_oid = orig_req->r_base_oid;
+	req->r_base_oloc = orig_req->r_base_oloc;
+	if (req->r_base_oloc.pool_ns)
+		ceph_get_string(req->r_base_oloc.pool_ns);
 
 	req->r_ops[0] = orig_req->r_ops[0];
 	osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 3c220f1..84b0c58 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -675,6 +675,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 	int issued = 0, implemented, new_issued;
 	struct timespec mtime, atime, ctime;
 	struct ceph_buffer *xattr_blob = NULL;
+	struct ceph_string *pool_ns = NULL;
 	struct ceph_cap *new_cap = NULL;
 	int err = 0;
 	bool wake = false;
@@ -702,6 +703,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 			       iinfo->xattr_len);
 	}
 
+	if (iinfo->pool_ns_len > 0)
+		pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
+						     iinfo->pool_ns_len);
+
 	spin_lock(&ci->i_ceph_lock);
 
 	/*
@@ -757,10 +762,18 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 	if (new_version ||
 	    (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
 		s64 old_pool = ci->i_layout.pool_id;
+		struct ceph_string *old_ns;
+
 		ceph_file_layout_from_legacy(&ci->i_layout, &info->layout);
-		if (ci->i_layout.pool_id != old_pool)
+
+		old_ns = rcu_dereference(ci->i_layout.pool_ns);
+		rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns);
+
+		if (ci->i_layout.pool_id != old_pool || pool_ns != old_ns)
 			ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
 
+		pool_ns = old_ns;
+
 		queue_trunc = ceph_fill_file_size(inode, issued,
 					le32_to_cpu(info->truncate_seq),
 					le64_to_cpu(info->truncate_size),
@@ -923,6 +936,7 @@ out:
 		ceph_put_cap(mdsc, new_cap);
 	if (xattr_blob)
 		ceph_buffer_put(xattr_blob);
+	ceph_put_string(pool_ns);
 	return err;
 }
 
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b6bec29..d60601a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -100,6 +100,17 @@ static int parse_reply_info_in(void **p, void *end,
 	} else
 		info->inline_version = CEPH_INLINE_NONE;
 
+	info->pool_ns_len = 0;
+	info->pool_ns_data = NULL;
+	if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
+		ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+		if (info->pool_ns_len > 0) {
+			ceph_decode_need(p, end, info->pool_ns_len, bad);
+			info->pool_ns_data = *p;
+			*p += info->pool_ns_len;
+		}
+	}
+
 	return 0;
 bad:
 	return err;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 6eb331c..3e04a4f 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -44,6 +44,8 @@ struct ceph_mds_reply_info_in {
 	u64 inline_version;
 	u32 inline_len;
 	char *inline_data;
+	u32 pool_ns_len;
+	char *pool_ns_data;
 };
 
 /*
@@ -269,6 +271,8 @@ struct ceph_pool_perm {
 	struct rb_node node;
 	int perm;
 	s64 pool;
+	size_t pool_ns_len;
+	char pool_ns[];
 };
 
 /*
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index c431f4c..465b775 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -67,44 +67,60 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
 static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 				   size_t size)
 {
-	int ret;
 	struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
 	struct ceph_osd_client *osdc = &fsc->client->osdc;
+	struct ceph_string *pool_ns;
 	s64 pool = ci->i_layout.pool_id;
 	const char *pool_name;
+	const char *ns_field = " pool_namespace=";
 	char buf[128];
+	size_t len, total_len = 0;
+	int ret;
+
+	pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
 
 	dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
 	down_read(&osdc->map_sem);
 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
 	if (pool_name) {
-		size_t len = strlen(pool_name);
-		ret = snprintf(buf, sizeof(buf),
+		len = snprintf(buf, sizeof(buf),
 		"stripe_unit=%u stripe_count=%u object_size=%u pool=",
 		ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
 	        ci->i_layout.object_size);
-		if (!size) {
-			ret += len;
-		} else if (ret + len > size) {
-			ret = -ERANGE;
-		} else {
-			memcpy(val, buf, ret);
-			memcpy(val + ret, pool_name, len);
-			ret += len;
-		}
+		total_len = len + strlen(pool_name);
 	} else {
-		ret = snprintf(buf, sizeof(buf),
+		len = snprintf(buf, sizeof(buf),
 		"stripe_unit=%u stripe_count=%u object_size=%u pool=%lld",
 		ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
 	        ci->i_layout.object_size, (unsigned long long)pool);
-		if (size) {
-			if (ret <= size)
-				memcpy(val, buf, ret);
-			else
-				ret = -ERANGE;
+		total_len = len;
+	}
+
+	if (pool_ns)
+		total_len += strlen(ns_field) + pool_ns->len;
+
+	if (!size) {
+		ret = total_len;
+	} else if (total_len > size) {
+		ret = -ERANGE;
+	} else {
+		memcpy(val, buf, len);
+		ret = len;
+		if (pool_name) {
+			len = strlen(pool_name);
+			memcpy(val + ret, pool_name, len);
+			ret += len;
+		}
+		if (pool_ns) {
+			len = strlen(ns_field);
+			memcpy(val + ret, ns_field, len);
+			ret += len;
+			memcpy(val + ret, pool_ns->str, pool_ns->len);
+			ret += pool_ns->len;
 		}
 	}
 	up_read(&osdc->map_sem);
+	ceph_put_string(pool_ns);
 	return ret;
 }
 
@@ -145,6 +161,18 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
 	return ret;
 }
 
+static size_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci,
+						  char *val, size_t size)
+{
+	int ret = 0;
+	struct ceph_string *ns = ceph_try_get_string(ci->i_layout.pool_ns);
+	if (ns) {
+		ret = snprintf(val, size, "%.*s", (int)ns->len, ns->str);
+		ceph_put_string(ns);
+	}
+	return ret;
+}
+
 /* directories */
 
 static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
@@ -233,6 +261,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
 	XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
 	XATTR_LAYOUT_FIELD(dir, layout, object_size),
 	XATTR_LAYOUT_FIELD(dir, layout, pool),
+	XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
 	XATTR_NAME_CEPH(dir, entries),
 	XATTR_NAME_CEPH(dir, files),
 	XATTR_NAME_CEPH(dir, subdirs),
@@ -260,6 +289,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
 	XATTR_LAYOUT_FIELD(file, layout, stripe_count),
 	XATTR_LAYOUT_FIELD(file, layout, object_size),
 	XATTR_LAYOUT_FIELD(file, layout, pool),
+	XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
 	{ .name = NULL, 0 }	/* Required table terminator */
 };
 static size_t ceph_file_vxattrs_name_size;	/* total size of all names */
diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index d2d7886..9cddad3 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -75,6 +75,7 @@
 #define CEPH_FEATURE_CRUSH_TUNABLES5	(1ULL<<58) /* chooseleaf stable mode */
 // duplicated since it was introduced at the same time as CEPH_FEATURE_CRUSH_TUNABLES5
 #define CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING   (1ULL<<58) /* New, v7 encoding */
+#define CEPH_FEATURE_FS_FILE_LAYOUT_V2  (1ULL<<58) /* overlap with tunables5 */
 
 /*
  * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
index 52c8264..43f6706 100644
--- a/net/ceph/ceph_fs.c
+++ b/net/ceph/ceph_fs.c
@@ -3,6 +3,7 @@
  */
 #include <linux/module.h>
 #include <linux/ceph/types.h>
+#include <linux/ceph/string_table.h>
 
 /*
  * return true if @layout appears to be valid
-- 
2.5.0


^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-02-06  7:00 ` [PATCH v2 2/6] libceph: introduce reference counted string Yan, Zheng
@ 2016-03-22  6:05   ` Ilya Dryomov
  2016-03-22  9:17     ` Yan, Zheng
  2016-03-23 15:39   ` Douglas Fuller
  1 sibling, 1 reply; 33+ messages in thread
From: Ilya Dryomov @ 2016-03-22  6:05 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Ceph Development, Sage Weil

On Sat, Feb 6, 2016 at 8:00 AM, Yan, Zheng <zyan@redhat.com> wrote:
> The data structure is for storing namesapce string. It allows namespace
> string to be shared by cephfs inodes with same layout.
>
> Signed-off-by: Yan, Zheng <zyan@redhat.com>
> ---
>  include/linux/ceph/libceph.h      |   1 +
>  include/linux/ceph/string_table.h |  58 ++++++++++++++++++
>  net/ceph/Makefile                 |   2 +-
>  net/ceph/ceph_common.c            |   1 +
>  net/ceph/string_table.c           | 121 ++++++++++++++++++++++++++++++++++++++
>  5 files changed, 182 insertions(+), 1 deletion(-)
>  create mode 100644 include/linux/ceph/string_table.h
>  create mode 100644 net/ceph/string_table.c
>
> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
> index e7975e4..8e9868d 100644
> --- a/include/linux/ceph/libceph.h
> +++ b/include/linux/ceph/libceph.h
> @@ -21,6 +21,7 @@
>  #include <linux/ceph/mon_client.h>
>  #include <linux/ceph/osd_client.h>
>  #include <linux/ceph/ceph_fs.h>
> +#include <linux/ceph/string_table.h>
>
>  /*
>   * mount options
> diff --git a/include/linux/ceph/string_table.h b/include/linux/ceph/string_table.h
> new file mode 100644
> index 0000000..427b6f9
> --- /dev/null
> +++ b/include/linux/ceph/string_table.h
> @@ -0,0 +1,58 @@
> +#ifndef _FS_CEPH_STRING_TABLE_H
> +#define _FS_CEPH_STRING_TABLE_H
> +
> +#include <linux/types.h>
> +#include <linux/kref.h>
> +#include <linux/rbtree.h>
> +#include <linux/rcupdate.h>
> +
> +struct ceph_string {
> +       struct kref kref;
> +       union {
> +               struct rb_node node;
> +               struct rcu_head rcu;
> +       };
> +       size_t len;
> +       char str[];
> +};
> +
> +extern void ceph_release_string(struct kref *ref);
> +extern struct ceph_string *ceph_find_or_create_string(const char *str,
> +                                                     size_t len);
> +extern void ceph_string_cleanup(void);
> +
> +static inline void ceph_get_string(struct ceph_string *str)
> +{
> +       kref_get(&str->kref);
> +}
> +
> +static inline void ceph_put_string(struct ceph_string *str)
> +{
> +       if (!str)
> +               return;
> +       kref_put(&str->kref, ceph_release_string);
> +}
> +
> +static inline int ceph_compare_string(struct ceph_string *cs,
> +                                     const char* str, size_t len)
> +{
> +       size_t cs_len = cs ? cs->len : 0;
> +       if (cs_len != len)
> +               return cs_len - len;
> +       if (len == 0)
> +               return 0;
> +       return strncmp(cs->str, str, len);
> +}
> +
> +#define ceph_try_get_string(x)                                 \
> +({                                                             \
> +       struct ceph_string *___str;                             \
> +       rcu_read_lock();                                        \
> +       ___str = rcu_dereference(x);                            \
> +       if (___str && !kref_get_unless_zero(&___str->kref))     \
> +               ___str = NULL;                                  \
> +       rcu_read_unlock();                                      \
> +       (___str);                                               \
> +})
> +
> +#endif
> diff --git a/net/ceph/Makefile b/net/ceph/Makefile
> index 958d9856..84cbed6 100644
> --- a/net/ceph/Makefile
> +++ b/net/ceph/Makefile
> @@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
>         crypto.o armor.o \
>         auth_x.o \
>         ceph_fs.o ceph_strings.o ceph_hash.o \
> -       pagevec.o snapshot.o
> +       pagevec.o snapshot.o string_table.o
>
> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
> index dcc18c6..baca649 100644
> --- a/net/ceph/ceph_common.c
> +++ b/net/ceph/ceph_common.c
> @@ -751,6 +751,7 @@ static void __exit exit_ceph_lib(void)
>         ceph_msgr_exit();
>         ceph_crypto_shutdown();
>         ceph_debugfs_cleanup();
> +       ceph_string_cleanup();
>  }
>
>  module_init(init_ceph_lib);
> diff --git a/net/ceph/string_table.c b/net/ceph/string_table.c
> new file mode 100644
> index 0000000..bb49bd7
> --- /dev/null
> +++ b/net/ceph/string_table.c
> @@ -0,0 +1,121 @@
> +#include <linux/slab.h>
> +#include <linux/gfp.h>
> +#include <linux/string.h>
> +#include <linux/spinlock.h>
> +#include <linux/ceph/string_table.h>
> +
> +static DEFINE_SPINLOCK(string_tree_lock);
> +static struct rb_root string_tree = RB_ROOT;
> +
> +struct ceph_string *ceph_find_or_create_string(const char* str, size_t len)
> +{
> +       struct ceph_string *cs, *exist;
> +       struct rb_node **p, *parent;
> +       int ret;
> +
> +       exist = NULL;
> +       spin_lock(&string_tree_lock);
> +       p = &string_tree.rb_node;
> +       while (*p) {
> +               exist = rb_entry(*p, struct ceph_string, node);
> +               ret = ceph_compare_string(exist, str, len);
> +               if (ret > 0)
> +                       p = &(*p)->rb_left;
> +               else if (ret < 0)
> +                       p = &(*p)->rb_right;
> +               else
> +                       break;
> +               exist = NULL;
> +       }
> +       if (exist && !kref_get_unless_zero(&exist->kref)) {
> +               rb_erase(&exist->node, &string_tree);
> +               RB_CLEAR_NODE(&exist->node);
> +               exist = NULL;
> +       }
> +       spin_unlock(&string_tree_lock);
> +       if (exist)
> +               return exist;
> +
> +       cs = kmalloc(sizeof(*cs) + len + 1, GFP_NOFS);
> +       if (!cs)
> +               return NULL;
> +
> +       kref_init(&cs->kref);
> +       cs->len = len;
> +       memcpy(cs->str, str, len);
> +       cs->str[len] = 0;
> +
> +retry:
> +       exist = NULL;
> +       parent = NULL;
> +       p = &string_tree.rb_node;
> +       spin_lock(&string_tree_lock);
> +       while (*p) {
> +               parent = *p;
> +               exist = rb_entry(*p, struct ceph_string, node);
> +               ret = ceph_compare_string(exist, str, len);
> +               if (ret > 0)
> +                       p = &(*p)->rb_left;
> +               else if (ret < 0)
> +                       p = &(*p)->rb_right;
> +               else
> +                       break;
> +               exist = NULL;
> +       }
> +       ret = 0;
> +       if (!exist) {
> +               rb_link_node(&cs->node, parent, p);
> +               rb_insert_color(&cs->node, &string_tree);
> +       } else if (!kref_get_unless_zero(&exist->kref)) {
> +               rb_erase(&exist->node, &string_tree);
> +               RB_CLEAR_NODE(&exist->node);
> +               ret = -EAGAIN;
> +       }
> +       spin_unlock(&string_tree_lock);
> +       if (ret == -EAGAIN)
> +               goto retry;
> +
> +       if (exist) {
> +               kfree(cs);
> +               cs = exist;
> +       }
> +
> +       return cs;
> +}
> +EXPORT_SYMBOL(ceph_find_or_create_string);
> +
> +static void ceph_free_string(struct rcu_head *head)
> +{
> +       struct ceph_string *cs = container_of(head, struct ceph_string, rcu);
> +       kfree(cs);
> +}
> +
> +void ceph_release_string(struct kref *ref)
> +{
> +       struct ceph_string *cs = container_of(ref, struct ceph_string, kref);
> +
> +       spin_lock(&string_tree_lock);
> +       if (!RB_EMPTY_NODE(&cs->node)) {
> +               rb_erase(&cs->node, &string_tree);
> +               RB_CLEAR_NODE(&cs->node);
> +       }
> +       spin_unlock(&string_tree_lock);
> +
> +       call_rcu(&cs->rcu, ceph_free_string);
> +}
> +EXPORT_SYMBOL(ceph_release_string);
> +
> +void ceph_string_cleanup(void)
> +{
> +       struct rb_node *p;
> +       struct ceph_string *cs;
> +       if (RB_EMPTY_ROOT(&string_tree))
> +               return;
> +
> +       pr_err("libceph: detect string leaks\n");
> +       while ((p = rb_first(&string_tree))) {
> +               cs = rb_entry(p, struct ceph_string, node);
> +               rb_erase(p, &string_tree);
> +               kfree(cs);
> +       }
> +}

Hi Zheng,

A one and a half line commit message and an equally short cover letter
for a series such as this isn't enough.  I *happen* to know that the
basic use case for namespaces in cephfs is going to be restricting
users to different parts of the directory tree, with the enforcement
happening in ceph on the server side, as opposed to in ceph on the
client side, but I would appreciate some details on what the actual
namespace names are going to be, whether it's user input or not,
whether there are plans to use namespaces for anything else, etc.

I don't like the idea of sharing namespace name strings between libceph
and ceph modules, especially with the strings themselves hosted in
libceph.  rbd has no use for namespaces, so libceph can live with
namespace names embedded into ceph_osd_request by value or with
a simple non-owning pointer, leaving reference counting to the outside
modules, if one of the use cases is "one namespace with a long name for
the entire directory tree" or something close to it.

I think the sharing infrastructure should be moved into cephfs, and
probably changed to share entire file layouts along the way.  I don't
know this code well enough to be sure, but it seems that by sharing
file layouts and making ci->i_layout an owning pointer you might be
able to piggy back on i_ceph_lock and considerably simplify the whole
thing by dropping RCU and eliminating numerous get/put calls.

Thanks,

                Ilya

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 3/6] libceph: rados pool namesapce support
  2016-02-06  7:00 ` [PATCH v2 3/6] libceph: rados pool namesapce support Yan, Zheng
@ 2016-03-22  6:11   ` Ilya Dryomov
       [not found]     ` <D7ADB539-766B-4E6F-B996-E90C5080D418@redhat.com>
  0 siblings, 1 reply; 33+ messages in thread
From: Ilya Dryomov @ 2016-03-22  6:11 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Ceph Development, Sage Weil

On Sat, Feb 6, 2016 at 8:00 AM, Yan, Zheng <zyan@redhat.com> wrote:
> Signed-off-by: Yan, Zheng <zyan@redhat.com>
> ---
>  drivers/block/rbd.c          |  1 +
>  fs/ceph/inode.c              |  3 +++
>  include/linux/ceph/ceph_fs.h |  2 ++
>  include/linux/ceph/osdmap.h  |  2 ++
>  net/ceph/osd_client.c        | 37 ++++++++++++++++++++++++++-----------
>  net/ceph/osdmap.c            | 33 +++++++++++++++++++++++++++------
>  6 files changed, 61 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
> index b0bcb2d..0423493 100644
> --- a/drivers/block/rbd.c
> +++ b/drivers/block/rbd.c
> @@ -4088,6 +4088,7 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
>         rbd_dev->layout.stripe_count = 1;
>         rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER;
>         rbd_dev->layout.pool_id = spec->pool_id;
> +       RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
>
>         /*
>          * If this is a mapping rbd_dev (as opposed to a parent one),
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index b0ad53d..3c220f1 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -396,6 +396,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>         ci->i_symlink = NULL;
>
>         memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
> +       RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
>
>         ci->i_fragtree = RB_ROOT;
>         mutex_init(&ci->i_fragtree_mutex);
> @@ -518,6 +519,8 @@ void ceph_destroy_inode(struct inode *inode)
>         if (ci->i_xattrs.prealloc_blob)
>                 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
>
> +       ceph_put_string(ci->i_layout.pool_ns);
> +
>         call_rcu(&inode->i_rcu, ceph_i_callback);
>  }
>
> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> index 7d8728e..3858923 100644
> --- a/include/linux/ceph/ceph_fs.h
> +++ b/include/linux/ceph/ceph_fs.h
> @@ -53,6 +53,7 @@ struct ceph_file_layout_legacy {
>         __le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
>  } __attribute__ ((packed));
>
> +struct ceph_string;
>  /*
>   * ceph_file_layout - describe data layout for a file/inode
>   */
> @@ -62,6 +63,7 @@ struct ceph_file_layout {
>         u32 stripe_count;  /* over this many objects */
>         u32 object_size;   /* until objects are this big */
>         s64 pool_id;        /* rados pool id */
> +       struct ceph_string __rcu *pool_ns; /* rados pool namespace */
>  };
>
>  extern int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
> diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
> index e55c08b..3d59d6c 100644
> --- a/include/linux/ceph/osdmap.h
> +++ b/include/linux/ceph/osdmap.h
> @@ -55,6 +55,7 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
>
>  struct ceph_object_locator {
>         s64 pool;
> +       struct ceph_string *pool_ns;
>  };
>
>  /*
> @@ -63,6 +64,7 @@ struct ceph_object_locator {
>   * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100)
>   */
>  #define CEPH_MAX_OID_NAME_LEN 100
> +#define CEPH_MAX_NAMESPACE_LEN 100
>
>  struct ceph_object_id {
>         char name[CEPH_MAX_OID_NAME_LEN];
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 450955e..68e7f68 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -339,6 +339,8 @@ static void ceph_osdc_release_request(struct kref *kref)
>                 kfree(req->r_ops);
>
>         ceph_put_snap_context(req->r_snapc);
> +       ceph_put_string(req->r_base_oloc.pool_ns);
> +
>         if (req->r_mempool)
>                 mempool_free(req, req->r_osdc->req_mempool);
>         else
> @@ -388,6 +390,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
>         req->r_num_ops = 0;
>         req->r_max_ops = num_ops;
>
> +       req->r_base_oloc.pool = -1;
> +       req->r_target_oloc.pool = -1;
> +
>         if (num_ops <= CEPH_OSD_INITIAL_OP) {
>                 req->r_ops = req->r_inline_ops;
>         } else {
> @@ -409,9 +414,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
>         INIT_LIST_HEAD(&req->r_req_lru_item);
>         INIT_LIST_HEAD(&req->r_osd_item);
>
> -       req->r_base_oloc.pool = -1;
> -       req->r_target_oloc.pool = -1;
> -
>         /* create reply message */
>         msg_size = OSD_OPREPLY_FRONT_LEN;
>         if (num_ops > CEPH_OSD_INITIAL_OP) {
> @@ -433,7 +435,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
>
>         /* create request message; allow space for oid */
>         msg_size = 4 + 4 + 8 + 8 + 4 + 8;
> -       msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
> +       msg_size += 2 + 4 + 8 + 4 + 4 + 4 + CEPH_MAX_NAMESPACE_LEN; /* oloc */
>         msg_size += 1 + 8 + 4 + 4;     /* pg_t */
>         msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
>         msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
> @@ -864,6 +866,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
>         }
>
>         req->r_base_oloc.pool = layout->pool_id;
> +       req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
>
>         snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
>                  "%llx.%08llx", vino.ino, objnum);
> @@ -1719,10 +1722,10 @@ static int ceph_oloc_decode(void **p, void *end,
>         }
>
>         if (struct_v >= 5) {
> -               len = ceph_decode_32(p);
> -               if (len > 0) {
> -                       pr_warn("ceph_object_locator::nspace is set\n");
> -                       goto e_inval;
> +               u32 ns_len = ceph_decode_32(p);
> +               if (ns_len > 0) {
> +                       ceph_decode_need(p, end, ns_len, e_inval);
> +                       *p += ns_len;
>                 }
>         }
>
> @@ -1907,7 +1910,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
>
>                 __unregister_request(osdc, req);
>
> -               req->r_target_oloc = redir.oloc; /* struct */
> +               req->r_target_oloc.pool = redir.oloc.pool;
>
>                 /*
>                  * Start redirect requests with nofail=true.  If
> @@ -2459,6 +2462,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
>                                 struct timespec *mtime)
>  {
>         struct ceph_msg *msg = req->r_request;
> +       struct ceph_string *pool_ns;
>         void *p;
>         size_t msg_size;
>         int flags = req->r_flags;
> @@ -2483,14 +2487,25 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
>         req->r_request_reassert_version = p;
>         p += sizeof(struct ceph_eversion); /* will get filled in */
>
> +       if (req->r_base_oloc.pool_ns)
> +               pool_ns = req->r_base_oloc.pool_ns;
> +       else
> +               pool_ns = NULL;
> +
>         /* oloc */
> +       ceph_encode_8(&p, 5);
>         ceph_encode_8(&p, 4);
> -       ceph_encode_8(&p, 4);
> -       ceph_encode_32(&p, 8 + 4 + 4);
> +       ceph_encode_32(&p, 8 + 4 + 4 + 4 + (pool_ns ? pool_ns->len : 0));
>         req->r_request_pool = p;
>         p += 8;
>         ceph_encode_32(&p, -1);  /* preferred */
>         ceph_encode_32(&p, 0);   /* key len */
> +       if (pool_ns) {
> +               ceph_encode_32(&p, pool_ns->len);
> +               ceph_encode_copy(&p, pool_ns->str, pool_ns->len);
> +       } else {
> +               ceph_encode_32(&p, 0);
> +       }
>
>         ceph_encode_8(&p, 1);
>         req->r_request_pgid = p;
> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
> index f033ca5..f117848 100644
> --- a/net/ceph/osdmap.c
> +++ b/net/ceph/osdmap.c
> @@ -1470,12 +1470,33 @@ int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
>         if (!pi)
>                 return -EIO;
>
> -       pg_out->pool = oloc->pool;
> -       pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
> -                                    oid->name_len);
> -
> -       dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
> -            pg_out->pool, pg_out->seed);
> +       if (!oloc->pool_ns) {
> +               pg_out->pool = oloc->pool;
> +               pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
> +                                            oid->name_len);
> +               dout("%s '%.*s' pgid %llu.%x\n", __func__,
> +                    oid->name_len, oid->name, pg_out->pool, pg_out->seed);
> +       } else {
> +               char stack_buf[256];
> +               char *buf = stack_buf;
> +               int nsl = oloc->pool_ns->len;
> +               size_t total = nsl + 1 + oid->name_len;
> +               if (total > sizeof(stack_buf)) {
> +                       buf = kmalloc(total, GFP_NOFS);
> +                       if (!buf)
> +                               return -ENOMEM;
> +               }

This ties into my question about how namespaces are going to be used
and how long the namespace name is allowed to be.

CEPH_MAX_NAMESPACE_LEN is defined to 100 above, but that definition is
removed in patch 5.  That needs fixing, and if the 100 char limit is
real, then buf can just be

    CEPH_MAX_OID_NAME_LEN + CEPH_MAX_NAMESPACE_LEN + 1

with no need for a kmalloc().

Thanks,

                Ilya

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-22  6:05   ` Ilya Dryomov
@ 2016-03-22  9:17     ` Yan, Zheng
  2016-03-22 11:00       ` Ilya Dryomov
  0 siblings, 1 reply; 33+ messages in thread
From: Yan, Zheng @ 2016-03-22  9:17 UTC (permalink / raw)
  To: Ilya Dryomov; +Cc: Ceph Development, Sage Weil


> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Sat, Feb 6, 2016 at 8:00 AM, Yan, Zheng <zyan@redhat.com> wrote:
>> The data structure is for storing namesapce string. It allows namespace
>> string to be shared by cephfs inodes with same layout.
>> 
>> Signed-off-by: Yan, Zheng <zyan@redhat.com>
>> ---
>> include/linux/ceph/libceph.h      |   1 +
>> include/linux/ceph/string_table.h |  58 ++++++++++++++++++
>> net/ceph/Makefile                 |   2 +-
>> net/ceph/ceph_common.c            |   1 +
>> net/ceph/string_table.c           | 121 ++++++++++++++++++++++++++++++++++++++
>> 5 files changed, 182 insertions(+), 1 deletion(-)
>> create mode 100644 include/linux/ceph/string_table.h
>> create mode 100644 net/ceph/string_table.c
>> 
>> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
>> index e7975e4..8e9868d 100644
>> --- a/include/linux/ceph/libceph.h
>> +++ b/include/linux/ceph/libceph.h
>> @@ -21,6 +21,7 @@
>> #include <linux/ceph/mon_client.h>
>> #include <linux/ceph/osd_client.h>
>> #include <linux/ceph/ceph_fs.h>
>> +#include <linux/ceph/string_table.h>
>> 
>> /*
>>  * mount options
>> diff --git a/include/linux/ceph/string_table.h b/include/linux/ceph/string_table.h
>> new file mode 100644
>> index 0000000..427b6f9
>> --- /dev/null
>> +++ b/include/linux/ceph/string_table.h
>> @@ -0,0 +1,58 @@
>> +#ifndef _FS_CEPH_STRING_TABLE_H
>> +#define _FS_CEPH_STRING_TABLE_H
>> +
>> +#include <linux/types.h>
>> +#include <linux/kref.h>
>> +#include <linux/rbtree.h>
>> +#include <linux/rcupdate.h>
>> +
>> +struct ceph_string {
>> +       struct kref kref;
>> +       union {
>> +               struct rb_node node;
>> +               struct rcu_head rcu;
>> +       };
>> +       size_t len;
>> +       char str[];
>> +};
>> +
>> +extern void ceph_release_string(struct kref *ref);
>> +extern struct ceph_string *ceph_find_or_create_string(const char *str,
>> +                                                     size_t len);
>> +extern void ceph_string_cleanup(void);
>> +
>> +static inline void ceph_get_string(struct ceph_string *str)
>> +{
>> +       kref_get(&str->kref);
>> +}
>> +
>> +static inline void ceph_put_string(struct ceph_string *str)
>> +{
>> +       if (!str)
>> +               return;
>> +       kref_put(&str->kref, ceph_release_string);
>> +}
>> +
>> +static inline int ceph_compare_string(struct ceph_string *cs,
>> +                                     const char* str, size_t len)
>> +{
>> +       size_t cs_len = cs ? cs->len : 0;
>> +       if (cs_len != len)
>> +               return cs_len - len;
>> +       if (len == 0)
>> +               return 0;
>> +       return strncmp(cs->str, str, len);
>> +}
>> +
>> +#define ceph_try_get_string(x)                                 \
>> +({                                                             \
>> +       struct ceph_string *___str;                             \
>> +       rcu_read_lock();                                        \
>> +       ___str = rcu_dereference(x);                            \
>> +       if (___str && !kref_get_unless_zero(&___str->kref))     \
>> +               ___str = NULL;                                  \
>> +       rcu_read_unlock();                                      \
>> +       (___str);                                               \
>> +})
>> +
>> +#endif
>> diff --git a/net/ceph/Makefile b/net/ceph/Makefile
>> index 958d9856..84cbed6 100644
>> --- a/net/ceph/Makefile
>> +++ b/net/ceph/Makefile
>> @@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
>>        crypto.o armor.o \
>>        auth_x.o \
>>        ceph_fs.o ceph_strings.o ceph_hash.o \
>> -       pagevec.o snapshot.o
>> +       pagevec.o snapshot.o string_table.o
>> 
>> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
>> index dcc18c6..baca649 100644
>> --- a/net/ceph/ceph_common.c
>> +++ b/net/ceph/ceph_common.c
>> @@ -751,6 +751,7 @@ static void __exit exit_ceph_lib(void)
>>        ceph_msgr_exit();
>>        ceph_crypto_shutdown();
>>        ceph_debugfs_cleanup();
>> +       ceph_string_cleanup();
>> }
>> 
>> module_init(init_ceph_lib);
>> diff --git a/net/ceph/string_table.c b/net/ceph/string_table.c
>> new file mode 100644
>> index 0000000..bb49bd7
>> --- /dev/null
>> +++ b/net/ceph/string_table.c
>> @@ -0,0 +1,121 @@
>> +#include <linux/slab.h>
>> +#include <linux/gfp.h>
>> +#include <linux/string.h>
>> +#include <linux/spinlock.h>
>> +#include <linux/ceph/string_table.h>
>> +
>> +static DEFINE_SPINLOCK(string_tree_lock);
>> +static struct rb_root string_tree = RB_ROOT;
>> +
>> +struct ceph_string *ceph_find_or_create_string(const char* str, size_t len)
>> +{
>> +       struct ceph_string *cs, *exist;
>> +       struct rb_node **p, *parent;
>> +       int ret;
>> +
>> +       exist = NULL;
>> +       spin_lock(&string_tree_lock);
>> +       p = &string_tree.rb_node;
>> +       while (*p) {
>> +               exist = rb_entry(*p, struct ceph_string, node);
>> +               ret = ceph_compare_string(exist, str, len);
>> +               if (ret > 0)
>> +                       p = &(*p)->rb_left;
>> +               else if (ret < 0)
>> +                       p = &(*p)->rb_right;
>> +               else
>> +                       break;
>> +               exist = NULL;
>> +       }
>> +       if (exist && !kref_get_unless_zero(&exist->kref)) {
>> +               rb_erase(&exist->node, &string_tree);
>> +               RB_CLEAR_NODE(&exist->node);
>> +               exist = NULL;
>> +       }
>> +       spin_unlock(&string_tree_lock);
>> +       if (exist)
>> +               return exist;
>> +
>> +       cs = kmalloc(sizeof(*cs) + len + 1, GFP_NOFS);
>> +       if (!cs)
>> +               return NULL;
>> +
>> +       kref_init(&cs->kref);
>> +       cs->len = len;
>> +       memcpy(cs->str, str, len);
>> +       cs->str[len] = 0;
>> +
>> +retry:
>> +       exist = NULL;
>> +       parent = NULL;
>> +       p = &string_tree.rb_node;
>> +       spin_lock(&string_tree_lock);
>> +       while (*p) {
>> +               parent = *p;
>> +               exist = rb_entry(*p, struct ceph_string, node);
>> +               ret = ceph_compare_string(exist, str, len);
>> +               if (ret > 0)
>> +                       p = &(*p)->rb_left;
>> +               else if (ret < 0)
>> +                       p = &(*p)->rb_right;
>> +               else
>> +                       break;
>> +               exist = NULL;
>> +       }
>> +       ret = 0;
>> +       if (!exist) {
>> +               rb_link_node(&cs->node, parent, p);
>> +               rb_insert_color(&cs->node, &string_tree);
>> +       } else if (!kref_get_unless_zero(&exist->kref)) {
>> +               rb_erase(&exist->node, &string_tree);
>> +               RB_CLEAR_NODE(&exist->node);
>> +               ret = -EAGAIN;
>> +       }
>> +       spin_unlock(&string_tree_lock);
>> +       if (ret == -EAGAIN)
>> +               goto retry;
>> +
>> +       if (exist) {
>> +               kfree(cs);
>> +               cs = exist;
>> +       }
>> +
>> +       return cs;
>> +}
>> +EXPORT_SYMBOL(ceph_find_or_create_string);
>> +
>> +static void ceph_free_string(struct rcu_head *head)
>> +{
>> +       struct ceph_string *cs = container_of(head, struct ceph_string, rcu);
>> +       kfree(cs);
>> +}
>> +
>> +void ceph_release_string(struct kref *ref)
>> +{
>> +       struct ceph_string *cs = container_of(ref, struct ceph_string, kref);
>> +
>> +       spin_lock(&string_tree_lock);
>> +       if (!RB_EMPTY_NODE(&cs->node)) {
>> +               rb_erase(&cs->node, &string_tree);
>> +               RB_CLEAR_NODE(&cs->node);
>> +       }
>> +       spin_unlock(&string_tree_lock);
>> +
>> +       call_rcu(&cs->rcu, ceph_free_string);
>> +}
>> +EXPORT_SYMBOL(ceph_release_string);
>> +
>> +void ceph_string_cleanup(void)
>> +{
>> +       struct rb_node *p;
>> +       struct ceph_string *cs;
>> +       if (RB_EMPTY_ROOT(&string_tree))
>> +               return;
>> +
>> +       pr_err("libceph: detect string leaks\n");
>> +       while ((p = rb_first(&string_tree))) {
>> +               cs = rb_entry(p, struct ceph_string, node);
>> +               rb_erase(p, &string_tree);
>> +               kfree(cs);
>> +       }
>> +}
> 
> Hi Zheng,
> 
> A one and a half line commit message and an equally short cover letter
> for a series such as this isn't enough.  I *happen* to know that the
> basic use case for namespaces in cephfs is going to be restricting
> users to different parts of the directory tree, with the enforcement
> happening in ceph on the server side, as opposed to in ceph on the
> client side, but I would appreciate some details on what the actual
> namespace names are going to be, whether it's user input or not,
> whether there are plans to use namespaces for anything else, etc.
> 

The namespace restriction you mentioned is for cephfs metadata. This namespace is restricting users to different namespaces in cephfs data pool. (At present, the only way to restrict data access is, creating multiple cephfs data pools, restrict users to different data pool. Creating lost of pools is not efficient for the cluster) 


> I don't like the idea of sharing namespace name strings between libceph
> and ceph modules, especially with the strings themselves hosted in
> libceph.  rbd has no use for namespaces, so libceph can live with
> namespace names embedded into ceph_osd_request by value or with
> a simple non-owning pointer, leaving reference counting to the outside
> modules, if one of the use cases is "one namespace with a long name for
> the entire directory tree" or something close to it.
> 
> I think the sharing infrastructure should be moved into cephfs, and
> probably changed to share entire file layouts along the way.  I don't
> know this code well enough to be sure, but it seems that by sharing
> file layouts and making ci->i_layout an owning pointer you might be
> able to piggy back on i_ceph_lock and considerably simplify the whole
> thing by dropping RCU and eliminating numerous get/put calls.

RBD may use namespace later.
http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support

The reason I use RCU here is that ci->i_layout.pool_ns can change at any time. For the same reason, using non-owning pointer for namespace or entire layout is unfeasible. Using embedded namespace is not elegant either. When allocating ceph_osd_request, cephfs needs to lock i_ceph_lock, copy namespace to a temporary buffer, unlock i_ceph_lock, pass ci->i_layout and the temporary buffer to the ceph_osdc_xxx_request().

Regards
Yan, Zheng



> Thanks,
> 
>                Ilya


^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 3/6] libceph: rados pool namesapce support
       [not found]       ` <CAOi1vP8MnAQSr9HQiOWKML8YED-ADNktqTtcGg6LYy3wEXA8LA@mail.gmail.com>
@ 2016-03-22  9:19         ` Ilya Dryomov
       [not found]         ` <9031B3CD-2132-4A0E-A2C9-0105A61B202E@redhat.com>
  1 sibling, 0 replies; 33+ messages in thread
From: Ilya Dryomov @ 2016-03-22  9:19 UTC (permalink / raw)
  To: Yan, Zheng, Ceph Development

On Tue, Mar 22, 2016 at 10:17 AM, Ilya Dryomov <idryomov@gmail.com> wrote:
> On Tue, Mar 22, 2016 at 8:52 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>
>>> On Mar 22, 2016, at 14:11, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>
>>> On Sat, Feb 6, 2016 at 8:00 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>> Signed-off-by: Yan, Zheng <zyan@redhat.com>
>>>> ---
>>>> drivers/block/rbd.c          |  1 +
>>>> fs/ceph/inode.c              |  3 +++
>>>> include/linux/ceph/ceph_fs.h |  2 ++
>>>> include/linux/ceph/osdmap.h  |  2 ++
>>>> net/ceph/osd_client.c        | 37 ++++++++++++++++++++++++++-----------
>>>> net/ceph/osdmap.c            | 33 +++++++++++++++++++++++++++------
>>>> 6 files changed, 61 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
>>>> index b0bcb2d..0423493 100644
>>>> --- a/drivers/block/rbd.c
>>>> +++ b/drivers/block/rbd.c
>>>> @@ -4088,6 +4088,7 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
>>>>        rbd_dev->layout.stripe_count = 1;
>>>>        rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER;
>>>>        rbd_dev->layout.pool_id = spec->pool_id;
>>>> +       RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
>>>>
>>>>        /*
>>>>         * If this is a mapping rbd_dev (as opposed to a parent one),
>>>> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
>>>> index b0ad53d..3c220f1 100644
>>>> --- a/fs/ceph/inode.c
>>>> +++ b/fs/ceph/inode.c
>>>> @@ -396,6 +396,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>>>>        ci->i_symlink = NULL;
>>>>
>>>>        memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
>>>> +       RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
>>>>
>>>>        ci->i_fragtree = RB_ROOT;
>>>>        mutex_init(&ci->i_fragtree_mutex);
>>>> @@ -518,6 +519,8 @@ void ceph_destroy_inode(struct inode *inode)
>>>>        if (ci->i_xattrs.prealloc_blob)
>>>>                ceph_buffer_put(ci->i_xattrs.prealloc_blob);
>>>>
>>>> +       ceph_put_string(ci->i_layout.pool_ns);
>>>> +
>>>>        call_rcu(&inode->i_rcu, ceph_i_callback);
>>>> }
>>>>
>>>> diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
>>>> index 7d8728e..3858923 100644
>>>> --- a/include/linux/ceph/ceph_fs.h
>>>> +++ b/include/linux/ceph/ceph_fs.h
>>>> @@ -53,6 +53,7 @@ struct ceph_file_layout_legacy {
>>>>        __le32 fl_pg_pool;      /* namespace, crush ruleset, rep level */
>>>> } __attribute__ ((packed));
>>>>
>>>> +struct ceph_string;
>>>> /*
>>>>  * ceph_file_layout - describe data layout for a file/inode
>>>>  */
>>>> @@ -62,6 +63,7 @@ struct ceph_file_layout {
>>>>        u32 stripe_count;  /* over this many objects */
>>>>        u32 object_size;   /* until objects are this big */
>>>>        s64 pool_id;        /* rados pool id */
>>>> +       struct ceph_string __rcu *pool_ns; /* rados pool namespace */
>>>> };
>>>>
>>>> extern int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
>>>> diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
>>>> index e55c08b..3d59d6c 100644
>>>> --- a/include/linux/ceph/osdmap.h
>>>> +++ b/include/linux/ceph/osdmap.h
>>>> @@ -55,6 +55,7 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
>>>>
>>>> struct ceph_object_locator {
>>>>        s64 pool;
>>>> +       struct ceph_string *pool_ns;
>>>> };
>>>>
>>>> /*
>>>> @@ -63,6 +64,7 @@ struct ceph_object_locator {
>>>>  * (probably outdated: must be >= RBD_MAX_MD_NAME_LEN -- currently 100)
>>>>  */
>>>> #define CEPH_MAX_OID_NAME_LEN 100
>>>> +#define CEPH_MAX_NAMESPACE_LEN 100
>>>>
>>>> struct ceph_object_id {
>>>>        char name[CEPH_MAX_OID_NAME_LEN];
>>>> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
>>>> index 450955e..68e7f68 100644
>>>> --- a/net/ceph/osd_client.c
>>>> +++ b/net/ceph/osd_client.c
>>>> @@ -339,6 +339,8 @@ static void ceph_osdc_release_request(struct kref *kref)
>>>>                kfree(req->r_ops);
>>>>
>>>>        ceph_put_snap_context(req->r_snapc);
>>>> +       ceph_put_string(req->r_base_oloc.pool_ns);
>>>> +
>>>>        if (req->r_mempool)
>>>>                mempool_free(req, req->r_osdc->req_mempool);
>>>>        else
>>>> @@ -388,6 +390,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
>>>>        req->r_num_ops = 0;
>>>>        req->r_max_ops = num_ops;
>>>>
>>>> +       req->r_base_oloc.pool = -1;
>>>> +       req->r_target_oloc.pool = -1;
>>>> +
>>>>        if (num_ops <= CEPH_OSD_INITIAL_OP) {
>>>>                req->r_ops = req->r_inline_ops;
>>>>        } else {
>>>> @@ -409,9 +414,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
>>>>        INIT_LIST_HEAD(&req->r_req_lru_item);
>>>>        INIT_LIST_HEAD(&req->r_osd_item);
>>>>
>>>> -       req->r_base_oloc.pool = -1;
>>>> -       req->r_target_oloc.pool = -1;
>>>> -
>>>>        /* create reply message */
>>>>        msg_size = OSD_OPREPLY_FRONT_LEN;
>>>>        if (num_ops > CEPH_OSD_INITIAL_OP) {
>>>> @@ -433,7 +435,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
>>>>
>>>>        /* create request message; allow space for oid */
>>>>        msg_size = 4 + 4 + 8 + 8 + 4 + 8;
>>>> -       msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
>>>> +       msg_size += 2 + 4 + 8 + 4 + 4 + 4 + CEPH_MAX_NAMESPACE_LEN; /* oloc */
>>>>        msg_size += 1 + 8 + 4 + 4;     /* pg_t */
>>>>        msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
>>>>        msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
>>>> @@ -864,6 +866,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
>>>>        }
>>>>
>>>>        req->r_base_oloc.pool = layout->pool_id;
>>>> +       req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
>>>>
>>>>        snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name),
>>>>                 "%llx.%08llx", vino.ino, objnum);
>>>> @@ -1719,10 +1722,10 @@ static int ceph_oloc_decode(void **p, void *end,
>>>>        }
>>>>
>>>>        if (struct_v >= 5) {
>>>> -               len = ceph_decode_32(p);
>>>> -               if (len > 0) {
>>>> -                       pr_warn("ceph_object_locator::nspace is set\n");
>>>> -                       goto e_inval;
>>>> +               u32 ns_len = ceph_decode_32(p);
>>>> +               if (ns_len > 0) {
>>>> +                       ceph_decode_need(p, end, ns_len, e_inval);
>>>> +                       *p += ns_len;
>>>>                }
>>>>        }
>>>>
>>>> @@ -1907,7 +1910,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
>>>>
>>>>                __unregister_request(osdc, req);
>>>>
>>>> -               req->r_target_oloc = redir.oloc; /* struct */
>>>> +               req->r_target_oloc.pool = redir.oloc.pool;
>>>>
>>>>                /*
>>>>                 * Start redirect requests with nofail=true.  If
>>>> @@ -2459,6 +2462,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
>>>>                                struct timespec *mtime)
>>>> {
>>>>        struct ceph_msg *msg = req->r_request;
>>>> +       struct ceph_string *pool_ns;
>>>>        void *p;
>>>>        size_t msg_size;
>>>>        int flags = req->r_flags;
>>>> @@ -2483,14 +2487,25 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
>>>>        req->r_request_reassert_version = p;
>>>>        p += sizeof(struct ceph_eversion); /* will get filled in */
>>>>
>>>> +       if (req->r_base_oloc.pool_ns)
>>>> +               pool_ns = req->r_base_oloc.pool_ns;
>>>> +       else
>>>> +               pool_ns = NULL;
>>>> +
>>>>        /* oloc */
>>>> +       ceph_encode_8(&p, 5);
>>>>        ceph_encode_8(&p, 4);
>>>> -       ceph_encode_8(&p, 4);
>>>> -       ceph_encode_32(&p, 8 + 4 + 4);
>>>> +       ceph_encode_32(&p, 8 + 4 + 4 + 4 + (pool_ns ? pool_ns->len : 0));
>>>>        req->r_request_pool = p;
>>>>        p += 8;
>>>>        ceph_encode_32(&p, -1);  /* preferred */
>>>>        ceph_encode_32(&p, 0);   /* key len */
>>>> +       if (pool_ns) {
>>>> +               ceph_encode_32(&p, pool_ns->len);
>>>> +               ceph_encode_copy(&p, pool_ns->str, pool_ns->len);
>>>> +       } else {
>>>> +               ceph_encode_32(&p, 0);
>>>> +       }
>>>>
>>>>        ceph_encode_8(&p, 1);
>>>>        req->r_request_pgid = p;
>>>> diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
>>>> index f033ca5..f117848 100644
>>>> --- a/net/ceph/osdmap.c
>>>> +++ b/net/ceph/osdmap.c
>>>> @@ -1470,12 +1470,33 @@ int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
>>>>        if (!pi)
>>>>                return -EIO;
>>>>
>>>> -       pg_out->pool = oloc->pool;
>>>> -       pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
>>>> -                                    oid->name_len);
>>>> -
>>>> -       dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name,
>>>> -            pg_out->pool, pg_out->seed);
>>>> +       if (!oloc->pool_ns) {
>>>> +               pg_out->pool = oloc->pool;
>>>> +               pg_out->seed = ceph_str_hash(pi->object_hash, oid->name,
>>>> +                                            oid->name_len);
>>>> +               dout("%s '%.*s' pgid %llu.%x\n", __func__,
>>>> +                    oid->name_len, oid->name, pg_out->pool, pg_out->seed);
>>>> +       } else {
>>>> +               char stack_buf[256];
>>>> +               char *buf = stack_buf;
>>>> +               int nsl = oloc->pool_ns->len;
>>>> +               size_t total = nsl + 1 + oid->name_len;
>>>> +               if (total > sizeof(stack_buf)) {
>>>> +                       buf = kmalloc(total, GFP_NOFS);
>>>> +                       if (!buf)
>>>> +                               return -ENOMEM;
>>>> +               }
>>>
>>> This ties into my question about how namespaces are going to be used
>>> and how long the namespace name is allowed to be.
>>>
>>> CEPH_MAX_NAMESPACE_LEN is defined to 100 above, but that definition is
>>> removed in patch 5.  That needs fixing, and if the 100 char limit is
>>> real, then buf can just be
>>>
>>>    CEPH_MAX_OID_NAME_LEN + CEPH_MAX_NAMESPACE_LEN + 1
>>>
>>> with no need for a kmalloc().
>>
>> CEPH_MAX_NAMESPACE_LEN is a intermediate variable for splitting patches (make individual patch be able to compile). As I know there is no limitation on namespace length.

(adding ceph-devel back)

>
> To me, that's indication of a poorly structured series.
>
> I understand that it's just a std::string in userspace and so there
> isn't a limit as such.  Same goes for OIDs, but we do limit those in
> the kernel client.  Can we do the same for namespace names?

Thanks,

                Ilya

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-22  9:17     ` Yan, Zheng
@ 2016-03-22 11:00       ` Ilya Dryomov
  2016-03-22 13:57         ` Yan, Zheng
  0 siblings, 1 reply; 33+ messages in thread
From: Ilya Dryomov @ 2016-03-22 11:00 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Ceph Development, Sage Weil

On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>
>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:

[ snip ]

>> Hi Zheng,
>>
>> A one and a half line commit message and an equally short cover letter
>> for a series such as this isn't enough.  I *happen* to know that the
>> basic use case for namespaces in cephfs is going to be restricting
>> users to different parts of the directory tree, with the enforcement
>> happening in ceph on the server side, as opposed to in ceph on the
>> client side, but I would appreciate some details on what the actual
>> namespace names are going to be, whether it's user input or not,
>> whether there are plans to use namespaces for anything else, etc.
>>
>
> The namespace restriction you mentioned is for cephfs metadata. This
> namespace is restricting users to different namespaces in cephfs data
> pool. (At present, the only way to restrict data access is, creating
> multiple cephfs data pools, restrict users to different data pool.
> Creating lost of pools is not efficient for the cluster)

What about the namespace names, who is generating them, how long are
they going to be?  Please describe in detail how this is going to work
for both data and metadata pools.

>
>
>> I don't like the idea of sharing namespace name strings between libceph
>> and ceph modules, especially with the strings themselves hosted in
>> libceph.  rbd has no use for namespaces, so libceph can live with
>> namespace names embedded into ceph_osd_request by value or with
>> a simple non-owning pointer, leaving reference counting to the outside
>> modules, if one of the use cases is "one namespace with a long name for
>> the entire directory tree" or something close to it.
>>
>> I think the sharing infrastructure should be moved into cephfs, and
>> probably changed to share entire file layouts along the way.  I don't
>> know this code well enough to be sure, but it seems that by sharing
>> file layouts and making ci->i_layout an owning pointer you might be
>> able to piggy back on i_ceph_lock and considerably simplify the whole
>> thing by dropping RCU and eliminating numerous get/put calls.
>
> RBD may use namespace later.
> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support

Well, compared to cephfs, it's hard to call that "using" - in that
case, there will only ever be one namespace per image.  My point is
it's never going to use the string sharing infrastructure and is fine
with a non-owning pointer to a string in the file layout field of the
in-memory rbd image header.

>
> The reason I use RCU here is that ci->i_layout.pool_ns can change at
> any time. For the same reason, using non-owning pointer for namespace
> or entire layout is unfeasible. Using embedded namespace is not
> elegant either. When allocating ceph_osd_request, cephfs needs to
> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
> ceph_osdc_xxx_request().

Hmm, RCU doesn't protect you from pool_ns or other file layout fields
changing while the OSD request is in flight.  As used above, it allows
ceph_try_get_string() to not take any locks and that's it.

Why exactly can't file layouts be shared?  ci->i_layout would become
reference counted and you would give libceph a pointer to the pool_ns
string (or entire layout) after bumping it.  It doesn't matter if
pool_ns or the rest of the layout changes due to a cap grant or revoke
while libceph is servicing the OSD request: you would unlink it from
the tree but the bumped reference will keep the layout around, to be
put in the OSD request completion callback or so.  Layout lookup would
have to happen in exactly two places: when newing an inode and handling
cap grant/revoke, in other places you would simply bump the count on
the basis of already holding a valid pointer.  You wouldn't have to
unlink in the destructor, so no hassle with kref_get_unless_zero() and
no need for RCU, with i_ceph_lock providing the exclusion around the
tree.

Like I said, I'm not familiar with the code in question at the
necessary level of detail, so feel free to destroy this, but I don't
see a problem.

Thanks,

                Ilya

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 3/6] libceph: rados pool namesapce support
       [not found]         ` <9031B3CD-2132-4A0E-A2C9-0105A61B202E@redhat.com>
@ 2016-03-22 11:03           ` Ilya Dryomov
  0 siblings, 0 replies; 33+ messages in thread
From: Ilya Dryomov @ 2016-03-22 11:03 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Ceph Development

On Tue, Mar 22, 2016 at 10:30 AM, Yan, Zheng <zyan@redhat.com> wrote:
>
>> On Mar 22, 2016, at 17:17, Ilya Dryomov <idryomov@gmail.com> wrote:
>>
>> On Tue, Mar 22, 2016 at 8:52 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>
>>>> On Mar 22, 2016, at 14:11, Ilya Dryomov <idryomov@gmail.com> wrote:

[ snip ]

>>>> This ties into my question about how namespaces are going to be used
>>>> and how long the namespace name is allowed to be.
>>>>
>>>> CEPH_MAX_NAMESPACE_LEN is defined to 100 above, but that definition is
>>>> removed in patch 5.  That needs fixing, and if the 100 char limit is
>>>> real, then buf can just be
>>>>
>>>>   CEPH_MAX_OID_NAME_LEN + CEPH_MAX_NAMESPACE_LEN + 1
>>>>
>>>> with no need for a kmalloc().
>>>
>>> CEPH_MAX_NAMESPACE_LEN is a intermediate variable for splitting patches (make individual patch be able to compile). As I know there is no limitation on namespace length.
>>
>> To me, that's indication of a poorly structured series.
>>
>> I understand that it's just a std::string in userspace and so there
>> isn't a limit as such.  Same goes for OIDs, but we do limit those in
>> the kernel client.  Can we do the same for namespace names?
>
> We can. But it’s irrelevance for this series, I can squash this patch and patch 5.

On the contrary, it's very relevant.  It answers whether embedding
namespace names by value into ceph_osd_request is feasible.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-22 11:00       ` Ilya Dryomov
@ 2016-03-22 13:57         ` Yan, Zheng
  2016-03-22 16:13           ` Ilya Dryomov
  0 siblings, 1 reply; 33+ messages in thread
From: Yan, Zheng @ 2016-03-22 13:57 UTC (permalink / raw)
  To: Ilya Dryomov; +Cc: Ceph Development, Sage Weil


> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>> 
>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> [ snip ]
> 
>>> Hi Zheng,
>>> 
>>> A one and a half line commit message and an equally short cover letter
>>> for a series such as this isn't enough.  I *happen* to know that the
>>> basic use case for namespaces in cephfs is going to be restricting
>>> users to different parts of the directory tree, with the enforcement
>>> happening in ceph on the server side, as opposed to in ceph on the
>>> client side, but I would appreciate some details on what the actual
>>> namespace names are going to be, whether it's user input or not,
>>> whether there are plans to use namespaces for anything else, etc.
>>> 
>> 
>> The namespace restriction you mentioned is for cephfs metadata. This
>> namespace is restricting users to different namespaces in cephfs data
>> pool. (At present, the only way to restrict data access is, creating
>> multiple cephfs data pools, restrict users to different data pool.
>> Creating lost of pools is not efficient for the cluster)
> 
> What about the namespace names, who is generating them, how long are
> they going to be?  Please describe in detail how this is going to work
> for both data and metadata pools.

For example, to restrict user foo to directory /foo_dir

// create auth caps for user foo.
ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns’

// mount cephfs by user admin
mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx

// set directory’s layout.pool_namespace
setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir

Admin user chooses namespace name. In most cases, namespace name does not change. 

> 
>> 
>> 
>>> I don't like the idea of sharing namespace name strings between libceph
>>> and ceph modules, especially with the strings themselves hosted in
>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>> namespace names embedded into ceph_osd_request by value or with
>>> a simple non-owning pointer, leaving reference counting to the outside
>>> modules, if one of the use cases is "one namespace with a long name for
>>> the entire directory tree" or something close to it.
>>> 
>>> I think the sharing infrastructure should be moved into cephfs, and
>>> probably changed to share entire file layouts along the way.  I don't
>>> know this code well enough to be sure, but it seems that by sharing
>>> file layouts and making ci->i_layout an owning pointer you might be
>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>> thing by dropping RCU and eliminating numerous get/put calls.
>> 
>> RBD may use namespace later.
>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
> 
> Well, compared to cephfs, it's hard to call that "using" - in that
> case, there will only ever be one namespace per image.  My point is
> it's never going to use the string sharing infrastructure and is fine
> with a non-owning pointer to a string in the file layout field of the
> in-memory rbd image header.
> 
>> 
>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>> any time. For the same reason, using non-owning pointer for namespace
>> or entire layout is unfeasible. Using embedded namespace is not
>> elegant either. When allocating ceph_osd_request, cephfs needs to
>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>> ceph_osdc_xxx_request().
> 
> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
> changing while the OSD request is in flight.  As used above, it allows
> ceph_try_get_string() to not take any locks and that's it.

Yes.  But not taking lock simplify the code a lot.  we don't need to lock/unlock i_ceph_lock each time i_layout is used.

> 
> Why exactly can't file layouts be shared?  ci->i_layout would become
> reference counted and you would give libceph a pointer to the pool_ns
> string (or entire layout) after bumping it.  It doesn't matter if
> pool_ns or the rest of the layout changes due to a cap grant or revoke
> while libceph is servicing the OSD request: you would unlink it from
> the tree but the bumped reference will keep the layout around, to be
> put in the OSD request completion callback or so.  Layout lookup would
> have to happen in exactly two places: when newing an inode and handling
> cap grant/revoke, in other places you would simply bump the count on
> the basis of already holding a valid pointer.  You wouldn't have to
> unlink in the destructor, so no hassle with kref_get_unless_zero() and
> no need for RCU, with i_ceph_lock providing the exclusion around the
> tree.

This means cephfs needs to set r_callback for all ceph_osd_request, ceph_osd_request also needs a new field to store layout pointer. I don’t think it’s better/simpler than reference counted namespace string. 

> 
> Like I said, I'm not familiar with the code in question at the
> necessary level of detail, so feel free to destroy this, but I don't
> see a problem.
> 
> Thanks,
> 
>                Ilya

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-22 13:57         ` Yan, Zheng
@ 2016-03-22 16:13           ` Ilya Dryomov
  2016-03-22 18:46             ` Gregory Farnum
  2016-03-23  3:37             ` Yan, Zheng
  0 siblings, 2 replies; 33+ messages in thread
From: Ilya Dryomov @ 2016-03-22 16:13 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Ceph Development, Sage Weil

On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>
>> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
>>
>> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>
>>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
>>
>> [ snip ]
>>
>>>> Hi Zheng,
>>>>
>>>> A one and a half line commit message and an equally short cover letter
>>>> for a series such as this isn't enough.  I *happen* to know that the
>>>> basic use case for namespaces in cephfs is going to be restricting
>>>> users to different parts of the directory tree, with the enforcement
>>>> happening in ceph on the server side, as opposed to in ceph on the
>>>> client side, but I would appreciate some details on what the actual
>>>> namespace names are going to be, whether it's user input or not,
>>>> whether there are plans to use namespaces for anything else, etc.
>>>>
>>>
>>> The namespace restriction you mentioned is for cephfs metadata. This
>>> namespace is restricting users to different namespaces in cephfs data
>>> pool. (At present, the only way to restrict data access is, creating
>>> multiple cephfs data pools, restrict users to different data pool.
>>> Creating lost of pools is not efficient for the cluster)
>>
>> What about the namespace names, who is generating them, how long are
>> they going to be?  Please describe in detail how this is going to work
>> for both data and metadata pools.
>
> For example, to restrict user foo to directory /foo_dir
>
> // create auth caps for user foo.
> ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns’
>
> // mount cephfs by user admin
> mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx
>
> // set directory’s layout.pool_namespace
> setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir
>
> Admin user chooses namespace name. In most cases, namespace name does not change.

Good, I guess limiting it to 100 chars (or maybe even a smaller
number) is sensible then.

>
>>
>>>
>>>
>>>> I don't like the idea of sharing namespace name strings between libceph
>>>> and ceph modules, especially with the strings themselves hosted in
>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>> namespace names embedded into ceph_osd_request by value or with
>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>> modules, if one of the use cases is "one namespace with a long name for
>>>> the entire directory tree" or something close to it.
>>>>
>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>> probably changed to share entire file layouts along the way.  I don't
>>>> know this code well enough to be sure, but it seems that by sharing
>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>
>>> RBD may use namespace later.
>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>
>> Well, compared to cephfs, it's hard to call that "using" - in that
>> case, there will only ever be one namespace per image.  My point is
>> it's never going to use the string sharing infrastructure and is fine
>> with a non-owning pointer to a string in the file layout field of the
>> in-memory rbd image header.
>>
>>>
>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>> any time. For the same reason, using non-owning pointer for namespace
>>> or entire layout is unfeasible. Using embedded namespace is not
>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>> ceph_osdc_xxx_request().
>>
>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>> changing while the OSD request is in flight.  As used above, it allows
>> ceph_try_get_string() to not take any locks and that's it.
>
> Yes.  But not taking lock simplify the code a lot.  we don't need to
> lock/unlock i_ceph_lock each time i_layout is used.

I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
the code base a simplification, but, more importantly, is keeping the
pool_ns pointer valid really all you need?  Shouldn't there be some
kind of synchronization around "OK, I'm switching to a new layout for
this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
with two successive calls to ceph_osdc_new_request() potentially ending
up with two different namespaces, e.g. ceph_uninline_data().

>
>>
>> Why exactly can't file layouts be shared?  ci->i_layout would become
>> reference counted and you would give libceph a pointer to the pool_ns
>> string (or entire layout) after bumping it.  It doesn't matter if
>> pool_ns or the rest of the layout changes due to a cap grant or revoke
>> while libceph is servicing the OSD request: you would unlink it from
>> the tree but the bumped reference will keep the layout around, to be
>> put in the OSD request completion callback or so.  Layout lookup would
>> have to happen in exactly two places: when newing an inode and handling
>> cap grant/revoke, in other places you would simply bump the count on
>> the basis of already holding a valid pointer.  You wouldn't have to
>> unlink in the destructor, so no hassle with kref_get_unless_zero() and
>> no need for RCU, with i_ceph_lock providing the exclusion around the
>> tree.
>
> This means cephfs needs to set r_callback for all ceph_osd_request,
> ceph_osd_request also needs a new field to store layout pointer.
> I don’t think it’s better/simpler than reference counted namespace
> string.

Not necessarily - you can put after ceph_osdc_wait_request() returns.
Somewhat unrelated, I'm working on refactoring osdc's handle_reply(),
and it'll probably be required that all OSD requests set one of the
callbacks, except for stateless fire-and-forget ones.

Sharing ->i_layout as opposed to ->i_layout->pool_ns seemed sensible to
me because a) it naturally hangs off of ceph inode and b) logically,
it is entire layouts and not just namespaces that are shared across the
directory tree.  If you think reference counted pool_ns strings are
better, I won't argue with that, but, with cephfs being the only user
of either solution, it'll have to live in fs/ceph.

Separately, I think passing non-owning pool_ns pointers into libceph is
worth exploring, but if that doesn't easily map onto cephfs lifetime or
ownership rules, we will go with embedding namespace names by value into
ceph_osd_request (or, rather, ceph_object_locator).

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-22 16:13           ` Ilya Dryomov
@ 2016-03-22 18:46             ` Gregory Farnum
  2016-03-23  3:37             ` Yan, Zheng
  1 sibling, 0 replies; 33+ messages in thread
From: Gregory Farnum @ 2016-03-22 18:46 UTC (permalink / raw)
  To: Ilya Dryomov; +Cc: Yan, Zheng, Ceph Development, Sage Weil

On Tue, Mar 22, 2016 at 9:13 AM, Ilya Dryomov <idryomov@gmail.com> wrote:
> On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>> I don't like the idea of sharing namespace name strings between libceph
>>>>> and ceph modules, especially with the strings themselves hosted in
>>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>>> namespace names embedded into ceph_osd_request by value or with
>>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>>> modules, if one of the use cases is "one namespace with a long name for
>>>>> the entire directory tree" or something close to it.
>>>>>
>>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>>> probably changed to share entire file layouts along the way.  I don't
>>>>> know this code well enough to be sure, but it seems that by sharing
>>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>>
>>>> RBD may use namespace later.
>>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>>
>>> Well, compared to cephfs, it's hard to call that "using" - in that
>>> case, there will only ever be one namespace per image.  My point is
>>> it's never going to use the string sharing infrastructure and is fine
>>> with a non-owning pointer to a string in the file layout field of the
>>> in-memory rbd image header.
>>>
>>>>
>>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>>> any time. For the same reason, using non-owning pointer for namespace
>>>> or entire layout is unfeasible. Using embedded namespace is not
>>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>>> ceph_osdc_xxx_request().
>>>
>>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>>> changing while the OSD request is in flight.  As used above, it allows
>>> ceph_try_get_string() to not take any locks and that's it.
>>
>> Yes.  But not taking lock simplify the code a lot.  we don't need to
>> lock/unlock i_ceph_lock each time i_layout is used.
>
> I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
> the code base a simplification, but, more importantly, is keeping the
> pool_ns pointer valid really all you need?  Shouldn't there be some
> kind of synchronization around "OK, I'm switching to a new layout for
> this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
> with two successive calls to ceph_osdc_new_request() potentially ending
> up with two different namespaces, e.g. ceph_uninline_data().

I haven't looked at the code at all, but this part is really confusing
me. Namespaces can't be changed arbitrarily, but when they are, a
subsequent write really needs to use the updated namespace! It needs
an atomic switch, not a lazy update. Is there some point at which
out-of-date users get updated prior to actually sending off OSD ops?
-Greg

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-22 16:13           ` Ilya Dryomov
  2016-03-22 18:46             ` Gregory Farnum
@ 2016-03-23  3:37             ` Yan, Zheng
  2016-03-23  9:51               ` Ilya Dryomov
  1 sibling, 1 reply; 33+ messages in thread
From: Yan, Zheng @ 2016-03-23  3:37 UTC (permalink / raw)
  To: Ilya Dryomov, Gregory Farnum; +Cc: Ceph Development, Sage Weil


> On Mar 23, 2016, at 00:13, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>> 
>>> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
>>> 
>>> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>> 
>>>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
>>> 
>>> [ snip ]
>>> 
>>>>> Hi Zheng,
>>>>> 
>>>>> A one and a half line commit message and an equally short cover letter
>>>>> for a series such as this isn't enough.  I *happen* to know that the
>>>>> basic use case for namespaces in cephfs is going to be restricting
>>>>> users to different parts of the directory tree, with the enforcement
>>>>> happening in ceph on the server side, as opposed to in ceph on the
>>>>> client side, but I would appreciate some details on what the actual
>>>>> namespace names are going to be, whether it's user input or not,
>>>>> whether there are plans to use namespaces for anything else, etc.
>>>>> 
>>>> 
>>>> The namespace restriction you mentioned is for cephfs metadata. This
>>>> namespace is restricting users to different namespaces in cephfs data
>>>> pool. (At present, the only way to restrict data access is, creating
>>>> multiple cephfs data pools, restrict users to different data pool.
>>>> Creating lost of pools is not efficient for the cluster)
>>> 
>>> What about the namespace names, who is generating them, how long are
>>> they going to be?  Please describe in detail how this is going to work
>>> for both data and metadata pools.
>> 
>> For example, to restrict user foo to directory /foo_dir
>> 
>> // create auth caps for user foo.
>> ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns’
>> 
>> // mount cephfs by user admin
>> mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx
>> 
>> // set directory’s layout.pool_namespace
>> setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir
>> 
>> Admin user chooses namespace name. In most cases, namespace name does not change.
> 
> Good, I guess limiting it to 100 chars (or maybe even a smaller
> number) is sensible then.
> 
>> 
>>> 
>>>> 
>>>> 
>>>>> I don't like the idea of sharing namespace name strings between libceph
>>>>> and ceph modules, especially with the strings themselves hosted in
>>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>>> namespace names embedded into ceph_osd_request by value or with
>>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>>> modules, if one of the use cases is "one namespace with a long name for
>>>>> the entire directory tree" or something close to it.
>>>>> 
>>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>>> probably changed to share entire file layouts along the way.  I don't
>>>>> know this code well enough to be sure, but it seems that by sharing
>>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>> 
>>>> RBD may use namespace later.
>>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>> 
>>> Well, compared to cephfs, it's hard to call that "using" - in that
>>> case, there will only ever be one namespace per image.  My point is
>>> it's never going to use the string sharing infrastructure and is fine
>>> with a non-owning pointer to a string in the file layout field of the
>>> in-memory rbd image header.
>>> 
>>>> 
>>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>>> any time. For the same reason, using non-owning pointer for namespace
>>>> or entire layout is unfeasible. Using embedded namespace is not
>>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>>> ceph_osdc_xxx_request().
>>> 
>>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>>> changing while the OSD request is in flight.  As used above, it allows
>>> ceph_try_get_string() to not take any locks and that's it.
>> 
>> Yes.  But not taking lock simplify the code a lot.  we don't need to
>> lock/unlock i_ceph_lock each time i_layout is used.
> 
> I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
> the code base a simplification, but, more importantly, is keeping the
> pool_ns pointer valid really all you need?  Shouldn't there be some
> kind of synchronization around "OK, I'm switching to a new layout for
> this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
> with two successive calls to ceph_osdc_new_request() potentially ending
> up with two different namespaces, e.g. ceph_uninline_data().

There is synchronisation. When changing file layout, MDS revokes Frw caps from client (block new read/write, for in-progress read/write). But this synchronisation is not complete reliable when client session state is toggled between stale and active.

>>> 
>>> Why exactly can't file layouts be shared?  ci->i_layout would become
>>> reference counted and you would give libceph a pointer to the pool_ns
>>> string (or entire layout) after bumping it.  It doesn't matter if
>>> pool_ns or the rest of the layout changes due to a cap grant or revoke
>>> while libceph is servicing the OSD request: you would unlink it from
>>> the tree but the bumped reference will keep the layout around, to be
>>> put in the OSD request completion callback or so.  Layout lookup would
>>> have to happen in exactly two places: when newing an inode and handling
>>> cap grant/revoke, in other places you would simply bump the count on
>>> the basis of already holding a valid pointer.  You wouldn't have to
>>> unlink in the destructor, so no hassle with kref_get_unless_zero() and
>>> no need for RCU, with i_ceph_lock providing the exclusion around the
>>> tree.
>> 
>> This means cephfs needs to set r_callback for all ceph_osd_request,
>> ceph_osd_request also needs a new field to store layout pointer.
>> I don’t think it’s better/simpler than reference counted namespace
>> string.
> 
> Not necessarily - you can put after ceph_osdc_wait_request() returns.
> Somewhat unrelated, I'm working on refactoring osdc's handle_reply(),
> and it'll probably be required that all OSD requests set one of the
> callbacks, except for stateless fire-and-forget ones.

For the r_callback case (no wait case), without saving a pointer in ceph_osd_request, how can I know which layout to put?

> 
> Sharing ->i_layout as opposed to ->i_layout->pool_ns seemed sensible to
> me because a) it naturally hangs off of ceph inode and b) logically,
> it is entire layouts and not just namespaces that are shared across the
> directory tree.  If you think reference counted pool_ns strings are
> better, I won't argue with that, but, with cephfs being the only user
> of either solution, it'll have to live in fs/ceph.

I’m OK with both approaches. When sharing i_layout, we need to add a layout pointer to ceph_osd_request. After adding the layout pointer, why not let libceph release it when request finishes. 

> Separately, I think passing non-owning pool_ns pointers into libceph is
> worth exploring, but if that doesn't easily map onto cephfs lifetime or
> ownership rules, we will go with embedding namespace names by value into
> ceph_osd_request (or, rather, ceph_object_locator).
> 

As I stated in previous mail, embedded namespace is nightmare for cephfs. Every time namespace is used,  cephfs needs to lock i_ceph_lock, copy namespace to a temporary buffer.

Regards
Yan, Zheng

> Thanks,
> 
>                Ilya

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-23  3:37             ` Yan, Zheng
@ 2016-03-23  9:51               ` Ilya Dryomov
  2016-03-23 13:02                 ` Yan, Zheng
  0 siblings, 1 reply; 33+ messages in thread
From: Ilya Dryomov @ 2016-03-23  9:51 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Gregory Farnum, Ceph Development, Sage Weil

On Wed, Mar 23, 2016 at 4:37 AM, Yan, Zheng <zyan@redhat.com> wrote:
>
>> On Mar 23, 2016, at 00:13, Ilya Dryomov <idryomov@gmail.com> wrote:
>>
>> On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>
>>>> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>
>>>> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>
>>>>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>
>>>> [ snip ]
>>>>
>>>>>> Hi Zheng,
>>>>>>
>>>>>> A one and a half line commit message and an equally short cover letter
>>>>>> for a series such as this isn't enough.  I *happen* to know that the
>>>>>> basic use case for namespaces in cephfs is going to be restricting
>>>>>> users to different parts of the directory tree, with the enforcement
>>>>>> happening in ceph on the server side, as opposed to in ceph on the
>>>>>> client side, but I would appreciate some details on what the actual
>>>>>> namespace names are going to be, whether it's user input or not,
>>>>>> whether there are plans to use namespaces for anything else, etc.
>>>>>>
>>>>>
>>>>> The namespace restriction you mentioned is for cephfs metadata. This
>>>>> namespace is restricting users to different namespaces in cephfs data
>>>>> pool. (At present, the only way to restrict data access is, creating
>>>>> multiple cephfs data pools, restrict users to different data pool.
>>>>> Creating lost of pools is not efficient for the cluster)
>>>>
>>>> What about the namespace names, who is generating them, how long are
>>>> they going to be?  Please describe in detail how this is going to work
>>>> for both data and metadata pools.
>>>
>>> For example, to restrict user foo to directory /foo_dir
>>>
>>> // create auth caps for user foo.
>>> ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns’
>>>
>>> // mount cephfs by user admin
>>> mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx
>>>
>>> // set directory’s layout.pool_namespace
>>> setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir
>>>
>>> Admin user chooses namespace name. In most cases, namespace name does not change.
>>
>> Good, I guess limiting it to 100 chars (or maybe even a smaller
>> number) is sensible then.
>>
>>>
>>>>
>>>>>
>>>>>
>>>>>> I don't like the idea of sharing namespace name strings between libceph
>>>>>> and ceph modules, especially with the strings themselves hosted in
>>>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>>>> namespace names embedded into ceph_osd_request by value or with
>>>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>>>> modules, if one of the use cases is "one namespace with a long name for
>>>>>> the entire directory tree" or something close to it.
>>>>>>
>>>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>>>> probably changed to share entire file layouts along the way.  I don't
>>>>>> know this code well enough to be sure, but it seems that by sharing
>>>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>>>
>>>>> RBD may use namespace later.
>>>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>>>
>>>> Well, compared to cephfs, it's hard to call that "using" - in that
>>>> case, there will only ever be one namespace per image.  My point is
>>>> it's never going to use the string sharing infrastructure and is fine
>>>> with a non-owning pointer to a string in the file layout field of the
>>>> in-memory rbd image header.
>>>>
>>>>>
>>>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>>>> any time. For the same reason, using non-owning pointer for namespace
>>>>> or entire layout is unfeasible. Using embedded namespace is not
>>>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>>>> ceph_osdc_xxx_request().
>>>>
>>>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>>>> changing while the OSD request is in flight.  As used above, it allows
>>>> ceph_try_get_string() to not take any locks and that's it.
>>>
>>> Yes.  But not taking lock simplify the code a lot.  we don't need to
>>> lock/unlock i_ceph_lock each time i_layout is used.
>>
>> I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
>> the code base a simplification, but, more importantly, is keeping the
>> pool_ns pointer valid really all you need?  Shouldn't there be some
>> kind of synchronization around "OK, I'm switching to a new layout for
>> this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
>> with two successive calls to ceph_osdc_new_request() potentially ending
>> up with two different namespaces, e.g. ceph_uninline_data().
>
> There is synchronisation. When changing file layout, MDS revokes Frw caps from client (block new read/write, for in-progress read/write). But this synchronisation is not complete reliable when client session state is toggled between stale and active.

I have a bit of trouble parsing "block new read/write, for in-progress
read/write".  So the client will stop issuing requests as soon as it
learns that it no longer has a cap, but what happens with the in-flight
requests?

>
>>>>
>>>> Why exactly can't file layouts be shared?  ci->i_layout would become
>>>> reference counted and you would give libceph a pointer to the pool_ns
>>>> string (or entire layout) after bumping it.  It doesn't matter if
>>>> pool_ns or the rest of the layout changes due to a cap grant or revoke
>>>> while libceph is servicing the OSD request: you would unlink it from
>>>> the tree but the bumped reference will keep the layout around, to be
>>>> put in the OSD request completion callback or so.  Layout lookup would
>>>> have to happen in exactly two places: when newing an inode and handling
>>>> cap grant/revoke, in other places you would simply bump the count on
>>>> the basis of already holding a valid pointer.  You wouldn't have to
>>>> unlink in the destructor, so no hassle with kref_get_unless_zero() and
>>>> no need for RCU, with i_ceph_lock providing the exclusion around the
>>>> tree.
>>>
>>> This means cephfs needs to set r_callback for all ceph_osd_request,
>>> ceph_osd_request also needs a new field to store layout pointer.
>>> I don’t think it’s better/simpler than reference counted namespace
>>> string.
>>
>> Not necessarily - you can put after ceph_osdc_wait_request() returns.
>> Somewhat unrelated, I'm working on refactoring osdc's handle_reply(),
>> and it'll probably be required that all OSD requests set one of the
>> callbacks, except for stateless fire-and-forget ones.
>
> For the r_callback case (no wait case), without saving a pointer in ceph_osd_request, how can I know which layout to put?
>
>>
>> Sharing ->i_layout as opposed to ->i_layout->pool_ns seemed sensible to
>> me because a) it naturally hangs off of ceph inode and b) logically,
>> it is entire layouts and not just namespaces that are shared across the
>> directory tree.  If you think reference counted pool_ns strings are
>> better, I won't argue with that, but, with cephfs being the only user
>> of either solution, it'll have to live in fs/ceph.
>
> I’m OK with both approaches. When sharing i_layout, we need to add a layout pointer to ceph_osd_request. After adding the layout pointer, why not let libceph release it when request finishes.
>
>> Separately, I think passing non-owning pool_ns pointers into libceph is
>> worth exploring, but if that doesn't easily map onto cephfs lifetime or
>> ownership rules, we will go with embedding namespace names by value into
>> ceph_osd_request (or, rather, ceph_object_locator).
>>
>
> As I stated in previous mail, embedded namespace is nightmare for cephfs. Every time namespace is used,  cephfs needs to lock i_ceph_lock, copy namespace to a temporary buffer.

So you are maintaining that all that is needed is to keep the memory
valid and there is no locking around installing a new namespace for an
inode.  I didn't realize that when I suggested layout sharing, it makes
it much less attractive.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-23  9:51               ` Ilya Dryomov
@ 2016-03-23 13:02                 ` Yan, Zheng
  2016-03-23 19:27                   ` Ilya Dryomov
  0 siblings, 1 reply; 33+ messages in thread
From: Yan, Zheng @ 2016-03-23 13:02 UTC (permalink / raw)
  To: Ilya Dryomov; +Cc: Gregory Farnum, Ceph Development, Sage Weil


> On Mar 23, 2016, at 17:51, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Wed, Mar 23, 2016 at 4:37 AM, Yan, Zheng <zyan@redhat.com> wrote:
>> 
>>> On Mar 23, 2016, at 00:13, Ilya Dryomov <idryomov@gmail.com> wrote:
>>> 
>>> On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>> 
>>>>> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>> 
>>>>> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>> 
>>>>>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>> 
>>>>> [ snip ]
>>>>> 
>>>>>>> Hi Zheng,
>>>>>>> 
>>>>>>> A one and a half line commit message and an equally short cover letter
>>>>>>> for a series such as this isn't enough.  I *happen* to know that the
>>>>>>> basic use case for namespaces in cephfs is going to be restricting
>>>>>>> users to different parts of the directory tree, with the enforcement
>>>>>>> happening in ceph on the server side, as opposed to in ceph on the
>>>>>>> client side, but I would appreciate some details on what the actual
>>>>>>> namespace names are going to be, whether it's user input or not,
>>>>>>> whether there are plans to use namespaces for anything else, etc.
>>>>>>> 
>>>>>> 
>>>>>> The namespace restriction you mentioned is for cephfs metadata. This
>>>>>> namespace is restricting users to different namespaces in cephfs data
>>>>>> pool. (At present, the only way to restrict data access is, creating
>>>>>> multiple cephfs data pools, restrict users to different data pool.
>>>>>> Creating lost of pools is not efficient for the cluster)
>>>>> 
>>>>> What about the namespace names, who is generating them, how long are
>>>>> they going to be?  Please describe in detail how this is going to work
>>>>> for both data and metadata pools.
>>>> 
>>>> For example, to restrict user foo to directory /foo_dir
>>>> 
>>>> // create auth caps for user foo.
>>>> ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns’
>>>> 
>>>> // mount cephfs by user admin
>>>> mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx
>>>> 
>>>> // set directory’s layout.pool_namespace
>>>> setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir
>>>> 
>>>> Admin user chooses namespace name. In most cases, namespace name does not change.
>>> 
>>> Good, I guess limiting it to 100 chars (or maybe even a smaller
>>> number) is sensible then.
>>> 
>>>> 
>>>>> 
>>>>>> 
>>>>>> 
>>>>>>> I don't like the idea of sharing namespace name strings between libceph
>>>>>>> and ceph modules, especially with the strings themselves hosted in
>>>>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>>>>> namespace names embedded into ceph_osd_request by value or with
>>>>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>>>>> modules, if one of the use cases is "one namespace with a long name for
>>>>>>> the entire directory tree" or something close to it.
>>>>>>> 
>>>>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>>>>> probably changed to share entire file layouts along the way.  I don't
>>>>>>> know this code well enough to be sure, but it seems that by sharing
>>>>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>>>> 
>>>>>> RBD may use namespace later.
>>>>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>>>> 
>>>>> Well, compared to cephfs, it's hard to call that "using" - in that
>>>>> case, there will only ever be one namespace per image.  My point is
>>>>> it's never going to use the string sharing infrastructure and is fine
>>>>> with a non-owning pointer to a string in the file layout field of the
>>>>> in-memory rbd image header.
>>>>> 
>>>>>> 
>>>>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>>>>> any time. For the same reason, using non-owning pointer for namespace
>>>>>> or entire layout is unfeasible. Using embedded namespace is not
>>>>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>>>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>>>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>>>>> ceph_osdc_xxx_request().
>>>>> 
>>>>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>>>>> changing while the OSD request is in flight.  As used above, it allows
>>>>> ceph_try_get_string() to not take any locks and that's it.
>>>> 
>>>> Yes.  But not taking lock simplify the code a lot.  we don't need to
>>>> lock/unlock i_ceph_lock each time i_layout is used.
>>> 
>>> I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
>>> the code base a simplification, but, more importantly, is keeping the
>>> pool_ns pointer valid really all you need?  Shouldn't there be some
>>> kind of synchronization around "OK, I'm switching to a new layout for
>>> this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
>>> with two successive calls to ceph_osdc_new_request() potentially ending
>>> up with two different namespaces, e.g. ceph_uninline_data().
>> 
>> There is synchronisation. When changing file layout, MDS revokes Frw caps from client (block new read/write, for in-progress read/write). But this synchronisation is not complete reliable when client session state is toggled between stale and active.
> 
> I have a bit of trouble parsing "block new read/write, for in-progress
> read/write".  So the client will stop issuing requests as soon as it
> learns that it no longer has a cap, but what happens with the in-flight
> requests?

When client know MDS is revoking Frw caps, it stops issuing new request and waits for in-flight requests. After all in-flight requests completes, client releases Frw caps to MDS.   

> 
>> 
>>>>> 
>>>>> Why exactly can't file layouts be shared?  ci->i_layout would become
>>>>> reference counted and you would give libceph a pointer to the pool_ns
>>>>> string (or entire layout) after bumping it.  It doesn't matter if
>>>>> pool_ns or the rest of the layout changes due to a cap grant or revoke
>>>>> while libceph is servicing the OSD request: you would unlink it from
>>>>> the tree but the bumped reference will keep the layout around, to be
>>>>> put in the OSD request completion callback or so.  Layout lookup would
>>>>> have to happen in exactly two places: when newing an inode and handling
>>>>> cap grant/revoke, in other places you would simply bump the count on
>>>>> the basis of already holding a valid pointer.  You wouldn't have to
>>>>> unlink in the destructor, so no hassle with kref_get_unless_zero() and
>>>>> no need for RCU, with i_ceph_lock providing the exclusion around the
>>>>> tree.
>>>> 
>>>> This means cephfs needs to set r_callback for all ceph_osd_request,
>>>> ceph_osd_request also needs a new field to store layout pointer.
>>>> I don’t think it’s better/simpler than reference counted namespace
>>>> string.
>>> 
>>> Not necessarily - you can put after ceph_osdc_wait_request() returns.
>>> Somewhat unrelated, I'm working on refactoring osdc's handle_reply(),
>>> and it'll probably be required that all OSD requests set one of the
>>> callbacks, except for stateless fire-and-forget ones.
>> 
>> For the r_callback case (no wait case), without saving a pointer in ceph_osd_request, how can I know which layout to put?
>> 
>>> 
>>> Sharing ->i_layout as opposed to ->i_layout->pool_ns seemed sensible to
>>> me because a) it naturally hangs off of ceph inode and b) logically,
>>> it is entire layouts and not just namespaces that are shared across the
>>> directory tree.  If you think reference counted pool_ns strings are
>>> better, I won't argue with that, but, with cephfs being the only user
>>> of either solution, it'll have to live in fs/ceph.
>> 
>> I’m OK with both approaches. When sharing i_layout, we need to add a layout pointer to ceph_osd_request. After adding the layout pointer, why not let libceph release it when request finishes.
>> 
>>> Separately, I think passing non-owning pool_ns pointers into libceph is
>>> worth exploring, but if that doesn't easily map onto cephfs lifetime or
>>> ownership rules, we will go with embedding namespace names by value into
>>> ceph_osd_request (or, rather, ceph_object_locator).
>>> 
>> 
>> As I stated in previous mail, embedded namespace is nightmare for cephfs. Every time namespace is used,  cephfs needs to lock i_ceph_lock, copy namespace to a temporary buffer.
> 
> So you are maintaining that all that is needed is to keep the memory
> valid and there is no locking around installing a new namespace for an
> inode.  I didn't realize that when I suggested layout sharing, it makes
> it much less attractive.

Yes.  That’s the main reason I decided to use RCU.

Regards
Yan, Zheng

> 
> Thanks,
> 
>                Ilya

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-02-06  7:00 ` [PATCH v2 2/6] libceph: introduce reference counted string Yan, Zheng
  2016-03-22  6:05   ` Ilya Dryomov
@ 2016-03-23 15:39   ` Douglas Fuller
  2016-03-24  1:39     ` Yan, Zheng
  1 sibling, 1 reply; 33+ messages in thread
From: Douglas Fuller @ 2016-03-23 15:39 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Ceph Development, Sage Weil, idryomov


> +#define ceph_try_get_string(x)					\
> +({								\
> +	struct ceph_string *___str;				\
> +	rcu_read_lock();					\
> +	___str = rcu_dereference(x);				\
> +	if (___str && !kref_get_unless_zero(&___str->kref))	\
> +		___str = NULL;					\
> +	rcu_read_unlock();					\
> +	(___str);						\

Isn’t this an unsafe use of __str? As I understand it, it could be updated after rcu_read_unlock().

> +})

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-23 13:02                 ` Yan, Zheng
@ 2016-03-23 19:27                   ` Ilya Dryomov
  2016-03-24  2:33                     ` Yan, Zheng
  0 siblings, 1 reply; 33+ messages in thread
From: Ilya Dryomov @ 2016-03-23 19:27 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Gregory Farnum, Ceph Development, Sage Weil

On Wed, Mar 23, 2016 at 2:02 PM, Yan, Zheng <zyan@redhat.com> wrote:
>
>> On Mar 23, 2016, at 17:51, Ilya Dryomov <idryomov@gmail.com> wrote:
>>
>> On Wed, Mar 23, 2016 at 4:37 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>
>>>> On Mar 23, 2016, at 00:13, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>
>>>> On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>
>>>>>> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>
>>>>>> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>>>
>>>>>>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>
>>>>>> [ snip ]
>>>>>>
>>>>>>>> Hi Zheng,
>>>>>>>>
>>>>>>>> A one and a half line commit message and an equally short cover letter
>>>>>>>> for a series such as this isn't enough.  I *happen* to know that the
>>>>>>>> basic use case for namespaces in cephfs is going to be restricting
>>>>>>>> users to different parts of the directory tree, with the enforcement
>>>>>>>> happening in ceph on the server side, as opposed to in ceph on the
>>>>>>>> client side, but I would appreciate some details on what the actual
>>>>>>>> namespace names are going to be, whether it's user input or not,
>>>>>>>> whether there are plans to use namespaces for anything else, etc.
>>>>>>>>
>>>>>>>
>>>>>>> The namespace restriction you mentioned is for cephfs metadata. This
>>>>>>> namespace is restricting users to different namespaces in cephfs data
>>>>>>> pool. (At present, the only way to restrict data access is, creating
>>>>>>> multiple cephfs data pools, restrict users to different data pool.
>>>>>>> Creating lost of pools is not efficient for the cluster)
>>>>>>
>>>>>> What about the namespace names, who is generating them, how long are
>>>>>> they going to be?  Please describe in detail how this is going to work
>>>>>> for both data and metadata pools.
>>>>>
>>>>> For example, to restrict user foo to directory /foo_dir
>>>>>
>>>>> // create auth caps for user foo.
>>>>> ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns’
>>>>>
>>>>> // mount cephfs by user admin
>>>>> mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx
>>>>>
>>>>> // set directory’s layout.pool_namespace
>>>>> setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir
>>>>>
>>>>> Admin user chooses namespace name. In most cases, namespace name does not change.
>>>>
>>>> Good, I guess limiting it to 100 chars (or maybe even a smaller
>>>> number) is sensible then.
>>>>
>>>>>
>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> I don't like the idea of sharing namespace name strings between libceph
>>>>>>>> and ceph modules, especially with the strings themselves hosted in
>>>>>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>>>>>> namespace names embedded into ceph_osd_request by value or with
>>>>>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>>>>>> modules, if one of the use cases is "one namespace with a long name for
>>>>>>>> the entire directory tree" or something close to it.
>>>>>>>>
>>>>>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>>>>>> probably changed to share entire file layouts along the way.  I don't
>>>>>>>> know this code well enough to be sure, but it seems that by sharing
>>>>>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>>>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>>>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>>>>>
>>>>>>> RBD may use namespace later.
>>>>>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>>>>>
>>>>>> Well, compared to cephfs, it's hard to call that "using" - in that
>>>>>> case, there will only ever be one namespace per image.  My point is
>>>>>> it's never going to use the string sharing infrastructure and is fine
>>>>>> with a non-owning pointer to a string in the file layout field of the
>>>>>> in-memory rbd image header.
>>>>>>
>>>>>>>
>>>>>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>>>>>> any time. For the same reason, using non-owning pointer for namespace
>>>>>>> or entire layout is unfeasible. Using embedded namespace is not
>>>>>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>>>>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>>>>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>>>>>> ceph_osdc_xxx_request().
>>>>>>
>>>>>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>>>>>> changing while the OSD request is in flight.  As used above, it allows
>>>>>> ceph_try_get_string() to not take any locks and that's it.
>>>>>
>>>>> Yes.  But not taking lock simplify the code a lot.  we don't need to
>>>>> lock/unlock i_ceph_lock each time i_layout is used.
>>>>
>>>> I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
>>>> the code base a simplification, but, more importantly, is keeping the
>>>> pool_ns pointer valid really all you need?  Shouldn't there be some
>>>> kind of synchronization around "OK, I'm switching to a new layout for
>>>> this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
>>>> with two successive calls to ceph_osdc_new_request() potentially ending
>>>> up with two different namespaces, e.g. ceph_uninline_data().
>>>
>>> There is synchronisation. When changing file layout, MDS revokes Frw caps from client (block new read/write, for in-progress read/write). But this synchronisation is not complete reliable when client session state is toggled between stale and active.
>>
>> I have a bit of trouble parsing "block new read/write, for in-progress
>> read/write".  So the client will stop issuing requests as soon as it
>> learns that it no longer has a cap, but what happens with the in-flight
>> requests?
>
> When client know MDS is revoking Frw caps, it stops issuing new request and waits for in-flight requests. After all in-flight requests completes, client releases Frw caps to MDS.
>
>>
>>>
>>>>>>
>>>>>> Why exactly can't file layouts be shared?  ci->i_layout would become
>>>>>> reference counted and you would give libceph a pointer to the pool_ns
>>>>>> string (or entire layout) after bumping it.  It doesn't matter if
>>>>>> pool_ns or the rest of the layout changes due to a cap grant or revoke
>>>>>> while libceph is servicing the OSD request: you would unlink it from
>>>>>> the tree but the bumped reference will keep the layout around, to be
>>>>>> put in the OSD request completion callback or so.  Layout lookup would
>>>>>> have to happen in exactly two places: when newing an inode and handling
>>>>>> cap grant/revoke, in other places you would simply bump the count on
>>>>>> the basis of already holding a valid pointer.  You wouldn't have to
>>>>>> unlink in the destructor, so no hassle with kref_get_unless_zero() and
>>>>>> no need for RCU, with i_ceph_lock providing the exclusion around the
>>>>>> tree.
>>>>>
>>>>> This means cephfs needs to set r_callback for all ceph_osd_request,
>>>>> ceph_osd_request also needs a new field to store layout pointer.
>>>>> I don’t think it’s better/simpler than reference counted namespace
>>>>> string.
>>>>
>>>> Not necessarily - you can put after ceph_osdc_wait_request() returns.
>>>> Somewhat unrelated, I'm working on refactoring osdc's handle_reply(),
>>>> and it'll probably be required that all OSD requests set one of the
>>>> callbacks, except for stateless fire-and-forget ones.
>>>
>>> For the r_callback case (no wait case), without saving a pointer in ceph_osd_request, how can I know which layout to put?
>>>
>>>>
>>>> Sharing ->i_layout as opposed to ->i_layout->pool_ns seemed sensible to
>>>> me because a) it naturally hangs off of ceph inode and b) logically,
>>>> it is entire layouts and not just namespaces that are shared across the
>>>> directory tree.  If you think reference counted pool_ns strings are
>>>> better, I won't argue with that, but, with cephfs being the only user
>>>> of either solution, it'll have to live in fs/ceph.
>>>
>>> I’m OK with both approaches. When sharing i_layout, we need to add a layout pointer to ceph_osd_request. After adding the layout pointer, why not let libceph release it when request finishes.
>>>
>>>> Separately, I think passing non-owning pool_ns pointers into libceph is
>>>> worth exploring, but if that doesn't easily map onto cephfs lifetime or
>>>> ownership rules, we will go with embedding namespace names by value into
>>>> ceph_osd_request (or, rather, ceph_object_locator).
>>>>
>>>
>>> As I stated in previous mail, embedded namespace is nightmare for cephfs. Every time namespace is used,  cephfs needs to lock i_ceph_lock, copy namespace to a temporary buffer.
>>
>> So you are maintaining that all that is needed is to keep the memory
>> valid and there is no locking around installing a new namespace for an
>> inode.  I didn't realize that when I suggested layout sharing, it makes
>> it much less attractive.
>
> Yes.  That’s the main reason I decided to use RCU.

For the record, I don't think it's a good design and I doubt the
implementation is going to work reliably, but that's your call.

Why would embedded namespaces in ceph_object_locator in libceph be
a nightmare for cephfs?  What do you refer to as a temporary buffer?
This kind of copying already occurs: you grab a ceph_string with
ceph_try_get_string() in ceph_osdc_new_request() and it's copied into
the request message, as part of encoding.  How is grabbing ceph_string
before calling into libceph and explicitly copying into object locator
different?

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-23 15:39   ` Douglas Fuller
@ 2016-03-24  1:39     ` Yan, Zheng
  2016-03-24 15:30       ` Douglas Fuller
  0 siblings, 1 reply; 33+ messages in thread
From: Yan, Zheng @ 2016-03-24  1:39 UTC (permalink / raw)
  To: Douglas Fuller; +Cc: Ceph Development, Sage Weil, idryomov


> On Mar 23, 2016, at 23:39, Douglas Fuller <dfuller@redhat.com> wrote:
> 
> 
>> +#define ceph_try_get_string(x)					\
>> +({								\
>> +	struct ceph_string *___str;				\
>> +	rcu_read_lock();					\
>> +	___str = rcu_dereference(x);				\
>> +	if (___str && !kref_get_unless_zero(&___str->kref))	\
>> +		___str = NULL;					\
>> +	rcu_read_unlock();					\
>> +	(___str);						\
> 
> Isn’t this an unsafe use of __str? As I understand it, it could be updated after rcu_read_unlock().

where does it get updated?

Regards
Yan, Zheng

> 
>> +})
> 

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-23 19:27                   ` Ilya Dryomov
@ 2016-03-24  2:33                     ` Yan, Zheng
  2016-03-24 10:26                       ` Ilya Dryomov
  0 siblings, 1 reply; 33+ messages in thread
From: Yan, Zheng @ 2016-03-24  2:33 UTC (permalink / raw)
  To: Ilya Dryomov; +Cc: Gregory Farnum, Ceph Development, Sage Weil


> On Mar 24, 2016, at 03:27, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Wed, Mar 23, 2016 at 2:02 PM, Yan, Zheng <zyan@redhat.com> wrote:
>> 
>>> On Mar 23, 2016, at 17:51, Ilya Dryomov <idryomov@gmail.com> wrote:
>>> 
>>> On Wed, Mar 23, 2016 at 4:37 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>> 
>>>>> On Mar 23, 2016, at 00:13, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>> 
>>>>> On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>> 
>>>>>>> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>> 
>>>>>>> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>>>> 
>>>>>>>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>> 
>>>>>>> [ snip ]
>>>>>>> 
>>>>>>>>> Hi Zheng,
>>>>>>>>> 
>>>>>>>>> A one and a half line commit message and an equally short cover letter
>>>>>>>>> for a series such as this isn't enough.  I *happen* to know that the
>>>>>>>>> basic use case for namespaces in cephfs is going to be restricting
>>>>>>>>> users to different parts of the directory tree, with the enforcement
>>>>>>>>> happening in ceph on the server side, as opposed to in ceph on the
>>>>>>>>> client side, but I would appreciate some details on what the actual
>>>>>>>>> namespace names are going to be, whether it's user input or not,
>>>>>>>>> whether there are plans to use namespaces for anything else, etc.
>>>>>>>>> 
>>>>>>>> 
>>>>>>>> The namespace restriction you mentioned is for cephfs metadata. This
>>>>>>>> namespace is restricting users to different namespaces in cephfs data
>>>>>>>> pool. (At present, the only way to restrict data access is, creating
>>>>>>>> multiple cephfs data pools, restrict users to different data pool.
>>>>>>>> Creating lost of pools is not efficient for the cluster)
>>>>>>> 
>>>>>>> What about the namespace names, who is generating them, how long are
>>>>>>> they going to be?  Please describe in detail how this is going to work
>>>>>>> for both data and metadata pools.
>>>>>> 
>>>>>> For example, to restrict user foo to directory /foo_dir
>>>>>> 
>>>>>> // create auth caps for user foo.
>>>>>> ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns’
>>>>>> 
>>>>>> // mount cephfs by user admin
>>>>>> mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx
>>>>>> 
>>>>>> // set directory’s layout.pool_namespace
>>>>>> setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir
>>>>>> 
>>>>>> Admin user chooses namespace name. In most cases, namespace name does not change.
>>>>> 
>>>>> Good, I guess limiting it to 100 chars (or maybe even a smaller
>>>>> number) is sensible then.
>>>>> 
>>>>>> 
>>>>>>> 
>>>>>>>> 
>>>>>>>> 
>>>>>>>>> I don't like the idea of sharing namespace name strings between libceph
>>>>>>>>> and ceph modules, especially with the strings themselves hosted in
>>>>>>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>>>>>>> namespace names embedded into ceph_osd_request by value or with
>>>>>>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>>>>>>> modules, if one of the use cases is "one namespace with a long name for
>>>>>>>>> the entire directory tree" or something close to it.
>>>>>>>>> 
>>>>>>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>>>>>>> probably changed to share entire file layouts along the way.  I don't
>>>>>>>>> know this code well enough to be sure, but it seems that by sharing
>>>>>>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>>>>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>>>>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>>>>>> 
>>>>>>>> RBD may use namespace later.
>>>>>>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>>>>>> 
>>>>>>> Well, compared to cephfs, it's hard to call that "using" - in that
>>>>>>> case, there will only ever be one namespace per image.  My point is
>>>>>>> it's never going to use the string sharing infrastructure and is fine
>>>>>>> with a non-owning pointer to a string in the file layout field of the
>>>>>>> in-memory rbd image header.
>>>>>>> 
>>>>>>>> 
>>>>>>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>>>>>>> any time. For the same reason, using non-owning pointer for namespace
>>>>>>>> or entire layout is unfeasible. Using embedded namespace is not
>>>>>>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>>>>>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>>>>>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>>>>>>> ceph_osdc_xxx_request().
>>>>>>> 
>>>>>>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>>>>>>> changing while the OSD request is in flight.  As used above, it allows
>>>>>>> ceph_try_get_string() to not take any locks and that's it.
>>>>>> 
>>>>>> Yes.  But not taking lock simplify the code a lot.  we don't need to
>>>>>> lock/unlock i_ceph_lock each time i_layout is used.
>>>>> 
>>>>> I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
>>>>> the code base a simplification, but, more importantly, is keeping the
>>>>> pool_ns pointer valid really all you need?  Shouldn't there be some
>>>>> kind of synchronization around "OK, I'm switching to a new layout for
>>>>> this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
>>>>> with two successive calls to ceph_osdc_new_request() potentially ending
>>>>> up with two different namespaces, e.g. ceph_uninline_data().
>>>> 
>>>> There is synchronisation. When changing file layout, MDS revokes Frw caps from client (block new read/write, for in-progress read/write). But this synchronisation is not complete reliable when client session state is toggled between stale and active.
>>> 
>>> I have a bit of trouble parsing "block new read/write, for in-progress
>>> read/write".  So the client will stop issuing requests as soon as it
>>> learns that it no longer has a cap, but what happens with the in-flight
>>> requests?
>> 
>> When client know MDS is revoking Frw caps, it stops issuing new request and waits for in-flight requests. After all in-flight requests completes, client releases Frw caps to MDS.
>> 
>>> 
>>>> 
>>>>>>> 
>>>>>>> Why exactly can't file layouts be shared?  ci->i_layout would become
>>>>>>> reference counted and you would give libceph a pointer to the pool_ns
>>>>>>> string (or entire layout) after bumping it.  It doesn't matter if
>>>>>>> pool_ns or the rest of the layout changes due to a cap grant or revoke
>>>>>>> while libceph is servicing the OSD request: you would unlink it from
>>>>>>> the tree but the bumped reference will keep the layout around, to be
>>>>>>> put in the OSD request completion callback or so.  Layout lookup would
>>>>>>> have to happen in exactly two places: when newing an inode and handling
>>>>>>> cap grant/revoke, in other places you would simply bump the count on
>>>>>>> the basis of already holding a valid pointer.  You wouldn't have to
>>>>>>> unlink in the destructor, so no hassle with kref_get_unless_zero() and
>>>>>>> no need for RCU, with i_ceph_lock providing the exclusion around the
>>>>>>> tree.
>>>>>> 
>>>>>> This means cephfs needs to set r_callback for all ceph_osd_request,
>>>>>> ceph_osd_request also needs a new field to store layout pointer.
>>>>>> I don’t think it’s better/simpler than reference counted namespace
>>>>>> string.
>>>>> 
>>>>> Not necessarily - you can put after ceph_osdc_wait_request() returns.
>>>>> Somewhat unrelated, I'm working on refactoring osdc's handle_reply(),
>>>>> and it'll probably be required that all OSD requests set one of the
>>>>> callbacks, except for stateless fire-and-forget ones.
>>>> 
>>>> For the r_callback case (no wait case), without saving a pointer in ceph_osd_request, how can I know which layout to put?
>>>> 
>>>>> 
>>>>> Sharing ->i_layout as opposed to ->i_layout->pool_ns seemed sensible to
>>>>> me because a) it naturally hangs off of ceph inode and b) logically,
>>>>> it is entire layouts and not just namespaces that are shared across the
>>>>> directory tree.  If you think reference counted pool_ns strings are
>>>>> better, I won't argue with that, but, with cephfs being the only user
>>>>> of either solution, it'll have to live in fs/ceph.
>>>> 
>>>> I’m OK with both approaches. When sharing i_layout, we need to add a layout pointer to ceph_osd_request. After adding the layout pointer, why not let libceph release it when request finishes.
>>>> 
>>>>> Separately, I think passing non-owning pool_ns pointers into libceph is
>>>>> worth exploring, but if that doesn't easily map onto cephfs lifetime or
>>>>> ownership rules, we will go with embedding namespace names by value into
>>>>> ceph_osd_request (or, rather, ceph_object_locator).
>>>>> 
>>>> 
>>>> As I stated in previous mail, embedded namespace is nightmare for cephfs. Every time namespace is used,  cephfs needs to lock i_ceph_lock, copy namespace to a temporary buffer.
>>> 
>>> So you are maintaining that all that is needed is to keep the memory
>>> valid and there is no locking around installing a new namespace for an
>>> inode.  I didn't realize that when I suggested layout sharing, it makes
>>> it much less attractive.
>> 
>> Yes.  That’s the main reason I decided to use RCU.
> 
> For the record, I don't think it's a good design and I doubt the
> implementation is going to work reliably, but that's your call.
> 
> Why would embedded namespaces in ceph_object_locator in libceph be
> a nightmare for cephfs?  What do you refer to as a temporary buffer?
> This kind of copying already occurs: you grab a ceph_string with
> ceph_try_get_string() in ceph_osdc_new_request() and it's copied into
> the request message, as part of encoding.  How is grabbing ceph_string
> before calling into libceph and explicitly copying into object locator
> different?

I mean not using reference-counted ceph_string. 


Yes, we can make cephfs code manage the reference counting, call ceph_try_get_string() and ceph_put_string() each time we issue osd request. But this approach requires more code/overhead for both ceph and libceph (an extra parameter for ceph_osdc_xxx_request() functions, code that copies namespace to embedded buffer, lots of ceph_try_get_string()/ceph_put_string() pair in cephfs code).  All of this is just for removing one ceph_try_get_string() call and two ceph_put_string() call in libceph. does it worth the effort, does it make libceph code cleaner?

Regards
Yan,Zheng--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-24  2:33                     ` Yan, Zheng
@ 2016-03-24 10:26                       ` Ilya Dryomov
  2016-03-24 12:19                         ` Ilya Dryomov
  2016-03-24 13:18                         ` Yan, Zheng
  0 siblings, 2 replies; 33+ messages in thread
From: Ilya Dryomov @ 2016-03-24 10:26 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Gregory Farnum, Ceph Development, Sage Weil

On Thu, Mar 24, 2016 at 3:33 AM, Yan, Zheng <zyan@redhat.com> wrote:
>
>> On Mar 24, 2016, at 03:27, Ilya Dryomov <idryomov@gmail.com> wrote:
>>
>> On Wed, Mar 23, 2016 at 2:02 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>
>>>> On Mar 23, 2016, at 17:51, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>
>>>> On Wed, Mar 23, 2016 at 4:37 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>
>>>>>> On Mar 23, 2016, at 00:13, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>
>>>>>> On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>>>
>>>>>>>> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>>>
>>>>>>>> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>>>>>
>>>>>>>>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>>>
>>>>>>>> [ snip ]
>>>>>>>>
>>>>>>>>>> Hi Zheng,
>>>>>>>>>>
>>>>>>>>>> A one and a half line commit message and an equally short cover letter
>>>>>>>>>> for a series such as this isn't enough.  I *happen* to know that the
>>>>>>>>>> basic use case for namespaces in cephfs is going to be restricting
>>>>>>>>>> users to different parts of the directory tree, with the enforcement
>>>>>>>>>> happening in ceph on the server side, as opposed to in ceph on the
>>>>>>>>>> client side, but I would appreciate some details on what the actual
>>>>>>>>>> namespace names are going to be, whether it's user input or not,
>>>>>>>>>> whether there are plans to use namespaces for anything else, etc.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> The namespace restriction you mentioned is for cephfs metadata. This
>>>>>>>>> namespace is restricting users to different namespaces in cephfs data
>>>>>>>>> pool. (At present, the only way to restrict data access is, creating
>>>>>>>>> multiple cephfs data pools, restrict users to different data pool.
>>>>>>>>> Creating lost of pools is not efficient for the cluster)
>>>>>>>>
>>>>>>>> What about the namespace names, who is generating them, how long are
>>>>>>>> they going to be?  Please describe in detail how this is going to work
>>>>>>>> for both data and metadata pools.
>>>>>>>
>>>>>>> For example, to restrict user foo to directory /foo_dir
>>>>>>>
>>>>>>> // create auth caps for user foo.
>>>>>>> ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns’
>>>>>>>
>>>>>>> // mount cephfs by user admin
>>>>>>> mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx
>>>>>>>
>>>>>>> // set directory’s layout.pool_namespace
>>>>>>> setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir
>>>>>>>
>>>>>>> Admin user chooses namespace name. In most cases, namespace name does not change.
>>>>>>
>>>>>> Good, I guess limiting it to 100 chars (or maybe even a smaller
>>>>>> number) is sensible then.
>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> I don't like the idea of sharing namespace name strings between libceph
>>>>>>>>>> and ceph modules, especially with the strings themselves hosted in
>>>>>>>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>>>>>>>> namespace names embedded into ceph_osd_request by value or with
>>>>>>>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>>>>>>>> modules, if one of the use cases is "one namespace with a long name for
>>>>>>>>>> the entire directory tree" or something close to it.
>>>>>>>>>>
>>>>>>>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>>>>>>>> probably changed to share entire file layouts along the way.  I don't
>>>>>>>>>> know this code well enough to be sure, but it seems that by sharing
>>>>>>>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>>>>>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>>>>>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>>>>>>>
>>>>>>>>> RBD may use namespace later.
>>>>>>>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>>>>>>>
>>>>>>>> Well, compared to cephfs, it's hard to call that "using" - in that
>>>>>>>> case, there will only ever be one namespace per image.  My point is
>>>>>>>> it's never going to use the string sharing infrastructure and is fine
>>>>>>>> with a non-owning pointer to a string in the file layout field of the
>>>>>>>> in-memory rbd image header.
>>>>>>>>
>>>>>>>>>
>>>>>>>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>>>>>>>> any time. For the same reason, using non-owning pointer for namespace
>>>>>>>>> or entire layout is unfeasible. Using embedded namespace is not
>>>>>>>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>>>>>>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>>>>>>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>>>>>>>> ceph_osdc_xxx_request().
>>>>>>>>
>>>>>>>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>>>>>>>> changing while the OSD request is in flight.  As used above, it allows
>>>>>>>> ceph_try_get_string() to not take any locks and that's it.
>>>>>>>
>>>>>>> Yes.  But not taking lock simplify the code a lot.  we don't need to
>>>>>>> lock/unlock i_ceph_lock each time i_layout is used.
>>>>>>
>>>>>> I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
>>>>>> the code base a simplification, but, more importantly, is keeping the
>>>>>> pool_ns pointer valid really all you need?  Shouldn't there be some
>>>>>> kind of synchronization around "OK, I'm switching to a new layout for
>>>>>> this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
>>>>>> with two successive calls to ceph_osdc_new_request() potentially ending
>>>>>> up with two different namespaces, e.g. ceph_uninline_data().
>>>>>
>>>>> There is synchronisation. When changing file layout, MDS revokes Frw caps from client (block new read/write, for in-progress read/write). But this synchronisation is not complete reliable when client session state is toggled between stale and active.
>>>>
>>>> I have a bit of trouble parsing "block new read/write, for in-progress
>>>> read/write".  So the client will stop issuing requests as soon as it
>>>> learns that it no longer has a cap, but what happens with the in-flight
>>>> requests?
>>>
>>> When client know MDS is revoking Frw caps, it stops issuing new request and waits for in-flight requests. After all in-flight requests completes, client releases Frw caps to MDS.
>>>
>>>>
>>>>>
>>>>>>>>
>>>>>>>> Why exactly can't file layouts be shared?  ci->i_layout would become
>>>>>>>> reference counted and you would give libceph a pointer to the pool_ns
>>>>>>>> string (or entire layout) after bumping it.  It doesn't matter if
>>>>>>>> pool_ns or the rest of the layout changes due to a cap grant or revoke
>>>>>>>> while libceph is servicing the OSD request: you would unlink it from
>>>>>>>> the tree but the bumped reference will keep the layout around, to be
>>>>>>>> put in the OSD request completion callback or so.  Layout lookup would
>>>>>>>> have to happen in exactly two places: when newing an inode and handling
>>>>>>>> cap grant/revoke, in other places you would simply bump the count on
>>>>>>>> the basis of already holding a valid pointer.  You wouldn't have to
>>>>>>>> unlink in the destructor, so no hassle with kref_get_unless_zero() and
>>>>>>>> no need for RCU, with i_ceph_lock providing the exclusion around the
>>>>>>>> tree.
>>>>>>>
>>>>>>> This means cephfs needs to set r_callback for all ceph_osd_request,
>>>>>>> ceph_osd_request also needs a new field to store layout pointer.
>>>>>>> I don’t think it’s better/simpler than reference counted namespace
>>>>>>> string.
>>>>>>
>>>>>> Not necessarily - you can put after ceph_osdc_wait_request() returns.
>>>>>> Somewhat unrelated, I'm working on refactoring osdc's handle_reply(),
>>>>>> and it'll probably be required that all OSD requests set one of the
>>>>>> callbacks, except for stateless fire-and-forget ones.
>>>>>
>>>>> For the r_callback case (no wait case), without saving a pointer in ceph_osd_request, how can I know which layout to put?
>>>>>
>>>>>>
>>>>>> Sharing ->i_layout as opposed to ->i_layout->pool_ns seemed sensible to
>>>>>> me because a) it naturally hangs off of ceph inode and b) logically,
>>>>>> it is entire layouts and not just namespaces that are shared across the
>>>>>> directory tree.  If you think reference counted pool_ns strings are
>>>>>> better, I won't argue with that, but, with cephfs being the only user
>>>>>> of either solution, it'll have to live in fs/ceph.
>>>>>
>>>>> I’m OK with both approaches. When sharing i_layout, we need to add a layout pointer to ceph_osd_request. After adding the layout pointer, why not let libceph release it when request finishes.
>>>>>
>>>>>> Separately, I think passing non-owning pool_ns pointers into libceph is
>>>>>> worth exploring, but if that doesn't easily map onto cephfs lifetime or
>>>>>> ownership rules, we will go with embedding namespace names by value into
>>>>>> ceph_osd_request (or, rather, ceph_object_locator).
>>>>>>
>>>>>
>>>>> As I stated in previous mail, embedded namespace is nightmare for cephfs. Every time namespace is used,  cephfs needs to lock i_ceph_lock, copy namespace to a temporary buffer.
>>>>
>>>> So you are maintaining that all that is needed is to keep the memory
>>>> valid and there is no locking around installing a new namespace for an
>>>> inode.  I didn't realize that when I suggested layout sharing, it makes
>>>> it much less attractive.
>>>
>>> Yes.  That’s the main reason I decided to use RCU.
>>
>> For the record, I don't think it's a good design and I doubt the
>> implementation is going to work reliably, but that's your call.
>>
>> Why would embedded namespaces in ceph_object_locator in libceph be
>> a nightmare for cephfs?  What do you refer to as a temporary buffer?
>> This kind of copying already occurs: you grab a ceph_string with
>> ceph_try_get_string() in ceph_osdc_new_request() and it's copied into
>> the request message, as part of encoding.  How is grabbing ceph_string
>> before calling into libceph and explicitly copying into object locator
>> different?
>
> I mean not using reference-counted ceph_string.
>
>
> Yes, we can make cephfs code manage the reference counting, call ceph_try_get_string() and ceph_put_string() each time we issue osd request. But this approach requires more code/overhead for both ceph and libceph (an extra parameter for ceph_osdc_xxx_request() functions, code that copies namespace to embedded buffer, lots of ceph_try_get_string()/ceph_put_string() pair in cephfs code).  All of this is just for removing one ceph_try_get_string() call and two ceph_put_string() call in libceph. does it worth the effort, does it make libceph code cleaner?

It'll be printed into the request after it's allocated, just like OID
is now, so e.g. ceph_osdc_alloc_request() won't need any changes.  You
may need to patch ceph_osdc_new_request(), but that's a cephfs-specific
function and should probably be moved out of libceph.  You won't need
to add lots of get/put pairs in cephfs either - a single helper around
filling in an object locator would do it.

The two disadvantages (extra strcpy() and larger OSD requests) are
real, but so are the benefits of modularity and keeping object locator
a simple value type.  I'm just trying to explore all available options.

TBH I still can't wrap my head around lockless layout/namespace pointer
updates and how that can ever work reliably...

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-24 10:26                       ` Ilya Dryomov
@ 2016-03-24 12:19                         ` Ilya Dryomov
  2016-03-24 13:18                         ` Yan, Zheng
  1 sibling, 0 replies; 33+ messages in thread
From: Ilya Dryomov @ 2016-03-24 12:19 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Gregory Farnum, Ceph Development, Sage Weil

On Thu, Mar 24, 2016 at 11:26 AM, Ilya Dryomov <idryomov@gmail.com> wrote:
> On Thu, Mar 24, 2016 at 3:33 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>
>>> On Mar 24, 2016, at 03:27, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>
>>> On Wed, Mar 23, 2016 at 2:02 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>
>>>>> On Mar 23, 2016, at 17:51, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>
>>>>> On Wed, Mar 23, 2016 at 4:37 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>>
>>>>>>> On Mar 23, 2016, at 00:13, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>>
>>>>>>> On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>>>>
>>>>>>>>> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>>>>
>>>>>>>>> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>>>>>>
>>>>>>>>>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>>>>
>>>>>>>>> [ snip ]
>>>>>>>>>
>>>>>>>>>>> Hi Zheng,
>>>>>>>>>>>
>>>>>>>>>>> A one and a half line commit message and an equally short cover letter
>>>>>>>>>>> for a series such as this isn't enough.  I *happen* to know that the
>>>>>>>>>>> basic use case for namespaces in cephfs is going to be restricting
>>>>>>>>>>> users to different parts of the directory tree, with the enforcement
>>>>>>>>>>> happening in ceph on the server side, as opposed to in ceph on the
>>>>>>>>>>> client side, but I would appreciate some details on what the actual
>>>>>>>>>>> namespace names are going to be, whether it's user input or not,
>>>>>>>>>>> whether there are plans to use namespaces for anything else, etc.
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> The namespace restriction you mentioned is for cephfs metadata. This
>>>>>>>>>> namespace is restricting users to different namespaces in cephfs data
>>>>>>>>>> pool. (At present, the only way to restrict data access is, creating
>>>>>>>>>> multiple cephfs data pools, restrict users to different data pool.
>>>>>>>>>> Creating lost of pools is not efficient for the cluster)
>>>>>>>>>
>>>>>>>>> What about the namespace names, who is generating them, how long are
>>>>>>>>> they going to be?  Please describe in detail how this is going to work
>>>>>>>>> for both data and metadata pools.
>>>>>>>>
>>>>>>>> For example, to restrict user foo to directory /foo_dir
>>>>>>>>
>>>>>>>> // create auth caps for user foo.
>>>>>>>> ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns\u2019
>>>>>>>>
>>>>>>>> // mount cephfs by user admin
>>>>>>>> mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx
>>>>>>>>
>>>>>>>> // set directory\u2019s layout.pool_namespace
>>>>>>>> setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir
>>>>>>>>
>>>>>>>> Admin user chooses namespace name. In most cases, namespace name does not change.
>>>>>>>
>>>>>>> Good, I guess limiting it to 100 chars (or maybe even a smaller
>>>>>>> number) is sensible then.
>>>>>>>
>>>>>>>>
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> I don't like the idea of sharing namespace name strings between libceph
>>>>>>>>>>> and ceph modules, especially with the strings themselves hosted in
>>>>>>>>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>>>>>>>>> namespace names embedded into ceph_osd_request by value or with
>>>>>>>>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>>>>>>>>> modules, if one of the use cases is "one namespace with a long name for
>>>>>>>>>>> the entire directory tree" or something close to it.
>>>>>>>>>>>
>>>>>>>>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>>>>>>>>> probably changed to share entire file layouts along the way.  I don't
>>>>>>>>>>> know this code well enough to be sure, but it seems that by sharing
>>>>>>>>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>>>>>>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>>>>>>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>>>>>>>>
>>>>>>>>>> RBD may use namespace later.
>>>>>>>>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>>>>>>>>
>>>>>>>>> Well, compared to cephfs, it's hard to call that "using" - in that
>>>>>>>>> case, there will only ever be one namespace per image.  My point is
>>>>>>>>> it's never going to use the string sharing infrastructure and is fine
>>>>>>>>> with a non-owning pointer to a string in the file layout field of the
>>>>>>>>> in-memory rbd image header.
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>>>>>>>>> any time. For the same reason, using non-owning pointer for namespace
>>>>>>>>>> or entire layout is unfeasible. Using embedded namespace is not
>>>>>>>>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>>>>>>>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>>>>>>>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>>>>>>>>> ceph_osdc_xxx_request().
>>>>>>>>>
>>>>>>>>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>>>>>>>>> changing while the OSD request is in flight.  As used above, it allows
>>>>>>>>> ceph_try_get_string() to not take any locks and that's it.
>>>>>>>>
>>>>>>>> Yes.  But not taking lock simplify the code a lot.  we don't need to
>>>>>>>> lock/unlock i_ceph_lock each time i_layout is used.
>>>>>>>
>>>>>>> I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
>>>>>>> the code base a simplification, but, more importantly, is keeping the
>>>>>>> pool_ns pointer valid really all you need?  Shouldn't there be some
>>>>>>> kind of synchronization around "OK, I'm switching to a new layout for
>>>>>>> this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
>>>>>>> with two successive calls to ceph_osdc_new_request() potentially ending
>>>>>>> up with two different namespaces, e.g. ceph_uninline_data().
>>>>>>
>>>>>> There is synchronisation. When changing file layout, MDS revokes Frw caps from client (block new read/write, for in-progress read/write). But this synchronisation is not complete reliable when client session state is toggled between stale and active.
>>>>>
>>>>> I have a bit of trouble parsing "block new read/write, for in-progress
>>>>> read/write".  So the client will stop issuing requests as soon as it
>>>>> learns that it no longer has a cap, but what happens with the in-flight
>>>>> requests?
>>>>
>>>> When client know MDS is revoking Frw caps, it stops issuing new request and waits for in-flight requests. After all in-flight requests completes, client releases Frw caps to MDS.
>>>>
>>>>>
>>>>>>
>>>>>>>>>
>>>>>>>>> Why exactly can't file layouts be shared?  ci->i_layout would become
>>>>>>>>> reference counted and you would give libceph a pointer to the pool_ns
>>>>>>>>> string (or entire layout) after bumping it.  It doesn't matter if
>>>>>>>>> pool_ns or the rest of the layout changes due to a cap grant or revoke
>>>>>>>>> while libceph is servicing the OSD request: you would unlink it from
>>>>>>>>> the tree but the bumped reference will keep the layout around, to be
>>>>>>>>> put in the OSD request completion callback or so.  Layout lookup would
>>>>>>>>> have to happen in exactly two places: when newing an inode and handling
>>>>>>>>> cap grant/revoke, in other places you would simply bump the count on
>>>>>>>>> the basis of already holding a valid pointer.  You wouldn't have to
>>>>>>>>> unlink in the destructor, so no hassle with kref_get_unless_zero() and
>>>>>>>>> no need for RCU, with i_ceph_lock providing the exclusion around the
>>>>>>>>> tree.
>>>>>>>>
>>>>>>>> This means cephfs needs to set r_callback for all ceph_osd_request,
>>>>>>>> ceph_osd_request also needs a new field to store layout pointer.
>>>>>>>> I don\u2019t think it\u2019s better/simpler than reference counted namespace
>>>>>>>> string.
>>>>>>>
>>>>>>> Not necessarily - you can put after ceph_osdc_wait_request() returns.
>>>>>>> Somewhat unrelated, I'm working on refactoring osdc's handle_reply(),
>>>>>>> and it'll probably be required that all OSD requests set one of the
>>>>>>> callbacks, except for stateless fire-and-forget ones.
>>>>>>
>>>>>> For the r_callback case (no wait case), without saving a pointer in ceph_osd_request, how can I know which layout to put?
>>>>>>
>>>>>>>
>>>>>>> Sharing ->i_layout as opposed to ->i_layout->pool_ns seemed sensible to
>>>>>>> me because a) it naturally hangs off of ceph inode and b) logically,
>>>>>>> it is entire layouts and not just namespaces that are shared across the
>>>>>>> directory tree.  If you think reference counted pool_ns strings are
>>>>>>> better, I won't argue with that, but, with cephfs being the only user
>>>>>>> of either solution, it'll have to live in fs/ceph.
>>>>>>
>>>>>> I\u2019m OK with both approaches. When sharing i_layout, we need to add a layout pointer to ceph_osd_request. After adding the layout pointer, why not let libceph release it when request finishes.
>>>>>>
>>>>>>> Separately, I think passing non-owning pool_ns pointers into libceph is
>>>>>>> worth exploring, but if that doesn't easily map onto cephfs lifetime or
>>>>>>> ownership rules, we will go with embedding namespace names by value into
>>>>>>> ceph_osd_request (or, rather, ceph_object_locator).
>>>>>>>
>>>>>>
>>>>>> As I stated in previous mail, embedded namespace is nightmare for cephfs. Every time namespace is used,  cephfs needs to lock i_ceph_lock, copy namespace to a temporary buffer.
>>>>>
>>>>> So you are maintaining that all that is needed is to keep the memory
>>>>> valid and there is no locking around installing a new namespace for an
>>>>> inode.  I didn't realize that when I suggested layout sharing, it makes
>>>>> it much less attractive.
>>>>
>>>> Yes.  That\u2019s the main reason I decided to use RCU.
>>>
>>> For the record, I don't think it's a good design and I doubt the
>>> implementation is going to work reliably, but that's your call.
>>>
>>> Why would embedded namespaces in ceph_object_locator in libceph be
>>> a nightmare for cephfs?  What do you refer to as a temporary buffer?
>>> This kind of copying already occurs: you grab a ceph_string with
>>> ceph_try_get_string() in ceph_osdc_new_request() and it's copied into
>>> the request message, as part of encoding.  How is grabbing ceph_string
>>> before calling into libceph and explicitly copying into object locator
>>> different?
>>
>> I mean not using reference-counted ceph_string.
>>
>>
>> Yes, we can make cephfs code manage the reference counting, call ceph_try_get_string() and ceph_put_string() each time we issue osd request. But this approach requires more code/overhead for both ceph and libceph (an extra parameter for ceph_osdc_xxx_request() functions, code that copies namespace to embedded buffer, lots of ceph_try_get_string()/ceph_put_string() pair in cephfs code).  All of this is just for removing one ceph_try_get_string() call and two ceph_put_string() call in libceph. does it worth the effort, does it make libceph code cleaner\uff1f
>
> It'll be printed into the request after it's allocated, just like OID
> is now, so e.g. ceph_osdc_alloc_request() won't need any changes.  You
> may need to patch ceph_osdc_new_request(), but that's a cephfs-specific
> function and should probably be moved out of libceph.  You won't need
> to add lots of get/put pairs in cephfs either - a single helper around
> filling in an object locator would do it.
>
> The two disadvantages (extra strcpy() and larger OSD requests) are
> real, but so are the benefits of modularity and keeping object locator
> a simple value type.  I'm just trying to explore all available options.
>
> TBH I still can't wrap my head around lockless layout/namespace pointer
> updates and how that can ever work reliably...

I also don't get the purpose of making it lockless.  Neither the MDS
client nor the data path are light on locks: there is more than half
a dozen of them, i_ceph_lock in particular is taken dozens of times,
including in cap management code.  Doesn't that tranlate into "there is
no way to start an I/O operation w/o i_ceph_lock being involved"?

caps grant/revoke code also happens to be one of the two places where
the namespace is set.  The other is (re-)filling an inode struct and
both of those necessarily revolve around i_ceph_lock.  Could you lay
out the reasoning behind the current approach in more detail?

Thanks,

                Ilya

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-24 10:26                       ` Ilya Dryomov
  2016-03-24 12:19                         ` Ilya Dryomov
@ 2016-03-24 13:18                         ` Yan, Zheng
  2016-03-24 13:55                           ` Ilya Dryomov
  1 sibling, 1 reply; 33+ messages in thread
From: Yan, Zheng @ 2016-03-24 13:18 UTC (permalink / raw)
  To: Ilya Dryomov; +Cc: Gregory Farnum, Ceph Development, Sage Weil


> On Mar 24, 2016, at 18:26, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Thu, Mar 24, 2016 at 3:33 AM, Yan, Zheng <zyan@redhat.com> wrote:
>> 
>>> On Mar 24, 2016, at 03:27, Ilya Dryomov <idryomov@gmail.com> wrote:
>>> 
>>> On Wed, Mar 23, 2016 at 2:02 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>> 
>>>>> On Mar 23, 2016, at 17:51, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>> 
>>>>> On Wed, Mar 23, 2016 at 4:37 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>> 
>>>>>>> On Mar 23, 2016, at 00:13, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>> 
>>>>>>> On Tue, Mar 22, 2016 at 2:57 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>>>> 
>>>>>>>>> On Mar 22, 2016, at 19:00, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>>>> 
>>>>>>>>> On Tue, Mar 22, 2016 at 10:17 AM, Yan, Zheng <zyan@redhat.com> wrote:
>>>>>>>>>> 
>>>>>>>>>>> On Mar 22, 2016, at 14:05, Ilya Dryomov <idryomov@gmail.com> wrote:
>>>>>>>>> 
>>>>>>>>> [ snip ]
>>>>>>>>> 
>>>>>>>>>>> Hi Zheng,
>>>>>>>>>>> 
>>>>>>>>>>> A one and a half line commit message and an equally short cover letter
>>>>>>>>>>> for a series such as this isn't enough.  I *happen* to know that the
>>>>>>>>>>> basic use case for namespaces in cephfs is going to be restricting
>>>>>>>>>>> users to different parts of the directory tree, with the enforcement
>>>>>>>>>>> happening in ceph on the server side, as opposed to in ceph on the
>>>>>>>>>>> client side, but I would appreciate some details on what the actual
>>>>>>>>>>> namespace names are going to be, whether it's user input or not,
>>>>>>>>>>> whether there are plans to use namespaces for anything else, etc.
>>>>>>>>>>> 
>>>>>>>>>> 
>>>>>>>>>> The namespace restriction you mentioned is for cephfs metadata. This
>>>>>>>>>> namespace is restricting users to different namespaces in cephfs data
>>>>>>>>>> pool. (At present, the only way to restrict data access is, creating
>>>>>>>>>> multiple cephfs data pools, restrict users to different data pool.
>>>>>>>>>> Creating lost of pools is not efficient for the cluster)
>>>>>>>>> 
>>>>>>>>> What about the namespace names, who is generating them, how long are
>>>>>>>>> they going to be?  Please describe in detail how this is going to work
>>>>>>>>> for both data and metadata pools.
>>>>>>>> 
>>>>>>>> For example, to restrict user foo to directory /foo_dir
>>>>>>>> 
>>>>>>>> // create auth caps for user foo.
>>>>>>>> ceph auth get-or-create client.foo mon 'allow r' mds 'allow r, allow rw path=/foo_dir' osd 'allow rw pool=data namespace=foo_ns’
>>>>>>>> 
>>>>>>>> // mount cephfs by user admin
>>>>>>>> mount -t ceph 10.0.0.1:/ /mnt/ceph_mount -o name=admin,secret=xxxx
>>>>>>>> 
>>>>>>>> // set directory’s layout.pool_namespace
>>>>>>>> setfattr -n ceph.dir.pool_namespace -v foo_ns /mnt/ceph_mount/foo_dir
>>>>>>>> 
>>>>>>>> Admin user chooses namespace name. In most cases, namespace name does not change.
>>>>>>> 
>>>>>>> Good, I guess limiting it to 100 chars (or maybe even a smaller
>>>>>>> number) is sensible then.
>>>>>>> 
>>>>>>>> 
>>>>>>>>> 
>>>>>>>>>> 
>>>>>>>>>> 
>>>>>>>>>>> I don't like the idea of sharing namespace name strings between libceph
>>>>>>>>>>> and ceph modules, especially with the strings themselves hosted in
>>>>>>>>>>> libceph.  rbd has no use for namespaces, so libceph can live with
>>>>>>>>>>> namespace names embedded into ceph_osd_request by value or with
>>>>>>>>>>> a simple non-owning pointer, leaving reference counting to the outside
>>>>>>>>>>> modules, if one of the use cases is "one namespace with a long name for
>>>>>>>>>>> the entire directory tree" or something close to it.
>>>>>>>>>>> 
>>>>>>>>>>> I think the sharing infrastructure should be moved into cephfs, and
>>>>>>>>>>> probably changed to share entire file layouts along the way.  I don't
>>>>>>>>>>> know this code well enough to be sure, but it seems that by sharing
>>>>>>>>>>> file layouts and making ci->i_layout an owning pointer you might be
>>>>>>>>>>> able to piggy back on i_ceph_lock and considerably simplify the whole
>>>>>>>>>>> thing by dropping RCU and eliminating numerous get/put calls.
>>>>>>>>>> 
>>>>>>>>>> RBD may use namespace later.
>>>>>>>>>> http://tracker.ceph.com/projects/ceph/wiki/Rbd_-_namespace_support
>>>>>>>>> 
>>>>>>>>> Well, compared to cephfs, it's hard to call that "using" - in that
>>>>>>>>> case, there will only ever be one namespace per image.  My point is
>>>>>>>>> it's never going to use the string sharing infrastructure and is fine
>>>>>>>>> with a non-owning pointer to a string in the file layout field of the
>>>>>>>>> in-memory rbd image header.
>>>>>>>>> 
>>>>>>>>>> 
>>>>>>>>>> The reason I use RCU here is that ci->i_layout.pool_ns can change at
>>>>>>>>>> any time. For the same reason, using non-owning pointer for namespace
>>>>>>>>>> or entire layout is unfeasible. Using embedded namespace is not
>>>>>>>>>> elegant either. When allocating ceph_osd_request, cephfs needs to
>>>>>>>>>> lock i_ceph_lock, copy namespace to a temporary buffer, unlock
>>>>>>>>>> i_ceph_lock, pass ci->i_layout and the temporary buffer to the
>>>>>>>>>> ceph_osdc_xxx_request().
>>>>>>>>> 
>>>>>>>>> Hmm, RCU doesn't protect you from pool_ns or other file layout fields
>>>>>>>>> changing while the OSD request is in flight.  As used above, it allows
>>>>>>>>> ceph_try_get_string() to not take any locks and that's it.
>>>>>>>> 
>>>>>>>> Yes.  But not taking lock simplify the code a lot.  we don't need to
>>>>>>>> lock/unlock i_ceph_lock each time i_layout is used.
>>>>>>> 
>>>>>>> I wouldn't call a bunch of rcu_dereference_* variants sprinkled around
>>>>>>> the code base a simplification, but, more importantly, is keeping the
>>>>>>> pool_ns pointer valid really all you need?  Shouldn't there be some
>>>>>>> kind of synchronization around "OK, I'm switching to a new layout for
>>>>>>> this inode"?  As it is, pool_ns is grabbed in ceph_osdc_new_request(),
>>>>>>> with two successive calls to ceph_osdc_new_request() potentially ending
>>>>>>> up with two different namespaces, e.g. ceph_uninline_data().
>>>>>> 
>>>>>> There is synchronisation. When changing file layout, MDS revokes Frw caps from client (block new read/write, for in-progress read/write). But this synchronisation is not complete reliable when client session state is toggled between stale and active.
>>>>> 
>>>>> I have a bit of trouble parsing "block new read/write, for in-progress
>>>>> read/write".  So the client will stop issuing requests as soon as it
>>>>> learns that it no longer has a cap, but what happens with the in-flight
>>>>> requests?
>>>> 
>>>> When client know MDS is revoking Frw caps, it stops issuing new request and waits for in-flight requests. After all in-flight requests completes, client releases Frw caps to MDS.
>>>> 
>>>>> 
>>>>>> 
>>>>>>>>> 
>>>>>>>>> Why exactly can't file layouts be shared?  ci->i_layout would become
>>>>>>>>> reference counted and you would give libceph a pointer to the pool_ns
>>>>>>>>> string (or entire layout) after bumping it.  It doesn't matter if
>>>>>>>>> pool_ns or the rest of the layout changes due to a cap grant or revoke
>>>>>>>>> while libceph is servicing the OSD request: you would unlink it from
>>>>>>>>> the tree but the bumped reference will keep the layout around, to be
>>>>>>>>> put in the OSD request completion callback or so.  Layout lookup would
>>>>>>>>> have to happen in exactly two places: when newing an inode and handling
>>>>>>>>> cap grant/revoke, in other places you would simply bump the count on
>>>>>>>>> the basis of already holding a valid pointer.  You wouldn't have to
>>>>>>>>> unlink in the destructor, so no hassle with kref_get_unless_zero() and
>>>>>>>>> no need for RCU, with i_ceph_lock providing the exclusion around the
>>>>>>>>> tree.
>>>>>>>> 
>>>>>>>> This means cephfs needs to set r_callback for all ceph_osd_request,
>>>>>>>> ceph_osd_request also needs a new field to store layout pointer.
>>>>>>>> I don’t think it’s better/simpler than reference counted namespace
>>>>>>>> string.
>>>>>>> 
>>>>>>> Not necessarily - you can put after ceph_osdc_wait_request() returns.
>>>>>>> Somewhat unrelated, I'm working on refactoring osdc's handle_reply(),
>>>>>>> and it'll probably be required that all OSD requests set one of the
>>>>>>> callbacks, except for stateless fire-and-forget ones.
>>>>>> 
>>>>>> For the r_callback case (no wait case), without saving a pointer in ceph_osd_request, how can I know which layout to put?
>>>>>> 
>>>>>>> 
>>>>>>> Sharing ->i_layout as opposed to ->i_layout->pool_ns seemed sensible to
>>>>>>> me because a) it naturally hangs off of ceph inode and b) logically,
>>>>>>> it is entire layouts and not just namespaces that are shared across the
>>>>>>> directory tree.  If you think reference counted pool_ns strings are
>>>>>>> better, I won't argue with that, but, with cephfs being the only user
>>>>>>> of either solution, it'll have to live in fs/ceph.
>>>>>> 
>>>>>> I’m OK with both approaches. When sharing i_layout, we need to add a layout pointer to ceph_osd_request. After adding the layout pointer, why not let libceph release it when request finishes.
>>>>>> 
>>>>>>> Separately, I think passing non-owning pool_ns pointers into libceph is
>>>>>>> worth exploring, but if that doesn't easily map onto cephfs lifetime or
>>>>>>> ownership rules, we will go with embedding namespace names by value into
>>>>>>> ceph_osd_request (or, rather, ceph_object_locator).
>>>>>>> 
>>>>>> 
>>>>>> As I stated in previous mail, embedded namespace is nightmare for cephfs. Every time namespace is used,  cephfs needs to lock i_ceph_lock, copy namespace to a temporary buffer.
>>>>> 
>>>>> So you are maintaining that all that is needed is to keep the memory
>>>>> valid and there is no locking around installing a new namespace for an
>>>>> inode.  I didn't realize that when I suggested layout sharing, it makes
>>>>> it much less attractive.
>>>> 
>>>> Yes.  That’s the main reason I decided to use RCU.
>>> 
>>> For the record, I don't think it's a good design and I doubt the
>>> implementation is going to work reliably, but that's your call.
>>> 
>>> Why would embedded namespaces in ceph_object_locator in libceph be
>>> a nightmare for cephfs?  What do you refer to as a temporary buffer?
>>> This kind of copying already occurs: you grab a ceph_string with
>>> ceph_try_get_string() in ceph_osdc_new_request() and it's copied into
>>> the request message, as part of encoding.  How is grabbing ceph_string
>>> before calling into libceph and explicitly copying into object locator
>>> different?
>> 
>> I mean not using reference-counted ceph_string.
>> 
>> 
>> Yes, we can make cephfs code manage the reference counting, call ceph_try_get_string() and ceph_put_string() each time we issue osd request. But this approach requires more code/overhead for both ceph and libceph (an extra parameter for ceph_osdc_xxx_request() functions, code that copies namespace to embedded buffer, lots of ceph_try_get_string()/ceph_put_string() pair in cephfs code).  All of this is just for removing one ceph_try_get_string() call and two ceph_put_string() call in libceph. does it worth the effort, does it make libceph code cleaner?
> 
> It'll be printed into the request after it's allocated, just like OID
> is now, so e.g. ceph_osdc_alloc_request() won't need any changes.  You
> may need to patch ceph_osdc_new_request(), but that's a cephfs-specific
> function and should probably be moved out of libceph.  You won't need
> to add lots of get/put pairs in cephfs either - a single helper around
> filling in an object locator would do it.
> 
> The two disadvantages (extra strcpy() and larger OSD requests) are
> real, but so are the benefits of modularity and keeping object locator
> a simple value type.  I'm just trying to explore all available options.

I agree that it’s easy for us to explore both options if we move ceph_osdc_new_request(), ceph_osdc_readpage and ceph_osdc_writepage into fs/ceph directory, 

> 
> TBH I still can't wrap my head around lockless layout/namespace pointer
> updates and how that can ever work reliably…

see Documentation/RCU/rcuref.txt. How RCU is used in my code is similar to the second example.

Regards
Yan, Zheng


> Thanks,
> 
>                Ilya

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-24 13:18                         ` Yan, Zheng
@ 2016-03-24 13:55                           ` Ilya Dryomov
  2016-03-25  3:05                             ` Yan, Zheng
  0 siblings, 1 reply; 33+ messages in thread
From: Ilya Dryomov @ 2016-03-24 13:55 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Gregory Farnum, Ceph Development, Sage Weil

On Thu, Mar 24, 2016 at 2:18 PM, Yan, Zheng <zyan@redhat.com> wrote:
>
>> On Mar 24, 2016, at 18:26, Ilya Dryomov <idryomov@gmail.com> wrote:

[ snip ]

>> TBH I still can't wrap my head around lockless layout/namespace pointer
>> updates and how that can ever work reliably…
>
> see Documentation/RCU/rcuref.txt. How RCU is used in my code is similar to the second example.

It's the higher-level cephfs code and installing new namespaces in the
middle of whatever else that might be going on that I'm worried about.
See my other mail, I'd like to hear what exactly is being achieved with
this approach and have you considered reusing i_ceph_lock (or others,
of which there are plenty).

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-24  1:39     ` Yan, Zheng
@ 2016-03-24 15:30       ` Douglas Fuller
  0 siblings, 0 replies; 33+ messages in thread
From: Douglas Fuller @ 2016-03-24 15:30 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Ceph Development, Sage Weil, idryomov


> On Mar 23, 2016, at 7:39 PM, Yan, Zheng <zyan@redhat.com> wrote:
> 
> 
>> On Mar 23, 2016, at 23:39, Douglas Fuller <dfuller@redhat.com> wrote:
>> 
>> 
>>> +#define ceph_try_get_string(x)					\
>>> +({								\
>>> +	struct ceph_string *___str;				\
>>> +	rcu_read_lock();					\
>>> +	___str = rcu_dereference(x);				\
>>> +	if (___str && !kref_get_unless_zero(&___str->kref))	\
>>> +		___str = NULL;					\
>>> +	rcu_read_unlock();					\
>>> +	(___str);						\
>> 
>> Isn’t this an unsafe use of __str? As I understand it, it could be updated after rcu_read_unlock().
> 
> where does it get updated?

By a later RCU update. As I understand it, an rcu_dereference()d pointer is only guaranteed stable inside rcu_read_lock()/rcu_read_unlock().

> 
> Regards
> Yan, Zheng
> 
>> 
>>> +})
>> 
> 

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-24 13:55                           ` Ilya Dryomov
@ 2016-03-25  3:05                             ` Yan, Zheng
  2016-03-28 19:37                               ` Ilya Dryomov
  0 siblings, 1 reply; 33+ messages in thread
From: Yan, Zheng @ 2016-03-25  3:05 UTC (permalink / raw)
  To: Ilya Dryomov; +Cc: Gregory Farnum, Ceph Development, Sage Weil


> On Mar 24, 2016, at 21:55, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Thu, Mar 24, 2016 at 2:18 PM, Yan, Zheng <zyan@redhat.com> wrote:
>> 
>>> On Mar 24, 2016, at 18:26, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> [ snip ]
> 
>>> TBH I still can't wrap my head around lockless layout/namespace pointer
>>> updates and how that can ever work reliably…
>> 
>> see Documentation/RCU/rcuref.txt. How RCU is used in my code is similar to the second example.
> 
> It's the higher-level cephfs code and installing new namespaces in the
> middle of whatever else that might be going on that I'm worried about.
> See my other mail, I'd like to hear what exactly is being achieved with
> this approach and have you considered reusing i_ceph_lock (or others,
> of which there are plenty).

Updating namespace pointer is protected by i_ceph_lock, For read access of the namespace, using i_ceph_lock does not provide any extra guarantee. This approach provides lockless access of namespace, it’s convenient for functions that do not know 
i_ceph_lock.

Regards
Yan, Zheng

> Thanks,
> 
>                Ilya

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-25  3:05                             ` Yan, Zheng
@ 2016-03-28 19:37                               ` Ilya Dryomov
  2016-03-29  2:29                                 ` Yan, Zheng
  2016-03-29  9:56                                 ` Yan, Zheng
  0 siblings, 2 replies; 33+ messages in thread
From: Ilya Dryomov @ 2016-03-28 19:37 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Gregory Farnum, Ceph Development, Sage Weil

On Fri, Mar 25, 2016 at 4:05 AM, Yan, Zheng <zyan@redhat.com> wrote:
>
>> On Mar 24, 2016, at 21:55, Ilya Dryomov <idryomov@gmail.com> wrote:
>>
>> On Thu, Mar 24, 2016 at 2:18 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>
>>>> On Mar 24, 2016, at 18:26, Ilya Dryomov <idryomov@gmail.com> wrote:
>>
>> [ snip ]
>>
>>>> TBH I still can't wrap my head around lockless layout/namespace pointer
>>>> updates and how that can ever work reliably…
>>>
>>> see Documentation/RCU/rcuref.txt. How RCU is used in my code is similar to the second example.
>>
>> It's the higher-level cephfs code and installing new namespaces in the
>> middle of whatever else that might be going on that I'm worried about.
>> See my other mail, I'd like to hear what exactly is being achieved with
>> this approach and have you considered reusing i_ceph_lock (or others,
>> of which there are plenty).
>
> Updating namespace pointer is protected by i_ceph_lock, For read
> access of the namespace, using i_ceph_lock does not provide any extra
> guarantee. This approach provides lockless access of namespace, it’s
> convenient for functions that do not know i_ceph_lock.

Could you name some of these functions?

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-28 19:37                               ` Ilya Dryomov
@ 2016-03-29  2:29                                 ` Yan, Zheng
  2016-03-29  9:56                                 ` Yan, Zheng
  1 sibling, 0 replies; 33+ messages in thread
From: Yan, Zheng @ 2016-03-29  2:29 UTC (permalink / raw)
  To: Ilya Dryomov; +Cc: Gregory Farnum, Ceph Development, Sage Weil


> On Mar 29, 2016, at 03:37, Ilya Dryomov <idryomov@gmail.com> wrote:
> 
> On Fri, Mar 25, 2016 at 4:05 AM, Yan, Zheng <zyan@redhat.com> wrote:
>> 
>>> On Mar 24, 2016, at 21:55, Ilya Dryomov <idryomov@gmail.com> wrote:
>>> 
>>> On Thu, Mar 24, 2016 at 2:18 PM, Yan, Zheng <zyan@redhat.com> wrote:
>>>> 
>>>>> On Mar 24, 2016, at 18:26, Ilya Dryomov <idryomov@gmail.com> wrote:
>>> 
>>> [ snip ]
>>> 
>>>>> TBH I still can't wrap my head around lockless layout/namespace pointer
>>>>> updates and how that can ever work reliably…
>>>> 
>>>> see Documentation/RCU/rcuref.txt. How RCU is used in my code is similar to the second example.
>>> 
>>> It's the higher-level cephfs code and installing new namespaces in the
>>> middle of whatever else that might be going on that I'm worried about.
>>> See my other mail, I'd like to hear what exactly is being achieved with
>>> this approach and have you considered reusing i_ceph_lock (or others,
>>> of which there are plenty).
>> 
>> Updating namespace pointer is protected by i_ceph_lock, For read
>> access of the namespace, using i_ceph_lock does not provide any extra
>> guarantee. This approach provides lockless access of namespace, it’s
>> convenient for functions that do not know i_ceph_lock.
> 
> Could you name some of these functions?

ceph_osdc_{new_request,readpages,write pages}

> 
> Thanks,
> 
>                Ilya

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-28 19:37                               ` Ilya Dryomov
  2016-03-29  2:29                                 ` Yan, Zheng
@ 2016-03-29  9:56                                 ` Yan, Zheng
  2016-03-29 11:18                                   ` Ilya Dryomov
  1 sibling, 1 reply; 33+ messages in thread
From: Yan, Zheng @ 2016-03-29  9:56 UTC (permalink / raw)
  To: Ilya Dryomov; +Cc: Gregory Farnum, Ceph Development, Sage Weil

Here is code that share i_layout and embed namespace in osd_request.  But the code is more complex and less less efficient. I strongly doubt it’s good design. 

https://github.com/ceph/ceph-client/commits/wip-file-layout2 <https://github.com/ceph/ceph-client/commits/wip-file-layout2>

Regards
Yan, Zheng

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH v2 2/6] libceph: introduce reference counted string
  2016-03-29  9:56                                 ` Yan, Zheng
@ 2016-03-29 11:18                                   ` Ilya Dryomov
  0 siblings, 0 replies; 33+ messages in thread
From: Ilya Dryomov @ 2016-03-29 11:18 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: Gregory Farnum, Ceph Development, Sage Weil

On Tue, Mar 29, 2016 at 11:56 AM, Yan, Zheng <zyan@redhat.com> wrote:
> Here is code that share i_layout and embed namespace in osd_request.  But the code is more complex and less less efficient. I strongly doubt it’s good design.
>
> https://github.com/ceph/ceph-client/commits/wip-file-layout2 <https://github.com/ceph/ceph-client/commits/wip-file-layout2>

When I suggested it, I assumed namespace updates were more synchronous
than they actually are.  I said in one of my previous emails a few days
ago that it seemed less attractive, at least compared to the posted
patches, so I agree.

The string-sharing approach reaches deeply into osd_client though.
I have a *big* set of changes in the works for it - let me try to finish
it, prepare the ground for reference-counting for object locators and
ping you in couple of weeks?  Discussing particulars like locking and
RCU usage should be easier then.

Thanks,

                Ilya
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2016-03-29 11:18 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-02-06  7:00 [PATCH v2 0/6] Rados pool namespace support Yan, Zheng
2016-02-06  7:00 ` [PATCH v2 1/6] libceph: define new ceph_file_layout structure Yan, Zheng
2016-02-06  7:00 ` [PATCH v2 2/6] libceph: introduce reference counted string Yan, Zheng
2016-03-22  6:05   ` Ilya Dryomov
2016-03-22  9:17     ` Yan, Zheng
2016-03-22 11:00       ` Ilya Dryomov
2016-03-22 13:57         ` Yan, Zheng
2016-03-22 16:13           ` Ilya Dryomov
2016-03-22 18:46             ` Gregory Farnum
2016-03-23  3:37             ` Yan, Zheng
2016-03-23  9:51               ` Ilya Dryomov
2016-03-23 13:02                 ` Yan, Zheng
2016-03-23 19:27                   ` Ilya Dryomov
2016-03-24  2:33                     ` Yan, Zheng
2016-03-24 10:26                       ` Ilya Dryomov
2016-03-24 12:19                         ` Ilya Dryomov
2016-03-24 13:18                         ` Yan, Zheng
2016-03-24 13:55                           ` Ilya Dryomov
2016-03-25  3:05                             ` Yan, Zheng
2016-03-28 19:37                               ` Ilya Dryomov
2016-03-29  2:29                                 ` Yan, Zheng
2016-03-29  9:56                                 ` Yan, Zheng
2016-03-29 11:18                                   ` Ilya Dryomov
2016-03-23 15:39   ` Douglas Fuller
2016-03-24  1:39     ` Yan, Zheng
2016-03-24 15:30       ` Douglas Fuller
2016-02-06  7:00 ` [PATCH v2 3/6] libceph: rados pool namesapce support Yan, Zheng
2016-03-22  6:11   ` Ilya Dryomov
     [not found]     ` <D7ADB539-766B-4E6F-B996-E90C5080D418@redhat.com>
     [not found]       ` <CAOi1vP8MnAQSr9HQiOWKML8YED-ADNktqTtcGg6LYy3wEXA8LA@mail.gmail.com>
2016-03-22  9:19         ` Ilya Dryomov
     [not found]         ` <9031B3CD-2132-4A0E-A2C9-0105A61B202E@redhat.com>
2016-03-22 11:03           ` Ilya Dryomov
2016-02-06  7:00 ` [PATCH v2 4/6] libceph: make sure redirect does not change namespace Yan, Zheng
2016-02-06  7:00 ` [PATCH v2 5/6] libceph: calculate request message size according to namespace Yan, Zheng
2016-02-06  7:00 ` [PATCH v2 6/6] ceph: rados pool namesapce support Yan, Zheng

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.