ceph-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] ceph: fix inode number presentation on 32-bit platforms and s390x
@ 2020-08-18 16:23 Jeff Layton
  2020-08-18 16:36 ` Jeff Layton
                   ` (3 more replies)
  0 siblings, 4 replies; 14+ messages in thread
From: Jeff Layton @ 2020-08-18 16:23 UTC (permalink / raw)
  To: ceph-devel; +Cc: Ulrich.Weigand, Tuan.Hoang1

Tuan and Ulrich mentioned that they were hitting a problem on s390x,
which has a 32-bit ino_t value, even though it's a 64-bit arch (for
historical reasons).

I think the current handling of inode numbers in the ceph driver is
wrong. It tries to use 32-bit inode numbers on 32-bit arches, but that's
actually not a problem. 32-bit arches can deal with 64-bit inode numbers
just fine when userland code is compiled with LFS support (the common
case these days).

What we really want to do is just use 64-bit numbers everywhere, unless
someone has mounted with the ino32 mount option. In that case, we want
to ensure that we hash the inode number down to something that will fit
in 32 bits before presenting the value to userland.

Add new helper functions that do this, and only do the conversion before
presenting these values to userland in getattr, readdir, debugfs, and
(some) debug log messages.

The inode table hashvalue is changed to just cast the inode number to
unsigned long, as low-order bits are the most likely to vary anyway.

While it's not strictly required, we do want to put something in
inode->i_ino. Instead of basing it on BITS_PER_LONG, however, base it on
the size of the ino_t type.

Reported-by: Tuan Hoang1 <Tuan.Hoang1@ibm.com>
Reported-by: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
URL: https://tracker.ceph.com/issues/46828
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/caps.c       | 14 +++++-----
 fs/ceph/debugfs.c    |  4 +--
 fs/ceph/dir.c        | 23 +++++-----------
 fs/ceph/file.c       |  4 +--
 fs/ceph/inode.c      | 18 +++++++------
 fs/ceph/mds_client.h |  2 +-
 fs/ceph/quota.c      |  4 +--
 fs/ceph/super.h      | 63 +++++++++++++++++++-------------------------
 8 files changed, 57 insertions(+), 75 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 55ccccf77cea..d5b1f781f398 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -887,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 	int have = ci->i_snap_caps;
 
 	if ((have & mask) == mask) {
-		dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
-		     " (mask %s)\n", ci->vfs_inode.i_ino,
+		dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
+		     " (mask %s)\n", ceph_present_inode(&ci->vfs_inode),
 		     ceph_cap_string(have),
 		     ceph_cap_string(mask));
 		return 1;
@@ -899,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 		if (!__cap_is_valid(cap))
 			continue;
 		if ((cap->issued & mask) == mask) {
-			dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
-			     " (mask %s)\n", ci->vfs_inode.i_ino, cap,
+			dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
+			     " (mask %s)\n", ceph_present_inode(&ci->vfs_inode), cap,
 			     ceph_cap_string(cap->issued),
 			     ceph_cap_string(mask));
 			if (touch)
@@ -911,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 		/* does a combination of caps satisfy mask? */
 		have |= cap->issued;
 		if ((have & mask) == mask) {
-			dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
-			     " (mask %s)\n", ci->vfs_inode.i_ino,
+			dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
+			     " (mask %s)\n", ceph_present_inode(&ci->vfs_inode),
 			     ceph_cap_string(cap->issued),
 			     ceph_cap_string(mask));
 			if (touch) {
@@ -2872,7 +2872,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
 			struct cap_wait cw;
 			DEFINE_WAIT_FUNC(wait, woken_wake_function);
 
-			cw.ino = inode->i_ino;
+			cw.ino = ceph_present_inode(inode);
 			cw.tgid = current->tgid;
 			cw.need = need;
 			cw.want = want;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 97539b497e4c..1cdab7d6b145 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -202,7 +202,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
 {
 	struct seq_file *s = p;
 
-	seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
+	seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_present_inode(inode),
 		   ceph_cap_string(cap->issued),
 		   ceph_cap_string(cap->implemented));
 	return 0;
@@ -247,7 +247,7 @@ static int caps_show(struct seq_file *s, void *p)
 
 	spin_lock(&mdsc->caps_list_lock);
 	list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
-		seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
+		seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino,
 				ceph_cap_string(cw->need),
 				ceph_cap_string(cw->want));
 	}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 060bdcc5ce32..428110efbc01 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -259,9 +259,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 			     dentry, dentry, d_inode(dentry));
 			ctx->pos = di->offset;
 			if (!dir_emit(ctx, dentry->d_name.name,
-				      dentry->d_name.len,
-				      ceph_translate_ino(dentry->d_sb,
-							 d_inode(dentry)->i_ino),
+				      dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
 				      d_inode(dentry)->i_mode >> 12)) {
 				dput(dentry);
 				err = 0;
@@ -324,17 +322,14 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 	/* always start with . and .. */
 	if (ctx->pos == 0) {
 		dout("readdir off 0 -> '.'\n");
-		if (!dir_emit(ctx, ".", 1, 
-			    ceph_translate_ino(inode->i_sb, inode->i_ino),
+		if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
 			    inode->i_mode >> 12))
 			return 0;
 		ctx->pos = 1;
 	}
 	if (ctx->pos == 1) {
-		ino_t ino = parent_ino(file->f_path.dentry);
 		dout("readdir off 1 -> '..'\n");
-		if (!dir_emit(ctx, "..", 2,
-			    ceph_translate_ino(inode->i_sb, ino),
+		if (!dir_emit(ctx, "..", 2, ceph_present_inode(inode),
 			    inode->i_mode >> 12))
 			return 0;
 		ctx->pos = 2;
@@ -507,9 +502,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 	}
 	for (; i < rinfo->dir_nr; i++) {
 		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
-		struct ceph_vino vino;
-		ino_t ino;
-		u32 ftype;
 
 		BUG_ON(rde->offset < ctx->pos);
 
@@ -519,13 +511,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 		     rde->name_len, rde->name, &rde->inode.in);
 
 		BUG_ON(!rde->inode.in);
-		ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
-		vino.ino = le64_to_cpu(rde->inode.in->ino);
-		vino.snap = le64_to_cpu(rde->inode.in->snapid);
-		ino = ceph_vino_to_ino(vino);
 
 		if (!dir_emit(ctx, rde->name, rde->name_len,
-			      ceph_translate_ino(inode->i_sb, ino), ftype)) {
+			      ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
+			      le32_to_cpu(rde->inode.in->mode) >> 12)) {
 			dout("filldir stopping us...\n");
 			return 0;
 		}
@@ -1161,7 +1150,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
 
 	if (try_async && op == CEPH_MDS_OP_UNLINK &&
 	    (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
-		dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
+		dout("async unlink on %llu/%.*s caps=%s", ceph_present_inode(dir),
 		     dentry->d_name.len, dentry->d_name.name,
 		     ceph_cap_string(req->r_dir_caps));
 		set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 04ab99c0223a..d745d26a50f8 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -628,8 +628,8 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
 	} else {
 		struct dentry *dn;
 
-		dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
-			vino.ino, dir->i_ino, dentry->d_name.name);
+		dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
+			vino.ino, ceph_ino(dir), dentry->d_name.name);
 		ceph_dir_clear_ordered(dir);
 		ceph_init_inode_acls(inode, as_ctx);
 		if (inode->i_state & I_NEW) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 357c937699d5..8dbb56d2edef 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work);
  */
 static int ceph_set_ino_cb(struct inode *inode, void *data)
 {
-	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
-	inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	ci->i_vino = *(struct ceph_vino *)data;
+	inode->i_ino = ceph_vino_to_ino(ci->i_vino);
 	inode_set_iversion_raw(inode, 0);
 	return 0;
 }
@@ -50,17 +52,17 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
 struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
 {
 	struct inode *inode;
-	ino_t t = ceph_vino_to_ino(vino);
 
-	inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
+	inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
+			     ceph_set_ino_cb, &vino);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	if (inode->i_state & I_NEW)
 		dout("get_inode created new inode %p %llx.%llx ino %llx\n",
-		     inode, ceph_vinop(inode), (u64)inode->i_ino);
+		     inode, ceph_vinop(inode), ceph_present_inode(inode));
 
-	dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
-	     vino.snap, inode);
+	dout("get_inode on %llu=%llx.%llx got %p\n", ceph_present_inode(inode),
+	     vino.ino, vino.snap, inode);
 	return inode;
 }
 
@@ -2378,7 +2380,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
 	}
 
 	generic_fillattr(inode, stat);
-	stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
+	stat->ino = ceph_present_inode(inode);
 
 	/*
 	 * btime on newly-allocated inodes is 0, so if this is still set to
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index bc9e95937d7c..658800605bfb 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -372,7 +372,7 @@ struct ceph_quotarealm_inode {
 
 struct cap_wait {
 	struct list_head	list;
-	unsigned long		ino;
+	u64			ino;
 	pid_t			tgid;
 	int			need;
 	int			want;
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 198ddde5c1e6..cc2c4d40b022 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -23,12 +23,12 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode)
 {
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	struct super_block *sb = mdsc->fsc->sb;
+	struct inode *root = d_inode(sb->s_root);
 
 	if (atomic64_read(&mdsc->quotarealms_count) > 0)
 		return true;
 	/* if root is the real CephFS root, we don't have quota realms */
-	if (sb->s_root->d_inode &&
-	    (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
+	if (root && ceph_ino(root) == CEPH_INO_ROOT)
 		return false;
 	/* otherwise, we can't know for sure */
 	return true;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 4c3c964b1c54..2a49330de8e8 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -457,15 +457,7 @@ ceph_vino(const struct inode *inode)
 	return ceph_inode(inode)->i_vino;
 }
 
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- *               i_ino (kernel inode)   st_ino (userspace)
- * i386          32                     32
- * x86_64+ino32  64                     32
- * x86_64        64                     64
- */
-static inline u32 ceph_ino_to_ino32(__u64 vino)
+static inline u32 ceph_ino_to_ino32(u64 vino)
 {
 	u32 ino = vino & 0xffffffff;
 	ino ^= vino >> 32;
@@ -474,36 +466,13 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
 	return ino;
 }
 
-/*
- * kernel i_ino value
- */
 static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
 {
-#if BITS_PER_LONG == 32
-	return ceph_ino_to_ino32(vino.ino);
-#else
+	if (sizeof(ino_t) == 32)
+		return ceph_ino_to_ino32(vino.ino);
 	return (ino_t)vino.ino;
-#endif
 }
 
-/*
- * user-visible ino (stat, filldir)
- */
-#if BITS_PER_LONG == 32
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
-	return ino;
-}
-#else
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
-	if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
-		ino = ceph_ino_to_ino32(ino);
-	return ino;
-}
-#endif
-
-
 /* for printf-style formatting */
 #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
 
@@ -511,11 +480,34 @@ static inline u64 ceph_ino(struct inode *inode)
 {
 	return ceph_inode(inode)->i_vino.ino;
 }
+
 static inline u64 ceph_snap(struct inode *inode)
 {
 	return ceph_inode(inode)->i_vino.snap;
 }
 
+/**
+ * ceph_present_ino - format an inode number for presentation to userland
+ * @sb: superblock where the inode lives
+ * @ino: inode number to (possibly) convert
+ *
+ * If the user mounted with the ino32 option, then the 64-bit value needs
+ * to be converted to something that can fit inside 32 bits. Note that
+ * internal kernel code never uses this value, so this is entirely for
+ * userland consumption.
+ */
+static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
+{
+	if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
+		return ceph_ino_to_ino32(ino);
+	return ino;
+}
+
+static inline u64 ceph_present_inode(struct inode *inode)
+{
+	return ceph_present_ino(inode->i_sb, ceph_ino(inode));
+}
+
 static inline int ceph_ino_compare(struct inode *inode, void *data)
 {
 	struct ceph_vino *pvino = (struct ceph_vino *)data;
@@ -527,8 +519,7 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
 static inline struct inode *ceph_find_inode(struct super_block *sb,
 					    struct ceph_vino vino)
 {
-	ino_t t = ceph_vino_to_ino(vino);
-	return ilookup5(sb, t, ceph_ino_compare, &vino);
+	return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino);
 }
 
 
-- 
2.26.2


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] ceph: fix inode number presentation on 32-bit platforms and s390x
  2020-08-18 16:23 [PATCH] ceph: fix inode number presentation on 32-bit platforms and s390x Jeff Layton
@ 2020-08-18 16:36 ` Jeff Layton
  2020-08-18 19:49 ` [PATCH v2] ceph: fix inode number handling on arches with 32-bit ino_t Jeff Layton
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 14+ messages in thread
From: Jeff Layton @ 2020-08-18 16:36 UTC (permalink / raw)
  To: ceph-devel; +Cc: Ulrich.Weigand, Tuan.Hoang1

On Tue, 2020-08-18 at 12:23 -0400, Jeff Layton wrote:
> Tuan and Ulrich mentioned that they were hitting a problem on s390x,
> which has a 32-bit ino_t value, even though it's a 64-bit arch (for
> historical reasons).
> 
> I think the current handling of inode numbers in the ceph driver is
> wrong. It tries to use 32-bit inode numbers on 32-bit arches, but that's
> actually not a problem. 32-bit arches can deal with 64-bit inode numbers
> just fine when userland code is compiled with LFS support (the common
> case these days).
> 
> What we really want to do is just use 64-bit numbers everywhere, unless
> someone has mounted with the ino32 mount option. In that case, we want
> to ensure that we hash the inode number down to something that will fit
> in 32 bits before presenting the value to userland.
> 
> Add new helper functions that do this, and only do the conversion before
> presenting these values to userland in getattr, readdir, debugfs, and
> (some) debug log messages.
> 
> The inode table hashvalue is changed to just cast the inode number to
> unsigned long, as low-order bits are the most likely to vary anyway.
> 
> While it's not strictly required, we do want to put something in
> inode->i_ino. Instead of basing it on BITS_PER_LONG, however, base it on
> the size of the ino_t type.
> 
> Reported-by: Tuan Hoang1 <Tuan.Hoang1@ibm.com>
> Reported-by: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
> URL: https://tracker.ceph.com/issues/46828
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/caps.c       | 14 +++++-----
>  fs/ceph/debugfs.c    |  4 +--
>  fs/ceph/dir.c        | 23 +++++-----------
>  fs/ceph/file.c       |  4 +--
>  fs/ceph/inode.c      | 18 +++++++------
>  fs/ceph/mds_client.h |  2 +-
>  fs/ceph/quota.c      |  4 +--
>  fs/ceph/super.h      | 63 +++++++++++++++++++-------------------------
>  8 files changed, 57 insertions(+), 75 deletions(-)
> 
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 55ccccf77cea..d5b1f781f398 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -887,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
>  	int have = ci->i_snap_caps;
>  
>  	if ((have & mask) == mask) {
> -		dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
> -		     " (mask %s)\n", ci->vfs_inode.i_ino,
> +		dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
> +		     " (mask %s)\n", ceph_present_inode(&ci->vfs_inode),
>  		     ceph_cap_string(have),
>  		     ceph_cap_string(mask));
>  		return 1;
> @@ -899,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
>  		if (!__cap_is_valid(cap))
>  			continue;
>  		if ((cap->issued & mask) == mask) {
> -			dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
> -			     " (mask %s)\n", ci->vfs_inode.i_ino, cap,
> +			dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
> +			     " (mask %s)\n", ceph_present_inode(&ci->vfs_inode), cap,
>  			     ceph_cap_string(cap->issued),
>  			     ceph_cap_string(mask));
>  			if (touch)
> @@ -911,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
>  		/* does a combination of caps satisfy mask? */
>  		have |= cap->issued;
>  		if ((have & mask) == mask) {
> -			dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
> -			     " (mask %s)\n", ci->vfs_inode.i_ino,
> +			dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
> +			     " (mask %s)\n", ceph_present_inode(&ci->vfs_inode),
>  			     ceph_cap_string(cap->issued),
>  			     ceph_cap_string(mask));
>  			if (touch) {
> @@ -2872,7 +2872,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
>  			struct cap_wait cw;
>  			DEFINE_WAIT_FUNC(wait, woken_wake_function);
>  
> -			cw.ino = inode->i_ino;
> +			cw.ino = ceph_present_inode(inode);
>  			cw.tgid = current->tgid;
>  			cw.need = need;
>  			cw.want = want;
> diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
> index 97539b497e4c..1cdab7d6b145 100644
> --- a/fs/ceph/debugfs.c
> +++ b/fs/ceph/debugfs.c
> @@ -202,7 +202,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
>  {
>  	struct seq_file *s = p;
>  
> -	seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
> +	seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_present_inode(inode),
>  		   ceph_cap_string(cap->issued),
>  		   ceph_cap_string(cap->implemented));
>  	return 0;
> @@ -247,7 +247,7 @@ static int caps_show(struct seq_file *s, void *p)
>  
>  	spin_lock(&mdsc->caps_list_lock);
>  	list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
> -		seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
> +		seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino,
>  				ceph_cap_string(cw->need),
>  				ceph_cap_string(cw->want));
>  	}
> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> index 060bdcc5ce32..428110efbc01 100644
> --- a/fs/ceph/dir.c
> +++ b/fs/ceph/dir.c
> @@ -259,9 +259,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
>  			     dentry, dentry, d_inode(dentry));
>  			ctx->pos = di->offset;
>  			if (!dir_emit(ctx, dentry->d_name.name,
> -				      dentry->d_name.len,
> -				      ceph_translate_ino(dentry->d_sb,
> -							 d_inode(dentry)->i_ino),
> +				      dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
>  				      d_inode(dentry)->i_mode >> 12)) {
>  				dput(dentry);
>  				err = 0;
> @@ -324,17 +322,14 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>  	/* always start with . and .. */
>  	if (ctx->pos == 0) {
>  		dout("readdir off 0 -> '.'\n");
> -		if (!dir_emit(ctx, ".", 1, 
> -			    ceph_translate_ino(inode->i_sb, inode->i_ino),
> +		if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
>  			    inode->i_mode >> 12))
>  			return 0;
>  		ctx->pos = 1;
>  	}
>  	if (ctx->pos == 1) {
> -		ino_t ino = parent_ino(file->f_path.dentry);
>  		dout("readdir off 1 -> '..'\n");
> -		if (!dir_emit(ctx, "..", 2,
> -			    ceph_translate_ino(inode->i_sb, ino),
> +		if (!dir_emit(ctx, "..", 2, ceph_present_inode(inode),
>  			    inode->i_mode >> 12))
>  			return 0;

As Ulrich pointed out in the tracker bug, the above is wrong. This
needs to fetch the inode number of the parent. I'll send a v2 once I
fix this and give it a test.

>  		ctx->pos = 2;
> @@ -507,9 +502,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>  	}
>  	for (; i < rinfo->dir_nr; i++) {
>  		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
> -		struct ceph_vino vino;
> -		ino_t ino;
> -		u32 ftype;
>  
>  		BUG_ON(rde->offset < ctx->pos);
>  
> @@ -519,13 +511,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>  		     rde->name_len, rde->name, &rde->inode.in);
>  
>  		BUG_ON(!rde->inode.in);
> -		ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
> -		vino.ino = le64_to_cpu(rde->inode.in->ino);
> -		vino.snap = le64_to_cpu(rde->inode.in->snapid);
> -		ino = ceph_vino_to_ino(vino);
>  
>  		if (!dir_emit(ctx, rde->name, rde->name_len,
> -			      ceph_translate_ino(inode->i_sb, ino), ftype)) {
> +			      ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
> +			      le32_to_cpu(rde->inode.in->mode) >> 12)) {
>  			dout("filldir stopping us...\n");
>  			return 0;
>  		}
> @@ -1161,7 +1150,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
>  
>  	if (try_async && op == CEPH_MDS_OP_UNLINK &&
>  	    (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
> -		dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
> +		dout("async unlink on %llu/%.*s caps=%s", ceph_present_inode(dir),
>  		     dentry->d_name.len, dentry->d_name.name,
>  		     ceph_cap_string(req->r_dir_caps));
>  		set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 04ab99c0223a..d745d26a50f8 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -628,8 +628,8 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
>  	} else {
>  		struct dentry *dn;
>  
> -		dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
> -			vino.ino, dir->i_ino, dentry->d_name.name);
> +		dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
> +			vino.ino, ceph_ino(dir), dentry->d_name.name);
>  		ceph_dir_clear_ordered(dir);
>  		ceph_init_inode_acls(inode, as_ctx);
>  		if (inode->i_state & I_NEW) {
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 357c937699d5..8dbb56d2edef 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work);
>   */
>  static int ceph_set_ino_cb(struct inode *inode, void *data)
>  {
> -	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
> -	inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
> +	struct ceph_inode_info *ci = ceph_inode(inode);
> +
> +	ci->i_vino = *(struct ceph_vino *)data;
> +	inode->i_ino = ceph_vino_to_ino(ci->i_vino);
>  	inode_set_iversion_raw(inode, 0);
>  	return 0;
>  }
> @@ -50,17 +52,17 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
>  struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
>  {
>  	struct inode *inode;
> -	ino_t t = ceph_vino_to_ino(vino);
>  
> -	inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
> +	inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
> +			     ceph_set_ino_cb, &vino);
>  	if (!inode)
>  		return ERR_PTR(-ENOMEM);
>  	if (inode->i_state & I_NEW)
>  		dout("get_inode created new inode %p %llx.%llx ino %llx\n",
> -		     inode, ceph_vinop(inode), (u64)inode->i_ino);
> +		     inode, ceph_vinop(inode), ceph_present_inode(inode));
>  
> -	dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
> -	     vino.snap, inode);
> +	dout("get_inode on %llu=%llx.%llx got %p\n", ceph_present_inode(inode),
> +	     vino.ino, vino.snap, inode);
>  	return inode;
>  }
>  
> @@ -2378,7 +2380,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
>  	}
>  
>  	generic_fillattr(inode, stat);
> -	stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
> +	stat->ino = ceph_present_inode(inode);
>  
>  	/*
>  	 * btime on newly-allocated inodes is 0, so if this is still set to
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index bc9e95937d7c..658800605bfb 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -372,7 +372,7 @@ struct ceph_quotarealm_inode {
>  
>  struct cap_wait {
>  	struct list_head	list;
> -	unsigned long		ino;
> +	u64			ino;
>  	pid_t			tgid;
>  	int			need;
>  	int			want;
> diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
> index 198ddde5c1e6..cc2c4d40b022 100644
> --- a/fs/ceph/quota.c
> +++ b/fs/ceph/quota.c
> @@ -23,12 +23,12 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode)
>  {
>  	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
>  	struct super_block *sb = mdsc->fsc->sb;
> +	struct inode *root = d_inode(sb->s_root);
>  
>  	if (atomic64_read(&mdsc->quotarealms_count) > 0)
>  		return true;
>  	/* if root is the real CephFS root, we don't have quota realms */
> -	if (sb->s_root->d_inode &&
> -	    (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
> +	if (root && ceph_ino(root) == CEPH_INO_ROOT)
>  		return false;
>  	/* otherwise, we can't know for sure */
>  	return true;
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 4c3c964b1c54..2a49330de8e8 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -457,15 +457,7 @@ ceph_vino(const struct inode *inode)
>  	return ceph_inode(inode)->i_vino;
>  }
>  
> -/*
> - * ino_t is <64 bits on many architectures, blech.
> - *
> - *               i_ino (kernel inode)   st_ino (userspace)
> - * i386          32                     32
> - * x86_64+ino32  64                     32
> - * x86_64        64                     64
> - */
> -static inline u32 ceph_ino_to_ino32(__u64 vino)
> +static inline u32 ceph_ino_to_ino32(u64 vino)
>  {
>  	u32 ino = vino & 0xffffffff;
>  	ino ^= vino >> 32;
> @@ -474,36 +466,13 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
>  	return ino;
>  }
>  
> -/*
> - * kernel i_ino value
> - */
>  static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
>  {
> -#if BITS_PER_LONG == 32
> -	return ceph_ino_to_ino32(vino.ino);
> -#else
> +	if (sizeof(ino_t) == 32)
> +		return ceph_ino_to_ino32(vino.ino);
>  	return (ino_t)vino.ino;
> -#endif
>  }
>  
> -/*
> - * user-visible ino (stat, filldir)
> - */
> -#if BITS_PER_LONG == 32
> -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> -{
> -	return ino;
> -}
> -#else
> -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> -{
> -	if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
> -		ino = ceph_ino_to_ino32(ino);
> -	return ino;
> -}
> -#endif
> -
> -
>  /* for printf-style formatting */
>  #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
>  
> @@ -511,11 +480,34 @@ static inline u64 ceph_ino(struct inode *inode)
>  {
>  	return ceph_inode(inode)->i_vino.ino;
>  }
> +
>  static inline u64 ceph_snap(struct inode *inode)
>  {
>  	return ceph_inode(inode)->i_vino.snap;
>  }
>  
> +/**
> + * ceph_present_ino - format an inode number for presentation to userland
> + * @sb: superblock where the inode lives
> + * @ino: inode number to (possibly) convert
> + *
> + * If the user mounted with the ino32 option, then the 64-bit value needs
> + * to be converted to something that can fit inside 32 bits. Note that
> + * internal kernel code never uses this value, so this is entirely for
> + * userland consumption.
> + */
> +static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
> +{
> +	if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
> +		return ceph_ino_to_ino32(ino);
> +	return ino;
> +}
> +
> +static inline u64 ceph_present_inode(struct inode *inode)
> +{
> +	return ceph_present_ino(inode->i_sb, ceph_ino(inode));
> +}
> +
>  static inline int ceph_ino_compare(struct inode *inode, void *data)
>  {
>  	struct ceph_vino *pvino = (struct ceph_vino *)data;
> @@ -527,8 +519,7 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
>  static inline struct inode *ceph_find_inode(struct super_block *sb,
>  					    struct ceph_vino vino)
>  {
> -	ino_t t = ceph_vino_to_ino(vino);
> -	return ilookup5(sb, t, ceph_ino_compare, &vino);
> +	return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino);
>  }
>  
>  

-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v2] ceph: fix inode number handling on arches with 32-bit ino_t
  2020-08-18 16:23 [PATCH] ceph: fix inode number presentation on 32-bit platforms and s390x Jeff Layton
  2020-08-18 16:36 ` Jeff Layton
@ 2020-08-18 19:49 ` Jeff Layton
  2020-08-19  1:58   ` Yan, Zheng
  2020-08-19  7:17   ` Ilya Dryomov
  2020-08-19 12:35 ` [PATCH v3] " Jeff Layton
  2020-08-19 15:16 ` [PATCH] " Jeff Layton
  3 siblings, 2 replies; 14+ messages in thread
From: Jeff Layton @ 2020-08-18 19:49 UTC (permalink / raw)
  To: ceph-devel; +Cc: Ulrich.Weigand, Tuan.Hoang1, idryomov

Tuan and Ulrich mentioned that they were hitting a problem on s390x,
which has a 32-bit ino_t value, even though it's a 64-bit arch (for
historical reasons).

I think the current handling of inode numbers in the ceph driver is
wrong. It tries to use 32-bit inode numbers on 32-bit arches, but that's
actually not a problem. 32-bit arches can deal with 64-bit inode numbers
just fine when userland code is compiled with LFS support (the common
case these days).

What we really want to do is just use 64-bit numbers everywhere, unless
someone has mounted with the ino32 mount option. In that case, we want
to ensure that we hash the inode number down to something that will fit
in 32 bits before presenting the value to userland.

Add new helper functions that do this, and only do the conversion before
presenting these values to userland in getattr, readdir, debugfs, and
(some) debug log messages.

The inode table hashvalue is changed to just cast the inode number to
unsigned long, as low-order bits are the most likely to vary anyway.

While it's not strictly required, we do want to put something in
inode->i_ino. Instead of basing it on BITS_PER_LONG, however, base it on
the size of the ino_t type.

Reported-by: Tuan Hoang1 <Tuan.Hoang1@ibm.com>
Reported-by: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
URL: https://tracker.ceph.com/issues/46828
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/caps.c       | 14 +++++-----
 fs/ceph/debugfs.c    |  4 +--
 fs/ceph/dir.c        | 31 +++++++++-------------
 fs/ceph/file.c       |  4 +--
 fs/ceph/inode.c      | 18 +++++++------
 fs/ceph/mds_client.h |  2 +-
 fs/ceph/quota.c      |  4 +--
 fs/ceph/super.h      | 63 +++++++++++++++++++-------------------------
 8 files changed, 64 insertions(+), 76 deletions(-)

v2:
- fix dir_emit inode number for ".."
- fix ino_t size test

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 55ccccf77cea..d5b1f781f398 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -887,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 	int have = ci->i_snap_caps;
 
 	if ((have & mask) == mask) {
-		dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
-		     " (mask %s)\n", ci->vfs_inode.i_ino,
+		dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
+		     " (mask %s)\n", ceph_present_inode(&ci->vfs_inode),
 		     ceph_cap_string(have),
 		     ceph_cap_string(mask));
 		return 1;
@@ -899,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 		if (!__cap_is_valid(cap))
 			continue;
 		if ((cap->issued & mask) == mask) {
-			dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
-			     " (mask %s)\n", ci->vfs_inode.i_ino, cap,
+			dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
+			     " (mask %s)\n", ceph_present_inode(&ci->vfs_inode), cap,
 			     ceph_cap_string(cap->issued),
 			     ceph_cap_string(mask));
 			if (touch)
@@ -911,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 		/* does a combination of caps satisfy mask? */
 		have |= cap->issued;
 		if ((have & mask) == mask) {
-			dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
-			     " (mask %s)\n", ci->vfs_inode.i_ino,
+			dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
+			     " (mask %s)\n", ceph_present_inode(&ci->vfs_inode),
 			     ceph_cap_string(cap->issued),
 			     ceph_cap_string(mask));
 			if (touch) {
@@ -2872,7 +2872,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
 			struct cap_wait cw;
 			DEFINE_WAIT_FUNC(wait, woken_wake_function);
 
-			cw.ino = inode->i_ino;
+			cw.ino = ceph_present_inode(inode);
 			cw.tgid = current->tgid;
 			cw.need = need;
 			cw.want = want;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 20043382d825..5eeca7987717 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -211,7 +211,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
 {
 	struct seq_file *s = p;
 
-	seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
+	seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_present_inode(inode),
 		   ceph_cap_string(cap->issued),
 		   ceph_cap_string(cap->implemented));
 	return 0;
@@ -256,7 +256,7 @@ static int caps_show(struct seq_file *s, void *p)
 
 	spin_lock(&mdsc->caps_list_lock);
 	list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
-		seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
+		seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino,
 				ceph_cap_string(cw->need),
 				ceph_cap_string(cw->want));
 	}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 060bdcc5ce32..1a61a435a1cd 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -259,9 +259,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 			     dentry, dentry, d_inode(dentry));
 			ctx->pos = di->offset;
 			if (!dir_emit(ctx, dentry->d_name.name,
-				      dentry->d_name.len,
-				      ceph_translate_ino(dentry->d_sb,
-							 d_inode(dentry)->i_ino),
+				      dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
 				      d_inode(dentry)->i_mode >> 12)) {
 				dput(dentry);
 				err = 0;
@@ -324,18 +322,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 	/* always start with . and .. */
 	if (ctx->pos == 0) {
 		dout("readdir off 0 -> '.'\n");
-		if (!dir_emit(ctx, ".", 1, 
-			    ceph_translate_ino(inode->i_sb, inode->i_ino),
+		if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
 			    inode->i_mode >> 12))
 			return 0;
 		ctx->pos = 1;
 	}
 	if (ctx->pos == 1) {
-		ino_t ino = parent_ino(file->f_path.dentry);
+		u64 ino;
+		struct dentry *dentry = file->f_path.dentry;
+
+		spin_lock(&dentry->d_lock);
+		ino = ceph_present_inode(dentry->d_parent->d_inode);
+		spin_unlock(&dentry->d_lock);
+
 		dout("readdir off 1 -> '..'\n");
-		if (!dir_emit(ctx, "..", 2,
-			    ceph_translate_ino(inode->i_sb, ino),
-			    inode->i_mode >> 12))
+		if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
 			return 0;
 		ctx->pos = 2;
 	}
@@ -507,9 +508,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 	}
 	for (; i < rinfo->dir_nr; i++) {
 		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
-		struct ceph_vino vino;
-		ino_t ino;
-		u32 ftype;
 
 		BUG_ON(rde->offset < ctx->pos);
 
@@ -519,13 +517,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 		     rde->name_len, rde->name, &rde->inode.in);
 
 		BUG_ON(!rde->inode.in);
-		ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
-		vino.ino = le64_to_cpu(rde->inode.in->ino);
-		vino.snap = le64_to_cpu(rde->inode.in->snapid);
-		ino = ceph_vino_to_ino(vino);
 
 		if (!dir_emit(ctx, rde->name, rde->name_len,
-			      ceph_translate_ino(inode->i_sb, ino), ftype)) {
+			      ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
+			      le32_to_cpu(rde->inode.in->mode) >> 12)) {
 			dout("filldir stopping us...\n");
 			return 0;
 		}
@@ -1161,7 +1156,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
 
 	if (try_async && op == CEPH_MDS_OP_UNLINK &&
 	    (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
-		dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
+		dout("async unlink on %llu/%.*s caps=%s", ceph_present_inode(dir),
 		     dentry->d_name.len, dentry->d_name.name,
 		     ceph_cap_string(req->r_dir_caps));
 		set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5fa28a620932..20e0c6f1d8f2 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -642,8 +642,8 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
 	} else {
 		struct dentry *dn;
 
-		dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
-			vino.ino, dir->i_ino, dentry->d_name.name);
+		dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
+			vino.ino, ceph_ino(dir), dentry->d_name.name);
 		ceph_dir_clear_ordered(dir);
 		ceph_init_inode_acls(inode, as_ctx);
 		if (inode->i_state & I_NEW) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 357c937699d5..8dbb56d2edef 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work);
  */
 static int ceph_set_ino_cb(struct inode *inode, void *data)
 {
-	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
-	inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	ci->i_vino = *(struct ceph_vino *)data;
+	inode->i_ino = ceph_vino_to_ino(ci->i_vino);
 	inode_set_iversion_raw(inode, 0);
 	return 0;
 }
@@ -50,17 +52,17 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
 struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
 {
 	struct inode *inode;
-	ino_t t = ceph_vino_to_ino(vino);
 
-	inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
+	inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
+			     ceph_set_ino_cb, &vino);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	if (inode->i_state & I_NEW)
 		dout("get_inode created new inode %p %llx.%llx ino %llx\n",
-		     inode, ceph_vinop(inode), (u64)inode->i_ino);
+		     inode, ceph_vinop(inode), ceph_present_inode(inode));
 
-	dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
-	     vino.snap, inode);
+	dout("get_inode on %llu=%llx.%llx got %p\n", ceph_present_inode(inode),
+	     vino.ino, vino.snap, inode);
 	return inode;
 }
 
@@ -2378,7 +2380,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
 	}
 
 	generic_fillattr(inode, stat);
-	stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
+	stat->ino = ceph_present_inode(inode);
 
 	/*
 	 * btime on newly-allocated inodes is 0, so if this is still set to
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index bc9e95937d7c..658800605bfb 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -372,7 +372,7 @@ struct ceph_quotarealm_inode {
 
 struct cap_wait {
 	struct list_head	list;
-	unsigned long		ino;
+	u64			ino;
 	pid_t			tgid;
 	int			need;
 	int			want;
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 198ddde5c1e6..cc2c4d40b022 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -23,12 +23,12 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode)
 {
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	struct super_block *sb = mdsc->fsc->sb;
+	struct inode *root = d_inode(sb->s_root);
 
 	if (atomic64_read(&mdsc->quotarealms_count) > 0)
 		return true;
 	/* if root is the real CephFS root, we don't have quota realms */
-	if (sb->s_root->d_inode &&
-	    (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
+	if (root && ceph_ino(root) == CEPH_INO_ROOT)
 		return false;
 	/* otherwise, we can't know for sure */
 	return true;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 4c3c964b1c54..0dd272a4a410 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -457,15 +457,7 @@ ceph_vino(const struct inode *inode)
 	return ceph_inode(inode)->i_vino;
 }
 
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- *               i_ino (kernel inode)   st_ino (userspace)
- * i386          32                     32
- * x86_64+ino32  64                     32
- * x86_64        64                     64
- */
-static inline u32 ceph_ino_to_ino32(__u64 vino)
+static inline u32 ceph_ino_to_ino32(u64 vino)
 {
 	u32 ino = vino & 0xffffffff;
 	ino ^= vino >> 32;
@@ -474,36 +466,13 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
 	return ino;
 }
 
-/*
- * kernel i_ino value
- */
 static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
 {
-#if BITS_PER_LONG == 32
-	return ceph_ino_to_ino32(vino.ino);
-#else
+	if (sizeof(ino_t) == sizeof(u32))
+		return ceph_ino_to_ino32(vino.ino);
 	return (ino_t)vino.ino;
-#endif
 }
 
-/*
- * user-visible ino (stat, filldir)
- */
-#if BITS_PER_LONG == 32
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
-	return ino;
-}
-#else
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
-	if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
-		ino = ceph_ino_to_ino32(ino);
-	return ino;
-}
-#endif
-
-
 /* for printf-style formatting */
 #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
 
@@ -511,11 +480,34 @@ static inline u64 ceph_ino(struct inode *inode)
 {
 	return ceph_inode(inode)->i_vino.ino;
 }
+
 static inline u64 ceph_snap(struct inode *inode)
 {
 	return ceph_inode(inode)->i_vino.snap;
 }
 
+/**
+ * ceph_present_ino - format an inode number for presentation to userland
+ * @sb: superblock where the inode lives
+ * @ino: inode number to (possibly) convert
+ *
+ * If the user mounted with the ino32 option, then the 64-bit value needs
+ * to be converted to something that can fit inside 32 bits. Note that
+ * internal kernel code never uses this value, so this is entirely for
+ * userland consumption.
+ */
+static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
+{
+	if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
+		return ceph_ino_to_ino32(ino);
+	return ino;
+}
+
+static inline u64 ceph_present_inode(struct inode *inode)
+{
+	return ceph_present_ino(inode->i_sb, ceph_ino(inode));
+}
+
 static inline int ceph_ino_compare(struct inode *inode, void *data)
 {
 	struct ceph_vino *pvino = (struct ceph_vino *)data;
@@ -527,8 +519,7 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
 static inline struct inode *ceph_find_inode(struct super_block *sb,
 					    struct ceph_vino vino)
 {
-	ino_t t = ceph_vino_to_ino(vino);
-	return ilookup5(sb, t, ceph_ino_compare, &vino);
+	return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino);
 }
 
 
-- 
2.26.2


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] ceph: fix inode number handling on arches with 32-bit ino_t
  2020-08-18 19:49 ` [PATCH v2] ceph: fix inode number handling on arches with 32-bit ino_t Jeff Layton
@ 2020-08-19  1:58   ` Yan, Zheng
  2020-08-19  7:17   ` Ilya Dryomov
  1 sibling, 0 replies; 14+ messages in thread
From: Yan, Zheng @ 2020-08-19  1:58 UTC (permalink / raw)
  To: Jeff Layton; +Cc: ceph-devel, Ulrich.Weigand, Tuan.Hoang1, Ilya Dryomov

On Wed, Aug 19, 2020 at 3:52 AM Jeff Layton <jlayton@kernel.org> wrote:
>
> Tuan and Ulrich mentioned that they were hitting a problem on s390x,
> which has a 32-bit ino_t value, even though it's a 64-bit arch (for
> historical reasons).
>
> I think the current handling of inode numbers in the ceph driver is
> wrong. It tries to use 32-bit inode numbers on 32-bit arches, but that's
> actually not a problem. 32-bit arches can deal with 64-bit inode numbers
> just fine when userland code is compiled with LFS support (the common
> case these days).
>
> What we really want to do is just use 64-bit numbers everywhere, unless
> someone has mounted with the ino32 mount option. In that case, we want
> to ensure that we hash the inode number down to something that will fit
> in 32 bits before presenting the value to userland.
>
> Add new helper functions that do this, and only do the conversion before
> presenting these values to userland in getattr, readdir, debugfs, and
> (some) debug log messages.
>
> The inode table hashvalue is changed to just cast the inode number to
> unsigned long, as low-order bits are the most likely to vary anyway.
>
> While it's not strictly required, we do want to put something in
> inode->i_ino. Instead of basing it on BITS_PER_LONG, however, base it on
> the size of the ino_t type.
>
> Reported-by: Tuan Hoang1 <Tuan.Hoang1@ibm.com>
> Reported-by: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
> URL: https://tracker.ceph.com/issues/46828
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/caps.c       | 14 +++++-----
>  fs/ceph/debugfs.c    |  4 +--
>  fs/ceph/dir.c        | 31 +++++++++-------------
>  fs/ceph/file.c       |  4 +--
>  fs/ceph/inode.c      | 18 +++++++------
>  fs/ceph/mds_client.h |  2 +-
>  fs/ceph/quota.c      |  4 +--
>  fs/ceph/super.h      | 63 +++++++++++++++++++-------------------------
>  8 files changed, 64 insertions(+), 76 deletions(-)
>
> v2:
> - fix dir_emit inode number for ".."
> - fix ino_t size test
>
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 55ccccf77cea..d5b1f781f398 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -887,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
>         int have = ci->i_snap_caps;
>
>         if ((have & mask) == mask) {
> -               dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
> -                    " (mask %s)\n", ci->vfs_inode.i_ino,
> +               dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
> +                    " (mask %s)\n", ceph_present_inode(&ci->vfs_inode),

debug output should alway use 64 bits ino number

>                      ceph_cap_string(have),
>                      ceph_cap_string(mask));
>                 return 1;
> @@ -899,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
>                 if (!__cap_is_valid(cap))
>                         continue;
>                 if ((cap->issued & mask) == mask) {
> -                       dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
> -                            " (mask %s)\n", ci->vfs_inode.i_ino, cap,
> +                       dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
> +                            " (mask %s)\n", ceph_present_inode(&ci->vfs_inode), cap,
>                              ceph_cap_string(cap->issued),
>                              ceph_cap_string(mask));
>                         if (touch)
> @@ -911,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
>                 /* does a combination of caps satisfy mask? */
>                 have |= cap->issued;
>                 if ((have & mask) == mask) {
> -                       dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
> -                            " (mask %s)\n", ci->vfs_inode.i_ino,
> +                       dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
> +                            " (mask %s)\n", ceph_present_inode(&ci->vfs_inode),
>                              ceph_cap_string(cap->issued),
>                              ceph_cap_string(mask));
>                         if (touch) {
> @@ -2872,7 +2872,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
>                         struct cap_wait cw;
>                         DEFINE_WAIT_FUNC(wait, woken_wake_function);
>
> -                       cw.ino = inode->i_ino;
> +                       cw.ino = ceph_present_inode(inode);

should always assign 64 bits inode number to it

>                         cw.tgid = current->tgid;
>                         cw.need = need;
>                         cw.want = want;
> diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
> index 20043382d825..5eeca7987717 100644
> --- a/fs/ceph/debugfs.c
> +++ b/fs/ceph/debugfs.c
> @@ -211,7 +211,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
>  {
>         struct seq_file *s = p;
>
> -       seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
> +       seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_present_inode(inode),
>                    ceph_cap_string(cap->issued),
>                    ceph_cap_string(cap->implemented));
>         return 0;
> @@ -256,7 +256,7 @@ static int caps_show(struct seq_file *s, void *p)
>
>         spin_lock(&mdsc->caps_list_lock);
>         list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
> -               seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
> +               seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino,
>                                 ceph_cap_string(cw->need),
>                                 ceph_cap_string(cw->want));
>         }
> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> index 060bdcc5ce32..1a61a435a1cd 100644
> --- a/fs/ceph/dir.c
> +++ b/fs/ceph/dir.c
> @@ -259,9 +259,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
>                              dentry, dentry, d_inode(dentry));
>                         ctx->pos = di->offset;
>                         if (!dir_emit(ctx, dentry->d_name.name,
> -                                     dentry->d_name.len,
> -                                     ceph_translate_ino(dentry->d_sb,
> -                                                        d_inode(dentry)->i_ino),
> +                                     dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
>                                       d_inode(dentry)->i_mode >> 12)) {
>                                 dput(dentry);
>                                 err = 0;
> @@ -324,18 +322,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>         /* always start with . and .. */
>         if (ctx->pos == 0) {
>                 dout("readdir off 0 -> '.'\n");
> -               if (!dir_emit(ctx, ".", 1,
> -                           ceph_translate_ino(inode->i_sb, inode->i_ino),
> +               if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
>                             inode->i_mode >> 12))
>                         return 0;
>                 ctx->pos = 1;
>         }
>         if (ctx->pos == 1) {
> -               ino_t ino = parent_ino(file->f_path.dentry);
> +               u64 ino;
> +               struct dentry *dentry = file->f_path.dentry;
> +
> +               spin_lock(&dentry->d_lock);
> +               ino = ceph_present_inode(dentry->d_parent->d_inode);
> +               spin_unlock(&dentry->d_lock);
> +
>                 dout("readdir off 1 -> '..'\n");
> -               if (!dir_emit(ctx, "..", 2,
> -                           ceph_translate_ino(inode->i_sb, ino),
> -                           inode->i_mode >> 12))
> +               if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
>                         return 0;
>                 ctx->pos = 2;
>         }
> @@ -507,9 +508,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>         }
>         for (; i < rinfo->dir_nr; i++) {
>                 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
> -               struct ceph_vino vino;
> -               ino_t ino;
> -               u32 ftype;
>
>                 BUG_ON(rde->offset < ctx->pos);
>
> @@ -519,13 +517,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>                      rde->name_len, rde->name, &rde->inode.in);
>
>                 BUG_ON(!rde->inode.in);
> -               ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
> -               vino.ino = le64_to_cpu(rde->inode.in->ino);
> -               vino.snap = le64_to_cpu(rde->inode.in->snapid);
> -               ino = ceph_vino_to_ino(vino);
>
>                 if (!dir_emit(ctx, rde->name, rde->name_len,
> -                             ceph_translate_ino(inode->i_sb, ino), ftype)) {
> +                             ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
> +                             le32_to_cpu(rde->inode.in->mode) >> 12)) {
>                         dout("filldir stopping us...\n");
>                         return 0;
>                 }
> @@ -1161,7 +1156,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
>
>         if (try_async && op == CEPH_MDS_OP_UNLINK &&
>             (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
> -               dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
> +               dout("async unlink on %llu/%.*s caps=%s", ceph_present_inode(dir),
>                      dentry->d_name.len, dentry->d_name.name,
>                      ceph_cap_string(req->r_dir_caps));
>                 set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 5fa28a620932..20e0c6f1d8f2 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -642,8 +642,8 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
>         } else {
>                 struct dentry *dn;
>
> -               dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
> -                       vino.ino, dir->i_ino, dentry->d_name.name);
> +               dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
> +                       vino.ino, ceph_ino(dir), dentry->d_name.name);
>                 ceph_dir_clear_ordered(dir);
>                 ceph_init_inode_acls(inode, as_ctx);
>                 if (inode->i_state & I_NEW) {
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 357c937699d5..8dbb56d2edef 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work);
>   */
>  static int ceph_set_ino_cb(struct inode *inode, void *data)
>  {
> -       ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
> -       inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
> +       struct ceph_inode_info *ci = ceph_inode(inode);
> +
> +       ci->i_vino = *(struct ceph_vino *)data;
> +       inode->i_ino = ceph_vino_to_ino(ci->i_vino);
>         inode_set_iversion_raw(inode, 0);
>         return 0;
>  }
> @@ -50,17 +52,17 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
>  struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
>  {
>         struct inode *inode;
> -       ino_t t = ceph_vino_to_ino(vino);
>
> -       inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
> +       inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
> +                            ceph_set_ino_cb, &vino);
>         if (!inode)
>                 return ERR_PTR(-ENOMEM);
>         if (inode->i_state & I_NEW)
>                 dout("get_inode created new inode %p %llx.%llx ino %llx\n",
> -                    inode, ceph_vinop(inode), (u64)inode->i_ino);
> +                    inode, ceph_vinop(inode), ceph_present_inode(inode));
>
> -       dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
> -            vino.snap, inode);
> +       dout("get_inode on %llu=%llx.%llx got %p\n", ceph_present_inode(inode),
> +            vino.ino, vino.snap, inode);
>         return inode;
>  }
>
> @@ -2378,7 +2380,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
>         }
>
>         generic_fillattr(inode, stat);
> -       stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
> +       stat->ino = ceph_present_inode(inode);
>
>         /*
>          * btime on newly-allocated inodes is 0, so if this is still set to
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index bc9e95937d7c..658800605bfb 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -372,7 +372,7 @@ struct ceph_quotarealm_inode {
>
>  struct cap_wait {
>         struct list_head        list;
> -       unsigned long           ino;
> +       u64                     ino;
>         pid_t                   tgid;
>         int                     need;
>         int                     want;
> diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
> index 198ddde5c1e6..cc2c4d40b022 100644
> --- a/fs/ceph/quota.c
> +++ b/fs/ceph/quota.c
> @@ -23,12 +23,12 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode)
>  {
>         struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
>         struct super_block *sb = mdsc->fsc->sb;
> +       struct inode *root = d_inode(sb->s_root);
>
>         if (atomic64_read(&mdsc->quotarealms_count) > 0)
>                 return true;
>         /* if root is the real CephFS root, we don't have quota realms */
> -       if (sb->s_root->d_inode &&
> -           (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
> +       if (root && ceph_ino(root) == CEPH_INO_ROOT)
>                 return false;
>         /* otherwise, we can't know for sure */
>         return true;
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 4c3c964b1c54..0dd272a4a410 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -457,15 +457,7 @@ ceph_vino(const struct inode *inode)
>         return ceph_inode(inode)->i_vino;
>  }
>
> -/*
> - * ino_t is <64 bits on many architectures, blech.
> - *
> - *               i_ino (kernel inode)   st_ino (userspace)
> - * i386          32                     32
> - * x86_64+ino32  64                     32
> - * x86_64        64                     64
> - */
> -static inline u32 ceph_ino_to_ino32(__u64 vino)
> +static inline u32 ceph_ino_to_ino32(u64 vino)
>  {
>         u32 ino = vino & 0xffffffff;
>         ino ^= vino >> 32;
> @@ -474,36 +466,13 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
>         return ino;
>  }
>
> -/*
> - * kernel i_ino value
> - */
>  static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
>  {
> -#if BITS_PER_LONG == 32
> -       return ceph_ino_to_ino32(vino.ino);
> -#else
> +       if (sizeof(ino_t) == sizeof(u32))
> +               return ceph_ino_to_ino32(vino.ino);
>         return (ino_t)vino.ino;
> -#endif
>  }
>
> -/*
> - * user-visible ino (stat, filldir)
> - */
> -#if BITS_PER_LONG == 32
> -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> -{
> -       return ino;
> -}
> -#else
> -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> -{
> -       if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
> -               ino = ceph_ino_to_ino32(ino);
> -       return ino;
> -}
> -#endif
> -
> -
>  /* for printf-style formatting */
>  #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
>
> @@ -511,11 +480,34 @@ static inline u64 ceph_ino(struct inode *inode)
>  {
>         return ceph_inode(inode)->i_vino.ino;
>  }
> +
>  static inline u64 ceph_snap(struct inode *inode)
>  {
>         return ceph_inode(inode)->i_vino.snap;
>  }
>
> +/**
> + * ceph_present_ino - format an inode number for presentation to userland
> + * @sb: superblock where the inode lives
> + * @ino: inode number to (possibly) convert
> + *
> + * If the user mounted with the ino32 option, then the 64-bit value needs
> + * to be converted to something that can fit inside 32 bits. Note that
> + * internal kernel code never uses this value, so this is entirely for
> + * userland consumption.
> + */
> +static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
> +{
> +       if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
> +               return ceph_ino_to_ino32(ino);
> +       return ino;
> +}
> +
> +static inline u64 ceph_present_inode(struct inode *inode)
> +{
> +       return ceph_present_ino(inode->i_sb, ceph_ino(inode));
> +}
> +
>  static inline int ceph_ino_compare(struct inode *inode, void *data)
>  {
>         struct ceph_vino *pvino = (struct ceph_vino *)data;
> @@ -527,8 +519,7 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
>  static inline struct inode *ceph_find_inode(struct super_block *sb,
>                                             struct ceph_vino vino)
>  {
> -       ino_t t = ceph_vino_to_ino(vino);
> -       return ilookup5(sb, t, ceph_ino_compare, &vino);
> +       return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino);
>  }
>
>
> --
> 2.26.2
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] ceph: fix inode number handling on arches with 32-bit ino_t
  2020-08-18 19:49 ` [PATCH v2] ceph: fix inode number handling on arches with 32-bit ino_t Jeff Layton
  2020-08-19  1:58   ` Yan, Zheng
@ 2020-08-19  7:17   ` Ilya Dryomov
  2020-08-19 11:25     ` Jeff Layton
  1 sibling, 1 reply; 14+ messages in thread
From: Ilya Dryomov @ 2020-08-19  7:17 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Ceph Development, Ulrich.Weigand, Tuan.Hoang1

On Tue, Aug 18, 2020 at 9:49 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> Tuan and Ulrich mentioned that they were hitting a problem on s390x,
> which has a 32-bit ino_t value, even though it's a 64-bit arch (for
> historical reasons).
>
> I think the current handling of inode numbers in the ceph driver is
> wrong. It tries to use 32-bit inode numbers on 32-bit arches, but that's
> actually not a problem. 32-bit arches can deal with 64-bit inode numbers
> just fine when userland code is compiled with LFS support (the common
> case these days).
>
> What we really want to do is just use 64-bit numbers everywhere, unless
> someone has mounted with the ino32 mount option. In that case, we want
> to ensure that we hash the inode number down to something that will fit
> in 32 bits before presenting the value to userland.
>
> Add new helper functions that do this, and only do the conversion before
> presenting these values to userland in getattr, readdir, debugfs, and
> (some) debug log messages.
>
> The inode table hashvalue is changed to just cast the inode number to
> unsigned long, as low-order bits are the most likely to vary anyway.
>
> While it's not strictly required, we do want to put something in
> inode->i_ino. Instead of basing it on BITS_PER_LONG, however, base it on
> the size of the ino_t type.
>
> Reported-by: Tuan Hoang1 <Tuan.Hoang1@ibm.com>
> Reported-by: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
> URL: https://tracker.ceph.com/issues/46828
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/caps.c       | 14 +++++-----
>  fs/ceph/debugfs.c    |  4 +--
>  fs/ceph/dir.c        | 31 +++++++++-------------
>  fs/ceph/file.c       |  4 +--
>  fs/ceph/inode.c      | 18 +++++++------
>  fs/ceph/mds_client.h |  2 +-
>  fs/ceph/quota.c      |  4 +--
>  fs/ceph/super.h      | 63 +++++++++++++++++++-------------------------
>  8 files changed, 64 insertions(+), 76 deletions(-)
>
> v2:
> - fix dir_emit inode number for ".."
> - fix ino_t size test
>
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 55ccccf77cea..d5b1f781f398 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -887,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
>         int have = ci->i_snap_caps;
>
>         if ((have & mask) == mask) {
> -               dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
> -                    " (mask %s)\n", ci->vfs_inode.i_ino,
> +               dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
> +                    " (mask %s)\n", ceph_present_inode(&ci->vfs_inode),

Hi Jeff,

I agree with Zheng.  douts should output the real 64-bit values.
If the presentation value is of interest in some places, they should
output it alongside the real value.

>                      ceph_cap_string(have),
>                      ceph_cap_string(mask));
>                 return 1;
> @@ -899,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
>                 if (!__cap_is_valid(cap))
>                         continue;
>                 if ((cap->issued & mask) == mask) {
> -                       dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
> -                            " (mask %s)\n", ci->vfs_inode.i_ino, cap,
> +                       dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
> +                            " (mask %s)\n", ceph_present_inode(&ci->vfs_inode), cap,
>                              ceph_cap_string(cap->issued),
>                              ceph_cap_string(mask));
>                         if (touch)
> @@ -911,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
>                 /* does a combination of caps satisfy mask? */
>                 have |= cap->issued;
>                 if ((have & mask) == mask) {
> -                       dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
> -                            " (mask %s)\n", ci->vfs_inode.i_ino,
> +                       dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
> +                            " (mask %s)\n", ceph_present_inode(&ci->vfs_inode),
>                              ceph_cap_string(cap->issued),
>                              ceph_cap_string(mask));
>                         if (touch) {
> @@ -2872,7 +2872,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
>                         struct cap_wait cw;
>                         DEFINE_WAIT_FUNC(wait, woken_wake_function);
>
> -                       cw.ino = inode->i_ino;
> +                       cw.ino = ceph_present_inode(inode);
>                         cw.tgid = current->tgid;
>                         cw.need = need;
>                         cw.want = want;
> diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
> index 20043382d825..5eeca7987717 100644
> --- a/fs/ceph/debugfs.c
> +++ b/fs/ceph/debugfs.c
> @@ -211,7 +211,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
>  {
>         struct seq_file *s = p;
>
> -       seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
> +       seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_present_inode(inode),
>                    ceph_cap_string(cap->issued),
>                    ceph_cap_string(cap->implemented));
>         return 0;
> @@ -256,7 +256,7 @@ static int caps_show(struct seq_file *s, void *p)
>
>         spin_lock(&mdsc->caps_list_lock);
>         list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
> -               seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
> +               seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino,
>                                 ceph_cap_string(cw->need),
>                                 ceph_cap_string(cw->want));
>         }
> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> index 060bdcc5ce32..1a61a435a1cd 100644
> --- a/fs/ceph/dir.c
> +++ b/fs/ceph/dir.c
> @@ -259,9 +259,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
>                              dentry, dentry, d_inode(dentry));
>                         ctx->pos = di->offset;
>                         if (!dir_emit(ctx, dentry->d_name.name,
> -                                     dentry->d_name.len,
> -                                     ceph_translate_ino(dentry->d_sb,
> -                                                        d_inode(dentry)->i_ino),
> +                                     dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
>                                       d_inode(dentry)->i_mode >> 12)) {
>                                 dput(dentry);
>                                 err = 0;
> @@ -324,18 +322,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>         /* always start with . and .. */
>         if (ctx->pos == 0) {
>                 dout("readdir off 0 -> '.'\n");
> -               if (!dir_emit(ctx, ".", 1,
> -                           ceph_translate_ino(inode->i_sb, inode->i_ino),
> +               if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
>                             inode->i_mode >> 12))
>                         return 0;
>                 ctx->pos = 1;
>         }
>         if (ctx->pos == 1) {
> -               ino_t ino = parent_ino(file->f_path.dentry);
> +               u64 ino;
> +               struct dentry *dentry = file->f_path.dentry;
> +
> +               spin_lock(&dentry->d_lock);
> +               ino = ceph_present_inode(dentry->d_parent->d_inode);
> +               spin_unlock(&dentry->d_lock);
> +
>                 dout("readdir off 1 -> '..'\n");
> -               if (!dir_emit(ctx, "..", 2,
> -                           ceph_translate_ino(inode->i_sb, ino),
> -                           inode->i_mode >> 12))
> +               if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
>                         return 0;
>                 ctx->pos = 2;
>         }
> @@ -507,9 +508,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>         }
>         for (; i < rinfo->dir_nr; i++) {
>                 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
> -               struct ceph_vino vino;
> -               ino_t ino;
> -               u32 ftype;
>
>                 BUG_ON(rde->offset < ctx->pos);
>
> @@ -519,13 +517,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>                      rde->name_len, rde->name, &rde->inode.in);
>
>                 BUG_ON(!rde->inode.in);
> -               ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
> -               vino.ino = le64_to_cpu(rde->inode.in->ino);
> -               vino.snap = le64_to_cpu(rde->inode.in->snapid);
> -               ino = ceph_vino_to_ino(vino);
>
>                 if (!dir_emit(ctx, rde->name, rde->name_len,
> -                             ceph_translate_ino(inode->i_sb, ino), ftype)) {
> +                             ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
> +                             le32_to_cpu(rde->inode.in->mode) >> 12)) {
>                         dout("filldir stopping us...\n");
>                         return 0;
>                 }
> @@ -1161,7 +1156,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
>
>         if (try_async && op == CEPH_MDS_OP_UNLINK &&
>             (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
> -               dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
> +               dout("async unlink on %llu/%.*s caps=%s", ceph_present_inode(dir),
>                      dentry->d_name.len, dentry->d_name.name,
>                      ceph_cap_string(req->r_dir_caps));
>                 set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 5fa28a620932..20e0c6f1d8f2 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -642,8 +642,8 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
>         } else {
>                 struct dentry *dn;
>
> -               dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
> -                       vino.ino, dir->i_ino, dentry->d_name.name);
> +               dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
> +                       vino.ino, ceph_ino(dir), dentry->d_name.name);
>                 ceph_dir_clear_ordered(dir);
>                 ceph_init_inode_acls(inode, as_ctx);
>                 if (inode->i_state & I_NEW) {
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 357c937699d5..8dbb56d2edef 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work);
>   */
>  static int ceph_set_ino_cb(struct inode *inode, void *data)
>  {
> -       ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
> -       inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
> +       struct ceph_inode_info *ci = ceph_inode(inode);
> +
> +       ci->i_vino = *(struct ceph_vino *)data;
> +       inode->i_ino = ceph_vino_to_ino(ci->i_vino);
>         inode_set_iversion_raw(inode, 0);
>         return 0;
>  }
> @@ -50,17 +52,17 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
>  struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
>  {
>         struct inode *inode;
> -       ino_t t = ceph_vino_to_ino(vino);
>
> -       inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
> +       inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
> +                            ceph_set_ino_cb, &vino);
>         if (!inode)
>                 return ERR_PTR(-ENOMEM);
>         if (inode->i_state & I_NEW)
>                 dout("get_inode created new inode %p %llx.%llx ino %llx\n",
> -                    inode, ceph_vinop(inode), (u64)inode->i_ino);
> +                    inode, ceph_vinop(inode), ceph_present_inode(inode));
>
> -       dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
> -            vino.snap, inode);
> +       dout("get_inode on %llu=%llx.%llx got %p\n", ceph_present_inode(inode),
> +            vino.ino, vino.snap, inode);
>         return inode;
>  }
>
> @@ -2378,7 +2380,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
>         }
>
>         generic_fillattr(inode, stat);
> -       stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
> +       stat->ino = ceph_present_inode(inode);
>
>         /*
>          * btime on newly-allocated inodes is 0, so if this is still set to
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index bc9e95937d7c..658800605bfb 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -372,7 +372,7 @@ struct ceph_quotarealm_inode {
>
>  struct cap_wait {
>         struct list_head        list;
> -       unsigned long           ino;
> +       u64                     ino;
>         pid_t                   tgid;
>         int                     need;
>         int                     want;
> diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
> index 198ddde5c1e6..cc2c4d40b022 100644
> --- a/fs/ceph/quota.c
> +++ b/fs/ceph/quota.c
> @@ -23,12 +23,12 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode)
>  {
>         struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
>         struct super_block *sb = mdsc->fsc->sb;
> +       struct inode *root = d_inode(sb->s_root);
>
>         if (atomic64_read(&mdsc->quotarealms_count) > 0)
>                 return true;
>         /* if root is the real CephFS root, we don't have quota realms */
> -       if (sb->s_root->d_inode &&
> -           (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
> +       if (root && ceph_ino(root) == CEPH_INO_ROOT)
>                 return false;
>         /* otherwise, we can't know for sure */
>         return true;
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 4c3c964b1c54..0dd272a4a410 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -457,15 +457,7 @@ ceph_vino(const struct inode *inode)
>         return ceph_inode(inode)->i_vino;
>  }
>
> -/*
> - * ino_t is <64 bits on many architectures, blech.
> - *
> - *               i_ino (kernel inode)   st_ino (userspace)
> - * i386          32                     32
> - * x86_64+ino32  64                     32
> - * x86_64        64                     64
> - */
> -static inline u32 ceph_ino_to_ino32(__u64 vino)
> +static inline u32 ceph_ino_to_ino32(u64 vino)
>  {
>         u32 ino = vino & 0xffffffff;
>         ino ^= vino >> 32;
> @@ -474,36 +466,13 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
>         return ino;
>  }
>
> -/*
> - * kernel i_ino value
> - */
>  static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
>  {
> -#if BITS_PER_LONG == 32
> -       return ceph_ino_to_ino32(vino.ino);
> -#else
> +       if (sizeof(ino_t) == sizeof(u32))
> +               return ceph_ino_to_ino32(vino.ino);
>         return (ino_t)vino.ino;
> -#endif
>  }
>
> -/*
> - * user-visible ino (stat, filldir)
> - */
> -#if BITS_PER_LONG == 32
> -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> -{
> -       return ino;
> -}
> -#else
> -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> -{
> -       if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
> -               ino = ceph_ino_to_ino32(ino);
> -       return ino;
> -}
> -#endif
> -
> -
>  /* for printf-style formatting */
>  #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
>
> @@ -511,11 +480,34 @@ static inline u64 ceph_ino(struct inode *inode)
>  {
>         return ceph_inode(inode)->i_vino.ino;
>  }
> +
>  static inline u64 ceph_snap(struct inode *inode)
>  {
>         return ceph_inode(inode)->i_vino.snap;
>  }
>
> +/**
> + * ceph_present_ino - format an inode number for presentation to userland
> + * @sb: superblock where the inode lives
> + * @ino: inode number to (possibly) convert
> + *
> + * If the user mounted with the ino32 option, then the 64-bit value needs
> + * to be converted to something that can fit inside 32 bits. Note that
> + * internal kernel code never uses this value, so this is entirely for
> + * userland consumption.
> + */
> +static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
> +{
> +       if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
> +               return ceph_ino_to_ino32(ino);
> +       return ino;
> +}
> +
> +static inline u64 ceph_present_inode(struct inode *inode)
> +{
> +       return ceph_present_ino(inode->i_sb, ceph_ino(inode));
> +}
> +
>  static inline int ceph_ino_compare(struct inode *inode, void *data)
>  {
>         struct ceph_vino *pvino = (struct ceph_vino *)data;
> @@ -527,8 +519,7 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
>  static inline struct inode *ceph_find_inode(struct super_block *sb,
>                                             struct ceph_vino vino)
>  {
> -       ino_t t = ceph_vino_to_ino(vino);
> -       return ilookup5(sb, t, ceph_ino_compare, &vino);
> +       return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino);

What is the rationale for this bit?  There is a paragraph in the
changelog that mentions that "low-order bits are the most likely
to vary anyway", but this way we seem to have three sets of values
instead of two: the real 64-bit values, the ino_t values and the
truncated values.

Thanks,

                Ilya

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] ceph: fix inode number handling on arches with 32-bit ino_t
  2020-08-19  7:17   ` Ilya Dryomov
@ 2020-08-19 11:25     ` Jeff Layton
  2020-08-19 13:28       ` Ilya Dryomov
  0 siblings, 1 reply; 14+ messages in thread
From: Jeff Layton @ 2020-08-19 11:25 UTC (permalink / raw)
  To: Ilya Dryomov; +Cc: Ceph Development, Ulrich.Weigand, Tuan.Hoang1

On Wed, 2020-08-19 at 09:17 +0200, Ilya Dryomov wrote:
> On Tue, Aug 18, 2020 at 9:49 PM Jeff Layton <jlayton@kernel.org> wrote:
> > Tuan and Ulrich mentioned that they were hitting a problem on s390x,
> > which has a 32-bit ino_t value, even though it's a 64-bit arch (for
> > historical reasons).
> > 
> > I think the current handling of inode numbers in the ceph driver is
> > wrong. It tries to use 32-bit inode numbers on 32-bit arches, but that's
> > actually not a problem. 32-bit arches can deal with 64-bit inode numbers
> > just fine when userland code is compiled with LFS support (the common
> > case these days).
> > 
> > What we really want to do is just use 64-bit numbers everywhere, unless
> > someone has mounted with the ino32 mount option. In that case, we want
> > to ensure that we hash the inode number down to something that will fit
> > in 32 bits before presenting the value to userland.
> > 
> > Add new helper functions that do this, and only do the conversion before
> > presenting these values to userland in getattr, readdir, debugfs, and
> > (some) debug log messages.
> > 
> > The inode table hashvalue is changed to just cast the inode number to
> > unsigned long, as low-order bits are the most likely to vary anyway.
> > 
> > While it's not strictly required, we do want to put something in
> > inode->i_ino. Instead of basing it on BITS_PER_LONG, however, base it on
> > the size of the ino_t type.
> > 
> > Reported-by: Tuan Hoang1 <Tuan.Hoang1@ibm.com>
> > Reported-by: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
> > URL: https://tracker.ceph.com/issues/46828
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> >  fs/ceph/caps.c       | 14 +++++-----
> >  fs/ceph/debugfs.c    |  4 +--
> >  fs/ceph/dir.c        | 31 +++++++++-------------
> >  fs/ceph/file.c       |  4 +--
> >  fs/ceph/inode.c      | 18 +++++++------
> >  fs/ceph/mds_client.h |  2 +-
> >  fs/ceph/quota.c      |  4 +--
> >  fs/ceph/super.h      | 63 +++++++++++++++++++-------------------------
> >  8 files changed, 64 insertions(+), 76 deletions(-)
> > 
> > v2:
> > - fix dir_emit inode number for ".."
> > - fix ino_t size test
> > 
> > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > index 55ccccf77cea..d5b1f781f398 100644
> > --- a/fs/ceph/caps.c
> > +++ b/fs/ceph/caps.c
> > @@ -887,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
> >         int have = ci->i_snap_caps;
> > 
> >         if ((have & mask) == mask) {
> > -               dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
> > -                    " (mask %s)\n", ci->vfs_inode.i_ino,
> > +               dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
> > +                    " (mask %s)\n", ceph_present_inode(&ci->vfs_inode),
> 
> Hi Jeff,
> 
> I agree with Zheng.  douts should output the real 64-bit values.
> If the presentation value is of interest in some places, they should
> output it alongside the real value.
> 

Ok.

FWIW, the rationale for using ceph_present_inode in these is that the
values would better match what stat() reports in st_ino. The debug
messages are really inconsistent here though, so I'm fine with moving
them to always print the 64-bit values. My guess is that hardly anyone
uses -o ino32 in this day and age anyway...


> >                      ceph_cap_string(have),
> >                      ceph_cap_string(mask));
> >                 return 1;
> > @@ -899,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
> >                 if (!__cap_is_valid(cap))
> >                         continue;
> >                 if ((cap->issued & mask) == mask) {
> > -                       dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
> > -                            " (mask %s)\n", ci->vfs_inode.i_ino, cap,
> > +                       dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
> > +                            " (mask %s)\n", ceph_present_inode(&ci->vfs_inode), cap,
> >                              ceph_cap_string(cap->issued),
> >                              ceph_cap_string(mask));
> >                         if (touch)
> > @@ -911,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
> >                 /* does a combination of caps satisfy mask? */
> >                 have |= cap->issued;
> >                 if ((have & mask) == mask) {
> > -                       dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
> > -                            " (mask %s)\n", ci->vfs_inode.i_ino,
> > +                       dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
> > +                            " (mask %s)\n", ceph_present_inode(&ci->vfs_inode),
> >                              ceph_cap_string(cap->issued),
> >                              ceph_cap_string(mask));
> >                         if (touch) {
> > @@ -2872,7 +2872,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
> >                         struct cap_wait cw;
> >                         DEFINE_WAIT_FUNC(wait, woken_wake_function);
> > 
> > -                       cw.ino = inode->i_ino;
> > +                       cw.ino = ceph_present_inode(inode);
> >                         cw.tgid = current->tgid;
> >                         cw.need = need;
> >                         cw.want = want;
> > diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
> > index 20043382d825..5eeca7987717 100644
> > --- a/fs/ceph/debugfs.c
> > +++ b/fs/ceph/debugfs.c
> > @@ -211,7 +211,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
> >  {
> >         struct seq_file *s = p;
> > 
> > -       seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
> > +       seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_present_inode(inode),
> >                    ceph_cap_string(cap->issued),
> >                    ceph_cap_string(cap->implemented));
> >         return 0;
> > @@ -256,7 +256,7 @@ static int caps_show(struct seq_file *s, void *p)
> > 
> >         spin_lock(&mdsc->caps_list_lock);
> >         list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
> > -               seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
> > +               seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino,
> >                                 ceph_cap_string(cw->need),
> >                                 ceph_cap_string(cw->want));
> >         }
> > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > index 060bdcc5ce32..1a61a435a1cd 100644
> > --- a/fs/ceph/dir.c
> > +++ b/fs/ceph/dir.c
> > @@ -259,9 +259,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
> >                              dentry, dentry, d_inode(dentry));
> >                         ctx->pos = di->offset;
> >                         if (!dir_emit(ctx, dentry->d_name.name,
> > -                                     dentry->d_name.len,
> > -                                     ceph_translate_ino(dentry->d_sb,
> > -                                                        d_inode(dentry)->i_ino),
> > +                                     dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
> >                                       d_inode(dentry)->i_mode >> 12)) {
> >                                 dput(dentry);
> >                                 err = 0;
> > @@ -324,18 +322,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> >         /* always start with . and .. */
> >         if (ctx->pos == 0) {
> >                 dout("readdir off 0 -> '.'\n");
> > -               if (!dir_emit(ctx, ".", 1,
> > -                           ceph_translate_ino(inode->i_sb, inode->i_ino),
> > +               if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
> >                             inode->i_mode >> 12))
> >                         return 0;
> >                 ctx->pos = 1;
> >         }
> >         if (ctx->pos == 1) {
> > -               ino_t ino = parent_ino(file->f_path.dentry);
> > +               u64 ino;
> > +               struct dentry *dentry = file->f_path.dentry;
> > +
> > +               spin_lock(&dentry->d_lock);
> > +               ino = ceph_present_inode(dentry->d_parent->d_inode);
> > +               spin_unlock(&dentry->d_lock);
> > +
> >                 dout("readdir off 1 -> '..'\n");
> > -               if (!dir_emit(ctx, "..", 2,
> > -                           ceph_translate_ino(inode->i_sb, ino),
> > -                           inode->i_mode >> 12))
> > +               if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
> >                         return 0;
> >                 ctx->pos = 2;
> >         }
> > @@ -507,9 +508,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> >         }
> >         for (; i < rinfo->dir_nr; i++) {
> >                 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
> > -               struct ceph_vino vino;
> > -               ino_t ino;
> > -               u32 ftype;
> > 
> >                 BUG_ON(rde->offset < ctx->pos);
> > 
> > @@ -519,13 +517,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> >                      rde->name_len, rde->name, &rde->inode.in);
> > 
> >                 BUG_ON(!rde->inode.in);
> > -               ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
> > -               vino.ino = le64_to_cpu(rde->inode.in->ino);
> > -               vino.snap = le64_to_cpu(rde->inode.in->snapid);
> > -               ino = ceph_vino_to_ino(vino);
> > 
> >                 if (!dir_emit(ctx, rde->name, rde->name_len,
> > -                             ceph_translate_ino(inode->i_sb, ino), ftype)) {
> > +                             ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
> > +                             le32_to_cpu(rde->inode.in->mode) >> 12)) {
> >                         dout("filldir stopping us...\n");
> >                         return 0;
> >                 }
> > @@ -1161,7 +1156,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
> > 
> >         if (try_async && op == CEPH_MDS_OP_UNLINK &&
> >             (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
> > -               dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
> > +               dout("async unlink on %llu/%.*s caps=%s", ceph_present_inode(dir),
> >                      dentry->d_name.len, dentry->d_name.name,
> >                      ceph_cap_string(req->r_dir_caps));
> >                 set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
> > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > index 5fa28a620932..20e0c6f1d8f2 100644
> > --- a/fs/ceph/file.c
> > +++ b/fs/ceph/file.c
> > @@ -642,8 +642,8 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
> >         } else {
> >                 struct dentry *dn;
> > 
> > -               dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
> > -                       vino.ino, dir->i_ino, dentry->d_name.name);
> > +               dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
> > +                       vino.ino, ceph_ino(dir), dentry->d_name.name);
> >                 ceph_dir_clear_ordered(dir);
> >                 ceph_init_inode_acls(inode, as_ctx);
> >                 if (inode->i_state & I_NEW) {
> > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > index 357c937699d5..8dbb56d2edef 100644
> > --- a/fs/ceph/inode.c
> > +++ b/fs/ceph/inode.c
> > @@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work);
> >   */
> >  static int ceph_set_ino_cb(struct inode *inode, void *data)
> >  {
> > -       ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
> > -       inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
> > +       struct ceph_inode_info *ci = ceph_inode(inode);
> > +
> > +       ci->i_vino = *(struct ceph_vino *)data;
> > +       inode->i_ino = ceph_vino_to_ino(ci->i_vino);
> >         inode_set_iversion_raw(inode, 0);
> >         return 0;
> >  }
> > @@ -50,17 +52,17 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
> >  struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
> >  {
> >         struct inode *inode;
> > -       ino_t t = ceph_vino_to_ino(vino);
> > 
> > -       inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
> > +       inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
> > +                            ceph_set_ino_cb, &vino);
> >         if (!inode)
> >                 return ERR_PTR(-ENOMEM);
> >         if (inode->i_state & I_NEW)
> >                 dout("get_inode created new inode %p %llx.%llx ino %llx\n",
> > -                    inode, ceph_vinop(inode), (u64)inode->i_ino);
> > +                    inode, ceph_vinop(inode), ceph_present_inode(inode));
> > 
> > -       dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
> > -            vino.snap, inode);
> > +       dout("get_inode on %llu=%llx.%llx got %p\n", ceph_present_inode(inode),
> > +            vino.ino, vino.snap, inode);
> >         return inode;
> >  }
> > 
> > @@ -2378,7 +2380,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
> >         }
> > 
> >         generic_fillattr(inode, stat);
> > -       stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
> > +       stat->ino = ceph_present_inode(inode);
> > 
> >         /*
> >          * btime on newly-allocated inodes is 0, so if this is still set to
> > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > index bc9e95937d7c..658800605bfb 100644
> > --- a/fs/ceph/mds_client.h
> > +++ b/fs/ceph/mds_client.h
> > @@ -372,7 +372,7 @@ struct ceph_quotarealm_inode {
> > 
> >  struct cap_wait {
> >         struct list_head        list;
> > -       unsigned long           ino;
> > +       u64                     ino;
> >         pid_t                   tgid;
> >         int                     need;
> >         int                     want;
> > diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
> > index 198ddde5c1e6..cc2c4d40b022 100644
> > --- a/fs/ceph/quota.c
> > +++ b/fs/ceph/quota.c
> > @@ -23,12 +23,12 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode)
> >  {
> >         struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
> >         struct super_block *sb = mdsc->fsc->sb;
> > +       struct inode *root = d_inode(sb->s_root);
> > 
> >         if (atomic64_read(&mdsc->quotarealms_count) > 0)
> >                 return true;
> >         /* if root is the real CephFS root, we don't have quota realms */
> > -       if (sb->s_root->d_inode &&
> > -           (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
> > +       if (root && ceph_ino(root) == CEPH_INO_ROOT)
> >                 return false;
> >         /* otherwise, we can't know for sure */
> >         return true;
> > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > index 4c3c964b1c54..0dd272a4a410 100644
> > --- a/fs/ceph/super.h
> > +++ b/fs/ceph/super.h
> > @@ -457,15 +457,7 @@ ceph_vino(const struct inode *inode)
> >         return ceph_inode(inode)->i_vino;
> >  }
> > 
> > -/*
> > - * ino_t is <64 bits on many architectures, blech.
> > - *
> > - *               i_ino (kernel inode)   st_ino (userspace)
> > - * i386          32                     32
> > - * x86_64+ino32  64                     32
> > - * x86_64        64                     64
> > - */
> > -static inline u32 ceph_ino_to_ino32(__u64 vino)
> > +static inline u32 ceph_ino_to_ino32(u64 vino)
> >  {
> >         u32 ino = vino & 0xffffffff;
> >         ino ^= vino >> 32;
> > @@ -474,36 +466,13 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
> >         return ino;
> >  }
> > 
> > -/*
> > - * kernel i_ino value
> > - */
> >  static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
> >  {
> > -#if BITS_PER_LONG == 32
> > -       return ceph_ino_to_ino32(vino.ino);
> > -#else
> > +       if (sizeof(ino_t) == sizeof(u32))
> > +               return ceph_ino_to_ino32(vino.ino);
> >         return (ino_t)vino.ino;
> > -#endif
> >  }
> > 
> > -/*
> > - * user-visible ino (stat, filldir)
> > - */
> > -#if BITS_PER_LONG == 32
> > -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> > -{
> > -       return ino;
> > -}
> > -#else
> > -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> > -{
> > -       if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
> > -               ino = ceph_ino_to_ino32(ino);
> > -       return ino;
> > -}
> > -#endif
> > -
> > -
> >  /* for printf-style formatting */
> >  #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
> > 
> > @@ -511,11 +480,34 @@ static inline u64 ceph_ino(struct inode *inode)
> >  {
> >         return ceph_inode(inode)->i_vino.ino;
> >  }
> > +
> >  static inline u64 ceph_snap(struct inode *inode)
> >  {
> >         return ceph_inode(inode)->i_vino.snap;
> >  }
> > 
> > +/**
> > + * ceph_present_ino - format an inode number for presentation to userland
> > + * @sb: superblock where the inode lives
> > + * @ino: inode number to (possibly) convert
> > + *
> > + * If the user mounted with the ino32 option, then the 64-bit value needs
> > + * to be converted to something that can fit inside 32 bits. Note that
> > + * internal kernel code never uses this value, so this is entirely for
> > + * userland consumption.
> > + */
> > +static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
> > +{
> > +       if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
> > +               return ceph_ino_to_ino32(ino);
> > +       return ino;
> > +}
> > +
> > +static inline u64 ceph_present_inode(struct inode *inode)
> > +{
> > +       return ceph_present_ino(inode->i_sb, ceph_ino(inode));
> > +}
> > +
> >  static inline int ceph_ino_compare(struct inode *inode, void *data)
> >  {
> >         struct ceph_vino *pvino = (struct ceph_vino *)data;
> > @@ -527,8 +519,7 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
> >  static inline struct inode *ceph_find_inode(struct super_block *sb,
> >                                             struct ceph_vino vino)
> >  {
> > -       ino_t t = ceph_vino_to_ino(vino);
> > -       return ilookup5(sb, t, ceph_ino_compare, &vino);
> > +       return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino);
> 
> What is the rationale for this bit?  There is a paragraph in the
> changelog that mentions that "low-order bits are the most likely
> to vary anyway", but this way we seem to have three sets of values
> instead of two: the real 64-bit values, the ino_t values and the
> truncated values.
> 

Yep. It's a bit confusing, but we have:

1) the real 64-bit values: these are what we generally want to use
everywhere internally in ceph.ko

2) the presentation value: the values that get reported in getattr and
readdir to userland. This is usually the same as the real 64-bit value
unless you mount with -o ino32.

3) inode->i_ino: we'll no longer use this value for anything in ceph,
but we want to put _something_ in there so things like tracepoints that
display inode->i_ino don't show all 0's.

The second arg to ilookup5 is just the value that the ilookup5/iget5 use
to determine the hash bucket to use. This value won't ever be used for
anything beyond that. Most existing filesystems just use the inode
number for this.

So the rationale is that while we could just use the same value as what
ends up in inode->i_ino, that would mean that arches like s390x would
end up passing down a 32-bit value here when they could pass in the full
64 bits. 32 bit arches might end up just sending the low-order bits, but
this gets run through hash() anyway, so I doubt it will harm anything.

-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v3] ceph: fix inode number handling on arches with 32-bit ino_t
  2020-08-18 16:23 [PATCH] ceph: fix inode number presentation on 32-bit platforms and s390x Jeff Layton
  2020-08-18 16:36 ` Jeff Layton
  2020-08-18 19:49 ` [PATCH v2] ceph: fix inode number handling on arches with 32-bit ino_t Jeff Layton
@ 2020-08-19 12:35 ` Jeff Layton
  2020-08-19 12:43   ` Jeff Layton
  2020-08-19 15:16 ` [PATCH] " Jeff Layton
  3 siblings, 1 reply; 14+ messages in thread
From: Jeff Layton @ 2020-08-19 12:35 UTC (permalink / raw)
  To: ceph-devel; +Cc: Ulrich.Weigand, Tuan.Hoang1, idryomov

Tuan and Ulrich mentioned that they were hitting a problem on s390x,
which has a 32-bit ino_t value, even though it's a 64-bit arch (for
historical reasons).

I think the current handling of inode numbers in the ceph driver is
wrong. It tries to use 32-bit inode numbers on 32-bit arches, but that's
actually not a problem. 32-bit arches can deal with 64-bit inode numbers
just fine when userland code is compiled with LFS support (the common
case these days).

What we really want to do is just use 64-bit numbers everywhere, unless
someone has mounted with the ino32 mount option. In that case, we want
to ensure that we hash the inode number down to something that will fit
in 32 bits before presenting the value to userland.

Add new helper functions that do this, and only do the conversion before
presenting these values to userland in getattr and readdir.

The inode table hashvalue is changed to just cast the inode number to
unsigned long, as low-order bits are the most likely to vary anyway.

While it's not strictly required, we do want to put something in
inode->i_ino. Instead of basing it on BITS_PER_LONG, however, base it on
the size of the ino_t type.

Reported-by: Tuan Hoang1 <Tuan.Hoang1@ibm.com>
Reported-by: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
URL: https://tracker.ceph.com/issues/46828
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/caps.c       | 14 +++++-----
 fs/ceph/debugfs.c    |  4 +--
 fs/ceph/dir.c        | 31 +++++++++-------------
 fs/ceph/file.c       |  4 +--
 fs/ceph/inode.c      | 18 +++++++------
 fs/ceph/mds_client.h |  2 +-
 fs/ceph/quota.c      |  4 +--
 fs/ceph/super.h      | 63 +++++++++++++++++++-------------------------
 8 files changed, 64 insertions(+), 76 deletions(-)

v3:
- use ceph_ino instead of ceph_present_ino in most dout() messages

v2:
- fix dir_emit inode number for ".."
- fix ino_t size test

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 55ccccf77cea..034b3f4fdd3a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -887,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 	int have = ci->i_snap_caps;
 
 	if ((have & mask) == mask) {
-		dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
-		     " (mask %s)\n", ci->vfs_inode.i_ino,
+		dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
+		     " (mask %s)\n", ceph_ino(&ci->vfs_inode),
 		     ceph_cap_string(have),
 		     ceph_cap_string(mask));
 		return 1;
@@ -899,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 		if (!__cap_is_valid(cap))
 			continue;
 		if ((cap->issued & mask) == mask) {
-			dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
-			     " (mask %s)\n", ci->vfs_inode.i_ino, cap,
+			dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
+			     " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap,
 			     ceph_cap_string(cap->issued),
 			     ceph_cap_string(mask));
 			if (touch)
@@ -911,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 		/* does a combination of caps satisfy mask? */
 		have |= cap->issued;
 		if ((have & mask) == mask) {
-			dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
-			     " (mask %s)\n", ci->vfs_inode.i_ino,
+			dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
+			     " (mask %s)\n", ceph_ino(&ci->vfs_inode),
 			     ceph_cap_string(cap->issued),
 			     ceph_cap_string(mask));
 			if (touch) {
@@ -2872,7 +2872,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
 			struct cap_wait cw;
 			DEFINE_WAIT_FUNC(wait, woken_wake_function);
 
-			cw.ino = inode->i_ino;
+			cw.ino = ceph_ino(inode);
 			cw.tgid = current->tgid;
 			cw.need = need;
 			cw.want = want;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 97539b497e4c..3e3fcda9b276 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -202,7 +202,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
 {
 	struct seq_file *s = p;
 
-	seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
+	seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_ino(inode),
 		   ceph_cap_string(cap->issued),
 		   ceph_cap_string(cap->implemented));
 	return 0;
@@ -247,7 +247,7 @@ static int caps_show(struct seq_file *s, void *p)
 
 	spin_lock(&mdsc->caps_list_lock);
 	list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
-		seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
+		seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino,
 				ceph_cap_string(cw->need),
 				ceph_cap_string(cw->want));
 	}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 060bdcc5ce32..040eaad9d063 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -259,9 +259,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 			     dentry, dentry, d_inode(dentry));
 			ctx->pos = di->offset;
 			if (!dir_emit(ctx, dentry->d_name.name,
-				      dentry->d_name.len,
-				      ceph_translate_ino(dentry->d_sb,
-							 d_inode(dentry)->i_ino),
+				      dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
 				      d_inode(dentry)->i_mode >> 12)) {
 				dput(dentry);
 				err = 0;
@@ -324,18 +322,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 	/* always start with . and .. */
 	if (ctx->pos == 0) {
 		dout("readdir off 0 -> '.'\n");
-		if (!dir_emit(ctx, ".", 1, 
-			    ceph_translate_ino(inode->i_sb, inode->i_ino),
+		if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
 			    inode->i_mode >> 12))
 			return 0;
 		ctx->pos = 1;
 	}
 	if (ctx->pos == 1) {
-		ino_t ino = parent_ino(file->f_path.dentry);
+		u64 ino;
+		struct dentry *dentry = file->f_path.dentry;
+
+		spin_lock(&dentry->d_lock);
+		ino = ceph_present_inode(dentry->d_parent->d_inode);
+		spin_unlock(&dentry->d_lock);
+
 		dout("readdir off 1 -> '..'\n");
-		if (!dir_emit(ctx, "..", 2,
-			    ceph_translate_ino(inode->i_sb, ino),
-			    inode->i_mode >> 12))
+		if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
 			return 0;
 		ctx->pos = 2;
 	}
@@ -507,9 +508,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 	}
 	for (; i < rinfo->dir_nr; i++) {
 		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
-		struct ceph_vino vino;
-		ino_t ino;
-		u32 ftype;
 
 		BUG_ON(rde->offset < ctx->pos);
 
@@ -519,13 +517,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 		     rde->name_len, rde->name, &rde->inode.in);
 
 		BUG_ON(!rde->inode.in);
-		ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
-		vino.ino = le64_to_cpu(rde->inode.in->ino);
-		vino.snap = le64_to_cpu(rde->inode.in->snapid);
-		ino = ceph_vino_to_ino(vino);
 
 		if (!dir_emit(ctx, rde->name, rde->name_len,
-			      ceph_translate_ino(inode->i_sb, ino), ftype)) {
+			      ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
+			      le32_to_cpu(rde->inode.in->mode) >> 12)) {
 			dout("filldir stopping us...\n");
 			return 0;
 		}
@@ -1161,7 +1156,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
 
 	if (try_async && op == CEPH_MDS_OP_UNLINK &&
 	    (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
-		dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
+		dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
 		     dentry->d_name.len, dentry->d_name.name,
 		     ceph_cap_string(req->r_dir_caps));
 		set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 28714934ced7..27c7047c9383 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -628,8 +628,8 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
 	} else {
 		struct dentry *dn;
 
-		dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
-			vino.ino, dir->i_ino, dentry->d_name.name);
+		dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
+			vino.ino, ceph_ino(dir), dentry->d_name.name);
 		ceph_dir_clear_ordered(dir);
 		ceph_init_inode_acls(inode, as_ctx);
 		if (inode->i_state & I_NEW) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 357c937699d5..772794fdd7cc 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work);
  */
 static int ceph_set_ino_cb(struct inode *inode, void *data)
 {
-	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
-	inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	ci->i_vino = *(struct ceph_vino *)data;
+	inode->i_ino = ceph_vino_to_ino(ci->i_vino);
 	inode_set_iversion_raw(inode, 0);
 	return 0;
 }
@@ -50,17 +52,17 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
 struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
 {
 	struct inode *inode;
-	ino_t t = ceph_vino_to_ino(vino);
 
-	inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
+	inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
+			     ceph_set_ino_cb, &vino);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	if (inode->i_state & I_NEW)
 		dout("get_inode created new inode %p %llx.%llx ino %llx\n",
-		     inode, ceph_vinop(inode), (u64)inode->i_ino);
+		     inode, ceph_vinop(inode), ceph_present_inode(inode));
 
-	dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
-	     vino.snap, inode);
+	dout("get_inode on %llu=%llx.%llx got %p\n", ceph_ino(inode),
+	     vino.ino, vino.snap, inode);
 	return inode;
 }
 
@@ -2378,7 +2380,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
 	}
 
 	generic_fillattr(inode, stat);
-	stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
+	stat->ino = ceph_present_inode(inode);
 
 	/*
 	 * btime on newly-allocated inodes is 0, so if this is still set to
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index bc9e95937d7c..658800605bfb 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -372,7 +372,7 @@ struct ceph_quotarealm_inode {
 
 struct cap_wait {
 	struct list_head	list;
-	unsigned long		ino;
+	u64			ino;
 	pid_t			tgid;
 	int			need;
 	int			want;
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 198ddde5c1e6..cc2c4d40b022 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -23,12 +23,12 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode)
 {
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	struct super_block *sb = mdsc->fsc->sb;
+	struct inode *root = d_inode(sb->s_root);
 
 	if (atomic64_read(&mdsc->quotarealms_count) > 0)
 		return true;
 	/* if root is the real CephFS root, we don't have quota realms */
-	if (sb->s_root->d_inode &&
-	    (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
+	if (root && ceph_ino(root) == CEPH_INO_ROOT)
 		return false;
 	/* otherwise, we can't know for sure */
 	return true;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 4c3c964b1c54..0dd272a4a410 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -457,15 +457,7 @@ ceph_vino(const struct inode *inode)
 	return ceph_inode(inode)->i_vino;
 }
 
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- *               i_ino (kernel inode)   st_ino (userspace)
- * i386          32                     32
- * x86_64+ino32  64                     32
- * x86_64        64                     64
- */
-static inline u32 ceph_ino_to_ino32(__u64 vino)
+static inline u32 ceph_ino_to_ino32(u64 vino)
 {
 	u32 ino = vino & 0xffffffff;
 	ino ^= vino >> 32;
@@ -474,36 +466,13 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
 	return ino;
 }
 
-/*
- * kernel i_ino value
- */
 static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
 {
-#if BITS_PER_LONG == 32
-	return ceph_ino_to_ino32(vino.ino);
-#else
+	if (sizeof(ino_t) == sizeof(u32))
+		return ceph_ino_to_ino32(vino.ino);
 	return (ino_t)vino.ino;
-#endif
 }
 
-/*
- * user-visible ino (stat, filldir)
- */
-#if BITS_PER_LONG == 32
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
-	return ino;
-}
-#else
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
-	if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
-		ino = ceph_ino_to_ino32(ino);
-	return ino;
-}
-#endif
-
-
 /* for printf-style formatting */
 #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
 
@@ -511,11 +480,34 @@ static inline u64 ceph_ino(struct inode *inode)
 {
 	return ceph_inode(inode)->i_vino.ino;
 }
+
 static inline u64 ceph_snap(struct inode *inode)
 {
 	return ceph_inode(inode)->i_vino.snap;
 }
 
+/**
+ * ceph_present_ino - format an inode number for presentation to userland
+ * @sb: superblock where the inode lives
+ * @ino: inode number to (possibly) convert
+ *
+ * If the user mounted with the ino32 option, then the 64-bit value needs
+ * to be converted to something that can fit inside 32 bits. Note that
+ * internal kernel code never uses this value, so this is entirely for
+ * userland consumption.
+ */
+static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
+{
+	if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
+		return ceph_ino_to_ino32(ino);
+	return ino;
+}
+
+static inline u64 ceph_present_inode(struct inode *inode)
+{
+	return ceph_present_ino(inode->i_sb, ceph_ino(inode));
+}
+
 static inline int ceph_ino_compare(struct inode *inode, void *data)
 {
 	struct ceph_vino *pvino = (struct ceph_vino *)data;
@@ -527,8 +519,7 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
 static inline struct inode *ceph_find_inode(struct super_block *sb,
 					    struct ceph_vino vino)
 {
-	ino_t t = ceph_vino_to_ino(vino);
-	return ilookup5(sb, t, ceph_ino_compare, &vino);
+	return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino);
 }
 
 
-- 
2.26.2


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v3] ceph: fix inode number handling on arches with 32-bit ino_t
  2020-08-19 12:35 ` [PATCH v3] " Jeff Layton
@ 2020-08-19 12:43   ` Jeff Layton
  0 siblings, 0 replies; 14+ messages in thread
From: Jeff Layton @ 2020-08-19 12:43 UTC (permalink / raw)
  To: ceph-devel; +Cc: Ulrich.Weigand, Tuan.Hoang1, idryomov

On Wed, 2020-08-19 at 08:35 -0400, Jeff Layton wrote:
> Tuan and Ulrich mentioned that they were hitting a problem on s390x,
> which has a 32-bit ino_t value, even though it's a 64-bit arch (for
> historical reasons).
> 
> I think the current handling of inode numbers in the ceph driver is
> wrong. It tries to use 32-bit inode numbers on 32-bit arches, but that's
> actually not a problem. 32-bit arches can deal with 64-bit inode numbers
> just fine when userland code is compiled with LFS support (the common
> case these days).
> 
> What we really want to do is just use 64-bit numbers everywhere, unless
> someone has mounted with the ino32 mount option. In that case, we want
> to ensure that we hash the inode number down to something that will fit
> in 32 bits before presenting the value to userland.
> 
> Add new helper functions that do this, and only do the conversion before
> presenting these values to userland in getattr and readdir.
> 
> The inode table hashvalue is changed to just cast the inode number to
> unsigned long, as low-order bits are the most likely to vary anyway.
> 
> While it's not strictly required, we do want to put something in
> inode->i_ino. Instead of basing it on BITS_PER_LONG, however, base it on
> the size of the ino_t type.
> 
> Reported-by: Tuan Hoang1 <Tuan.Hoang1@ibm.com>
> Reported-by: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
> URL: https://tracker.ceph.com/issues/46828
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/caps.c       | 14 +++++-----
>  fs/ceph/debugfs.c    |  4 +--
>  fs/ceph/dir.c        | 31 +++++++++-------------
>  fs/ceph/file.c       |  4 +--
>  fs/ceph/inode.c      | 18 +++++++------
>  fs/ceph/mds_client.h |  2 +-
>  fs/ceph/quota.c      |  4 +--
>  fs/ceph/super.h      | 63 +++++++++++++++++++-------------------------
>  8 files changed, 64 insertions(+), 76 deletions(-)
> 
> v3:
> - use ceph_ino instead of ceph_present_ino in most dout() messages
> 
> v2:
> - fix dir_emit inode number for ".."
> - fix ino_t size test
> 
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 55ccccf77cea..034b3f4fdd3a 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -887,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
>  	int have = ci->i_snap_caps;
>  
>  	if ((have & mask) == mask) {
> -		dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
> -		     " (mask %s)\n", ci->vfs_inode.i_ino,
> +		dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
> +		     " (mask %s)\n", ceph_ino(&ci->vfs_inode),
>  		     ceph_cap_string(have),
>  		     ceph_cap_string(mask));
>  		return 1;
> @@ -899,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
>  		if (!__cap_is_valid(cap))
>  			continue;
>  		if ((cap->issued & mask) == mask) {
> -			dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
> -			     " (mask %s)\n", ci->vfs_inode.i_ino, cap,
> +			dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
> +			     " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap,
>  			     ceph_cap_string(cap->issued),
>  			     ceph_cap_string(mask));
>  			if (touch)
> @@ -911,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
>  		/* does a combination of caps satisfy mask? */
>  		have |= cap->issued;
>  		if ((have & mask) == mask) {
> -			dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
> -			     " (mask %s)\n", ci->vfs_inode.i_ino,
> +			dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
> +			     " (mask %s)\n", ceph_ino(&ci->vfs_inode),
>  			     ceph_cap_string(cap->issued),
>  			     ceph_cap_string(mask));
>  			if (touch) {
> @@ -2872,7 +2872,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
>  			struct cap_wait cw;
>  			DEFINE_WAIT_FUNC(wait, woken_wake_function);
>  
> -			cw.ino = inode->i_ino;
> +			cw.ino = ceph_ino(inode);
>  			cw.tgid = current->tgid;
>  			cw.need = need;
>  			cw.want = want;
> diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
> index 97539b497e4c..3e3fcda9b276 100644
> --- a/fs/ceph/debugfs.c
> +++ b/fs/ceph/debugfs.c
> @@ -202,7 +202,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
>  {
>  	struct seq_file *s = p;
>  
> -	seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
> +	seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_ino(inode),
>  		   ceph_cap_string(cap->issued),
>  		   ceph_cap_string(cap->implemented));
>  	return 0;
> @@ -247,7 +247,7 @@ static int caps_show(struct seq_file *s, void *p)
>  
>  	spin_lock(&mdsc->caps_list_lock);
>  	list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
> -		seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
> +		seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino,
>  				ceph_cap_string(cw->need),
>  				ceph_cap_string(cw->want));
>  	}
> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> index 060bdcc5ce32..040eaad9d063 100644
> --- a/fs/ceph/dir.c
> +++ b/fs/ceph/dir.c
> @@ -259,9 +259,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
>  			     dentry, dentry, d_inode(dentry));
>  			ctx->pos = di->offset;
>  			if (!dir_emit(ctx, dentry->d_name.name,
> -				      dentry->d_name.len,
> -				      ceph_translate_ino(dentry->d_sb,
> -							 d_inode(dentry)->i_ino),
> +				      dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
>  				      d_inode(dentry)->i_mode >> 12)) {
>  				dput(dentry);
>  				err = 0;
> @@ -324,18 +322,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>  	/* always start with . and .. */
>  	if (ctx->pos == 0) {
>  		dout("readdir off 0 -> '.'\n");
> -		if (!dir_emit(ctx, ".", 1, 
> -			    ceph_translate_ino(inode->i_sb, inode->i_ino),
> +		if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
>  			    inode->i_mode >> 12))
>  			return 0;
>  		ctx->pos = 1;
>  	}
>  	if (ctx->pos == 1) {
> -		ino_t ino = parent_ino(file->f_path.dentry);
> +		u64 ino;
> +		struct dentry *dentry = file->f_path.dentry;
> +
> +		spin_lock(&dentry->d_lock);
> +		ino = ceph_present_inode(dentry->d_parent->d_inode);
> +		spin_unlock(&dentry->d_lock);
> +
>  		dout("readdir off 1 -> '..'\n");
> -		if (!dir_emit(ctx, "..", 2,
> -			    ceph_translate_ino(inode->i_sb, ino),
> -			    inode->i_mode >> 12))
> +		if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
>  			return 0;
>  		ctx->pos = 2;
>  	}
> @@ -507,9 +508,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>  	}
>  	for (; i < rinfo->dir_nr; i++) {
>  		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
> -		struct ceph_vino vino;
> -		ino_t ino;
> -		u32 ftype;
>  
>  		BUG_ON(rde->offset < ctx->pos);
>  
> @@ -519,13 +517,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>  		     rde->name_len, rde->name, &rde->inode.in);
>  
>  		BUG_ON(!rde->inode.in);
> -		ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
> -		vino.ino = le64_to_cpu(rde->inode.in->ino);
> -		vino.snap = le64_to_cpu(rde->inode.in->snapid);
> -		ino = ceph_vino_to_ino(vino);
>  
>  		if (!dir_emit(ctx, rde->name, rde->name_len,
> -			      ceph_translate_ino(inode->i_sb, ino), ftype)) {
> +			      ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
> +			      le32_to_cpu(rde->inode.in->mode) >> 12)) {
>  			dout("filldir stopping us...\n");
>  			return 0;
>  		}
> @@ -1161,7 +1156,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
>  
>  	if (try_async && op == CEPH_MDS_OP_UNLINK &&
>  	    (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
> -		dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
> +		dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
>  		     dentry->d_name.len, dentry->d_name.name,
>  		     ceph_cap_string(req->r_dir_caps));
>  		set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 28714934ced7..27c7047c9383 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -628,8 +628,8 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
>  	} else {
>  		struct dentry *dn;
>  
> -		dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
> -			vino.ino, dir->i_ino, dentry->d_name.name);
> +		dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
> +			vino.ino, ceph_ino(dir), dentry->d_name.name);
>  		ceph_dir_clear_ordered(dir);
>  		ceph_init_inode_acls(inode, as_ctx);
>  		if (inode->i_state & I_NEW) {
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 357c937699d5..772794fdd7cc 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work);
>   */
>  static int ceph_set_ino_cb(struct inode *inode, void *data)
>  {
> -	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
> -	inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
> +	struct ceph_inode_info *ci = ceph_inode(inode);
> +
> +	ci->i_vino = *(struct ceph_vino *)data;
> +	inode->i_ino = ceph_vino_to_ino(ci->i_vino);
>  	inode_set_iversion_raw(inode, 0);
>  	return 0;
>  }
> @@ -50,17 +52,17 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
>  struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
>  {
>  	struct inode *inode;
> -	ino_t t = ceph_vino_to_ino(vino);
>  
> -	inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
> +	inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
> +			     ceph_set_ino_cb, &vino);
>  	if (!inode)
>  		return ERR_PTR(-ENOMEM);
>  	if (inode->i_state & I_NEW)
>  		dout("get_inode created new inode %p %llx.%llx ino %llx\n",
> -		     inode, ceph_vinop(inode), (u64)inode->i_ino);
> +		     inode, ceph_vinop(inode), ceph_present_inode(inode));
>  
> -	dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
> -	     vino.snap, inode);
> +	dout("get_inode on %llu=%llx.%llx got %p\n", ceph_ino(inode),
> +	     vino.ino, vino.snap, inode);
>  	return inode;

The two douts above are almost identical, so I think I'm going to merge
them so that they look like this:

        dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
             ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));                               

I won't bother resending for that though unless there are other things
that need to be addressed in here.

>  }
>  
> @@ -2378,7 +2380,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
>  	}
>  
>  	generic_fillattr(inode, stat);
> -	stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
> +	stat->ino = ceph_present_inode(inode);
>  
>  	/*
>  	 * btime on newly-allocated inodes is 0, so if this is still set to
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index bc9e95937d7c..658800605bfb 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -372,7 +372,7 @@ struct ceph_quotarealm_inode {
>  
>  struct cap_wait {
>  	struct list_head	list;
> -	unsigned long		ino;
> +	u64			ino;
>  	pid_t			tgid;
>  	int			need;
>  	int			want;
> diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
> index 198ddde5c1e6..cc2c4d40b022 100644
> --- a/fs/ceph/quota.c
> +++ b/fs/ceph/quota.c
> @@ -23,12 +23,12 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode)
>  {
>  	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
>  	struct super_block *sb = mdsc->fsc->sb;
> +	struct inode *root = d_inode(sb->s_root);
>  
>  	if (atomic64_read(&mdsc->quotarealms_count) > 0)
>  		return true;
>  	/* if root is the real CephFS root, we don't have quota realms */
> -	if (sb->s_root->d_inode &&
> -	    (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
> +	if (root && ceph_ino(root) == CEPH_INO_ROOT)
>  		return false;
>  	/* otherwise, we can't know for sure */
>  	return true;
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 4c3c964b1c54..0dd272a4a410 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -457,15 +457,7 @@ ceph_vino(const struct inode *inode)
>  	return ceph_inode(inode)->i_vino;
>  }
>  
> -/*
> - * ino_t is <64 bits on many architectures, blech.
> - *
> - *               i_ino (kernel inode)   st_ino (userspace)
> - * i386          32                     32
> - * x86_64+ino32  64                     32
> - * x86_64        64                     64
> - */
> -static inline u32 ceph_ino_to_ino32(__u64 vino)
> +static inline u32 ceph_ino_to_ino32(u64 vino)
>  {
>  	u32 ino = vino & 0xffffffff;
>  	ino ^= vino >> 32;
> @@ -474,36 +466,13 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
>  	return ino;
>  }
>  
> -/*
> - * kernel i_ino value
> - */
>  static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
>  {
> -#if BITS_PER_LONG == 32
> -	return ceph_ino_to_ino32(vino.ino);
> -#else
> +	if (sizeof(ino_t) == sizeof(u32))
> +		return ceph_ino_to_ino32(vino.ino);
>  	return (ino_t)vino.ino;
> -#endif
>  }
>  
> -/*
> - * user-visible ino (stat, filldir)
> - */
> -#if BITS_PER_LONG == 32
> -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> -{
> -	return ino;
> -}
> -#else
> -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> -{
> -	if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
> -		ino = ceph_ino_to_ino32(ino);
> -	return ino;
> -}
> -#endif
> -
> -
>  /* for printf-style formatting */
>  #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
>  
> @@ -511,11 +480,34 @@ static inline u64 ceph_ino(struct inode *inode)
>  {
>  	return ceph_inode(inode)->i_vino.ino;
>  }
> +
>  static inline u64 ceph_snap(struct inode *inode)
>  {
>  	return ceph_inode(inode)->i_vino.snap;
>  }
>  
> +/**
> + * ceph_present_ino - format an inode number for presentation to userland
> + * @sb: superblock where the inode lives
> + * @ino: inode number to (possibly) convert
> + *
> + * If the user mounted with the ino32 option, then the 64-bit value needs
> + * to be converted to something that can fit inside 32 bits. Note that
> + * internal kernel code never uses this value, so this is entirely for
> + * userland consumption.
> + */
> +static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
> +{
> +	if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
> +		return ceph_ino_to_ino32(ino);
> +	return ino;
> +}
> +
> +static inline u64 ceph_present_inode(struct inode *inode)
> +{
> +	return ceph_present_ino(inode->i_sb, ceph_ino(inode));
> +}
> +
>  static inline int ceph_ino_compare(struct inode *inode, void *data)
>  {
>  	struct ceph_vino *pvino = (struct ceph_vino *)data;
> @@ -527,8 +519,7 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
>  static inline struct inode *ceph_find_inode(struct super_block *sb,
>  					    struct ceph_vino vino)
>  {
> -	ino_t t = ceph_vino_to_ino(vino);
> -	return ilookup5(sb, t, ceph_ino_compare, &vino);
> +	return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino);
>  }
>  
>  

-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] ceph: fix inode number handling on arches with 32-bit ino_t
  2020-08-19 11:25     ` Jeff Layton
@ 2020-08-19 13:28       ` Ilya Dryomov
  2020-08-19 13:57         ` Jeff Layton
  0 siblings, 1 reply; 14+ messages in thread
From: Ilya Dryomov @ 2020-08-19 13:28 UTC (permalink / raw)
  To: Jeff Layton; +Cc: Ceph Development, Ulrich.Weigand, Tuan.Hoang1

On Wed, Aug 19, 2020 at 1:25 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> On Wed, 2020-08-19 at 09:17 +0200, Ilya Dryomov wrote:
> > On Tue, Aug 18, 2020 at 9:49 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > Tuan and Ulrich mentioned that they were hitting a problem on s390x,
> > > which has a 32-bit ino_t value, even though it's a 64-bit arch (for
> > > historical reasons).
> > >
> > > I think the current handling of inode numbers in the ceph driver is
> > > wrong. It tries to use 32-bit inode numbers on 32-bit arches, but that's
> > > actually not a problem. 32-bit arches can deal with 64-bit inode numbers
> > > just fine when userland code is compiled with LFS support (the common
> > > case these days).
> > >
> > > What we really want to do is just use 64-bit numbers everywhere, unless
> > > someone has mounted with the ino32 mount option. In that case, we want
> > > to ensure that we hash the inode number down to something that will fit
> > > in 32 bits before presenting the value to userland.
> > >
> > > Add new helper functions that do this, and only do the conversion before
> > > presenting these values to userland in getattr, readdir, debugfs, and
> > > (some) debug log messages.
> > >
> > > The inode table hashvalue is changed to just cast the inode number to
> > > unsigned long, as low-order bits are the most likely to vary anyway.
> > >
> > > While it's not strictly required, we do want to put something in
> > > inode->i_ino. Instead of basing it on BITS_PER_LONG, however, base it on
> > > the size of the ino_t type.
> > >
> > > Reported-by: Tuan Hoang1 <Tuan.Hoang1@ibm.com>
> > > Reported-by: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
> > > URL: https://tracker.ceph.com/issues/46828
> > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > ---
> > >  fs/ceph/caps.c       | 14 +++++-----
> > >  fs/ceph/debugfs.c    |  4 +--
> > >  fs/ceph/dir.c        | 31 +++++++++-------------
> > >  fs/ceph/file.c       |  4 +--
> > >  fs/ceph/inode.c      | 18 +++++++------
> > >  fs/ceph/mds_client.h |  2 +-
> > >  fs/ceph/quota.c      |  4 +--
> > >  fs/ceph/super.h      | 63 +++++++++++++++++++-------------------------
> > >  8 files changed, 64 insertions(+), 76 deletions(-)
> > >
> > > v2:
> > > - fix dir_emit inode number for ".."
> > > - fix ino_t size test
> > >
> > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > index 55ccccf77cea..d5b1f781f398 100644
> > > --- a/fs/ceph/caps.c
> > > +++ b/fs/ceph/caps.c
> > > @@ -887,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
> > >         int have = ci->i_snap_caps;
> > >
> > >         if ((have & mask) == mask) {
> > > -               dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
> > > -                    " (mask %s)\n", ci->vfs_inode.i_ino,
> > > +               dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
> > > +                    " (mask %s)\n", ceph_present_inode(&ci->vfs_inode),
> >
> > Hi Jeff,
> >
> > I agree with Zheng.  douts should output the real 64-bit values.
> > If the presentation value is of interest in some places, they should
> > output it alongside the real value.
> >
>
> Ok.
>
> FWIW, the rationale for using ceph_present_inode in these is that the
> values would better match what stat() reports in st_ino. The debug
> messages are really inconsistent here though, so I'm fine with moving
> them to always print the 64-bit values. My guess is that hardly anyone
> uses -o ino32 in this day and age anyway...
>
>
> > >                      ceph_cap_string(have),
> > >                      ceph_cap_string(mask));
> > >                 return 1;
> > > @@ -899,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
> > >                 if (!__cap_is_valid(cap))
> > >                         continue;
> > >                 if ((cap->issued & mask) == mask) {
> > > -                       dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
> > > -                            " (mask %s)\n", ci->vfs_inode.i_ino, cap,
> > > +                       dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
> > > +                            " (mask %s)\n", ceph_present_inode(&ci->vfs_inode), cap,
> > >                              ceph_cap_string(cap->issued),
> > >                              ceph_cap_string(mask));
> > >                         if (touch)
> > > @@ -911,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
> > >                 /* does a combination of caps satisfy mask? */
> > >                 have |= cap->issued;
> > >                 if ((have & mask) == mask) {
> > > -                       dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
> > > -                            " (mask %s)\n", ci->vfs_inode.i_ino,
> > > +                       dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
> > > +                            " (mask %s)\n", ceph_present_inode(&ci->vfs_inode),
> > >                              ceph_cap_string(cap->issued),
> > >                              ceph_cap_string(mask));
> > >                         if (touch) {
> > > @@ -2872,7 +2872,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
> > >                         struct cap_wait cw;
> > >                         DEFINE_WAIT_FUNC(wait, woken_wake_function);
> > >
> > > -                       cw.ino = inode->i_ino;
> > > +                       cw.ino = ceph_present_inode(inode);
> > >                         cw.tgid = current->tgid;
> > >                         cw.need = need;
> > >                         cw.want = want;
> > > diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
> > > index 20043382d825..5eeca7987717 100644
> > > --- a/fs/ceph/debugfs.c
> > > +++ b/fs/ceph/debugfs.c
> > > @@ -211,7 +211,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
> > >  {
> > >         struct seq_file *s = p;
> > >
> > > -       seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
> > > +       seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_present_inode(inode),
> > >                    ceph_cap_string(cap->issued),
> > >                    ceph_cap_string(cap->implemented));
> > >         return 0;
> > > @@ -256,7 +256,7 @@ static int caps_show(struct seq_file *s, void *p)
> > >
> > >         spin_lock(&mdsc->caps_list_lock);
> > >         list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
> > > -               seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
> > > +               seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino,
> > >                                 ceph_cap_string(cw->need),
> > >                                 ceph_cap_string(cw->want));
> > >         }
> > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > > index 060bdcc5ce32..1a61a435a1cd 100644
> > > --- a/fs/ceph/dir.c
> > > +++ b/fs/ceph/dir.c
> > > @@ -259,9 +259,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
> > >                              dentry, dentry, d_inode(dentry));
> > >                         ctx->pos = di->offset;
> > >                         if (!dir_emit(ctx, dentry->d_name.name,
> > > -                                     dentry->d_name.len,
> > > -                                     ceph_translate_ino(dentry->d_sb,
> > > -                                                        d_inode(dentry)->i_ino),
> > > +                                     dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
> > >                                       d_inode(dentry)->i_mode >> 12)) {
> > >                                 dput(dentry);
> > >                                 err = 0;
> > > @@ -324,18 +322,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> > >         /* always start with . and .. */
> > >         if (ctx->pos == 0) {
> > >                 dout("readdir off 0 -> '.'\n");
> > > -               if (!dir_emit(ctx, ".", 1,
> > > -                           ceph_translate_ino(inode->i_sb, inode->i_ino),
> > > +               if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
> > >                             inode->i_mode >> 12))
> > >                         return 0;
> > >                 ctx->pos = 1;
> > >         }
> > >         if (ctx->pos == 1) {
> > > -               ino_t ino = parent_ino(file->f_path.dentry);
> > > +               u64 ino;
> > > +               struct dentry *dentry = file->f_path.dentry;
> > > +
> > > +               spin_lock(&dentry->d_lock);
> > > +               ino = ceph_present_inode(dentry->d_parent->d_inode);
> > > +               spin_unlock(&dentry->d_lock);
> > > +
> > >                 dout("readdir off 1 -> '..'\n");
> > > -               if (!dir_emit(ctx, "..", 2,
> > > -                           ceph_translate_ino(inode->i_sb, ino),
> > > -                           inode->i_mode >> 12))
> > > +               if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
> > >                         return 0;
> > >                 ctx->pos = 2;
> > >         }
> > > @@ -507,9 +508,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> > >         }
> > >         for (; i < rinfo->dir_nr; i++) {
> > >                 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
> > > -               struct ceph_vino vino;
> > > -               ino_t ino;
> > > -               u32 ftype;
> > >
> > >                 BUG_ON(rde->offset < ctx->pos);
> > >
> > > @@ -519,13 +517,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> > >                      rde->name_len, rde->name, &rde->inode.in);
> > >
> > >                 BUG_ON(!rde->inode.in);
> > > -               ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
> > > -               vino.ino = le64_to_cpu(rde->inode.in->ino);
> > > -               vino.snap = le64_to_cpu(rde->inode.in->snapid);
> > > -               ino = ceph_vino_to_ino(vino);
> > >
> > >                 if (!dir_emit(ctx, rde->name, rde->name_len,
> > > -                             ceph_translate_ino(inode->i_sb, ino), ftype)) {
> > > +                             ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
> > > +                             le32_to_cpu(rde->inode.in->mode) >> 12)) {
> > >                         dout("filldir stopping us...\n");
> > >                         return 0;
> > >                 }
> > > @@ -1161,7 +1156,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
> > >
> > >         if (try_async && op == CEPH_MDS_OP_UNLINK &&
> > >             (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
> > > -               dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
> > > +               dout("async unlink on %llu/%.*s caps=%s", ceph_present_inode(dir),
> > >                      dentry->d_name.len, dentry->d_name.name,
> > >                      ceph_cap_string(req->r_dir_caps));
> > >                 set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
> > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > > index 5fa28a620932..20e0c6f1d8f2 100644
> > > --- a/fs/ceph/file.c
> > > +++ b/fs/ceph/file.c
> > > @@ -642,8 +642,8 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
> > >         } else {
> > >                 struct dentry *dn;
> > >
> > > -               dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
> > > -                       vino.ino, dir->i_ino, dentry->d_name.name);
> > > +               dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
> > > +                       vino.ino, ceph_ino(dir), dentry->d_name.name);
> > >                 ceph_dir_clear_ordered(dir);
> > >                 ceph_init_inode_acls(inode, as_ctx);
> > >                 if (inode->i_state & I_NEW) {
> > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > index 357c937699d5..8dbb56d2edef 100644
> > > --- a/fs/ceph/inode.c
> > > +++ b/fs/ceph/inode.c
> > > @@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work);
> > >   */
> > >  static int ceph_set_ino_cb(struct inode *inode, void *data)
> > >  {
> > > -       ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
> > > -       inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
> > > +       struct ceph_inode_info *ci = ceph_inode(inode);
> > > +
> > > +       ci->i_vino = *(struct ceph_vino *)data;
> > > +       inode->i_ino = ceph_vino_to_ino(ci->i_vino);
> > >         inode_set_iversion_raw(inode, 0);
> > >         return 0;
> > >  }
> > > @@ -50,17 +52,17 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
> > >  struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
> > >  {
> > >         struct inode *inode;
> > > -       ino_t t = ceph_vino_to_ino(vino);
> > >
> > > -       inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
> > > +       inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
> > > +                            ceph_set_ino_cb, &vino);
> > >         if (!inode)
> > >                 return ERR_PTR(-ENOMEM);
> > >         if (inode->i_state & I_NEW)
> > >                 dout("get_inode created new inode %p %llx.%llx ino %llx\n",
> > > -                    inode, ceph_vinop(inode), (u64)inode->i_ino);
> > > +                    inode, ceph_vinop(inode), ceph_present_inode(inode));
> > >
> > > -       dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
> > > -            vino.snap, inode);
> > > +       dout("get_inode on %llu=%llx.%llx got %p\n", ceph_present_inode(inode),
> > > +            vino.ino, vino.snap, inode);
> > >         return inode;
> > >  }
> > >
> > > @@ -2378,7 +2380,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
> > >         }
> > >
> > >         generic_fillattr(inode, stat);
> > > -       stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
> > > +       stat->ino = ceph_present_inode(inode);
> > >
> > >         /*
> > >          * btime on newly-allocated inodes is 0, so if this is still set to
> > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > index bc9e95937d7c..658800605bfb 100644
> > > --- a/fs/ceph/mds_client.h
> > > +++ b/fs/ceph/mds_client.h
> > > @@ -372,7 +372,7 @@ struct ceph_quotarealm_inode {
> > >
> > >  struct cap_wait {
> > >         struct list_head        list;
> > > -       unsigned long           ino;
> > > +       u64                     ino;
> > >         pid_t                   tgid;
> > >         int                     need;
> > >         int                     want;
> > > diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
> > > index 198ddde5c1e6..cc2c4d40b022 100644
> > > --- a/fs/ceph/quota.c
> > > +++ b/fs/ceph/quota.c
> > > @@ -23,12 +23,12 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode)
> > >  {
> > >         struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
> > >         struct super_block *sb = mdsc->fsc->sb;
> > > +       struct inode *root = d_inode(sb->s_root);
> > >
> > >         if (atomic64_read(&mdsc->quotarealms_count) > 0)
> > >                 return true;
> > >         /* if root is the real CephFS root, we don't have quota realms */
> > > -       if (sb->s_root->d_inode &&
> > > -           (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
> > > +       if (root && ceph_ino(root) == CEPH_INO_ROOT)
> > >                 return false;
> > >         /* otherwise, we can't know for sure */
> > >         return true;
> > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > index 4c3c964b1c54..0dd272a4a410 100644
> > > --- a/fs/ceph/super.h
> > > +++ b/fs/ceph/super.h
> > > @@ -457,15 +457,7 @@ ceph_vino(const struct inode *inode)
> > >         return ceph_inode(inode)->i_vino;
> > >  }
> > >
> > > -/*
> > > - * ino_t is <64 bits on many architectures, blech.
> > > - *
> > > - *               i_ino (kernel inode)   st_ino (userspace)
> > > - * i386          32                     32
> > > - * x86_64+ino32  64                     32
> > > - * x86_64        64                     64
> > > - */
> > > -static inline u32 ceph_ino_to_ino32(__u64 vino)
> > > +static inline u32 ceph_ino_to_ino32(u64 vino)
> > >  {
> > >         u32 ino = vino & 0xffffffff;
> > >         ino ^= vino >> 32;
> > > @@ -474,36 +466,13 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
> > >         return ino;
> > >  }
> > >
> > > -/*
> > > - * kernel i_ino value
> > > - */
> > >  static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
> > >  {
> > > -#if BITS_PER_LONG == 32
> > > -       return ceph_ino_to_ino32(vino.ino);
> > > -#else
> > > +       if (sizeof(ino_t) == sizeof(u32))
> > > +               return ceph_ino_to_ino32(vino.ino);
> > >         return (ino_t)vino.ino;
> > > -#endif
> > >  }
> > >
> > > -/*
> > > - * user-visible ino (stat, filldir)
> > > - */
> > > -#if BITS_PER_LONG == 32
> > > -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> > > -{
> > > -       return ino;
> > > -}
> > > -#else
> > > -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> > > -{
> > > -       if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
> > > -               ino = ceph_ino_to_ino32(ino);
> > > -       return ino;
> > > -}
> > > -#endif
> > > -
> > > -
> > >  /* for printf-style formatting */
> > >  #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
> > >
> > > @@ -511,11 +480,34 @@ static inline u64 ceph_ino(struct inode *inode)
> > >  {
> > >         return ceph_inode(inode)->i_vino.ino;
> > >  }
> > > +
> > >  static inline u64 ceph_snap(struct inode *inode)
> > >  {
> > >         return ceph_inode(inode)->i_vino.snap;
> > >  }
> > >
> > > +/**
> > > + * ceph_present_ino - format an inode number for presentation to userland
> > > + * @sb: superblock where the inode lives
> > > + * @ino: inode number to (possibly) convert
> > > + *
> > > + * If the user mounted with the ino32 option, then the 64-bit value needs
> > > + * to be converted to something that can fit inside 32 bits. Note that
> > > + * internal kernel code never uses this value, so this is entirely for
> > > + * userland consumption.
> > > + */
> > > +static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
> > > +{
> > > +       if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
> > > +               return ceph_ino_to_ino32(ino);
> > > +       return ino;
> > > +}
> > > +
> > > +static inline u64 ceph_present_inode(struct inode *inode)
> > > +{
> > > +       return ceph_present_ino(inode->i_sb, ceph_ino(inode));
> > > +}
> > > +
> > >  static inline int ceph_ino_compare(struct inode *inode, void *data)
> > >  {
> > >         struct ceph_vino *pvino = (struct ceph_vino *)data;
> > > @@ -527,8 +519,7 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
> > >  static inline struct inode *ceph_find_inode(struct super_block *sb,
> > >                                             struct ceph_vino vino)
> > >  {
> > > -       ino_t t = ceph_vino_to_ino(vino);
> > > -       return ilookup5(sb, t, ceph_ino_compare, &vino);
> > > +       return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino);
> >
> > What is the rationale for this bit?  There is a paragraph in the
> > changelog that mentions that "low-order bits are the most likely
> > to vary anyway", but this way we seem to have three sets of values
> > instead of two: the real 64-bit values, the ino_t values and the
> > truncated values.
> >
>
> Yep. It's a bit confusing, but we have:
>
> 1) the real 64-bit values: these are what we generally want to use
> everywhere internally in ceph.ko
>
> 2) the presentation value: the values that get reported in getattr and
> readdir to userland. This is usually the same as the real 64-bit value
> unless you mount with -o ino32.
>
> 3) inode->i_ino: we'll no longer use this value for anything in ceph,
> but we want to put _something_ in there so things like tracepoints that
> display inode->i_ino don't show all 0's.
>
> The second arg to ilookup5 is just the value that the ilookup5/iget5 use
> to determine the hash bucket to use. This value won't ever be used for
> anything beyond that. Most existing filesystems just use the inode
> number for this.
>
> So the rationale is that while we could just use the same value as what
> ends up in inode->i_ino, that would mean that arches like s390x would
> end up passing down a 32-bit value here when they could pass in the full
> 64 bits. 32 bit arches might end up just sending the low-order bits, but
> this gets run through hash() anyway, so I doubt it will harm anything.

I guess I still I don't see a good reason to special case this for
64-bit arches with 32-bit ino_t (just s390x?) and I'd keep it as is.
It's just more consistent and less to keep in mind.

This is just a feeling and not a strong opinion though, ultimately
it doesn't seem to matter too much.  If you go ahead with truncating
to BITS_PER_LONG, I'd suggest turning your last paragraph into a
comment for ceph_find_inode().

Thanks,

                Ilya

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] ceph: fix inode number handling on arches with 32-bit ino_t
  2020-08-19 13:28       ` Ilya Dryomov
@ 2020-08-19 13:57         ` Jeff Layton
  2020-08-19 14:25           ` Jeff Layton
  0 siblings, 1 reply; 14+ messages in thread
From: Jeff Layton @ 2020-08-19 13:57 UTC (permalink / raw)
  To: Ilya Dryomov; +Cc: Ceph Development, Ulrich.Weigand, Tuan.Hoang1

On Wed, 2020-08-19 at 15:28 +0200, Ilya Dryomov wrote:
> On Wed, Aug 19, 2020 at 1:25 PM Jeff Layton <jlayton@kernel.org> wrote:
> > On Wed, 2020-08-19 at 09:17 +0200, Ilya Dryomov wrote:
> > > On Tue, Aug 18, 2020 at 9:49 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > Tuan and Ulrich mentioned that they were hitting a problem on s390x,
> > > > which has a 32-bit ino_t value, even though it's a 64-bit arch (for
> > > > historical reasons).
> > > > 
> > > > I think the current handling of inode numbers in the ceph driver is
> > > > wrong. It tries to use 32-bit inode numbers on 32-bit arches, but that's
> > > > actually not a problem. 32-bit arches can deal with 64-bit inode numbers
> > > > just fine when userland code is compiled with LFS support (the common
> > > > case these days).
> > > > 
> > > > What we really want to do is just use 64-bit numbers everywhere, unless
> > > > someone has mounted with the ino32 mount option. In that case, we want
> > > > to ensure that we hash the inode number down to something that will fit
> > > > in 32 bits before presenting the value to userland.
> > > > 
> > > > Add new helper functions that do this, and only do the conversion before
> > > > presenting these values to userland in getattr, readdir, debugfs, and
> > > > (some) debug log messages.
> > > > 
> > > > The inode table hashvalue is changed to just cast the inode number to
> > > > unsigned long, as low-order bits are the most likely to vary anyway.
> > > > 
> > > > While it's not strictly required, we do want to put something in
> > > > inode->i_ino. Instead of basing it on BITS_PER_LONG, however, base it on
> > > > the size of the ino_t type.
> > > > 
> > > > Reported-by: Tuan Hoang1 <Tuan.Hoang1@ibm.com>
> > > > Reported-by: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
> > > > URL: https://tracker.ceph.com/issues/46828
> > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > ---
> > > >  fs/ceph/caps.c       | 14 +++++-----
> > > >  fs/ceph/debugfs.c    |  4 +--
> > > >  fs/ceph/dir.c        | 31 +++++++++-------------
> > > >  fs/ceph/file.c       |  4 +--
> > > >  fs/ceph/inode.c      | 18 +++++++------
> > > >  fs/ceph/mds_client.h |  2 +-
> > > >  fs/ceph/quota.c      |  4 +--
> > > >  fs/ceph/super.h      | 63 +++++++++++++++++++-------------------------
> > > >  8 files changed, 64 insertions(+), 76 deletions(-)
> > > > 
> > > > v2:
> > > > - fix dir_emit inode number for ".."
> > > > - fix ino_t size test
> > > > 
> > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > > index 55ccccf77cea..d5b1f781f398 100644
> > > > --- a/fs/ceph/caps.c
> > > > +++ b/fs/ceph/caps.c
> > > > @@ -887,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
> > > >         int have = ci->i_snap_caps;
> > > > 
> > > >         if ((have & mask) == mask) {
> > > > -               dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
> > > > -                    " (mask %s)\n", ci->vfs_inode.i_ino,
> > > > +               dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
> > > > +                    " (mask %s)\n", ceph_present_inode(&ci->vfs_inode),
> > > 
> > > Hi Jeff,
> > > 
> > > I agree with Zheng.  douts should output the real 64-bit values.
> > > If the presentation value is of interest in some places, they should
> > > output it alongside the real value.
> > > 
> > 
> > Ok.
> > 
> > FWIW, the rationale for using ceph_present_inode in these is that the
> > values would better match what stat() reports in st_ino. The debug
> > messages are really inconsistent here though, so I'm fine with moving
> > them to always print the 64-bit values. My guess is that hardly anyone
> > uses -o ino32 in this day and age anyway...
> > 
> > 
> > > >                      ceph_cap_string(have),
> > > >                      ceph_cap_string(mask));
> > > >                 return 1;
> > > > @@ -899,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
> > > >                 if (!__cap_is_valid(cap))
> > > >                         continue;
> > > >                 if ((cap->issued & mask) == mask) {
> > > > -                       dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
> > > > -                            " (mask %s)\n", ci->vfs_inode.i_ino, cap,
> > > > +                       dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
> > > > +                            " (mask %s)\n", ceph_present_inode(&ci->vfs_inode), cap,
> > > >                              ceph_cap_string(cap->issued),
> > > >                              ceph_cap_string(mask));
> > > >                         if (touch)
> > > > @@ -911,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
> > > >                 /* does a combination of caps satisfy mask? */
> > > >                 have |= cap->issued;
> > > >                 if ((have & mask) == mask) {
> > > > -                       dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
> > > > -                            " (mask %s)\n", ci->vfs_inode.i_ino,
> > > > +                       dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
> > > > +                            " (mask %s)\n", ceph_present_inode(&ci->vfs_inode),
> > > >                              ceph_cap_string(cap->issued),
> > > >                              ceph_cap_string(mask));
> > > >                         if (touch) {
> > > > @@ -2872,7 +2872,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
> > > >                         struct cap_wait cw;
> > > >                         DEFINE_WAIT_FUNC(wait, woken_wake_function);
> > > > 
> > > > -                       cw.ino = inode->i_ino;
> > > > +                       cw.ino = ceph_present_inode(inode);
> > > >                         cw.tgid = current->tgid;
> > > >                         cw.need = need;
> > > >                         cw.want = want;
> > > > diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
> > > > index 20043382d825..5eeca7987717 100644
> > > > --- a/fs/ceph/debugfs.c
> > > > +++ b/fs/ceph/debugfs.c
> > > > @@ -211,7 +211,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
> > > >  {
> > > >         struct seq_file *s = p;
> > > > 
> > > > -       seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
> > > > +       seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_present_inode(inode),
> > > >                    ceph_cap_string(cap->issued),
> > > >                    ceph_cap_string(cap->implemented));
> > > >         return 0;
> > > > @@ -256,7 +256,7 @@ static int caps_show(struct seq_file *s, void *p)
> > > > 
> > > >         spin_lock(&mdsc->caps_list_lock);
> > > >         list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
> > > > -               seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
> > > > +               seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino,
> > > >                                 ceph_cap_string(cw->need),
> > > >                                 ceph_cap_string(cw->want));
> > > >         }
> > > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > > > index 060bdcc5ce32..1a61a435a1cd 100644
> > > > --- a/fs/ceph/dir.c
> > > > +++ b/fs/ceph/dir.c
> > > > @@ -259,9 +259,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
> > > >                              dentry, dentry, d_inode(dentry));
> > > >                         ctx->pos = di->offset;
> > > >                         if (!dir_emit(ctx, dentry->d_name.name,
> > > > -                                     dentry->d_name.len,
> > > > -                                     ceph_translate_ino(dentry->d_sb,
> > > > -                                                        d_inode(dentry)->i_ino),
> > > > +                                     dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
> > > >                                       d_inode(dentry)->i_mode >> 12)) {
> > > >                                 dput(dentry);
> > > >                                 err = 0;
> > > > @@ -324,18 +322,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> > > >         /* always start with . and .. */
> > > >         if (ctx->pos == 0) {
> > > >                 dout("readdir off 0 -> '.'\n");
> > > > -               if (!dir_emit(ctx, ".", 1,
> > > > -                           ceph_translate_ino(inode->i_sb, inode->i_ino),
> > > > +               if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
> > > >                             inode->i_mode >> 12))
> > > >                         return 0;
> > > >                 ctx->pos = 1;
> > > >         }
> > > >         if (ctx->pos == 1) {
> > > > -               ino_t ino = parent_ino(file->f_path.dentry);
> > > > +               u64 ino;
> > > > +               struct dentry *dentry = file->f_path.dentry;
> > > > +
> > > > +               spin_lock(&dentry->d_lock);
> > > > +               ino = ceph_present_inode(dentry->d_parent->d_inode);
> > > > +               spin_unlock(&dentry->d_lock);
> > > > +
> > > >                 dout("readdir off 1 -> '..'\n");
> > > > -               if (!dir_emit(ctx, "..", 2,
> > > > -                           ceph_translate_ino(inode->i_sb, ino),
> > > > -                           inode->i_mode >> 12))
> > > > +               if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
> > > >                         return 0;
> > > >                 ctx->pos = 2;
> > > >         }
> > > > @@ -507,9 +508,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> > > >         }
> > > >         for (; i < rinfo->dir_nr; i++) {
> > > >                 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
> > > > -               struct ceph_vino vino;
> > > > -               ino_t ino;
> > > > -               u32 ftype;
> > > > 
> > > >                 BUG_ON(rde->offset < ctx->pos);
> > > > 
> > > > @@ -519,13 +517,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> > > >                      rde->name_len, rde->name, &rde->inode.in);
> > > > 
> > > >                 BUG_ON(!rde->inode.in);
> > > > -               ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
> > > > -               vino.ino = le64_to_cpu(rde->inode.in->ino);
> > > > -               vino.snap = le64_to_cpu(rde->inode.in->snapid);
> > > > -               ino = ceph_vino_to_ino(vino);
> > > > 
> > > >                 if (!dir_emit(ctx, rde->name, rde->name_len,
> > > > -                             ceph_translate_ino(inode->i_sb, ino), ftype)) {
> > > > +                             ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
> > > > +                             le32_to_cpu(rde->inode.in->mode) >> 12)) {
> > > >                         dout("filldir stopping us...\n");
> > > >                         return 0;
> > > >                 }
> > > > @@ -1161,7 +1156,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
> > > > 
> > > >         if (try_async && op == CEPH_MDS_OP_UNLINK &&
> > > >             (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
> > > > -               dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
> > > > +               dout("async unlink on %llu/%.*s caps=%s", ceph_present_inode(dir),
> > > >                      dentry->d_name.len, dentry->d_name.name,
> > > >                      ceph_cap_string(req->r_dir_caps));
> > > >                 set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
> > > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > > > index 5fa28a620932..20e0c6f1d8f2 100644
> > > > --- a/fs/ceph/file.c
> > > > +++ b/fs/ceph/file.c
> > > > @@ -642,8 +642,8 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
> > > >         } else {
> > > >                 struct dentry *dn;
> > > > 
> > > > -               dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
> > > > -                       vino.ino, dir->i_ino, dentry->d_name.name);
> > > > +               dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
> > > > +                       vino.ino, ceph_ino(dir), dentry->d_name.name);
> > > >                 ceph_dir_clear_ordered(dir);
> > > >                 ceph_init_inode_acls(inode, as_ctx);
> > > >                 if (inode->i_state & I_NEW) {
> > > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > > index 357c937699d5..8dbb56d2edef 100644
> > > > --- a/fs/ceph/inode.c
> > > > +++ b/fs/ceph/inode.c
> > > > @@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work);
> > > >   */
> > > >  static int ceph_set_ino_cb(struct inode *inode, void *data)
> > > >  {
> > > > -       ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
> > > > -       inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
> > > > +       struct ceph_inode_info *ci = ceph_inode(inode);
> > > > +
> > > > +       ci->i_vino = *(struct ceph_vino *)data;
> > > > +       inode->i_ino = ceph_vino_to_ino(ci->i_vino);
> > > >         inode_set_iversion_raw(inode, 0);
> > > >         return 0;
> > > >  }
> > > > @@ -50,17 +52,17 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
> > > >  struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
> > > >  {
> > > >         struct inode *inode;
> > > > -       ino_t t = ceph_vino_to_ino(vino);
> > > > 
> > > > -       inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
> > > > +       inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
> > > > +                            ceph_set_ino_cb, &vino);
> > > >         if (!inode)
> > > >                 return ERR_PTR(-ENOMEM);
> > > >         if (inode->i_state & I_NEW)
> > > >                 dout("get_inode created new inode %p %llx.%llx ino %llx\n",
> > > > -                    inode, ceph_vinop(inode), (u64)inode->i_ino);
> > > > +                    inode, ceph_vinop(inode), ceph_present_inode(inode));
> > > > 
> > > > -       dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
> > > > -            vino.snap, inode);
> > > > +       dout("get_inode on %llu=%llx.%llx got %p\n", ceph_present_inode(inode),
> > > > +            vino.ino, vino.snap, inode);
> > > >         return inode;
> > > >  }
> > > > 
> > > > @@ -2378,7 +2380,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
> > > >         }
> > > > 
> > > >         generic_fillattr(inode, stat);
> > > > -       stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
> > > > +       stat->ino = ceph_present_inode(inode);
> > > > 
> > > >         /*
> > > >          * btime on newly-allocated inodes is 0, so if this is still set to
> > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > > index bc9e95937d7c..658800605bfb 100644
> > > > --- a/fs/ceph/mds_client.h
> > > > +++ b/fs/ceph/mds_client.h
> > > > @@ -372,7 +372,7 @@ struct ceph_quotarealm_inode {
> > > > 
> > > >  struct cap_wait {
> > > >         struct list_head        list;
> > > > -       unsigned long           ino;
> > > > +       u64                     ino;
> > > >         pid_t                   tgid;
> > > >         int                     need;
> > > >         int                     want;
> > > > diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
> > > > index 198ddde5c1e6..cc2c4d40b022 100644
> > > > --- a/fs/ceph/quota.c
> > > > +++ b/fs/ceph/quota.c
> > > > @@ -23,12 +23,12 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode)
> > > >  {
> > > >         struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
> > > >         struct super_block *sb = mdsc->fsc->sb;
> > > > +       struct inode *root = d_inode(sb->s_root);
> > > > 
> > > >         if (atomic64_read(&mdsc->quotarealms_count) > 0)
> > > >                 return true;
> > > >         /* if root is the real CephFS root, we don't have quota realms */
> > > > -       if (sb->s_root->d_inode &&
> > > > -           (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
> > > > +       if (root && ceph_ino(root) == CEPH_INO_ROOT)
> > > >                 return false;
> > > >         /* otherwise, we can't know for sure */
> > > >         return true;
> > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > index 4c3c964b1c54..0dd272a4a410 100644
> > > > --- a/fs/ceph/super.h
> > > > +++ b/fs/ceph/super.h
> > > > @@ -457,15 +457,7 @@ ceph_vino(const struct inode *inode)
> > > >         return ceph_inode(inode)->i_vino;
> > > >  }
> > > > 
> > > > -/*
> > > > - * ino_t is <64 bits on many architectures, blech.
> > > > - *
> > > > - *               i_ino (kernel inode)   st_ino (userspace)
> > > > - * i386          32                     32
> > > > - * x86_64+ino32  64                     32
> > > > - * x86_64        64                     64
> > > > - */
> > > > -static inline u32 ceph_ino_to_ino32(__u64 vino)
> > > > +static inline u32 ceph_ino_to_ino32(u64 vino)
> > > >  {
> > > >         u32 ino = vino & 0xffffffff;
> > > >         ino ^= vino >> 32;
> > > > @@ -474,36 +466,13 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
> > > >         return ino;
> > > >  }
> > > > 
> > > > -/*
> > > > - * kernel i_ino value
> > > > - */
> > > >  static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
> > > >  {
> > > > -#if BITS_PER_LONG == 32
> > > > -       return ceph_ino_to_ino32(vino.ino);
> > > > -#else
> > > > +       if (sizeof(ino_t) == sizeof(u32))
> > > > +               return ceph_ino_to_ino32(vino.ino);
> > > >         return (ino_t)vino.ino;
> > > > -#endif
> > > >  }
> > > > 
> > > > -/*
> > > > - * user-visible ino (stat, filldir)
> > > > - */
> > > > -#if BITS_PER_LONG == 32
> > > > -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> > > > -{
> > > > -       return ino;
> > > > -}
> > > > -#else
> > > > -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> > > > -{
> > > > -       if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
> > > > -               ino = ceph_ino_to_ino32(ino);
> > > > -       return ino;
> > > > -}
> > > > -#endif
> > > > -
> > > > -
> > > >  /* for printf-style formatting */
> > > >  #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
> > > > 
> > > > @@ -511,11 +480,34 @@ static inline u64 ceph_ino(struct inode *inode)
> > > >  {
> > > >         return ceph_inode(inode)->i_vino.ino;
> > > >  }
> > > > +
> > > >  static inline u64 ceph_snap(struct inode *inode)
> > > >  {
> > > >         return ceph_inode(inode)->i_vino.snap;
> > > >  }
> > > > 
> > > > +/**
> > > > + * ceph_present_ino - format an inode number for presentation to userland
> > > > + * @sb: superblock where the inode lives
> > > > + * @ino: inode number to (possibly) convert
> > > > + *
> > > > + * If the user mounted with the ino32 option, then the 64-bit value needs
> > > > + * to be converted to something that can fit inside 32 bits. Note that
> > > > + * internal kernel code never uses this value, so this is entirely for
> > > > + * userland consumption.
> > > > + */
> > > > +static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
> > > > +{
> > > > +       if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
> > > > +               return ceph_ino_to_ino32(ino);
> > > > +       return ino;
> > > > +}
> > > > +
> > > > +static inline u64 ceph_present_inode(struct inode *inode)
> > > > +{
> > > > +       return ceph_present_ino(inode->i_sb, ceph_ino(inode));
> > > > +}
> > > > +
> > > >  static inline int ceph_ino_compare(struct inode *inode, void *data)
> > > >  {
> > > >         struct ceph_vino *pvino = (struct ceph_vino *)data;
> > > > @@ -527,8 +519,7 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
> > > >  static inline struct inode *ceph_find_inode(struct super_block *sb,
> > > >                                             struct ceph_vino vino)
> > > >  {
> > > > -       ino_t t = ceph_vino_to_ino(vino);
> > > > -       return ilookup5(sb, t, ceph_ino_compare, &vino);
> > > > +       return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino);
> > > 
> > > What is the rationale for this bit?  There is a paragraph in the
> > > changelog that mentions that "low-order bits are the most likely
> > > to vary anyway", but this way we seem to have three sets of values
> > > instead of two: the real 64-bit values, the ino_t values and the
> > > truncated values.
> > > 
> > 
> > Yep. It's a bit confusing, but we have:
> > 
> > 1) the real 64-bit values: these are what we generally want to use
> > everywhere internally in ceph.ko
> > 
> > 2) the presentation value: the values that get reported in getattr and
> > readdir to userland. This is usually the same as the real 64-bit value
> > unless you mount with -o ino32.
> > 
> > 3) inode->i_ino: we'll no longer use this value for anything in ceph,
> > but we want to put _something_ in there so things like tracepoints that
> > display inode->i_ino don't show all 0's.
> > 
> > The second arg to ilookup5 is just the value that the ilookup5/iget5 use
> > to determine the hash bucket to use. This value won't ever be used for
> > anything beyond that. Most existing filesystems just use the inode
> > number for this.
> > 
> > So the rationale is that while we could just use the same value as what
> > ends up in inode->i_ino, that would mean that arches like s390x would
> > end up passing down a 32-bit value here when they could pass in the full
> > 64 bits. 32 bit arches might end up just sending the low-order bits, but
> > this gets run through hash() anyway, so I doubt it will harm anything.
> 
> I guess I still I don't see a good reason to special case this for
> 64-bit arches with 32-bit ino_t (just s390x?) and I'd keep it as is.
> It's just more consistent and less to keep in mind.
> 
> This is just a feeling and not a strong opinion though, ultimately
> it doesn't seem to matter too much.  If you go ahead with truncating
> to BITS_PER_LONG, I'd suggest turning your last paragraph into a
> comment for ceph_find_inode().
> 

I guess it comes down to what makes a better hashval for the inode table
on arches with a 32-bit long? Should we just use the lowest bits of the
inode number or do a ceph_ino_to_ino32 conversion first?

My rationale was that we're going to end up hashing this value through
hash() in fs/inode.c anyway, so why bother hashing it down into 32 bits
first? Most inodes (at least for a single MDS) end up clustered together
so that the upper 32 bits are not generally significant.

We might end up with collisions when 2 MDS's generate 2 inode numbers
with the lower 32 bits all the same, but that's not a huge problem. The
hash chain just ends up being a little longer than is optimal.

-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v2] ceph: fix inode number handling on arches with 32-bit ino_t
  2020-08-19 13:57         ` Jeff Layton
@ 2020-08-19 14:25           ` Jeff Layton
  0 siblings, 0 replies; 14+ messages in thread
From: Jeff Layton @ 2020-08-19 14:25 UTC (permalink / raw)
  To: Ilya Dryomov; +Cc: Ceph Development, Ulrich.Weigand, Tuan.Hoang1

On Wed, 2020-08-19 at 09:57 -0400, Jeff Layton wrote:
> On Wed, 2020-08-19 at 15:28 +0200, Ilya Dryomov wrote:
> > On Wed, Aug 19, 2020 at 1:25 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > On Wed, 2020-08-19 at 09:17 +0200, Ilya Dryomov wrote:
> > > > On Tue, Aug 18, 2020 at 9:49 PM Jeff Layton <jlayton@kernel.org> wrote:
> > > > > Tuan and Ulrich mentioned that they were hitting a problem on s390x,
> > > > > which has a 32-bit ino_t value, even though it's a 64-bit arch (for
> > > > > historical reasons).
> > > > > 
> > > > > I think the current handling of inode numbers in the ceph driver is
> > > > > wrong. It tries to use 32-bit inode numbers on 32-bit arches, but that's
> > > > > actually not a problem. 32-bit arches can deal with 64-bit inode numbers
> > > > > just fine when userland code is compiled with LFS support (the common
> > > > > case these days).
> > > > > 
> > > > > What we really want to do is just use 64-bit numbers everywhere, unless
> > > > > someone has mounted with the ino32 mount option. In that case, we want
> > > > > to ensure that we hash the inode number down to something that will fit
> > > > > in 32 bits before presenting the value to userland.
> > > > > 
> > > > > Add new helper functions that do this, and only do the conversion before
> > > > > presenting these values to userland in getattr, readdir, debugfs, and
> > > > > (some) debug log messages.
> > > > > 
> > > > > The inode table hashvalue is changed to just cast the inode number to
> > > > > unsigned long, as low-order bits are the most likely to vary anyway.
> > > > > 
> > > > > While it's not strictly required, we do want to put something in
> > > > > inode->i_ino. Instead of basing it on BITS_PER_LONG, however, base it on
> > > > > the size of the ino_t type.
> > > > > 
> > > > > Reported-by: Tuan Hoang1 <Tuan.Hoang1@ibm.com>
> > > > > Reported-by: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
> > > > > URL: https://tracker.ceph.com/issues/46828
> > > > > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > > > > ---
> > > > >  fs/ceph/caps.c       | 14 +++++-----
> > > > >  fs/ceph/debugfs.c    |  4 +--
> > > > >  fs/ceph/dir.c        | 31 +++++++++-------------
> > > > >  fs/ceph/file.c       |  4 +--
> > > > >  fs/ceph/inode.c      | 18 +++++++------
> > > > >  fs/ceph/mds_client.h |  2 +-
> > > > >  fs/ceph/quota.c      |  4 +--
> > > > >  fs/ceph/super.h      | 63 +++++++++++++++++++-------------------------
> > > > >  8 files changed, 64 insertions(+), 76 deletions(-)
> > > > > 
> > > > > v2:
> > > > > - fix dir_emit inode number for ".."
> > > > > - fix ino_t size test
> > > > > 
> > > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > > > index 55ccccf77cea..d5b1f781f398 100644
> > > > > --- a/fs/ceph/caps.c
> > > > > +++ b/fs/ceph/caps.c
> > > > > @@ -887,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
> > > > >         int have = ci->i_snap_caps;
> > > > > 
> > > > >         if ((have & mask) == mask) {
> > > > > -               dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
> > > > > -                    " (mask %s)\n", ci->vfs_inode.i_ino,
> > > > > +               dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
> > > > > +                    " (mask %s)\n", ceph_present_inode(&ci->vfs_inode),
> > > > 
> > > > Hi Jeff,
> > > > 
> > > > I agree with Zheng.  douts should output the real 64-bit values.
> > > > If the presentation value is of interest in some places, they should
> > > > output it alongside the real value.
> > > > 
> > > 
> > > Ok.
> > > 
> > > FWIW, the rationale for using ceph_present_inode in these is that the
> > > values would better match what stat() reports in st_ino. The debug
> > > messages are really inconsistent here though, so I'm fine with moving
> > > them to always print the 64-bit values. My guess is that hardly anyone
> > > uses -o ino32 in this day and age anyway...
> > > 
> > > 
> > > > >                      ceph_cap_string(have),
> > > > >                      ceph_cap_string(mask));
> > > > >                 return 1;
> > > > > @@ -899,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
> > > > >                 if (!__cap_is_valid(cap))
> > > > >                         continue;
> > > > >                 if ((cap->issued & mask) == mask) {
> > > > > -                       dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
> > > > > -                            " (mask %s)\n", ci->vfs_inode.i_ino, cap,
> > > > > +                       dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
> > > > > +                            " (mask %s)\n", ceph_present_inode(&ci->vfs_inode), cap,
> > > > >                              ceph_cap_string(cap->issued),
> > > > >                              ceph_cap_string(mask));
> > > > >                         if (touch)
> > > > > @@ -911,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
> > > > >                 /* does a combination of caps satisfy mask? */
> > > > >                 have |= cap->issued;
> > > > >                 if ((have & mask) == mask) {
> > > > > -                       dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
> > > > > -                            " (mask %s)\n", ci->vfs_inode.i_ino,
> > > > > +                       dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
> > > > > +                            " (mask %s)\n", ceph_present_inode(&ci->vfs_inode),
> > > > >                              ceph_cap_string(cap->issued),
> > > > >                              ceph_cap_string(mask));
> > > > >                         if (touch) {
> > > > > @@ -2872,7 +2872,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
> > > > >                         struct cap_wait cw;
> > > > >                         DEFINE_WAIT_FUNC(wait, woken_wake_function);
> > > > > 
> > > > > -                       cw.ino = inode->i_ino;
> > > > > +                       cw.ino = ceph_present_inode(inode);
> > > > >                         cw.tgid = current->tgid;
> > > > >                         cw.need = need;
> > > > >                         cw.want = want;
> > > > > diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
> > > > > index 20043382d825..5eeca7987717 100644
> > > > > --- a/fs/ceph/debugfs.c
> > > > > +++ b/fs/ceph/debugfs.c
> > > > > @@ -211,7 +211,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
> > > > >  {
> > > > >         struct seq_file *s = p;
> > > > > 
> > > > > -       seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
> > > > > +       seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_present_inode(inode),
> > > > >                    ceph_cap_string(cap->issued),
> > > > >                    ceph_cap_string(cap->implemented));
> > > > >         return 0;
> > > > > @@ -256,7 +256,7 @@ static int caps_show(struct seq_file *s, void *p)
> > > > > 
> > > > >         spin_lock(&mdsc->caps_list_lock);
> > > > >         list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
> > > > > -               seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
> > > > > +               seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino,
> > > > >                                 ceph_cap_string(cw->need),
> > > > >                                 ceph_cap_string(cw->want));
> > > > >         }
> > > > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > > > > index 060bdcc5ce32..1a61a435a1cd 100644
> > > > > --- a/fs/ceph/dir.c
> > > > > +++ b/fs/ceph/dir.c
> > > > > @@ -259,9 +259,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
> > > > >                              dentry, dentry, d_inode(dentry));
> > > > >                         ctx->pos = di->offset;
> > > > >                         if (!dir_emit(ctx, dentry->d_name.name,
> > > > > -                                     dentry->d_name.len,
> > > > > -                                     ceph_translate_ino(dentry->d_sb,
> > > > > -                                                        d_inode(dentry)->i_ino),
> > > > > +                                     dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
> > > > >                                       d_inode(dentry)->i_mode >> 12)) {
> > > > >                                 dput(dentry);
> > > > >                                 err = 0;
> > > > > @@ -324,18 +322,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> > > > >         /* always start with . and .. */
> > > > >         if (ctx->pos == 0) {
> > > > >                 dout("readdir off 0 -> '.'\n");
> > > > > -               if (!dir_emit(ctx, ".", 1,
> > > > > -                           ceph_translate_ino(inode->i_sb, inode->i_ino),
> > > > > +               if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
> > > > >                             inode->i_mode >> 12))
> > > > >                         return 0;
> > > > >                 ctx->pos = 1;
> > > > >         }
> > > > >         if (ctx->pos == 1) {
> > > > > -               ino_t ino = parent_ino(file->f_path.dentry);
> > > > > +               u64 ino;
> > > > > +               struct dentry *dentry = file->f_path.dentry;
> > > > > +
> > > > > +               spin_lock(&dentry->d_lock);
> > > > > +               ino = ceph_present_inode(dentry->d_parent->d_inode);
> > > > > +               spin_unlock(&dentry->d_lock);
> > > > > +
> > > > >                 dout("readdir off 1 -> '..'\n");
> > > > > -               if (!dir_emit(ctx, "..", 2,
> > > > > -                           ceph_translate_ino(inode->i_sb, ino),
> > > > > -                           inode->i_mode >> 12))
> > > > > +               if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
> > > > >                         return 0;
> > > > >                 ctx->pos = 2;
> > > > >         }
> > > > > @@ -507,9 +508,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> > > > >         }
> > > > >         for (; i < rinfo->dir_nr; i++) {
> > > > >                 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
> > > > > -               struct ceph_vino vino;
> > > > > -               ino_t ino;
> > > > > -               u32 ftype;
> > > > > 
> > > > >                 BUG_ON(rde->offset < ctx->pos);
> > > > > 
> > > > > @@ -519,13 +517,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
> > > > >                      rde->name_len, rde->name, &rde->inode.in);
> > > > > 
> > > > >                 BUG_ON(!rde->inode.in);
> > > > > -               ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
> > > > > -               vino.ino = le64_to_cpu(rde->inode.in->ino);
> > > > > -               vino.snap = le64_to_cpu(rde->inode.in->snapid);
> > > > > -               ino = ceph_vino_to_ino(vino);
> > > > > 
> > > > >                 if (!dir_emit(ctx, rde->name, rde->name_len,
> > > > > -                             ceph_translate_ino(inode->i_sb, ino), ftype)) {
> > > > > +                             ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
> > > > > +                             le32_to_cpu(rde->inode.in->mode) >> 12)) {
> > > > >                         dout("filldir stopping us...\n");
> > > > >                         return 0;
> > > > >                 }
> > > > > @@ -1161,7 +1156,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
> > > > > 
> > > > >         if (try_async && op == CEPH_MDS_OP_UNLINK &&
> > > > >             (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
> > > > > -               dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
> > > > > +               dout("async unlink on %llu/%.*s caps=%s", ceph_present_inode(dir),
> > > > >                      dentry->d_name.len, dentry->d_name.name,
> > > > >                      ceph_cap_string(req->r_dir_caps));
> > > > >                 set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
> > > > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > > > > index 5fa28a620932..20e0c6f1d8f2 100644
> > > > > --- a/fs/ceph/file.c
> > > > > +++ b/fs/ceph/file.c
> > > > > @@ -642,8 +642,8 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
> > > > >         } else {
> > > > >                 struct dentry *dn;
> > > > > 
> > > > > -               dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
> > > > > -                       vino.ino, dir->i_ino, dentry->d_name.name);
> > > > > +               dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
> > > > > +                       vino.ino, ceph_ino(dir), dentry->d_name.name);
> > > > >                 ceph_dir_clear_ordered(dir);
> > > > >                 ceph_init_inode_acls(inode, as_ctx);
> > > > >                 if (inode->i_state & I_NEW) {
> > > > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > > > index 357c937699d5..8dbb56d2edef 100644
> > > > > --- a/fs/ceph/inode.c
> > > > > +++ b/fs/ceph/inode.c
> > > > > @@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work);
> > > > >   */
> > > > >  static int ceph_set_ino_cb(struct inode *inode, void *data)
> > > > >  {
> > > > > -       ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
> > > > > -       inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
> > > > > +       struct ceph_inode_info *ci = ceph_inode(inode);
> > > > > +
> > > > > +       ci->i_vino = *(struct ceph_vino *)data;
> > > > > +       inode->i_ino = ceph_vino_to_ino(ci->i_vino);
> > > > >         inode_set_iversion_raw(inode, 0);
> > > > >         return 0;
> > > > >  }
> > > > > @@ -50,17 +52,17 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
> > > > >  struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
> > > > >  {
> > > > >         struct inode *inode;
> > > > > -       ino_t t = ceph_vino_to_ino(vino);
> > > > > 
> > > > > -       inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
> > > > > +       inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
> > > > > +                            ceph_set_ino_cb, &vino);
> > > > >         if (!inode)
> > > > >                 return ERR_PTR(-ENOMEM);
> > > > >         if (inode->i_state & I_NEW)
> > > > >                 dout("get_inode created new inode %p %llx.%llx ino %llx\n",
> > > > > -                    inode, ceph_vinop(inode), (u64)inode->i_ino);
> > > > > +                    inode, ceph_vinop(inode), ceph_present_inode(inode));
> > > > > 
> > > > > -       dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
> > > > > -            vino.snap, inode);
> > > > > +       dout("get_inode on %llu=%llx.%llx got %p\n", ceph_present_inode(inode),
> > > > > +            vino.ino, vino.snap, inode);
> > > > >         return inode;
> > > > >  }
> > > > > 
> > > > > @@ -2378,7 +2380,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
> > > > >         }
> > > > > 
> > > > >         generic_fillattr(inode, stat);
> > > > > -       stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
> > > > > +       stat->ino = ceph_present_inode(inode);
> > > > > 
> > > > >         /*
> > > > >          * btime on newly-allocated inodes is 0, so if this is still set to
> > > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > > > index bc9e95937d7c..658800605bfb 100644
> > > > > --- a/fs/ceph/mds_client.h
> > > > > +++ b/fs/ceph/mds_client.h
> > > > > @@ -372,7 +372,7 @@ struct ceph_quotarealm_inode {
> > > > > 
> > > > >  struct cap_wait {
> > > > >         struct list_head        list;
> > > > > -       unsigned long           ino;
> > > > > +       u64                     ino;
> > > > >         pid_t                   tgid;
> > > > >         int                     need;
> > > > >         int                     want;
> > > > > diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
> > > > > index 198ddde5c1e6..cc2c4d40b022 100644
> > > > > --- a/fs/ceph/quota.c
> > > > > +++ b/fs/ceph/quota.c
> > > > > @@ -23,12 +23,12 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode)
> > > > >  {
> > > > >         struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
> > > > >         struct super_block *sb = mdsc->fsc->sb;
> > > > > +       struct inode *root = d_inode(sb->s_root);
> > > > > 
> > > > >         if (atomic64_read(&mdsc->quotarealms_count) > 0)
> > > > >                 return true;
> > > > >         /* if root is the real CephFS root, we don't have quota realms */
> > > > > -       if (sb->s_root->d_inode &&
> > > > > -           (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
> > > > > +       if (root && ceph_ino(root) == CEPH_INO_ROOT)
> > > > >                 return false;
> > > > >         /* otherwise, we can't know for sure */
> > > > >         return true;
> > > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > > index 4c3c964b1c54..0dd272a4a410 100644
> > > > > --- a/fs/ceph/super.h
> > > > > +++ b/fs/ceph/super.h
> > > > > @@ -457,15 +457,7 @@ ceph_vino(const struct inode *inode)
> > > > >         return ceph_inode(inode)->i_vino;
> > > > >  }
> > > > > 
> > > > > -/*
> > > > > - * ino_t is <64 bits on many architectures, blech.
> > > > > - *
> > > > > - *               i_ino (kernel inode)   st_ino (userspace)
> > > > > - * i386          32                     32
> > > > > - * x86_64+ino32  64                     32
> > > > > - * x86_64        64                     64
> > > > > - */
> > > > > -static inline u32 ceph_ino_to_ino32(__u64 vino)
> > > > > +static inline u32 ceph_ino_to_ino32(u64 vino)
> > > > >  {
> > > > >         u32 ino = vino & 0xffffffff;
> > > > >         ino ^= vino >> 32;
> > > > > @@ -474,36 +466,13 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
> > > > >         return ino;
> > > > >  }
> > > > > 
> > > > > -/*
> > > > > - * kernel i_ino value
> > > > > - */
> > > > >  static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
> > > > >  {
> > > > > -#if BITS_PER_LONG == 32
> > > > > -       return ceph_ino_to_ino32(vino.ino);
> > > > > -#else
> > > > > +       if (sizeof(ino_t) == sizeof(u32))
> > > > > +               return ceph_ino_to_ino32(vino.ino);
> > > > >         return (ino_t)vino.ino;
> > > > > -#endif
> > > > >  }
> > > > > 
> > > > > -/*
> > > > > - * user-visible ino (stat, filldir)
> > > > > - */
> > > > > -#if BITS_PER_LONG == 32
> > > > > -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> > > > > -{
> > > > > -       return ino;
> > > > > -}
> > > > > -#else
> > > > > -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> > > > > -{
> > > > > -       if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
> > > > > -               ino = ceph_ino_to_ino32(ino);
> > > > > -       return ino;
> > > > > -}
> > > > > -#endif
> > > > > -
> > > > > -
> > > > >  /* for printf-style formatting */
> > > > >  #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
> > > > > 
> > > > > @@ -511,11 +480,34 @@ static inline u64 ceph_ino(struct inode *inode)
> > > > >  {
> > > > >         return ceph_inode(inode)->i_vino.ino;
> > > > >  }
> > > > > +
> > > > >  static inline u64 ceph_snap(struct inode *inode)
> > > > >  {
> > > > >         return ceph_inode(inode)->i_vino.snap;
> > > > >  }
> > > > > 
> > > > > +/**
> > > > > + * ceph_present_ino - format an inode number for presentation to userland
> > > > > + * @sb: superblock where the inode lives
> > > > > + * @ino: inode number to (possibly) convert
> > > > > + *
> > > > > + * If the user mounted with the ino32 option, then the 64-bit value needs
> > > > > + * to be converted to something that can fit inside 32 bits. Note that
> > > > > + * internal kernel code never uses this value, so this is entirely for
> > > > > + * userland consumption.
> > > > > + */
> > > > > +static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
> > > > > +{
> > > > > +       if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
> > > > > +               return ceph_ino_to_ino32(ino);
> > > > > +       return ino;
> > > > > +}
> > > > > +
> > > > > +static inline u64 ceph_present_inode(struct inode *inode)
> > > > > +{
> > > > > +       return ceph_present_ino(inode->i_sb, ceph_ino(inode));
> > > > > +}
> > > > > +
> > > > >  static inline int ceph_ino_compare(struct inode *inode, void *data)
> > > > >  {
> > > > >         struct ceph_vino *pvino = (struct ceph_vino *)data;
> > > > > @@ -527,8 +519,7 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
> > > > >  static inline struct inode *ceph_find_inode(struct super_block *sb,
> > > > >                                             struct ceph_vino vino)
> > > > >  {
> > > > > -       ino_t t = ceph_vino_to_ino(vino);
> > > > > -       return ilookup5(sb, t, ceph_ino_compare, &vino);
> > > > > +       return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino);
> > > > 
> > > > What is the rationale for this bit?  There is a paragraph in the
> > > > changelog that mentions that "low-order bits are the most likely
> > > > to vary anyway", but this way we seem to have three sets of values
> > > > instead of two: the real 64-bit values, the ino_t values and the
> > > > truncated values.
> > > > 
> > > 
> > > Yep. It's a bit confusing, but we have:
> > > 
> > > 1) the real 64-bit values: these are what we generally want to use
> > > everywhere internally in ceph.ko
> > > 
> > > 2) the presentation value: the values that get reported in getattr and
> > > readdir to userland. This is usually the same as the real 64-bit value
> > > unless you mount with -o ino32.
> > > 
> > > 3) inode->i_ino: we'll no longer use this value for anything in ceph,
> > > but we want to put _something_ in there so things like tracepoints that
> > > display inode->i_ino don't show all 0's.
> > > 
> > > The second arg to ilookup5 is just the value that the ilookup5/iget5 use
> > > to determine the hash bucket to use. This value won't ever be used for
> > > anything beyond that. Most existing filesystems just use the inode
> > > number for this.
> > > 
> > > So the rationale is that while we could just use the same value as what
> > > ends up in inode->i_ino, that would mean that arches like s390x would
> > > end up passing down a 32-bit value here when they could pass in the full
> > > 64 bits. 32 bit arches might end up just sending the low-order bits, but
> > > this gets run through hash() anyway, so I doubt it will harm anything.
> > 
> > I guess I still I don't see a good reason to special case this for
> > 64-bit arches with 32-bit ino_t (just s390x?) and I'd keep it as is.
> > It's just more consistent and less to keep in mind.
> > 
> > This is just a feeling and not a strong opinion though, ultimately
> > it doesn't seem to matter too much.  If you go ahead with truncating
> > to BITS_PER_LONG, I'd suggest turning your last paragraph into a
> > comment for ceph_find_inode().
> > 
> 
> I guess it comes down to what makes a better hashval for the inode table
> on arches with a 32-bit long? Should we just use the lowest bits of the
> inode number or do a ceph_ino_to_ino32 conversion first?
> 
> My rationale was that we're going to end up hashing this value through
> hash() in fs/inode.c anyway, so why bother hashing it down into 32 bits
> first? Most inodes (at least for a single MDS) end up clustered together
> so that the upper 32 bits are not generally significant.
> 
> We might end up with collisions when 2 MDS's generate 2 inode numbers
> with the lower 32 bits all the same, but that's not a huge problem. The
> hash chain just ends up being a little longer than is optimal.
> 

Sorry, Ilya...I didn't answer your original question as to whether this
is just for s390x. Ulrich said in a different email thread:

> The background is as follows: on s390x the kernel's ino_t is a 32-bit
> type, even though we're a 64-bit platform. This is an unfortunate
> historical accident that cannot easily be fixed now due to userspace
> ABI stability concerns. (Note that there is one other 64-bit platform,
> alpha, that has the same issue.)

So to be clear, this isn't special-casing s390x. This change allows
s390x (and I guess alpha) to use the full inode number as a hashval, as
that value is an unsigned long.

Arches with a 32-bit long are changed from using the 32-bit hashed
version of the inode number as a to just passing in the lowest bits of
the actual 64-bit inode number as a hashval. My sense is that that's
fine given that the value goes through a separate hashing step anyway
when we call ilookup5, iget5, etc.

This is all confusing though, so I'll add some comments resend a v4
version.

Thanks for the review so far!
-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] ceph: fix inode number handling on arches with 32-bit ino_t
  2020-08-18 16:23 [PATCH] ceph: fix inode number presentation on 32-bit platforms and s390x Jeff Layton
                   ` (2 preceding siblings ...)
  2020-08-19 12:35 ` [PATCH v3] " Jeff Layton
@ 2020-08-19 15:16 ` Jeff Layton
  2020-08-20 21:30   ` Jeff Layton
  2020-08-24 13:38   ` Yan, Zheng
  3 siblings, 2 replies; 14+ messages in thread
From: Jeff Layton @ 2020-08-19 15:16 UTC (permalink / raw)
  To: ceph-devel; +Cc: Ulrich.Weigand, Tuan.Hoang1, idryomov

Tuan and Ulrich mentioned that they were hitting a problem on s390x,
which has a 32-bit ino_t value, even though it's a 64-bit arch (for
historical reasons).

I think the current handling of inode numbers in the ceph driver is
wrong. It tries to use 32-bit inode numbers on 32-bit arches, but that's
actually not a problem. 32-bit arches can deal with 64-bit inode numbers
just fine when userland code is compiled with LFS support (the common
case these days).

What we really want to do is just use 64-bit numbers everywhere, unless
someone has mounted with the ino32 mount option. In that case, we want
to ensure that we hash the inode number down to something that will fit
in 32 bits before presenting the value to userland.

Add new helper functions that do this, and only do the conversion before
presenting these values to userland in getattr and readdir.

The inode table hashvalue is changed to just cast the inode number to
unsigned long, as low-order bits are the most likely to vary anyway.

While it's not strictly required, we do want to put something in
inode->i_ino. Instead of basing it on BITS_PER_LONG, however, base it on
the size of the ino_t type.

Reported-by: Tuan Hoang1 <Tuan.Hoang1@ibm.com>
Reported-by: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
URL: https://tracker.ceph.com/issues/46828
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/ceph/caps.c       | 14 ++++-----
 fs/ceph/debugfs.c    |  4 +--
 fs/ceph/dir.c        | 31 ++++++++-----------
 fs/ceph/file.c       |  4 +--
 fs/ceph/inode.c      | 19 ++++++------
 fs/ceph/mds_client.h |  2 +-
 fs/ceph/quota.c      |  4 +--
 fs/ceph/super.h      | 73 +++++++++++++++++++++++---------------------
 8 files changed, 74 insertions(+), 77 deletions(-)

v4:
- flesh out comments in super.h
- merge dout messages in ceph_get_inode
- rename ceph_vino_to_ino to ceph_vino_to_ino_t

v3:
- use ceph_ino instead of ceph_present_ino in most dout() messages

v2:
- fix dir_emit inode number for ".."
- fix ino_t size test

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 55ccccf77cea..034b3f4fdd3a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -887,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 	int have = ci->i_snap_caps;
 
 	if ((have & mask) == mask) {
-		dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
-		     " (mask %s)\n", ci->vfs_inode.i_ino,
+		dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
+		     " (mask %s)\n", ceph_ino(&ci->vfs_inode),
 		     ceph_cap_string(have),
 		     ceph_cap_string(mask));
 		return 1;
@@ -899,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 		if (!__cap_is_valid(cap))
 			continue;
 		if ((cap->issued & mask) == mask) {
-			dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
-			     " (mask %s)\n", ci->vfs_inode.i_ino, cap,
+			dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
+			     " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap,
 			     ceph_cap_string(cap->issued),
 			     ceph_cap_string(mask));
 			if (touch)
@@ -911,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
 		/* does a combination of caps satisfy mask? */
 		have |= cap->issued;
 		if ((have & mask) == mask) {
-			dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
-			     " (mask %s)\n", ci->vfs_inode.i_ino,
+			dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
+			     " (mask %s)\n", ceph_ino(&ci->vfs_inode),
 			     ceph_cap_string(cap->issued),
 			     ceph_cap_string(mask));
 			if (touch) {
@@ -2872,7 +2872,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
 			struct cap_wait cw;
 			DEFINE_WAIT_FUNC(wait, woken_wake_function);
 
-			cw.ino = inode->i_ino;
+			cw.ino = ceph_ino(inode);
 			cw.tgid = current->tgid;
 			cw.need = need;
 			cw.want = want;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 97539b497e4c..3e3fcda9b276 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -202,7 +202,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
 {
 	struct seq_file *s = p;
 
-	seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
+	seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_ino(inode),
 		   ceph_cap_string(cap->issued),
 		   ceph_cap_string(cap->implemented));
 	return 0;
@@ -247,7 +247,7 @@ static int caps_show(struct seq_file *s, void *p)
 
 	spin_lock(&mdsc->caps_list_lock);
 	list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
-		seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
+		seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino,
 				ceph_cap_string(cw->need),
 				ceph_cap_string(cw->want));
 	}
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 060bdcc5ce32..040eaad9d063 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -259,9 +259,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 			     dentry, dentry, d_inode(dentry));
 			ctx->pos = di->offset;
 			if (!dir_emit(ctx, dentry->d_name.name,
-				      dentry->d_name.len,
-				      ceph_translate_ino(dentry->d_sb,
-							 d_inode(dentry)->i_ino),
+				      dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
 				      d_inode(dentry)->i_mode >> 12)) {
 				dput(dentry);
 				err = 0;
@@ -324,18 +322,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 	/* always start with . and .. */
 	if (ctx->pos == 0) {
 		dout("readdir off 0 -> '.'\n");
-		if (!dir_emit(ctx, ".", 1, 
-			    ceph_translate_ino(inode->i_sb, inode->i_ino),
+		if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
 			    inode->i_mode >> 12))
 			return 0;
 		ctx->pos = 1;
 	}
 	if (ctx->pos == 1) {
-		ino_t ino = parent_ino(file->f_path.dentry);
+		u64 ino;
+		struct dentry *dentry = file->f_path.dentry;
+
+		spin_lock(&dentry->d_lock);
+		ino = ceph_present_inode(dentry->d_parent->d_inode);
+		spin_unlock(&dentry->d_lock);
+
 		dout("readdir off 1 -> '..'\n");
-		if (!dir_emit(ctx, "..", 2,
-			    ceph_translate_ino(inode->i_sb, ino),
-			    inode->i_mode >> 12))
+		if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
 			return 0;
 		ctx->pos = 2;
 	}
@@ -507,9 +508,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 	}
 	for (; i < rinfo->dir_nr; i++) {
 		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
-		struct ceph_vino vino;
-		ino_t ino;
-		u32 ftype;
 
 		BUG_ON(rde->offset < ctx->pos);
 
@@ -519,13 +517,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 		     rde->name_len, rde->name, &rde->inode.in);
 
 		BUG_ON(!rde->inode.in);
-		ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
-		vino.ino = le64_to_cpu(rde->inode.in->ino);
-		vino.snap = le64_to_cpu(rde->inode.in->snapid);
-		ino = ceph_vino_to_ino(vino);
 
 		if (!dir_emit(ctx, rde->name, rde->name_len,
-			      ceph_translate_ino(inode->i_sb, ino), ftype)) {
+			      ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
+			      le32_to_cpu(rde->inode.in->mode) >> 12)) {
 			dout("filldir stopping us...\n");
 			return 0;
 		}
@@ -1161,7 +1156,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
 
 	if (try_async && op == CEPH_MDS_OP_UNLINK &&
 	    (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
-		dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
+		dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
 		     dentry->d_name.len, dentry->d_name.name,
 		     ceph_cap_string(req->r_dir_caps));
 		set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 28714934ced7..27c7047c9383 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -628,8 +628,8 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
 	} else {
 		struct dentry *dn;
 
-		dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
-			vino.ino, dir->i_ino, dentry->d_name.name);
+		dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
+			vino.ino, ceph_ino(dir), dentry->d_name.name);
 		ceph_dir_clear_ordered(dir);
 		ceph_init_inode_acls(inode, as_ctx);
 		if (inode->i_state & I_NEW) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 357c937699d5..d163fa96cb40 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work);
  */
 static int ceph_set_ino_cb(struct inode *inode, void *data)
 {
-	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
-	inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
+	struct ceph_inode_info *ci = ceph_inode(inode);
+
+	ci->i_vino = *(struct ceph_vino *)data;
+	inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
 	inode_set_iversion_raw(inode, 0);
 	return 0;
 }
@@ -50,17 +52,14 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
 struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
 {
 	struct inode *inode;
-	ino_t t = ceph_vino_to_ino(vino);
 
-	inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
+	inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
+			     ceph_set_ino_cb, &vino);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
-	if (inode->i_state & I_NEW)
-		dout("get_inode created new inode %p %llx.%llx ino %llx\n",
-		     inode, ceph_vinop(inode), (u64)inode->i_ino);
 
-	dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
-	     vino.snap, inode);
+	dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
+	     ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
 	return inode;
 }
 
@@ -2378,7 +2377,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
 	}
 
 	generic_fillattr(inode, stat);
-	stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
+	stat->ino = ceph_present_inode(inode);
 
 	/*
 	 * btime on newly-allocated inodes is 0, so if this is still set to
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index bc9e95937d7c..658800605bfb 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -372,7 +372,7 @@ struct ceph_quotarealm_inode {
 
 struct cap_wait {
 	struct list_head	list;
-	unsigned long		ino;
+	u64			ino;
 	pid_t			tgid;
 	int			need;
 	int			want;
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 198ddde5c1e6..cc2c4d40b022 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -23,12 +23,12 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode)
 {
 	struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
 	struct super_block *sb = mdsc->fsc->sb;
+	struct inode *root = d_inode(sb->s_root);
 
 	if (atomic64_read(&mdsc->quotarealms_count) > 0)
 		return true;
 	/* if root is the real CephFS root, we don't have quota realms */
-	if (sb->s_root->d_inode &&
-	    (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
+	if (root && ceph_ino(root) == CEPH_INO_ROOT)
 		return false;
 	/* otherwise, we can't know for sure */
 	return true;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 4c3c964b1c54..a3995ebe0623 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -457,15 +457,7 @@ ceph_vino(const struct inode *inode)
 	return ceph_inode(inode)->i_vino;
 }
 
-/*
- * ino_t is <64 bits on many architectures, blech.
- *
- *               i_ino (kernel inode)   st_ino (userspace)
- * i386          32                     32
- * x86_64+ino32  64                     32
- * x86_64        64                     64
- */
-static inline u32 ceph_ino_to_ino32(__u64 vino)
+static inline u32 ceph_ino_to_ino32(u64 vino)
 {
 	u32 ino = vino & 0xffffffff;
 	ino ^= vino >> 32;
@@ -475,34 +467,17 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
 }
 
 /*
- * kernel i_ino value
+ * Inode numbers in cephfs are 64 bits, but inode->i_ino is 32-bits on
+ * some arches. We generally do not use this value inside the ceph driver, but
+ * we do want to set it to something, so that generic vfs code has an
+ * appropriate value for tracepoints and the like.
  */
-static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+static inline ino_t ceph_vino_to_ino_t(struct ceph_vino vino)
 {
-#if BITS_PER_LONG == 32
-	return ceph_ino_to_ino32(vino.ino);
-#else
+	if (sizeof(ino_t) == sizeof(u32))
+		return ceph_ino_to_ino32(vino.ino);
 	return (ino_t)vino.ino;
-#endif
-}
-
-/*
- * user-visible ino (stat, filldir)
- */
-#if BITS_PER_LONG == 32
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
-	return ino;
-}
-#else
-static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
-{
-	if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
-		ino = ceph_ino_to_ino32(ino);
-	return ino;
 }
-#endif
-
 
 /* for printf-style formatting */
 #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
@@ -511,11 +486,34 @@ static inline u64 ceph_ino(struct inode *inode)
 {
 	return ceph_inode(inode)->i_vino.ino;
 }
+
 static inline u64 ceph_snap(struct inode *inode)
 {
 	return ceph_inode(inode)->i_vino.snap;
 }
 
+/**
+ * ceph_present_ino - format an inode number for presentation to userland
+ * @sb: superblock where the inode lives
+ * @ino: inode number to (possibly) convert
+ *
+ * If the user mounted with the ino32 option, then the 64-bit value needs
+ * to be converted to something that can fit inside 32 bits. Note that
+ * internal kernel code never uses this value, so this is entirely for
+ * userland consumption.
+ */
+static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
+{
+	if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
+		return ceph_ino_to_ino32(ino);
+	return ino;
+}
+
+static inline u64 ceph_present_inode(struct inode *inode)
+{
+	return ceph_present_ino(inode->i_sb, ceph_ino(inode));
+}
+
 static inline int ceph_ino_compare(struct inode *inode, void *data)
 {
 	struct ceph_vino *pvino = (struct ceph_vino *)data;
@@ -524,11 +522,16 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
 		ci->i_vino.snap == pvino->snap;
 }
 
+
 static inline struct inode *ceph_find_inode(struct super_block *sb,
 					    struct ceph_vino vino)
 {
-	ino_t t = ceph_vino_to_ino(vino);
-	return ilookup5(sb, t, ceph_ino_compare, &vino);
+	/*
+	 * NB: The hashval will be run through the fs/inode.c hash function
+	 * anyway, so there is no need to squash the inode number down to
+	 * 32-bits first. Just use low-order bits on arches with 32-bit long.
+	 */
+	return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino);
 }
 
 
-- 
2.26.2


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] ceph: fix inode number handling on arches with 32-bit ino_t
  2020-08-19 15:16 ` [PATCH] " Jeff Layton
@ 2020-08-20 21:30   ` Jeff Layton
  2020-08-24 13:38   ` Yan, Zheng
  1 sibling, 0 replies; 14+ messages in thread
From: Jeff Layton @ 2020-08-20 21:30 UTC (permalink / raw)
  To: ceph-devel; +Cc: Ulrich.Weigand, Tuan.Hoang1, idryomov

On Wed, 2020-08-19 at 11:16 -0400, Jeff Layton wrote:
> Tuan and Ulrich mentioned that they were hitting a problem on s390x,
> which has a 32-bit ino_t value, even though it's a 64-bit arch (for
> historical reasons).
> 
> I think the current handling of inode numbers in the ceph driver is
> wrong. It tries to use 32-bit inode numbers on 32-bit arches, but that's
> actually not a problem. 32-bit arches can deal with 64-bit inode numbers
> just fine when userland code is compiled with LFS support (the common
> case these days).
> 
> What we really want to do is just use 64-bit numbers everywhere, unless
> someone has mounted with the ino32 mount option. In that case, we want
> to ensure that we hash the inode number down to something that will fit
> in 32 bits before presenting the value to userland.
> 
> Add new helper functions that do this, and only do the conversion before
> presenting these values to userland in getattr and readdir.
> 
> The inode table hashvalue is changed to just cast the inode number to
> unsigned long, as low-order bits are the most likely to vary anyway.
> 
> While it's not strictly required, we do want to put something in
> inode->i_ino. Instead of basing it on BITS_PER_LONG, however, base it on
> the size of the ino_t type.
> 
> Reported-by: Tuan Hoang1 <Tuan.Hoang1@ibm.com>
> Reported-by: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
> URL: https://tracker.ceph.com/issues/46828
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/caps.c       | 14 ++++-----
>  fs/ceph/debugfs.c    |  4 +--
>  fs/ceph/dir.c        | 31 ++++++++-----------
>  fs/ceph/file.c       |  4 +--
>  fs/ceph/inode.c      | 19 ++++++------
>  fs/ceph/mds_client.h |  2 +-
>  fs/ceph/quota.c      |  4 +--
>  fs/ceph/super.h      | 73 +++++++++++++++++++++++---------------------
>  8 files changed, 74 insertions(+), 77 deletions(-)
> 
> v4:
> - flesh out comments in super.h
> - merge dout messages in ceph_get_inode
> - rename ceph_vino_to_ino to ceph_vino_to_ino_t
> 
> v3:
> - use ceph_ino instead of ceph_present_ino in most dout() messages
> 
> v2:
> - fix dir_emit inode number for ".."
> - fix ino_t size test
> 

FWIW, I built an i386 VM and ran a kernel with this patch through
xfstests and it seems to be ok.

To be clear though, this _will_ be a user-visible change on 32-bit
arches:

1/ inode numbers will be seen to have changed between kernel versions.
32-bit arches will see large inode numbers now instead of the hashed
ones they saw before.

2/ any really old software not built with LFS support may start failing
stat() calls with -EOVERFLOW on inode numbers >2^32. Nothing much we can
do about these, but hopefully the intersection of people running such
code on ceph will be very small.

The workaround for both problems will be to mount with "-o ino32".
-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] ceph: fix inode number handling on arches with 32-bit ino_t
  2020-08-19 15:16 ` [PATCH] " Jeff Layton
  2020-08-20 21:30   ` Jeff Layton
@ 2020-08-24 13:38   ` Yan, Zheng
  1 sibling, 0 replies; 14+ messages in thread
From: Yan, Zheng @ 2020-08-24 13:38 UTC (permalink / raw)
  To: Jeff Layton; +Cc: ceph-devel, Ulrich.Weigand, Tuan.Hoang1, Ilya Dryomov

On Wed, Aug 19, 2020 at 11:20 PM Jeff Layton <jlayton@kernel.org> wrote:
>
> Tuan and Ulrich mentioned that they were hitting a problem on s390x,
> which has a 32-bit ino_t value, even though it's a 64-bit arch (for
> historical reasons).
>
> I think the current handling of inode numbers in the ceph driver is
> wrong. It tries to use 32-bit inode numbers on 32-bit arches, but that's
> actually not a problem. 32-bit arches can deal with 64-bit inode numbers
> just fine when userland code is compiled with LFS support (the common
> case these days).
>
> What we really want to do is just use 64-bit numbers everywhere, unless
> someone has mounted with the ino32 mount option. In that case, we want
> to ensure that we hash the inode number down to something that will fit
> in 32 bits before presenting the value to userland.
>
> Add new helper functions that do this, and only do the conversion before
> presenting these values to userland in getattr and readdir.
>
> The inode table hashvalue is changed to just cast the inode number to
> unsigned long, as low-order bits are the most likely to vary anyway.
>
> While it's not strictly required, we do want to put something in
> inode->i_ino. Instead of basing it on BITS_PER_LONG, however, base it on
> the size of the ino_t type.
>
> Reported-by: Tuan Hoang1 <Tuan.Hoang1@ibm.com>
> Reported-by: Ulrich Weigand <Ulrich.Weigand@de.ibm.com>
> URL: https://tracker.ceph.com/issues/46828
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/ceph/caps.c       | 14 ++++-----
>  fs/ceph/debugfs.c    |  4 +--
>  fs/ceph/dir.c        | 31 ++++++++-----------
>  fs/ceph/file.c       |  4 +--
>  fs/ceph/inode.c      | 19 ++++++------
>  fs/ceph/mds_client.h |  2 +-
>  fs/ceph/quota.c      |  4 +--
>  fs/ceph/super.h      | 73 +++++++++++++++++++++++---------------------
>  8 files changed, 74 insertions(+), 77 deletions(-)
>
> v4:
> - flesh out comments in super.h
> - merge dout messages in ceph_get_inode
> - rename ceph_vino_to_ino to ceph_vino_to_ino_t
>
> v3:
> - use ceph_ino instead of ceph_present_ino in most dout() messages
>
> v2:
> - fix dir_emit inode number for ".."
> - fix ino_t size test
>
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index 55ccccf77cea..034b3f4fdd3a 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -887,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
>         int have = ci->i_snap_caps;
>
>         if ((have & mask) == mask) {
> -               dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
> -                    " (mask %s)\n", ci->vfs_inode.i_ino,
> +               dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
> +                    " (mask %s)\n", ceph_ino(&ci->vfs_inode),
>                      ceph_cap_string(have),
>                      ceph_cap_string(mask));
>                 return 1;
> @@ -899,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
>                 if (!__cap_is_valid(cap))
>                         continue;
>                 if ((cap->issued & mask) == mask) {
> -                       dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
> -                            " (mask %s)\n", ci->vfs_inode.i_ino, cap,
> +                       dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
> +                            " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap,
>                              ceph_cap_string(cap->issued),
>                              ceph_cap_string(mask));
>                         if (touch)
> @@ -911,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
>                 /* does a combination of caps satisfy mask? */
>                 have |= cap->issued;
>                 if ((have & mask) == mask) {
> -                       dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
> -                            " (mask %s)\n", ci->vfs_inode.i_ino,
> +                       dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
> +                            " (mask %s)\n", ceph_ino(&ci->vfs_inode),
>                              ceph_cap_string(cap->issued),
>                              ceph_cap_string(mask));
>                         if (touch) {
> @@ -2872,7 +2872,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
>                         struct cap_wait cw;
>                         DEFINE_WAIT_FUNC(wait, woken_wake_function);
>
> -                       cw.ino = inode->i_ino;
> +                       cw.ino = ceph_ino(inode);
>                         cw.tgid = current->tgid;
>                         cw.need = need;
>                         cw.want = want;
> diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
> index 97539b497e4c..3e3fcda9b276 100644
> --- a/fs/ceph/debugfs.c
> +++ b/fs/ceph/debugfs.c
> @@ -202,7 +202,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
>  {
>         struct seq_file *s = p;
>
> -       seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
> +       seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_ino(inode),
>                    ceph_cap_string(cap->issued),
>                    ceph_cap_string(cap->implemented));
>         return 0;
> @@ -247,7 +247,7 @@ static int caps_show(struct seq_file *s, void *p)
>
>         spin_lock(&mdsc->caps_list_lock);
>         list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
> -               seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
> +               seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino,
>                                 ceph_cap_string(cw->need),
>                                 ceph_cap_string(cw->want));
>         }
> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> index 060bdcc5ce32..040eaad9d063 100644
> --- a/fs/ceph/dir.c
> +++ b/fs/ceph/dir.c
> @@ -259,9 +259,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
>                              dentry, dentry, d_inode(dentry));
>                         ctx->pos = di->offset;
>                         if (!dir_emit(ctx, dentry->d_name.name,
> -                                     dentry->d_name.len,
> -                                     ceph_translate_ino(dentry->d_sb,
> -                                                        d_inode(dentry)->i_ino),
> +                                     dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
>                                       d_inode(dentry)->i_mode >> 12)) {
>                                 dput(dentry);
>                                 err = 0;
> @@ -324,18 +322,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>         /* always start with . and .. */
>         if (ctx->pos == 0) {
>                 dout("readdir off 0 -> '.'\n");
> -               if (!dir_emit(ctx, ".", 1,
> -                           ceph_translate_ino(inode->i_sb, inode->i_ino),
> +               if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
>                             inode->i_mode >> 12))
>                         return 0;
>                 ctx->pos = 1;
>         }
>         if (ctx->pos == 1) {
> -               ino_t ino = parent_ino(file->f_path.dentry);
> +               u64 ino;
> +               struct dentry *dentry = file->f_path.dentry;
> +
> +               spin_lock(&dentry->d_lock);
> +               ino = ceph_present_inode(dentry->d_parent->d_inode);
> +               spin_unlock(&dentry->d_lock);
> +
>                 dout("readdir off 1 -> '..'\n");
> -               if (!dir_emit(ctx, "..", 2,
> -                           ceph_translate_ino(inode->i_sb, ino),
> -                           inode->i_mode >> 12))
> +               if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
>                         return 0;
>                 ctx->pos = 2;
>         }
> @@ -507,9 +508,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>         }
>         for (; i < rinfo->dir_nr; i++) {
>                 struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
> -               struct ceph_vino vino;
> -               ino_t ino;
> -               u32 ftype;
>
>                 BUG_ON(rde->offset < ctx->pos);
>
> @@ -519,13 +517,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
>                      rde->name_len, rde->name, &rde->inode.in);
>
>                 BUG_ON(!rde->inode.in);
> -               ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
> -               vino.ino = le64_to_cpu(rde->inode.in->ino);
> -               vino.snap = le64_to_cpu(rde->inode.in->snapid);
> -               ino = ceph_vino_to_ino(vino);
>
>                 if (!dir_emit(ctx, rde->name, rde->name_len,
> -                             ceph_translate_ino(inode->i_sb, ino), ftype)) {
> +                             ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
> +                             le32_to_cpu(rde->inode.in->mode) >> 12)) {
>                         dout("filldir stopping us...\n");
>                         return 0;
>                 }
> @@ -1161,7 +1156,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
>
>         if (try_async && op == CEPH_MDS_OP_UNLINK &&
>             (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
> -               dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
> +               dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
>                      dentry->d_name.len, dentry->d_name.name,
>                      ceph_cap_string(req->r_dir_caps));
>                 set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 28714934ced7..27c7047c9383 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -628,8 +628,8 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
>         } else {
>                 struct dentry *dn;
>
> -               dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
> -                       vino.ino, dir->i_ino, dentry->d_name.name);
> +               dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
> +                       vino.ino, ceph_ino(dir), dentry->d_name.name);
>                 ceph_dir_clear_ordered(dir);
>                 ceph_init_inode_acls(inode, as_ctx);
>                 if (inode->i_state & I_NEW) {
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 357c937699d5..d163fa96cb40 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work);
>   */
>  static int ceph_set_ino_cb(struct inode *inode, void *data)
>  {
> -       ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
> -       inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
> +       struct ceph_inode_info *ci = ceph_inode(inode);
> +
> +       ci->i_vino = *(struct ceph_vino *)data;
> +       inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
>         inode_set_iversion_raw(inode, 0);
>         return 0;
>  }
> @@ -50,17 +52,14 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
>  struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
>  {
>         struct inode *inode;
> -       ino_t t = ceph_vino_to_ino(vino);
>
> -       inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
> +       inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
> +                            ceph_set_ino_cb, &vino);
>         if (!inode)
>                 return ERR_PTR(-ENOMEM);
> -       if (inode->i_state & I_NEW)
> -               dout("get_inode created new inode %p %llx.%llx ino %llx\n",
> -                    inode, ceph_vinop(inode), (u64)inode->i_ino);
>
> -       dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
> -            vino.snap, inode);
> +       dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
> +            ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
>         return inode;
>  }
>
> @@ -2378,7 +2377,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
>         }
>
>         generic_fillattr(inode, stat);
> -       stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
> +       stat->ino = ceph_present_inode(inode);
>
>         /*
>          * btime on newly-allocated inodes is 0, so if this is still set to
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index bc9e95937d7c..658800605bfb 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -372,7 +372,7 @@ struct ceph_quotarealm_inode {
>
>  struct cap_wait {
>         struct list_head        list;
> -       unsigned long           ino;
> +       u64                     ino;
>         pid_t                   tgid;
>         int                     need;
>         int                     want;
> diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
> index 198ddde5c1e6..cc2c4d40b022 100644
> --- a/fs/ceph/quota.c
> +++ b/fs/ceph/quota.c
> @@ -23,12 +23,12 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode)
>  {
>         struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
>         struct super_block *sb = mdsc->fsc->sb;
> +       struct inode *root = d_inode(sb->s_root);
>
>         if (atomic64_read(&mdsc->quotarealms_count) > 0)
>                 return true;
>         /* if root is the real CephFS root, we don't have quota realms */
> -       if (sb->s_root->d_inode &&
> -           (sb->s_root->d_inode->i_ino == CEPH_INO_ROOT))
> +       if (root && ceph_ino(root) == CEPH_INO_ROOT)
>                 return false;
>         /* otherwise, we can't know for sure */
>         return true;
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 4c3c964b1c54..a3995ebe0623 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -457,15 +457,7 @@ ceph_vino(const struct inode *inode)
>         return ceph_inode(inode)->i_vino;
>  }
>
> -/*
> - * ino_t is <64 bits on many architectures, blech.
> - *
> - *               i_ino (kernel inode)   st_ino (userspace)
> - * i386          32                     32
> - * x86_64+ino32  64                     32
> - * x86_64        64                     64
> - */
> -static inline u32 ceph_ino_to_ino32(__u64 vino)
> +static inline u32 ceph_ino_to_ino32(u64 vino)
>  {
>         u32 ino = vino & 0xffffffff;
>         ino ^= vino >> 32;
> @@ -475,34 +467,17 @@ static inline u32 ceph_ino_to_ino32(__u64 vino)
>  }
>
>  /*
> - * kernel i_ino value
> + * Inode numbers in cephfs are 64 bits, but inode->i_ino is 32-bits on
> + * some arches. We generally do not use this value inside the ceph driver, but
> + * we do want to set it to something, so that generic vfs code has an
> + * appropriate value for tracepoints and the like.
>   */
> -static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
> +static inline ino_t ceph_vino_to_ino_t(struct ceph_vino vino)
>  {
> -#if BITS_PER_LONG == 32
> -       return ceph_ino_to_ino32(vino.ino);
> -#else
> +       if (sizeof(ino_t) == sizeof(u32))
> +               return ceph_ino_to_ino32(vino.ino);
>         return (ino_t)vino.ino;
> -#endif
> -}
> -
> -/*
> - * user-visible ino (stat, filldir)
> - */
> -#if BITS_PER_LONG == 32
> -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> -{
> -       return ino;
> -}
> -#else
> -static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
> -{
> -       if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
> -               ino = ceph_ino_to_ino32(ino);
> -       return ino;
>  }
> -#endif
> -
>
>  /* for printf-style formatting */
>  #define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
> @@ -511,11 +486,34 @@ static inline u64 ceph_ino(struct inode *inode)
>  {
>         return ceph_inode(inode)->i_vino.ino;
>  }
> +
>  static inline u64 ceph_snap(struct inode *inode)
>  {
>         return ceph_inode(inode)->i_vino.snap;
>  }
>
> +/**
> + * ceph_present_ino - format an inode number for presentation to userland
> + * @sb: superblock where the inode lives
> + * @ino: inode number to (possibly) convert
> + *
> + * If the user mounted with the ino32 option, then the 64-bit value needs
> + * to be converted to something that can fit inside 32 bits. Note that
> + * internal kernel code never uses this value, so this is entirely for
> + * userland consumption.
> + */
> +static inline u64 ceph_present_ino(struct super_block *sb, u64 ino)
> +{
> +       if (unlikely(ceph_test_mount_opt(ceph_sb_to_client(sb), INO32)))
> +               return ceph_ino_to_ino32(ino);
> +       return ino;
> +}
> +
> +static inline u64 ceph_present_inode(struct inode *inode)
> +{
> +       return ceph_present_ino(inode->i_sb, ceph_ino(inode));
> +}
> +
>  static inline int ceph_ino_compare(struct inode *inode, void *data)
>  {
>         struct ceph_vino *pvino = (struct ceph_vino *)data;
> @@ -524,11 +522,16 @@ static inline int ceph_ino_compare(struct inode *inode, void *data)
>                 ci->i_vino.snap == pvino->snap;
>  }
>
> +
>  static inline struct inode *ceph_find_inode(struct super_block *sb,
>                                             struct ceph_vino vino)
>  {
> -       ino_t t = ceph_vino_to_ino(vino);
> -       return ilookup5(sb, t, ceph_ino_compare, &vino);
> +       /*
> +        * NB: The hashval will be run through the fs/inode.c hash function
> +        * anyway, so there is no need to squash the inode number down to
> +        * 32-bits first. Just use low-order bits on arches with 32-bit long.
> +        */
> +       return ilookup5(sb, (unsigned long)vino.ino, ceph_ino_compare, &vino);
>  }
>
    Reviewed-by: "Yan, Zheng" <zyan@redhat.com>


>
> --
> 2.26.2
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2020-08-24 13:40 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-18 16:23 [PATCH] ceph: fix inode number presentation on 32-bit platforms and s390x Jeff Layton
2020-08-18 16:36 ` Jeff Layton
2020-08-18 19:49 ` [PATCH v2] ceph: fix inode number handling on arches with 32-bit ino_t Jeff Layton
2020-08-19  1:58   ` Yan, Zheng
2020-08-19  7:17   ` Ilya Dryomov
2020-08-19 11:25     ` Jeff Layton
2020-08-19 13:28       ` Ilya Dryomov
2020-08-19 13:57         ` Jeff Layton
2020-08-19 14:25           ` Jeff Layton
2020-08-19 12:35 ` [PATCH v3] " Jeff Layton
2020-08-19 12:43   ` Jeff Layton
2020-08-19 15:16 ` [PATCH] " Jeff Layton
2020-08-20 21:30   ` Jeff Layton
2020-08-24 13:38   ` Yan, Zheng

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).