[3/3] shmem: Add support for using full width of ino_t
diff mbox series

Message ID 533d188802d292fa9f7c9e66f26068000346d6c1.1577456898.git.chris@chrisdown.name
State Superseded
Headers show
Series
  • fs: inode: shmem: Reduce risk of inum overflow
Related show

Commit Message

Chris Down Dec. 27, 2019, 2:30 p.m. UTC
The new inode64 option now uses get_next_ino_full, which always uses the
full width of ino_t (as opposed to get_next_ino, which always uses
unsigned int).

Using inode64 makes inode number wraparound significantly less likely,
at the cost of making some features that rely on the underlying
filesystem not setting any of the highest 32 bits (eg. overlayfs' xino)
not usable.

Signed-off-by: Chris Down <chris@chrisdown.name>
Reported-by: Phyllipe Medeiros <phyllipe@fb.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Jeff Layton <jlayton@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: kernel-team@fb.com
---
 include/linux/shmem_fs.h |  1 +
 mm/shmem.c               | 41 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 40 insertions(+), 2 deletions(-)

Comments

Amir Goldstein Dec. 27, 2019, 3:26 p.m. UTC | #1
On Fri, Dec 27, 2019 at 4:30 PM Chris Down <chris@chrisdown.name> wrote:
>
> The new inode64 option now uses get_next_ino_full, which always uses the
> full width of ino_t (as opposed to get_next_ino, which always uses
> unsigned int).
>
> Using inode64 makes inode number wraparound significantly less likely,
> at the cost of making some features that rely on the underlying
> filesystem not setting any of the highest 32 bits (eg. overlayfs' xino)
> not usable.

That's not an accurate statement. overlayfs xino just needs some high
bits available. Therefore I never had any objection to having tmpfs use
64bit ino values (from overlayfs perspective). My only objection is to
use the same pool "irresponsibly" instead of per-sb pool for the heavy
users.

>
> Signed-off-by: Chris Down <chris@chrisdown.name>
> Reported-by: Phyllipe Medeiros <phyllipe@fb.com>
> Cc: Al Viro <viro@zeniv.linux.org.uk>
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Amir Goldstein <amir73il@gmail.com>
> Cc: Jeff Layton <jlayton@kernel.org>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: linux-fsdevel@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Cc: kernel-team@fb.com
> ---
>  include/linux/shmem_fs.h |  1 +
>  mm/shmem.c               | 41 ++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 40 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index de8e4b71e3ba..d7727d0d687f 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -35,6 +35,7 @@ struct shmem_sb_info {
>         unsigned char huge;         /* Whether to try for hugepages */
>         kuid_t uid;                 /* Mount uid for root directory */
>         kgid_t gid;                 /* Mount gid for root directory */
> +       bool small_inums;           /* i_ino width unsigned int or ino_t */
>         struct mempolicy *mpol;     /* default memory policy for mappings */
>         spinlock_t shrinklist_lock;   /* Protects shrinklist */
>         struct list_head shrinklist;  /* List of shinkable inodes */
> diff --git a/mm/shmem.c b/mm/shmem.c
> index ff041cb15550..56cf581ec66d 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -115,11 +115,13 @@ struct shmem_options {
>         kuid_t uid;
>         kgid_t gid;
>         umode_t mode;
> +       bool small_inums;
>         int huge;
>         int seen;
>  #define SHMEM_SEEN_BLOCKS 1
>  #define SHMEM_SEEN_INODES 2
>  #define SHMEM_SEEN_HUGE 4
> +#define SHMEM_SEEN_INUMS 8
>  };
>
>  #ifdef CONFIG_TMPFS
> @@ -2248,8 +2250,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
>         inode = new_inode(sb);
>         if (inode) {
>                 /* Recycle to avoid 32-bit wraparound where possible */
> -               if (!inode->i_ino)
> -                       inode->i_ino = get_next_ino();
> +               if (!inode->i_ino) {
> +                       if (sbinfo->small_inums)
> +                               inode->i_ino = get_next_ino();
> +                       else
> +                               inode->i_ino = get_next_ino_full();
> +               }

Ouch! You set yourself a trap in patch #1 and stepped into it here.
shmem driver has a single shmem_inode_cachep serving all tmpfs
instances. You cannot use different ino allocators and recycle ino
numbers from the same inode cache pool.
Sorry I did not see it coming...

I'm afraid that the recycling method cannot work along side a per-sb
ino allocator :/ (unless get_next_ino() was a truncated version of the
same counter as get_next_ino_full()).

You could still apply the recycling method if shmem inode cache was
never tainted by any other ino allocator, but that will lead to
unpredictable results for users and quite hard to document behavior.

IOW, I don't see a decent way out besides making shmem ino
allocator per-sb proper and always *before* adding the per-sb mount
option inode64.

And because of this, I think the global get_next_ino_full() API is
a mistake.

Thanks,
Amir.
Chris Down Dec. 27, 2019, 4:35 p.m. UTC | #2
Amir Goldstein writes:
>On Fri, Dec 27, 2019 at 4:30 PM Chris Down <chris@chrisdown.name> wrote:
>>
>> The new inode64 option now uses get_next_ino_full, which always uses the
>> full width of ino_t (as opposed to get_next_ino, which always uses
>> unsigned int).
>>
>> Using inode64 makes inode number wraparound significantly less likely,
>> at the cost of making some features that rely on the underlying
>> filesystem not setting any of the highest 32 bits (eg. overlayfs' xino)
>> not usable.
>
>That's not an accurate statement. overlayfs xino just needs some high
>bits available. Therefore I never had any objection to having tmpfs use
>64bit ino values (from overlayfs perspective). My only objection is to
>use the same pool "irresponsibly" instead of per-sb pool for the heavy
>users.

Per-sb get_next_ino is fine, but seems less important if inode64 is used. Or is 
your point about people who would still be using inode32?

I think things have become quite unclear in previous discussions, so I want to 
make sure we're all on the same page here. Are you saying you would 
theoretically ack the following series?

1. Recycle volatile slabs in tmpfs/hugetlbfs
2. Make get_next_ino per-sb
3. Make get_next_ino_full (which is also per-sb)
4. Add inode{32,64} to tmpfs

To keep this thread as high signal as possible, I'll avoid sending any other 
patches until I hear back on that :-)

Thanks again,

Chris
Amir Goldstein Dec. 28, 2019, 4:20 a.m. UTC | #3
On Fri, Dec 27, 2019 at 6:35 PM Chris Down <chris@chrisdown.name> wrote:
>
> Amir Goldstein writes:
> >On Fri, Dec 27, 2019 at 4:30 PM Chris Down <chris@chrisdown.name> wrote:
> >>
> >> The new inode64 option now uses get_next_ino_full, which always uses the
> >> full width of ino_t (as opposed to get_next_ino, which always uses
> >> unsigned int).
> >>
> >> Using inode64 makes inode number wraparound significantly less likely,
> >> at the cost of making some features that rely on the underlying
> >> filesystem not setting any of the highest 32 bits (eg. overlayfs' xino)
> >> not usable.
> >
> >That's not an accurate statement. overlayfs xino just needs some high
> >bits available. Therefore I never had any objection to having tmpfs use
> >64bit ino values (from overlayfs perspective). My only objection is to
> >use the same pool "irresponsibly" instead of per-sb pool for the heavy
> >users.
>
> Per-sb get_next_ino is fine, but seems less important if inode64 is used. Or is
> your point about people who would still be using inode32?
>
> I think things have become quite unclear in previous discussions, so I want to
> make sure we're all on the same page here. Are you saying you would
> theoretically ack the following series?
>
> 1. Recycle volatile slabs in tmpfs/hugetlbfs
> 2. Make get_next_ino per-sb
> 3. Make get_next_ino_full (which is also per-sb)
> 4. Add inode{32,64} to tmpfs

Not what I meant. On the contrary:
1. Recycle ino from slab is a nice idea, but it is not applicable
    along with per-sb ino allocator, so you can't use it for tmpfs
2. Leave get_next_ino() alone - it is used by things like pipe(2)
    that you don't want to mess with
3. Don't create another global ino allocator
4. inode{32,64} option to tmpfs is the only thing you need

We've made quite a big mess of a problem that is not really that big.

In this thread on zhenbin's patch you have the simple solution that
Google are using to your problem:
https://patchwork.kernel.org/patch/11254001/#23014383

The only thing keeping this solution away from upstream according to
tmpfs maintainer is the concern of breaking legacy 32bit apps.

If you make the high ino bits exposed opt-in by mount and/or Kconfig
option, then this concern would be mitigated and Google's private
solution to tmpfs ino could go upstream.

Hugh did not specify if sbinfo->next_ino is incremented under
sbinfo->stat_lock or some other lock (maybe he can share a link to
the actual patch?), but shmem_reserve_inode() already takes that
lock anyway, so I don't see the need to any further micro optimizations.

Chris, I hope the solution I am proposing is clear now and I hope I am
not leading you by mistake into another trap...

To be clear, solution should be dead simple and contained to tmpfs.
If you like, you could clone exact same solution to hugetlbfs, but no
new vfs helpers please.

Thanks,
Amir.

Patch
diff mbox series

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index de8e4b71e3ba..d7727d0d687f 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -35,6 +35,7 @@  struct shmem_sb_info {
 	unsigned char huge;	    /* Whether to try for hugepages */
 	kuid_t uid;		    /* Mount uid for root directory */
 	kgid_t gid;		    /* Mount gid for root directory */
+	bool small_inums;	    /* i_ino width unsigned int or ino_t */
 	struct mempolicy *mpol;     /* default memory policy for mappings */
 	spinlock_t shrinklist_lock;   /* Protects shrinklist */
 	struct list_head shrinklist;  /* List of shinkable inodes */
diff --git a/mm/shmem.c b/mm/shmem.c
index ff041cb15550..56cf581ec66d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -115,11 +115,13 @@  struct shmem_options {
 	kuid_t uid;
 	kgid_t gid;
 	umode_t mode;
+	bool small_inums;
 	int huge;
 	int seen;
 #define SHMEM_SEEN_BLOCKS 1
 #define SHMEM_SEEN_INODES 2
 #define SHMEM_SEEN_HUGE 4
+#define SHMEM_SEEN_INUMS 8
 };
 
 #ifdef CONFIG_TMPFS
@@ -2248,8 +2250,12 @@  static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 	inode = new_inode(sb);
 	if (inode) {
 		/* Recycle to avoid 32-bit wraparound where possible */
-		if (!inode->i_ino)
-			inode->i_ino = get_next_ino();
+		if (!inode->i_ino) {
+			if (sbinfo->small_inums)
+				inode->i_ino = get_next_ino();
+			else
+				inode->i_ino = get_next_ino_full();
+		}
 		inode_init_owner(inode, dir, mode);
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
@@ -3380,6 +3386,8 @@  enum shmem_param {
 	Opt_nr_inodes,
 	Opt_size,
 	Opt_uid,
+	Opt_inode32,
+	Opt_inode64,
 };
 
 static const struct fs_parameter_spec shmem_param_specs[] = {
@@ -3391,6 +3399,8 @@  static const struct fs_parameter_spec shmem_param_specs[] = {
 	fsparam_string("nr_inodes",	Opt_nr_inodes),
 	fsparam_string("size",		Opt_size),
 	fsparam_u32   ("uid",		Opt_uid),
+	fsparam_flag  ("inode32",	Opt_inode32),
+	fsparam_flag  ("inode64",	Opt_inode64),
 	{}
 };
 
@@ -3415,6 +3425,7 @@  static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
 	unsigned long long size;
 	char *rest;
 	int opt;
+	const char *err;
 
 	opt = fs_parse(fc, &shmem_fs_parameters, param, &result);
 	if (opt < 0)
@@ -3476,6 +3487,18 @@  static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
 			break;
 		}
 		goto unsupported_parameter;
+	case Opt_inode32:
+		ctx->small_inums = true;
+		ctx->seen |= SHMEM_SEEN_INUMS;
+		break;
+	case Opt_inode64:
+		if (sizeof(ino_t) < 8) {
+			err = "Cannot use inode64 with <64bit inums in kernel";
+			goto err_msg;
+		}
+		ctx->small_inums = false;
+		ctx->seen |= SHMEM_SEEN_INUMS;
+		break;
 	}
 	return 0;
 
@@ -3483,6 +3506,8 @@  static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
 	return invalf(fc, "tmpfs: Unsupported parameter '%s'", param->key);
 bad_value:
 	return invalf(fc, "tmpfs: Bad value for '%s'", param->key);
+err_msg:
+	return invalf(fc, "tmpfs: %s", err);
 }
 
 static int shmem_parse_options(struct fs_context *fc, void *data)
@@ -3567,6 +3592,15 @@  static int shmem_reconfigure(struct fs_context *fc)
 		}
 	}
 
+	/*
+	 * get_next_ino and get_next_ino_full have different static counters, so
+	 * it's not safe to change once we started or we could get duplicates
+	 */
+	if (ctx->seen & SHMEM_SEEN_INUMS) {
+		err = "Cannot retroactively change inode number size";
+		goto out;
+	}
+
 	if (ctx->seen & SHMEM_SEEN_HUGE)
 		sbinfo->huge = ctx->huge;
 	if (ctx->seen & SHMEM_SEEN_BLOCKS)
@@ -3608,6 +3642,7 @@  static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 	if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
 		seq_printf(seq, ",gid=%u",
 				from_kgid_munged(&init_user_ns, sbinfo->gid));
+	seq_printf(seq, ",inode%d", (sbinfo->small_inums ? 32 : 64));
 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
 	/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
 	if (sbinfo->huge)
@@ -3667,6 +3702,7 @@  static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
 	sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
 	sbinfo->uid = ctx->uid;
 	sbinfo->gid = ctx->gid;
+	sbinfo->small_inums = ctx->small_inums;
 	sbinfo->mode = ctx->mode;
 	sbinfo->huge = ctx->huge;
 	sbinfo->mpol = ctx->mpol;
@@ -3879,6 +3915,7 @@  int shmem_init_fs_context(struct fs_context *fc)
 	ctx->mode = 0777 | S_ISVTX;
 	ctx->uid = current_fsuid();
 	ctx->gid = current_fsgid();
+	ctx->small_inums = true;
 
 	fc->fs_private = ctx;
 	fc->ops = &shmem_fs_context_ops;