All of lore.kernel.org
 help / color / mirror / Atom feed
From: "J. R. Okajima" <hooanon05g@gmail.com>
To: linux-fsdevel@vger.kernel.org, dchinner@redhat.com,
	viro@zeniv.linux.org.uk, Eric Dumazet <edumazet@google.com>,
	Hugh Dickins <hughd@google.com>, Christoph Hellwig <hch@lst.de>,
	Andreas Dilger <adilger@dilger.ca>, Jan Kara <jack@suse.cz>
Subject: [RFC PATCH v4 1/2] tmpfs: manage the inode-number by IDR, signed int inum
Date: Thu,  5 Jun 2014 21:27:53 +0900	[thread overview]
Message-ID: <1401971274-8075-2-git-send-email-hooanon05g@gmail.com> (raw)
In-Reply-To: <1401971274-8075-1-git-send-email-hooanon05g@gmail.com>

To ensure the uniquness of the inode-number, manage it by IDR.
Also it tries using the lowest unused inode-number, so the value will
usually be smaller.
Another side effect is the type of the inode-number in tmpfs. By using
IDR, it is limited to signed int. But I don't think it a big
problem. INT_MAX is big enough for the number of inodes in a single tmpfs.

Comparision on performance:
- test program: see below
- version: 3.15.0-rc7
- before this commit
  1 procs, 1048575/1048575 file, do unlink, 43.023 secs (usr 1.029 + sys 40.981)
  2 procs, 1048574/1048574 file, do unlink, 24.047 secs (usr 1.048 + sys 45.886)
  1 procs, 524286/524286 file, do unlink, 21.476 secs (usr 0.529 + sys 20.441)
  2 procs, 524286/524286 file, do unlink, 12.029 secs (usr 0.554 + sys 22.880)
  1 procs, 32766/32766 file, do unlink, 1.345 secs (usr 0.035 + sys 1.279)
  2 procs, 32766/32766 file, do unlink, 0.753 secs (usr 0.030 + sys 1.439)
- after this commit
  1 procs, 1048575/1048575 file, do unlink, 45.178 secs (usr 1.183 + sys 43.005)
  2 procs, 1048574/1048574 file, do unlink, 25.328 secs (usr 1.126 + sys 48.481)
  1 procs, 524286/524286 file, do unlink, 22.668 secs (usr 0.367 + sys 21.806)
  2 procs, 524286/524286 file, do unlink, 12.639 secs (usr 0.591 + sys 24.137)
  1 procs, 32766/32766 file, do unlink, 1.414 secs (usr 0.028 + sys 1.356)
  2 procs, 32766/32766 file, do unlink, 0.787 secs (usr 0.036 + sys 1.500)

The overhead surely exists, but looks around 5% or less.

Test prorams.
------- tmpfs-idr.sh -------
#!/bin/sh

set -eu

f() # dir [opts]
{
	local dir=$1
	shift
	seq $(getconf _NPROCESSORS_ONLN) |
	while read ncpu
	do
		seq 1 |
		while read do_unlink
		do
			sudo mount -v -t tmpfs $@ tmpfs $dir
			#stat -f $dir
			free_inodes=$(stat -f -c %d $dir)
			/tmp/tmpfs-idr $dir $ncpu $free_inodes $do_unlink
			sudo umount $dir
		done
	done
}

dir=/tmp/tmpfs-$$
mkdir $dir
uname -a
free -m
#f $dir -o size=50%,nr_inodes=$((0x7fffffff))
#f $dir -o size=50%,nr_inodes=$((0x07ffffff))
#f $dir -o size=50%,nr_inodes=$((0x007fffff))
f $dir -o size=50%,nr_inodes=$((0x00100000))
f $dir -o size=50%,nr_inodes=$((0x0007ffff))
f $dir -o size=50%,nr_inodes=$((0x00007fff))

rm -fr $dir
------- tmpfs-idr.c -------
#define _GNU_SOURCE
#include <pthread.h>

#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>

#ifndef O_PATH
#define O_PATH		010000000
#endif

pthread_barrier_t barrier;
int rootfd, nproc, nfile, do_unlink;

static int argton(char *s)
{
	long l;

	errno = 0;
	l = strtol(s, NULL, 0);
	assert(!((l == LONG_MIN || l == LONG_MAX)
		 && errno));
	assert(l >= 0);

	return l;
}

void *f(void *arg)
{
	int err, dirfd, fd, i;
	char a[16];
	int id = (long)arg;

	snprintf(a, sizeof(a), "%d", id);
	err = mkdirat(rootfd, a, 0755);
	assert(!err);
	dirfd = openat(rootfd, a, O_RDONLY | O_PATH);
	assert(dirfd >= 0);

	err = pthread_barrier_wait(&barrier);
	assert(!err || err == PTHREAD_BARRIER_SERIAL_THREAD);

	for (i = 0; i < nfile; i++) {
		snprintf(a, sizeof(a), "%d", i);
		fd = openat(dirfd, a, O_CREAT | O_WRONLY);
		if (fd >= 0) {
			if (do_unlink)
				unlinkat(dirfd, a, /*flags*/ 0);
			close(fd);
		} else
			break;
	}

	return (void *)(long)i;
}

struct perf {
	struct timespec ts;
	struct rusage ru;
};

void perf(struct perf *perf)
{
	clock_gettime(CLOCK_MONOTONIC, &perf->ts);
	getrusage(RUSAGE_SELF, &perf->ru);
}

void ts_subtract(struct timespec *ans, struct timespec *a, struct timespec *b)
{
	ans->tv_sec = a->tv_sec - b->tv_sec;
	ans->tv_nsec = a->tv_nsec - b->tv_nsec;
	if (ans->tv_nsec < 0) {
		ans->tv_sec--;
		ans->tv_nsec += 1000000000;
	}
}

void tv_subtract(struct timeval *ans, struct timeval *a, struct timeval *b)
{
	ans->tv_sec = a->tv_sec - b->tv_sec;
	ans->tv_usec = a->tv_usec - b->tv_usec;
	if (ans->tv_usec < 0) {
		ans->tv_sec--;
		ans->tv_usec += 1000000;
	}
}

#define MAX_NPROC 16
void run(void)
{
	int err, i, n;
	struct {
		pthread_t th;
		void *p;
	} b[MAX_NPROC];
	struct perf s[3];

	err = pthread_barrier_init(&barrier, NULL, nproc + 1);
	assert(!err);
	for (i = 0; i < nproc; i++) {
		err = pthread_create(&b[i].th, NULL, f, (void *)(long)i);
		assert(!err);
	}

	perf(s + 0);
	err = pthread_barrier_wait(&barrier);
	assert(!err || err == PTHREAD_BARRIER_SERIAL_THREAD);

	for (i = 0; i < nproc; i++)
		pthread_join(b[i].th, &b[i].p);
	perf(s + 1);

	n = 0;
	for (i = 0; i < nproc; i++)
		n += (long)b[i].p;

	ts_subtract(&s[2].ts, &s[1].ts, &s[0].ts);
	tv_subtract(&s[2].ru.ru_utime, &s[1].ru.ru_utime, &s[0].ru.ru_utime);
	tv_subtract(&s[2].ru.ru_stime, &s[1].ru.ru_stime, &s[0].ru.ru_stime);

	printf("%d procs, %d/%d file, %s unlink, %lu.%03ld secs"
	       " (usr %lu.%03ld + sys %lu.%03ld)\n",
	       nproc, n, nfile * nproc, do_unlink ? "do" : "no",
	       s[2].ts.tv_sec, s[2].ts.tv_nsec / 1000000,
	       s[2].ru.ru_utime.tv_sec, s[2].ru.ru_utime.tv_usec / 1000,
	       s[2].ru.ru_stime.tv_sec, s[2].ru.ru_stime.tv_usec / 1000);
}

int main(int argc, char *argv[])
{
	rootfd = open(argv[1], O_RDONLY | O_PATH);
	assert(rootfd >= 0);
	nproc = argton(argv[2]);
	assert(nproc < MAX_NPROC);
	nfile = argton(argv[3]);
	nfile /= nproc;
	do_unlink = argton(argv[4]);
	run();

	return 0;
}

/*
 * Local variables: ;
 * compile-command: "gcc -g -Wall -UNDEBUG -pthread -o /tmp/tmpfs-idr tmpfs-idr.c -lrt";
 * End: ;
 */
----------------------------------------

Cc: Eric Dumazet <edumazet@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andreas Dilger <adilger@dilger.ca>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: J. R. Okajima <hooanon05g@gmail.com>
---
 include/linux/shmem_fs.h |    6 ++++--
 mm/shmem.c               |   37 +++++++++++++++++++++++++++++++------
 2 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 4d1771c..4ba8b43 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -24,10 +24,12 @@ struct shmem_inode_info {
 };
 
 struct shmem_sb_info {
+	struct mutex idr_lock;
+	struct idr idr;		    /* manages inode-number */
 	unsigned long max_blocks;   /* How many blocks are allowed */
 	struct percpu_counter used_blocks;  /* How many are allocated */
-	unsigned long max_inodes;   /* How many inodes are allowed */
-	unsigned long free_inodes;  /* How many are left for allocation */
+	int max_inodes;		    /* How many inodes are allowed */
+	int free_inodes;	    /* How many are left for allocation */
 	spinlock_t stat_lock;	    /* Serialize shmem_sb_info changes */
 	kuid_t uid;		    /* Mount uid for root directory */
 	kgid_t gid;		    /* Mount gid for root directory */
diff --git a/mm/shmem.c b/mm/shmem.c
index 368f314..3ac613d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -107,9 +107,13 @@ static unsigned long shmem_default_max_blocks(void)
 	return totalram_pages / 2;
 }
 
-static unsigned long shmem_default_max_inodes(void)
+static int shmem_default_max_inodes(void)
 {
-	return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
+	unsigned long ul;
+
+	ul = INT_MAX;
+	ul = min3(ul, totalram_pages - totalhigh_pages, totalram_pages / 2);
+	return ul;
 }
 #endif
 
@@ -569,6 +573,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 static void shmem_evict_inode(struct inode *inode)
 {
 	struct shmem_inode_info *info = SHMEM_I(inode);
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 
 	if (inode->i_mapping->a_ops == &shmem_aops) {
 		shmem_unacct_size(info->flags, inode->i_size);
@@ -584,6 +589,11 @@ static void shmem_evict_inode(struct inode *inode)
 
 	simple_xattrs_free(&info->xattrs);
 	WARN_ON(inode->i_blocks);
+	if (inode->i_ino) {
+		mutex_lock(&sbinfo->idr_lock);
+		idr_remove(&sbinfo->idr, inode->i_ino);
+		mutex_unlock(&sbinfo->idr_lock);
+	}
 	shmem_free_inode(inode->i_sb);
 	clear_inode(inode);
 }
@@ -1315,13 +1325,13 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 	struct inode *inode;
 	struct shmem_inode_info *info;
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+	int ino;
 
 	if (shmem_reserve_inode(sb))
 		return NULL;
 
 	inode = new_inode(sb);
 	if (inode) {
-		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, dir, mode);
 		inode->i_blocks = 0;
 		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
@@ -1362,6 +1372,18 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 			mpol_shared_policy_init(&info->policy, NULL);
 			break;
 		}
+
+		/* inum 0 and 1 are unused */
+		mutex_lock(&sbinfo->idr_lock);
+		ino = idr_alloc(&sbinfo->idr, inode, 2, INT_MAX, GFP_NOFS);
+		if (ino > 0) {
+			inode->i_ino = ino;
+			mutex_unlock(&sbinfo->idr_lock);
+		} else {
+			mutex_unlock(&sbinfo->idr_lock);
+			iput(inode);	/* shmem_free_inode() will be called */
+			inode = NULL;
+		}
 	} else
 		shmem_free_inode(sb);
 	return inode;
@@ -2385,7 +2407,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 				goto bad_val;
 		} else if (!strcmp(this_char,"nr_inodes")) {
 			sbinfo->max_inodes = memparse(value, &rest);
-			if (*rest)
+			if (*rest || sbinfo->max_inodes < 2)
 				goto bad_val;
 		} else if (!strcmp(this_char,"mode")) {
 			if (remount)
@@ -2438,7 +2460,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 	struct shmem_sb_info config = *sbinfo;
-	unsigned long inodes;
+	int inodes;
 	int error = -EINVAL;
 
 	config.mpol = NULL;
@@ -2486,7 +2508,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 		seq_printf(seq, ",size=%luk",
 			sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10));
 	if (sbinfo->max_inodes != shmem_default_max_inodes())
-		seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
+		seq_printf(seq, ",nr_inodes=%d", sbinfo->max_inodes);
 	if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
 		seq_printf(seq, ",mode=%03ho", sbinfo->mode);
 	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
@@ -2504,6 +2526,7 @@ static void shmem_put_super(struct super_block *sb)
 {
 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
 
+	idr_destroy(&sbinfo->idr);
 	percpu_counter_destroy(&sbinfo->used_blocks);
 	mpol_put(sbinfo->mpol);
 	kfree(sbinfo);
@@ -2522,6 +2545,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 	if (!sbinfo)
 		return -ENOMEM;
 
+	mutex_init(&sbinfo->idr_lock);
+	idr_init(&sbinfo->idr);
 	sbinfo->mode = S_IRWXUGO | S_ISVTX;
 	sbinfo->uid = current_fsuid();
 	sbinfo->gid = current_fsgid();
-- 
1.7.10.4


  reply	other threads:[~2014-06-05 12:27 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-05-21 18:48 [RFC 0/3] vfs: get_next_ino(), never inum=0 and uniqueness hooanon05g
2014-05-21 18:48 ` [RFC 1/3] vfs: get_next_ino(), never inum=0 hooanon05g
2014-05-28  4:28   ` Hugh Dickins
2014-05-28  5:53     ` Eric Dumazet
2014-05-21 18:48 ` [RFC 2/3] vfs: get_next_ino(), support for the uniqueness hooanon05g
2014-05-22 11:56   ` Jan Kara
2014-05-22 15:03     ` J. R. Okajima
2014-05-22 15:12       ` Jan Kara
2014-05-22 15:14         ` Christoph Hellwig
2014-05-22 16:06           ` Jan Kara
2014-05-29 15:46           ` [PATCH v2 1/2] tmpfs: manage the inode-number by IDR J. R. Okajima
2014-05-29 15:46             ` [PATCH v2 2/2] tmpfs: refine a file handle for NFS-exporting J. R. Okajima
2014-05-31  2:43             ` [PATCH v2 1/2] tmpfs: manage the inode-number by IDR J. R. Okajima
2014-06-01 16:18           ` [RFC PATCH v3 0/2] the uniquness of tmpfs inode-number J. R. Okajima
2014-06-01 16:18             ` [RFC PATCH v3 1/2] tmpfs: manage the inode-number by IDR, signed int inum J. R. Okajima
2014-06-03  9:04               ` Jan Kara
2014-06-03 14:36                 ` J. R. Okajima
2014-06-05 12:27                 ` [RFC PATCH v4 0/2] tmpfs: manage the inode-number by IDR (performance measure) J. R. Okajima
2014-06-05 12:27                   ` J. R. Okajima [this message]
2014-06-05 12:27                   ` [RFC PATCH v4 2/2] tmpfs: refine a file handle for NFS-exporting J. R. Okajima
2014-06-01 16:18             ` [RFC PATCH v3 " J. R. Okajima
2014-05-21 18:49 ` [RFC 3/3] uniqueness of inode number, configfs, debugfs, procfs, ramfs and tmpfs hooanon05g
2014-05-22  1:03   ` J. R. Okajima
2014-05-22 11:53     ` Jan Kara
2014-05-22 14:58       ` J. R. Okajima
2014-05-22 15:09         ` Jan Kara

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1401971274-8075-2-git-send-email-hooanon05g@gmail.com \
    --to=hooanon05g@gmail.com \
    --cc=adilger@dilger.ca \
    --cc=dchinner@redhat.com \
    --cc=edumazet@google.com \
    --cc=hch@lst.de \
    --cc=hughd@google.com \
    --cc=jack@suse.cz \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.