linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] shmem: avoid huge pages for small files
@ 2016-10-17 12:18 Kirill A. Shutemov
  2016-10-17 12:30 ` Kirill A. Shutemov
  0 siblings, 1 reply; 20+ messages in thread
From: Kirill A. Shutemov @ 2016-10-17 12:18 UTC (permalink / raw)
  To: Hugh Dickins, Andrea Arcangeli, Andrew Morton
  Cc: Andi Kleen, linux-mm, linux-kernel, Kirill A. Shutemov

Huge pages are detrimental for small file: they causes noticible
overhead on both allocation performance and memory footprint.

This patch aimed to address this issue by avoiding huge pages until file
grown to specified size. This would cover most of the cases where huge
pages causes regressions in performance.

By default the minimal file size to allocate huge pages is equal to size
of huge page.

We add two handle to specify minimal file size for huge pages:

  - mount option 'huge_min_size';

  - sysfs file /sys/kernel/mm/transparent_hugepage/shmem_min_size for
    in-kernel tmpfs mountpoint;

Few notes:

  - if shmem_enabled is set to 'force', the limit is ignored. We still
    want to generate as many pages as possible for functional testing.

  - the limit doesn't affect khugepaged behaviour: it still can collapse
    pages based on its settings;

  - remount of the filesystem doesn't affect previously allocated pages,
    but the limit is applied for new allocations;

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 Documentation/vm/transhuge.txt |  6 +++++
 include/linux/huge_mm.h        |  1 +
 include/linux/shmem_fs.h       |  1 +
 mm/huge_memory.c               |  1 +
 mm/shmem.c                     | 56 ++++++++++++++++++++++++++++++++++++++----
 5 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 2ec6adb5a4ce..40006d193687 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -238,6 +238,12 @@ values:
   - "force":
     Force the huge option on for all - very useful for testing;
 
+Tehre's limit on minimal file size before kenrel starts allocate huge
+pages for it. By default it's size of huge page.
+
+You can adjust the limit using "huge_min_size=" mount option or
+/sys/kernel/mm/transparent_hugepage/shmem_min_size for in-kernel mount.
+
 == Need of application restart ==
 
 The transparent_hugepage/enabled values and tmpfs mount option only affect
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 9b9f65d99873..515b96a5a592 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -52,6 +52,7 @@ extern ssize_t single_hugepage_flag_show(struct kobject *kobj,
 				struct kobj_attribute *attr, char *buf,
 				enum transparent_hugepage_flag flag);
 extern struct kobj_attribute shmem_enabled_attr;
+extern struct kobj_attribute shmem_min_size_attr;
 
 #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index ff078e7043b6..e7c3bddc6335 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -31,6 +31,7 @@ struct shmem_sb_info {
 	spinlock_t stat_lock;	    /* Serialize shmem_sb_info changes */
 	umode_t mode;		    /* Mount mode for root directory */
 	unsigned char huge;	    /* Whether to try for hugepages */
+	loff_t huge_min_size;       /* No hugepages if i_size less than this */
 	kuid_t uid;		    /* Mount uid for root directory */
 	kgid_t gid;		    /* Mount gid for root directory */
 	struct mempolicy *mpol;     /* default memory policy for mappings */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cdcd25cb30fe..fa133eb5bf62 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -309,6 +309,7 @@ static struct attribute *hugepage_attr[] = {
 	&use_zero_page_attr.attr,
 #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
 	&shmem_enabled_attr.attr,
+	&shmem_min_size_attr.attr,
 #endif
 #ifdef CONFIG_DEBUG_VM
 	&debug_cow_attr.attr,
diff --git a/mm/shmem.c b/mm/shmem.c
index ad7813d73ea7..c69047386e2f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -369,6 +369,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
 /* ifdef here to avoid bloating shmem.o when not necessary */
 
 int shmem_huge __read_mostly;
+unsigned long long shmem_huge_min_size = HPAGE_PMD_SIZE __read_mostly;
 
 static int shmem_parse_huge(const char *str)
 {
@@ -1668,6 +1669,8 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 		swap_free(swap);
 
 	} else {
+		loff_t i_size;
+
 		/* shmem_symlink() */
 		if (mapping->a_ops != &shmem_aops)
 			goto alloc_nohuge;
@@ -1675,14 +1678,17 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 			goto alloc_nohuge;
 		if (shmem_huge == SHMEM_HUGE_FORCE)
 			goto alloc_huge;
+		i_size = i_size_read(inode);
+		if (i_size < sbinfo->huge_min_size &&
+				index < (sbinfo->huge_min_size >> PAGE_SHIFT))
+			goto alloc_nohuge;
 		switch (sbinfo->huge) {
-			loff_t i_size;
 			pgoff_t off;
 		case SHMEM_HUGE_NEVER:
 			goto alloc_nohuge;
 		case SHMEM_HUGE_WITHIN_SIZE:
 			off = round_up(index, HPAGE_PMD_NR);
-			i_size = round_up(i_size_read(inode), PAGE_SIZE);
+			i_size = round_up(i_size, PAGE_SIZE);
 			if (i_size >= HPAGE_PMD_SIZE &&
 					i_size >> PAGE_SHIFT >= off)
 				goto alloc_huge;
@@ -3349,6 +3355,10 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 					huge != SHMEM_HUGE_NEVER)
 				goto bad_val;
 			sbinfo->huge = huge;
+		} else if (!strcmp(this_char, "huge_min_size")) {
+			sbinfo->huge_min_size = memparse(value, &rest);
+			if (*rest)
+				goto bad_val;
 #endif
 #ifdef CONFIG_NUMA
 		} else if (!strcmp(this_char,"mpol")) {
@@ -3382,6 +3392,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 	int error = -EINVAL;
 
 	config.mpol = NULL;
+	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+		config.huge_min_size = HPAGE_PMD_SIZE;
 	if (shmem_parse_options(data, &config, true))
 		return error;
 
@@ -3403,6 +3415,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 
 	error = 0;
 	sbinfo->huge = config.huge;
+	sbinfo->huge_min_size = config.huge_min_size;
 	sbinfo->max_blocks  = config.max_blocks;
 	sbinfo->max_inodes  = config.max_inodes;
 	sbinfo->free_inodes = config.max_inodes - inodes;
@@ -3438,8 +3451,10 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 				from_kgid_munged(&init_user_ns, sbinfo->gid));
 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
 	/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
-	if (sbinfo->huge)
+	if (sbinfo->huge) {
 		seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
+		seq_printf(seq, ",huge_min_size=%llu", sbinfo->huge_min_size);
+	}
 #endif
 	shmem_show_mpol(seq, sbinfo->mpol);
 	return 0;
@@ -3542,6 +3557,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 	sbinfo->mode = S_IRWXUGO | S_ISVTX;
 	sbinfo->uid = current_fsuid();
 	sbinfo->gid = current_fsgid();
+	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+		sbinfo->huge_min_size = HPAGE_PMD_SIZE;
 	sb->s_fs_info = sbinfo;
 
 #ifdef CONFIG_TMPFS
@@ -3780,9 +3797,10 @@ int __init shmem_init(void)
 	}
 
 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
-	if (has_transparent_hugepage() && shmem_huge < SHMEM_HUGE_DENY)
+	if (has_transparent_hugepage() && shmem_huge < SHMEM_HUGE_DENY) {
 		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
-	else
+		SHMEM_SB(shm_mnt->mnt_sb)->huge_min_size = shmem_huge_min_size;
+	} else
 		shmem_huge = 0; /* just in case it was patched */
 #endif
 	return 0;
@@ -3848,6 +3866,34 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
 
 struct kobj_attribute shmem_enabled_attr =
 	__ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
+
+static ssize_t shmem_min_size_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%llu\n", shmem_huge_min_size);
+}
+
+
+static ssize_t shmem_min_size_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	unsigned long long size;
+	char *end;
+
+	size = memparse(buf, &end);
+	if (end == buf)
+		return  -EINVAL;
+	if (*end == '\n')
+		end++;
+	if (*end != '\0')
+		return -EINVAL;
+	shmem_huge_min_size = size;
+	SHMEM_SB(shm_mnt->mnt_sb)->huge_min_size = size;
+	return end - buf;
+}
+
+struct kobj_attribute shmem_min_size_attr =
+	__ATTR(shmem_min_size, 0644, shmem_min_size_show, shmem_min_size_store);
 #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
 
 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 20+ messages in thread
* Re: [PATCHv4] shmem: avoid huge pages for small files
@ 2016-11-10 16:25 Kirill A. Shutemov
  2016-11-10 17:42 ` [PATCH] " kbuild test robot
  0 siblings, 1 reply; 20+ messages in thread
From: Kirill A. Shutemov @ 2016-11-10 16:25 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Kirill A. Shutemov, Andrea Arcangeli, Andrew Morton, Andi Kleen,
	Dave Chinner, Michal Hocko, linux-mm, linux-kernel

On Mon, Nov 07, 2016 at 03:17:11PM -0800, Hugh Dickins wrote:
> On Sat, 22 Oct 2016, Kirill A. Shutemov wrote:
> > 
> > Huge pages are detrimental for small file: they causes noticible
> > overhead on both allocation performance and memory footprint.
> > 
> > This patch aimed to address this issue by avoiding huge pages until file
> > grown to size of huge page. This would cover most of the cases where huge
> > pages causes regressions in performance.
> > 
> > Couple notes:
> > 
> >   - if shmem_enabled is set to 'force', the limit is ignored. We still
> >     want to generate as many pages as possible for functional testing.
> > 
> >   - the limit doesn't affect khugepaged behaviour: it still can collapse
> >     pages based on its settings;
> > 
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> 
> Sorry, but NAK.  I was expecting a patch to tune within_size behaviour.
> 
> > ---
> >  Documentation/vm/transhuge.txt | 3 +++
> >  mm/shmem.c                     | 5 +++++
> >  2 files changed, 8 insertions(+)
> > 
> > diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
> > index 2ec6adb5a4ce..d1889c7c8c46 100644
> > --- a/Documentation/vm/transhuge.txt
> > +++ b/Documentation/vm/transhuge.txt
> > @@ -238,6 +238,9 @@ values:
> >    - "force":
> >      Force the huge option on for all - very useful for testing;
> >  
> > +To avoid overhead for small files, we don't allocate huge pages for a file
> > +until it grows to size of huge pages.
> > +
> >  == Need of application restart ==
> >  
> >  The transparent_hugepage/enabled values and tmpfs mount option only affect
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index ad7813d73ea7..49618d2d6330 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -1692,6 +1692,11 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
> >  				goto alloc_huge;
> >  			/* TODO: implement fadvise() hints */
> >  			goto alloc_nohuge;
> > +		case SHMEM_HUGE_ALWAYS:
> > +			i_size = i_size_read(inode);
> > +			if (index < HPAGE_PMD_NR && i_size < HPAGE_PMD_SIZE)
> > +				goto alloc_nohuge;
> > +			break;
> >  		}
> >  
> >  alloc_huge:
> 
> So (eliding the SHMEM_HUGE_ADVISE case in between) you now have:
> 
> 		case SHMEM_HUGE_WITHIN_SIZE:
> 			off = round_up(index, HPAGE_PMD_NR);
> 			i_size = round_up(i_size_read(inode), PAGE_SIZE);
> 			if (i_size >= HPAGE_PMD_SIZE &&
> 					i_size >> PAGE_SHIFT >= off)
> 				goto alloc_huge;
> 			goto alloc_nohuge;
> 		case SHMEM_HUGE_ALWAYS:
> 			i_size = i_size_read(inode);
> 			if (index < HPAGE_PMD_NR && i_size < HPAGE_PMD_SIZE)
> 				goto alloc_nohuge;
> 			goto alloc_huge;
> 
> I'll concede that those two conditions are not the same; but again you're
> messing with huge=always to make it, not always, but conditional on size.
> 
> Please, keep huge=always as is: if I copy a 4MiB file into a huge tmpfs,
> I got ShmemHugePages 4096 kB before, which is what I wanted.  Whereas
> with this change I get only 2048 kB, just like with huge=within_size.

I don't think it's a problem really. We don't have guarantees anyway.
And we can collapse the page later.

But okay.

> Treating the first extent differently is a hack, and does not respect
> that this is a filesystem, on which size is likely to increase.
> 
> By all means refine the condition for huge=within_size, and by all means
> warn in transhuge.txt that huge=always may tend to waste valuable huge
> pages if the filesystem is used for small files without good reason

Would it be okay, if I just replace huge=within_size logic with what I
proposed here for huge=always?

That's not what I intended initially for this option, but...

> (but maybe the implementation needs to reclaim those more effectively).

It's more about cost of allocation than memory pressure.

-----8<-----

>From 287ab05c09bfd49c7356ca74b6fea36d8131edaf Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Mon, 17 Oct 2016 14:44:47 +0300
Subject: [PATCH] shmem: avoid huge pages for small files

Huge pages are detrimental for small file: they causes noticible
overhead on both allocation performance and memory footprint.

This patch aimed to address this issue by avoiding huge pages until
file grown to size of huge page if the filesystem mounted with
huge=within_size option.

This would cover most of the cases where huge pages causes regressions
in performance.

The limit doesn't affect khugepaged behaviour: it still can collapse
pages based on its settings.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 Documentation/vm/transhuge.txt | 7 ++++++-
 mm/shmem.c                     | 6 ++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 2ec6adb5a4ce..14c911c56f4a 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -208,11 +208,16 @@ You can control hugepage allocation policy in tmpfs with mount option
   - "always":
     Attempt to allocate huge pages every time we need a new page;
 
+    This option can lead to significant overhead if filesystem is used to
+    store small files.
+
   - "never":
     Do not allocate huge pages;
 
   - "within_size":
-    Only allocate huge page if it will be fully within i_size.
+    Only allocate huge page if size of the file more than size of huge
+    page. This helps to avoid overhead for small files.
+
     Also respect fadvise()/madvise() hints;
 
   - "advise:
diff --git a/mm/shmem.c b/mm/shmem.c
index ad7813d73ea7..3589d36c7c63 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1681,10 +1681,8 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 		case SHMEM_HUGE_NEVER:
 			goto alloc_nohuge;
 		case SHMEM_HUGE_WITHIN_SIZE:
-			off = round_up(index, HPAGE_PMD_NR);
-			i_size = round_up(i_size_read(inode), PAGE_SIZE);
-			if (i_size >= HPAGE_PMD_SIZE &&
-					i_size >> PAGE_SHIFT >= off)
+			i_size = i_size_read(inode);
+			if (index >= HPAGE_PMD_NR || i_size >= HPAGE_PMD_SIZE)
 				goto alloc_huge;
 			/* fallthrough */
 		case SHMEM_HUGE_ADVISE:
-- 
 Kirill A. Shutemov

^ permalink raw reply related	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2016-11-10 17:52 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-10-17 12:18 [PATCH] shmem: avoid huge pages for small files Kirill A. Shutemov
2016-10-17 12:30 ` Kirill A. Shutemov
2016-10-17 14:12   ` Michal Hocko
2016-10-17 14:55     ` Kirill A. Shutemov
2016-10-18 14:20       ` Michal Hocko
2016-10-18 14:32         ` Kirill A. Shutemov
2016-10-18 18:30           ` Michal Hocko
2016-10-19 18:13             ` Hugh Dickins
2016-10-20 10:39               ` Kirill A. Shutemov
2016-10-20 22:46                 ` Dave Chinner
2016-10-21  2:01                   ` Andi Kleen
2016-10-21  5:01                     ` Dave Chinner
2016-10-21 15:00                       ` Kirill A. Shutemov
2016-10-21 15:12                         ` Michal Hocko
2016-10-21 22:50                         ` Dave Chinner
2016-10-21 23:32                           ` Kirill A. Shutemov
2016-10-24 20:34                           ` Dave Hansen
2016-10-25  5:28                             ` Dave Chinner
2016-11-10 16:25 [PATCHv4] " Kirill A. Shutemov
2016-11-10 17:42 ` [PATCH] " kbuild test robot
2016-11-10 17:51   ` Kirill A. Shutemov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).