All of lore.kernel.org
 help / color / mirror / Atom feed
From: Minchan Kim <minchan.kim@gmail.com>
To: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: linux-mm@kvack.org, akpm@linux-foundation.org, npiggin@kernel.dk,
	kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
	kosaki.motohiro@jp.fujitsu.com, cl@linux.com,
	kamezawa.hiroyu@jp.fujitsu.com
Subject: Re: [PATCH 3/3] Provide control over unmapped pages (v4)
Date: Thu, 27 Jan 2011 08:12:02 +0900	[thread overview]
Message-ID: <AANLkTikHLw0Qg+odOB-bDtBSB-5UbTJ5ZOM-ZAdMqXgh@mail.gmail.com> (raw)
In-Reply-To: <20110125051015.13762.13429.stgit@localhost6.localdomain6>

Hi Balbir,

On Tue, Jan 25, 2011 at 2:10 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> Changelog v4
> 1. Add max_unmapped_ratio and use that as the upper limit
> to check when to shrink the unmapped page cache (Christoph
> Lameter)
>
> Changelog v2
> 1. Use a config option to enable the code (Andrew Morton)
> 2. Explain the magic tunables in the code or at-least attempt
>   to explain them (General comment)
> 3. Hint uses of the boot parameter with unlikely (Andrew Morton)
> 4. Use better names (balanced is not a good naming convention)
>
> Provide control using zone_reclaim() and a boot parameter. The
> code reuses functionality from zone_reclaim() to isolate unmapped
> pages and reclaim them as a priority, ahead of other mapped pages.
> A new sysctl for max_unmapped_ratio is provided and set to 16,
> indicating 16% of the total zone pages are unmapped, we start
> shrinking unmapped page cache.
>
> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
> ---
>  Documentation/kernel-parameters.txt |    8 +++
>  include/linux/mmzone.h              |    5 ++
>  include/linux/swap.h                |   23 ++++++++-
>  init/Kconfig                        |   12 +++++
>  kernel/sysctl.c                     |   11 ++++
>  mm/page_alloc.c                     |   25 ++++++++++
>  mm/vmscan.c                         |   87 +++++++++++++++++++++++++++++++++++
>  7 files changed, 166 insertions(+), 5 deletions(-)
>
> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> index fee5f57..65a4ee6 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -2500,6 +2500,14 @@ and is between 256 and 4096 characters. It is defined in the file
>                        [X86]
>                        Set unknown_nmi_panic=1 early on boot.
>
> +       unmapped_page_control
> +                       [KNL] Available if CONFIG_UNMAPPED_PAGECACHE_CONTROL
> +                       is enabled. It controls the amount of unmapped memory
> +                       that is present in the system. This boot option plus
> +                       vm.min_unmapped_ratio (sysctl) provide granular control
> +                       over how much unmapped page cache can exist in the system
> +                       before kswapd starts reclaiming unmapped page cache pages.
> +
>        usbcore.autosuspend=
>                        [USB] The autosuspend time delay (in seconds) used
>                        for newly-detected USB devices (default 2).  This
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 2485acc..18f0f09 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -306,7 +306,10 @@ struct zone {
>        /*
>         * zone reclaim becomes active if more unmapped pages exist.
>         */
> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>        unsigned long           min_unmapped_pages;
> +       unsigned long           max_unmapped_pages;
> +#endif
>  #ifdef CONFIG_NUMA
>        int node;
>        unsigned long           min_slab_pages;
> @@ -773,6 +776,8 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
>                                        void __user *, size_t *, loff_t *);
>  int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
>                        void __user *, size_t *, loff_t *);
> +int sysctl_max_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
> +                       void __user *, size_t *, loff_t *);
>  int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
>                        void __user *, size_t *, loff_t *);
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 7b75626..ae62a03 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -255,19 +255,34 @@ extern int vm_swappiness;
>  extern int remove_mapping(struct address_space *mapping, struct page *page);
>  extern long vm_total_pages;
>
> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL) || defined(CONFIG_NUMA)
>  extern int sysctl_min_unmapped_ratio;
> +extern int sysctl_max_unmapped_ratio;
> +
>  extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
> -#ifdef CONFIG_NUMA
> -extern int zone_reclaim_mode;
> -extern int sysctl_min_slab_ratio;
>  #else
> -#define zone_reclaim_mode 0
>  static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
>  {
>        return 0;
>  }
>  #endif
>
> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
> +extern bool should_reclaim_unmapped_pages(struct zone *zone);
> +#else
> +static inline bool should_reclaim_unmapped_pages(struct zone *zone)
> +{
> +       return false;
> +}
> +#endif
> +
> +#ifdef CONFIG_NUMA
> +extern int zone_reclaim_mode;
> +extern int sysctl_min_slab_ratio;
> +#else
> +#define zone_reclaim_mode 0
> +#endif
> +
>  extern int page_evictable(struct page *page, struct vm_area_struct *vma);
>  extern void scan_mapping_unevictable_pages(struct address_space *);
>
> diff --git a/init/Kconfig b/init/Kconfig
> index 4f6cdbf..2dfbc09 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -828,6 +828,18 @@ config SCHED_AUTOGROUP
>  config MM_OWNER
>        bool
>
> +config UNMAPPED_PAGECACHE_CONTROL
> +       bool "Provide control over unmapped page cache"
> +       default n
> +       help
> +         This option adds support for controlling unmapped page cache
> +         via a boot parameter (unmapped_page_control). The boot parameter
> +         with sysctl (vm.min_unmapped_ratio) control the total number
> +         of unmapped pages in the system. This feature is useful if
> +         you want to limit the amount of unmapped page cache or want
> +         to reduce page cache duplication in a virtualized environment.
> +         If unsure say 'N'
> +
>  config SYSFS_DEPRECATED
>        bool "enable deprecated sysfs features to support old userspace tools"
>        depends on SYSFS
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 12e8f26..63dbba6 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1224,6 +1224,7 @@ static struct ctl_table vm_table[] = {
>                .extra1         = &zero,
>        },
>  #endif
> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>        {
>                .procname       = "min_unmapped_ratio",
>                .data           = &sysctl_min_unmapped_ratio,
> @@ -1233,6 +1234,16 @@ static struct ctl_table vm_table[] = {
>                .extra1         = &zero,
>                .extra2         = &one_hundred,
>        },
> +       {
> +               .procname       = "max_unmapped_ratio",
> +               .data           = &sysctl_max_unmapped_ratio,
> +               .maxlen         = sizeof(sysctl_max_unmapped_ratio),
> +               .mode           = 0644,
> +               .proc_handler   = sysctl_max_unmapped_ratio_sysctl_handler,
> +               .extra1         = &zero,
> +               .extra2         = &one_hundred,
> +       },
> +#endif
>  #ifdef CONFIG_NUMA
>        {
>                .procname       = "zone_reclaim_mode",
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 7b56473..2ac8549 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1660,6 +1660,9 @@ zonelist_scan:
>                        unsigned long mark;
>                        int ret;
>
> +                       if (should_reclaim_unmapped_pages(zone))
> +                               wakeup_kswapd(zone, order, classzone_idx);
> +

Do we really need the check in fastpath?
There are lost of caller of alloc_pages.
Many of them are not related to mapped pages.
Could we move the check into add_to_page_cache_locked?

>                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
>                        if (zone_watermark_ok(zone, order, mark,
>                                    classzone_idx, alloc_flags))
> @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
>
>                zone->spanned_pages = size;
>                zone->present_pages = realsize;
> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
>                                                / 100;
> +               zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
> +                                               / 100;
> +#endif
>  #ifdef CONFIG_NUMA
>                zone->node = nid;
>                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
> @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
>        return 0;
>  }
>
> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>        void __user *buffer, size_t *length, loff_t *ppos)
>  {
> @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>        return 0;
>  }
>
> +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> +       void __user *buffer, size_t *length, loff_t *ppos)
> +{
> +       struct zone *zone;
> +       int rc;
> +
> +       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
> +       if (rc)
> +               return rc;
> +
> +       for_each_zone(zone)
> +               zone->max_unmapped_pages = (zone->present_pages *
> +                               sysctl_max_unmapped_ratio) / 100;
> +       return 0;
> +}
> +#endif
> +
>  #ifdef CONFIG_NUMA
>  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
>        void __user *buffer, size_t *length, loff_t *ppos)
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 02cc82e..6377411 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
>  #define scanning_global_lru(sc)        (1)
>  #endif
>
> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
> +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
> +                                               struct scan_control *sc);
> +static int unmapped_page_control __read_mostly;
> +
> +static int __init unmapped_page_control_parm(char *str)
> +{
> +       unmapped_page_control = 1;
> +       /*
> +        * XXX: Should we tweak swappiness here?
> +        */
> +       return 1;
> +}
> +__setup("unmapped_page_control", unmapped_page_control_parm);
> +
> +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
> +static inline unsigned long reclaim_unmapped_pages(int priority,
> +                               struct zone *zone, struct scan_control *sc)
> +{
> +       return 0;
> +}
> +#endif
> +
>  static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
>                                                  struct scan_control *sc)
>  {
> @@ -2359,6 +2382,12 @@ loop_again:
>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
>                                                        &sc, priority, 0);
>
> +                       /*
> +                        * We do unmapped page reclaim once here and once
> +                        * below, so that we don't lose out
> +                        */
> +                       reclaim_unmapped_pages(priority, zone, &sc);
> +
>                        if (!zone_watermark_ok_safe(zone, order,
>                                        high_wmark_pages(zone), 0, 0)) {
>                                end_zone = i;
> @@ -2396,6 +2425,11 @@ loop_again:
>                                continue;
>
>                        sc.nr_scanned = 0;
> +                       /*
> +                        * Reclaim unmapped pages upfront, this should be
> +                        * really cheap
> +                        */
> +                       reclaim_unmapped_pages(priority, zone, &sc);

Why should we do by two phase?
It's not a direct reclaim path. I mean it doesn't need to reclaim tighly
If we can't reclaim enough, next allocation would wake up kswapd again
and kswapd try it again.

And I have a concern. I already pointed out.
If memory pressure is heavy and unmappd_pages is more than our
threshold, this can move inactive's tail pages which are mapped into
heads by reclaim_unmapped_pages. It can make confusing LRU order so
working set can be evicted.

zone_reclaim is used by only NUMA until now but you are opening it in the world.
I think it would be a good feature in embedded system, too.
I hope we care of working set eviction problem.

-- 
Kind regards,
Minchan Kim

WARNING: multiple messages have this Message-ID (diff)
From: Minchan Kim <minchan.kim@gmail.com>
To: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: linux-mm@kvack.org, akpm@linux-foundation.org, npiggin@kernel.dk,
	kvm@vger.kernel.org, linux-kernel@vger.kernel.org,
	kosaki.motohiro@jp.fujitsu.com, cl@linux.com,
	kamezawa.hiroyu@jp.fujitsu.com
Subject: Re: [PATCH 3/3] Provide control over unmapped pages (v4)
Date: Thu, 27 Jan 2011 08:12:02 +0900	[thread overview]
Message-ID: <AANLkTikHLw0Qg+odOB-bDtBSB-5UbTJ5ZOM-ZAdMqXgh@mail.gmail.com> (raw)
In-Reply-To: <20110125051015.13762.13429.stgit@localhost6.localdomain6>

Hi Balbir,

On Tue, Jan 25, 2011 at 2:10 PM, Balbir Singh <balbir@linux.vnet.ibm.com> wrote:
> Changelog v4
> 1. Add max_unmapped_ratio and use that as the upper limit
> to check when to shrink the unmapped page cache (Christoph
> Lameter)
>
> Changelog v2
> 1. Use a config option to enable the code (Andrew Morton)
> 2. Explain the magic tunables in the code or at-least attempt
>   to explain them (General comment)
> 3. Hint uses of the boot parameter with unlikely (Andrew Morton)
> 4. Use better names (balanced is not a good naming convention)
>
> Provide control using zone_reclaim() and a boot parameter. The
> code reuses functionality from zone_reclaim() to isolate unmapped
> pages and reclaim them as a priority, ahead of other mapped pages.
> A new sysctl for max_unmapped_ratio is provided and set to 16,
> indicating 16% of the total zone pages are unmapped, we start
> shrinking unmapped page cache.
>
> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
> ---
>  Documentation/kernel-parameters.txt |    8 +++
>  include/linux/mmzone.h              |    5 ++
>  include/linux/swap.h                |   23 ++++++++-
>  init/Kconfig                        |   12 +++++
>  kernel/sysctl.c                     |   11 ++++
>  mm/page_alloc.c                     |   25 ++++++++++
>  mm/vmscan.c                         |   87 +++++++++++++++++++++++++++++++++++
>  7 files changed, 166 insertions(+), 5 deletions(-)
>
> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> index fee5f57..65a4ee6 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -2500,6 +2500,14 @@ and is between 256 and 4096 characters. It is defined in the file
>                        [X86]
>                        Set unknown_nmi_panic=1 early on boot.
>
> +       unmapped_page_control
> +                       [KNL] Available if CONFIG_UNMAPPED_PAGECACHE_CONTROL
> +                       is enabled. It controls the amount of unmapped memory
> +                       that is present in the system. This boot option plus
> +                       vm.min_unmapped_ratio (sysctl) provide granular control
> +                       over how much unmapped page cache can exist in the system
> +                       before kswapd starts reclaiming unmapped page cache pages.
> +
>        usbcore.autosuspend=
>                        [USB] The autosuspend time delay (in seconds) used
>                        for newly-detected USB devices (default 2).  This
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 2485acc..18f0f09 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -306,7 +306,10 @@ struct zone {
>        /*
>         * zone reclaim becomes active if more unmapped pages exist.
>         */
> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>        unsigned long           min_unmapped_pages;
> +       unsigned long           max_unmapped_pages;
> +#endif
>  #ifdef CONFIG_NUMA
>        int node;
>        unsigned long           min_slab_pages;
> @@ -773,6 +776,8 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int,
>                                        void __user *, size_t *, loff_t *);
>  int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
>                        void __user *, size_t *, loff_t *);
> +int sysctl_max_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
> +                       void __user *, size_t *, loff_t *);
>  int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
>                        void __user *, size_t *, loff_t *);
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 7b75626..ae62a03 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -255,19 +255,34 @@ extern int vm_swappiness;
>  extern int remove_mapping(struct address_space *mapping, struct page *page);
>  extern long vm_total_pages;
>
> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL) || defined(CONFIG_NUMA)
>  extern int sysctl_min_unmapped_ratio;
> +extern int sysctl_max_unmapped_ratio;
> +
>  extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
> -#ifdef CONFIG_NUMA
> -extern int zone_reclaim_mode;
> -extern int sysctl_min_slab_ratio;
>  #else
> -#define zone_reclaim_mode 0
>  static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
>  {
>        return 0;
>  }
>  #endif
>
> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
> +extern bool should_reclaim_unmapped_pages(struct zone *zone);
> +#else
> +static inline bool should_reclaim_unmapped_pages(struct zone *zone)
> +{
> +       return false;
> +}
> +#endif
> +
> +#ifdef CONFIG_NUMA
> +extern int zone_reclaim_mode;
> +extern int sysctl_min_slab_ratio;
> +#else
> +#define zone_reclaim_mode 0
> +#endif
> +
>  extern int page_evictable(struct page *page, struct vm_area_struct *vma);
>  extern void scan_mapping_unevictable_pages(struct address_space *);
>
> diff --git a/init/Kconfig b/init/Kconfig
> index 4f6cdbf..2dfbc09 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -828,6 +828,18 @@ config SCHED_AUTOGROUP
>  config MM_OWNER
>        bool
>
> +config UNMAPPED_PAGECACHE_CONTROL
> +       bool "Provide control over unmapped page cache"
> +       default n
> +       help
> +         This option adds support for controlling unmapped page cache
> +         via a boot parameter (unmapped_page_control). The boot parameter
> +         with sysctl (vm.min_unmapped_ratio) control the total number
> +         of unmapped pages in the system. This feature is useful if
> +         you want to limit the amount of unmapped page cache or want
> +         to reduce page cache duplication in a virtualized environment.
> +         If unsure say 'N'
> +
>  config SYSFS_DEPRECATED
>        bool "enable deprecated sysfs features to support old userspace tools"
>        depends on SYSFS
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 12e8f26..63dbba6 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1224,6 +1224,7 @@ static struct ctl_table vm_table[] = {
>                .extra1         = &zero,
>        },
>  #endif
> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>        {
>                .procname       = "min_unmapped_ratio",
>                .data           = &sysctl_min_unmapped_ratio,
> @@ -1233,6 +1234,16 @@ static struct ctl_table vm_table[] = {
>                .extra1         = &zero,
>                .extra2         = &one_hundred,
>        },
> +       {
> +               .procname       = "max_unmapped_ratio",
> +               .data           = &sysctl_max_unmapped_ratio,
> +               .maxlen         = sizeof(sysctl_max_unmapped_ratio),
> +               .mode           = 0644,
> +               .proc_handler   = sysctl_max_unmapped_ratio_sysctl_handler,
> +               .extra1         = &zero,
> +               .extra2         = &one_hundred,
> +       },
> +#endif
>  #ifdef CONFIG_NUMA
>        {
>                .procname       = "zone_reclaim_mode",
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 7b56473..2ac8549 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1660,6 +1660,9 @@ zonelist_scan:
>                        unsigned long mark;
>                        int ret;
>
> +                       if (should_reclaim_unmapped_pages(zone))
> +                               wakeup_kswapd(zone, order, classzone_idx);
> +

Do we really need the check in fastpath?
There are lost of caller of alloc_pages.
Many of them are not related to mapped pages.
Could we move the check into add_to_page_cache_locked?

>                        mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
>                        if (zone_watermark_ok(zone, order, mark,
>                                    classzone_idx, alloc_flags))
> @@ -4167,8 +4170,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
>
>                zone->spanned_pages = size;
>                zone->present_pages = realsize;
> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
>                                                / 100;
> +               zone->max_unmapped_pages = (realsize*sysctl_max_unmapped_ratio)
> +                                               / 100;
> +#endif
>  #ifdef CONFIG_NUMA
>                zone->node = nid;
>                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
> @@ -5084,6 +5091,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
>        return 0;
>  }
>
> +#if defined(CONFIG_UNMAPPED_PAGE_CONTROL) || defined(CONFIG_NUMA)
>  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>        void __user *buffer, size_t *length, loff_t *ppos)
>  {
> @@ -5100,6 +5108,23 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
>        return 0;
>  }
>
> +int sysctl_max_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
> +       void __user *buffer, size_t *length, loff_t *ppos)
> +{
> +       struct zone *zone;
> +       int rc;
> +
> +       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
> +       if (rc)
> +               return rc;
> +
> +       for_each_zone(zone)
> +               zone->max_unmapped_pages = (zone->present_pages *
> +                               sysctl_max_unmapped_ratio) / 100;
> +       return 0;
> +}
> +#endif
> +
>  #ifdef CONFIG_NUMA
>  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
>        void __user *buffer, size_t *length, loff_t *ppos)
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 02cc82e..6377411 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -159,6 +159,29 @@ static DECLARE_RWSEM(shrinker_rwsem);
>  #define scanning_global_lru(sc)        (1)
>  #endif
>
> +#if defined(CONFIG_UNMAPPED_PAGECACHE_CONTROL)
> +static unsigned long reclaim_unmapped_pages(int priority, struct zone *zone,
> +                                               struct scan_control *sc);
> +static int unmapped_page_control __read_mostly;
> +
> +static int __init unmapped_page_control_parm(char *str)
> +{
> +       unmapped_page_control = 1;
> +       /*
> +        * XXX: Should we tweak swappiness here?
> +        */
> +       return 1;
> +}
> +__setup("unmapped_page_control", unmapped_page_control_parm);
> +
> +#else /* !CONFIG_UNMAPPED_PAGECACHE_CONTROL */
> +static inline unsigned long reclaim_unmapped_pages(int priority,
> +                               struct zone *zone, struct scan_control *sc)
> +{
> +       return 0;
> +}
> +#endif
> +
>  static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
>                                                  struct scan_control *sc)
>  {
> @@ -2359,6 +2382,12 @@ loop_again:
>                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
>                                                        &sc, priority, 0);
>
> +                       /*
> +                        * We do unmapped page reclaim once here and once
> +                        * below, so that we don't lose out
> +                        */
> +                       reclaim_unmapped_pages(priority, zone, &sc);
> +
>                        if (!zone_watermark_ok_safe(zone, order,
>                                        high_wmark_pages(zone), 0, 0)) {
>                                end_zone = i;
> @@ -2396,6 +2425,11 @@ loop_again:
>                                continue;
>
>                        sc.nr_scanned = 0;
> +                       /*
> +                        * Reclaim unmapped pages upfront, this should be
> +                        * really cheap
> +                        */
> +                       reclaim_unmapped_pages(priority, zone, &sc);

Why should we do by two phase?
It's not a direct reclaim path. I mean it doesn't need to reclaim tighly
If we can't reclaim enough, next allocation would wake up kswapd again
and kswapd try it again.

And I have a concern. I already pointed out.
If memory pressure is heavy and unmappd_pages is more than our
threshold, this can move inactive's tail pages which are mapped into
heads by reclaim_unmapped_pages. It can make confusing LRU order so
working set can be evicted.

zone_reclaim is used by only NUMA until now but you are opening it in the world.
I think it would be a good feature in embedded system, too.
I hope we care of working set eviction problem.

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

  parent reply	other threads:[~2011-01-26 23:12 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2011-01-25  5:10 [PATCH 1/2] Refactor zone_reclaim code (v4) Balbir Singh
2011-01-25  5:10 ` Balbir Singh
2011-01-25  5:10 ` [PATCH 3/3] Provide control over unmapped pages (v4) Balbir Singh
2011-01-25  5:10   ` Balbir Singh
2011-01-26 16:57   ` Christoph Lameter
2011-01-26 16:57     ` Christoph Lameter
2011-01-26 17:43     ` Balbir Singh
2011-01-26 17:43       ` Balbir Singh
2011-01-26 23:12   ` Minchan Kim [this message]
2011-01-26 23:12     ` Minchan Kim
2011-01-28  2:56     ` Balbir Singh
2011-01-28  2:56       ` Balbir Singh
2011-01-28  5:44       ` Minchan Kim
2011-01-28  5:44         ` Minchan Kim
2011-01-28  6:48         ` Balbir Singh
2011-01-28  6:48           ` Balbir Singh
2011-01-28  6:48           ` Balbir Singh
2011-01-28  7:24           ` Minchan Kim
2011-01-28  7:24             ` Minchan Kim
2011-01-28  7:56             ` KAMEZAWA Hiroyuki
2011-01-28  7:56               ` KAMEZAWA Hiroyuki
2011-01-28  7:56               ` KAMEZAWA Hiroyuki
2011-01-28  8:19               ` Balbir Singh
2011-01-28  8:19                 ` Balbir Singh
2011-01-28  8:19                 ` Balbir Singh
2011-01-28  8:17                 ` KAMEZAWA Hiroyuki
2011-01-28  8:17                   ` KAMEZAWA Hiroyuki
2011-01-28 12:02                   ` Balbir Singh
2011-01-28 12:02                     ` Balbir Singh
2011-01-28 15:20               ` Christoph Lameter
2011-01-28 15:20                 ` Christoph Lameter
2011-01-30 23:58                 ` KAMEZAWA Hiroyuki
2011-01-30 23:58                   ` KAMEZAWA Hiroyuki
2011-01-31  4:37                   ` Balbir Singh
2011-01-31  4:37                     ` Balbir Singh
2011-01-28 11:18             ` Balbir Singh
2011-01-28 11:18               ` Balbir Singh
2011-02-10  5:33               ` Minchan Kim
2011-02-10  5:33                 ` Minchan Kim
2011-02-10  5:41                 ` Minchan Kim
2011-02-10  5:41                   ` Minchan Kim
2011-02-13 17:33                   ` Balbir Singh
2011-02-13 17:33                     ` Balbir Singh
2011-02-16 23:45                     ` Minchan Kim
2011-02-16 23:45                       ` Minchan Kim
2011-01-25  5:15 ` [PATCH 1/2] Refactor zone_reclaim code (v4) Balbir Singh
2011-01-25  5:15   ` Balbir Singh

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=AANLkTikHLw0Qg+odOB-bDtBSB-5UbTJ5ZOM-ZAdMqXgh@mail.gmail.com \
    --to=minchan.kim@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=balbir@linux.vnet.ibm.com \
    --cc=cl@linux.com \
    --cc=kamezawa.hiroyu@jp.fujitsu.com \
    --cc=kosaki.motohiro@jp.fujitsu.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=npiggin@kernel.dk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.