linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Make SPLIT_RSS_COUNTING configurable
@ 2019-10-02 20:24 Daniel Colascione
  2019-10-02 20:28 ` Daniel Colascione
  0 siblings, 1 reply; 9+ messages in thread
From: Daniel Colascione @ 2019-10-02 20:24 UTC (permalink / raw)
  To: dancol, timmurray, surenb, linux-mm, linux-kernel

Using the new config option, users can disable SPLIT_RSS_COUNTING to
get increased accuracy in user-visible mm counters.

Signed-off-by: Daniel Colascione <dancol@google.com>
---
 include/linux/mm.h            |  4 ++--
 include/linux/mm_types_task.h |  5 ++---
 include/linux/sched.h         |  2 +-
 kernel/fork.c                 |  2 +-
 mm/Kconfig                    | 11 +++++++++++
 mm/memory.c                   |  6 +++---
 6 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cc292273e6ba..221395de3cb4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1637,7 +1637,7 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
 {
 	long val = atomic_long_read(&mm->rss_stat.count[member]);
 
-#ifdef SPLIT_RSS_COUNTING
+#ifdef CONFIG_SPLIT_RSS_COUNTING
 	/*
 	 * counter is updated in asynchronous manner and may go to minus.
 	 * But it's never be expected number for users.
@@ -1723,7 +1723,7 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
 		*maxrss = hiwater_rss;
 }
 
-#if defined(SPLIT_RSS_COUNTING)
+#ifdef CONFIG_SPLIT_RSS_COUNTING
 void sync_mm_rss(struct mm_struct *mm);
 #else
 static inline void sync_mm_rss(struct mm_struct *mm)
diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
index c1bc6731125c..d2adc8057e65 100644
--- a/include/linux/mm_types_task.h
+++ b/include/linux/mm_types_task.h
@@ -48,14 +48,13 @@ enum {
 	NR_MM_COUNTERS
 };
 
-#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
-#define SPLIT_RSS_COUNTING
+#ifdef CONFIG_SPLIT_RSS_COUNTING
 /* per-thread cached information, */
 struct task_rss_stat {
 	int events;	/* for synchronization threshold */
 	int count[NR_MM_COUNTERS];
 };
-#endif /* USE_SPLIT_PTE_PTLOCKS */
+#endif /* CONFIG_SPLIT_RSS_COUNTING */
 
 struct mm_rss_stat {
 	atomic_long_t count[NR_MM_COUNTERS];
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c2e56bd8913..22f354774540 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -729,7 +729,7 @@ struct task_struct {
 	/* Per-thread vma caching: */
 	struct vmacache			vmacache;
 
-#ifdef SPLIT_RSS_COUNTING
+#ifdef CONFIG_SPLIT_RSS_COUNTING
 	struct task_rss_stat		rss_stat;
 #endif
 	int				exit_state;
diff --git a/kernel/fork.c b/kernel/fork.c
index f9572f416126..fc5e0889922b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1917,7 +1917,7 @@ static __latent_entropy struct task_struct *copy_process(
 	p->vtime.state = VTIME_INACTIVE;
 #endif
 
-#if defined(SPLIT_RSS_COUNTING)
+#ifdef CONFIG_SPLIT_RSS_COUNTING
 	memset(&p->rss_stat, 0, sizeof(p->rss_stat));
 #endif
 
diff --git a/mm/Kconfig b/mm/Kconfig
index a5dae9a7eb51..372ef9449924 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -736,4 +736,15 @@ config ARCH_HAS_PTE_SPECIAL
 config ARCH_HAS_HUGEPD
 	bool
 
+config SPLIT_RSS_COUNTING
+	bool "Per-thread mm counter caching"
+	depends on MMU
+	default y if NR_CPUS >= SPLIT_PTLOCK_CPUS
+	help
+	  Cache mm counter updates in thread structures and
+	  flush them to visible per-process statistics in batches.
+	  Say Y here to slightly reduce cache contention in processes
+	  with many threads at the expense of decreasing the accuracy
+	  of memory statistics in /proc.
+	  
 endmenu
diff --git a/mm/memory.c b/mm/memory.c
index b1ca51a079f2..bf557ed5ba23 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -141,7 +141,7 @@ static int __init init_zero_pfn(void)
 core_initcall(init_zero_pfn);
 
 
-#if defined(SPLIT_RSS_COUNTING)
+#ifdef CONFIG_SPLIT_RSS_COUNTING
 
 void sync_mm_rss(struct mm_struct *mm)
 {
@@ -177,7 +177,7 @@ static void check_sync_rss_stat(struct task_struct *task)
 	if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
 		sync_mm_rss(task->mm);
 }
-#else /* SPLIT_RSS_COUNTING */
+#else /* CONFIG_SPLIT_RSS_COUNTING */
 
 #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
 #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
@@ -186,7 +186,7 @@ static void check_sync_rss_stat(struct task_struct *task)
 {
 }
 
-#endif /* SPLIT_RSS_COUNTING */
+#endif /* CONFIG_SPLIT_RSS_COUNTING */
 
 /*
  * Note: this doesn't free the actual pages themselves. That
-- 
2.23.0.581.g78d2f28ef7-goog


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Make SPLIT_RSS_COUNTING configurable
  2019-10-02 20:24 [PATCH] Make SPLIT_RSS_COUNTING configurable Daniel Colascione
@ 2019-10-02 20:28 ` Daniel Colascione
  2019-10-03  1:56   ` Qian Cai
  0 siblings, 1 reply; 9+ messages in thread
From: Daniel Colascione @ 2019-10-02 20:28 UTC (permalink / raw)
  To: Daniel Colascione, Tim Murray, Suren Baghdasaryan, linux-mm,
	linux-kernel, linux-mm

Adding the correct linux-mm address.


On Wed, Oct 2, 2019 at 1:25 PM Daniel Colascione <dancol@google.com> wrote:
>
> Using the new config option, users can disable SPLIT_RSS_COUNTING to
> get increased accuracy in user-visible mm counters.
>
> Signed-off-by: Daniel Colascione <dancol@google.com>
> ---
>  include/linux/mm.h            |  4 ++--
>  include/linux/mm_types_task.h |  5 ++---
>  include/linux/sched.h         |  2 +-
>  kernel/fork.c                 |  2 +-
>  mm/Kconfig                    | 11 +++++++++++
>  mm/memory.c                   |  6 +++---
>  6 files changed, 20 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index cc292273e6ba..221395de3cb4 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1637,7 +1637,7 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
>  {
>         long val = atomic_long_read(&mm->rss_stat.count[member]);
>
> -#ifdef SPLIT_RSS_COUNTING
> +#ifdef CONFIG_SPLIT_RSS_COUNTING
>         /*
>          * counter is updated in asynchronous manner and may go to minus.
>          * But it's never be expected number for users.
> @@ -1723,7 +1723,7 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
>                 *maxrss = hiwater_rss;
>  }
>
> -#if defined(SPLIT_RSS_COUNTING)
> +#ifdef CONFIG_SPLIT_RSS_COUNTING
>  void sync_mm_rss(struct mm_struct *mm);
>  #else
>  static inline void sync_mm_rss(struct mm_struct *mm)
> diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
> index c1bc6731125c..d2adc8057e65 100644
> --- a/include/linux/mm_types_task.h
> +++ b/include/linux/mm_types_task.h
> @@ -48,14 +48,13 @@ enum {
>         NR_MM_COUNTERS
>  };
>
> -#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
> -#define SPLIT_RSS_COUNTING
> +#ifdef CONFIG_SPLIT_RSS_COUNTING
>  /* per-thread cached information, */
>  struct task_rss_stat {
>         int events;     /* for synchronization threshold */
>         int count[NR_MM_COUNTERS];
>  };
> -#endif /* USE_SPLIT_PTE_PTLOCKS */
> +#endif /* CONFIG_SPLIT_RSS_COUNTING */
>
>  struct mm_rss_stat {
>         atomic_long_t count[NR_MM_COUNTERS];
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 2c2e56bd8913..22f354774540 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -729,7 +729,7 @@ struct task_struct {
>         /* Per-thread vma caching: */
>         struct vmacache                 vmacache;
>
> -#ifdef SPLIT_RSS_COUNTING
> +#ifdef CONFIG_SPLIT_RSS_COUNTING
>         struct task_rss_stat            rss_stat;
>  #endif
>         int                             exit_state;
> diff --git a/kernel/fork.c b/kernel/fork.c
> index f9572f416126..fc5e0889922b 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1917,7 +1917,7 @@ static __latent_entropy struct task_struct *copy_process(
>         p->vtime.state = VTIME_INACTIVE;
>  #endif
>
> -#if defined(SPLIT_RSS_COUNTING)
> +#ifdef CONFIG_SPLIT_RSS_COUNTING
>         memset(&p->rss_stat, 0, sizeof(p->rss_stat));
>  #endif
>
> diff --git a/mm/Kconfig b/mm/Kconfig
> index a5dae9a7eb51..372ef9449924 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -736,4 +736,15 @@ config ARCH_HAS_PTE_SPECIAL
>  config ARCH_HAS_HUGEPD
>         bool
>
> +config SPLIT_RSS_COUNTING
> +       bool "Per-thread mm counter caching"
> +       depends on MMU
> +       default y if NR_CPUS >= SPLIT_PTLOCK_CPUS
> +       help
> +         Cache mm counter updates in thread structures and
> +         flush them to visible per-process statistics in batches.
> +         Say Y here to slightly reduce cache contention in processes
> +         with many threads at the expense of decreasing the accuracy
> +         of memory statistics in /proc.
> +
>  endmenu
> diff --git a/mm/memory.c b/mm/memory.c
> index b1ca51a079f2..bf557ed5ba23 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -141,7 +141,7 @@ static int __init init_zero_pfn(void)
>  core_initcall(init_zero_pfn);
>
>
> -#if defined(SPLIT_RSS_COUNTING)
> +#ifdef CONFIG_SPLIT_RSS_COUNTING
>
>  void sync_mm_rss(struct mm_struct *mm)
>  {
> @@ -177,7 +177,7 @@ static void check_sync_rss_stat(struct task_struct *task)
>         if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
>                 sync_mm_rss(task->mm);
>  }
> -#else /* SPLIT_RSS_COUNTING */
> +#else /* CONFIG_SPLIT_RSS_COUNTING */
>
>  #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
>  #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
> @@ -186,7 +186,7 @@ static void check_sync_rss_stat(struct task_struct *task)
>  {
>  }
>
> -#endif /* SPLIT_RSS_COUNTING */
> +#endif /* CONFIG_SPLIT_RSS_COUNTING */
>
>  /*
>   * Note: this doesn't free the actual pages themselves. That
> --
> 2.23.0.581.g78d2f28ef7-goog
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Make SPLIT_RSS_COUNTING configurable
  2019-10-02 20:28 ` Daniel Colascione
@ 2019-10-03  1:56   ` Qian Cai
  2019-10-03  2:08     ` Daniel Colascione
  0 siblings, 1 reply; 9+ messages in thread
From: Qian Cai @ 2019-10-03  1:56 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: Tim Murray, Suren Baghdasaryan, linux-mm, linux-kernel, linux-mm



> On Oct 2, 2019, at 4:29 PM, Daniel Colascione <dancol@google.com> wrote:
> 
> Adding the correct linux-mm address.
> 
> 
>> On Wed, Oct 2, 2019 at 1:25 PM Daniel Colascione <dancol@google.com> wrote:
>> 
>> Using the new config option, users can disable SPLIT_RSS_COUNTING to
>> get increased accuracy in user-visible mm counters.
>> 
>> Signed-off-by: Daniel Colascione <dancol@google.com>
>> ---
>> include/linux/mm.h            |  4 ++--
>> include/linux/mm_types_task.h |  5 ++---
>> include/linux/sched.h         |  2 +-
>> kernel/fork.c                 |  2 +-
>> mm/Kconfig                    | 11 +++++++++++
>> mm/memory.c                   |  6 +++---
>> 6 files changed, 20 insertions(+), 10 deletions(-)
>> 
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index cc292273e6ba..221395de3cb4 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -1637,7 +1637,7 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
>> {
>>        long val = atomic_long_read(&mm->rss_stat.count[member]);
>> 
>> -#ifdef SPLIT_RSS_COUNTING
>> +#ifdef CONFIG_SPLIT_RSS_COUNTING
>>        /*
>>         * counter is updated in asynchronous manner and may go to minus.
>>         * But it's never be expected number for users.
>> @@ -1723,7 +1723,7 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
>>                *maxrss = hiwater_rss;
>> }
>> 
>> -#if defined(SPLIT_RSS_COUNTING)
>> +#ifdef CONFIG_SPLIT_RSS_COUNTING
>> void sync_mm_rss(struct mm_struct *mm);
>> #else
>> static inline void sync_mm_rss(struct mm_struct *mm)
>> diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
>> index c1bc6731125c..d2adc8057e65 100644
>> --- a/include/linux/mm_types_task.h
>> +++ b/include/linux/mm_types_task.h
>> @@ -48,14 +48,13 @@ enum {
>>        NR_MM_COUNTERS
>> };
>> 
>> -#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
>> -#define SPLIT_RSS_COUNTING
>> +#ifdef CONFIG_SPLIT_RSS_COUNTING
>> /* per-thread cached information, */
>> struct task_rss_stat {
>>        int events;     /* for synchronization threshold */
>>        int count[NR_MM_COUNTERS];
>> };
>> -#endif /* USE_SPLIT_PTE_PTLOCKS */
>> +#endif /* CONFIG_SPLIT_RSS_COUNTING */
>> 
>> struct mm_rss_stat {
>>        atomic_long_t count[NR_MM_COUNTERS];
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 2c2e56bd8913..22f354774540 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -729,7 +729,7 @@ struct task_struct {
>>        /* Per-thread vma caching: */
>>        struct vmacache                 vmacache;
>> 
>> -#ifdef SPLIT_RSS_COUNTING
>> +#ifdef CONFIG_SPLIT_RSS_COUNTING
>>        struct task_rss_stat            rss_stat;
>> #endif
>>        int                             exit_state;
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index f9572f416126..fc5e0889922b 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -1917,7 +1917,7 @@ static __latent_entropy struct task_struct *copy_process(
>>        p->vtime.state = VTIME_INACTIVE;
>> #endif
>> 
>> -#if defined(SPLIT_RSS_COUNTING)
>> +#ifdef CONFIG_SPLIT_RSS_COUNTING
>>        memset(&p->rss_stat, 0, sizeof(p->rss_stat));
>> #endif
>> 
>> diff --git a/mm/Kconfig b/mm/Kconfig
>> index a5dae9a7eb51..372ef9449924 100644
>> --- a/mm/Kconfig
>> +++ b/mm/Kconfig
>> @@ -736,4 +736,15 @@ config ARCH_HAS_PTE_SPECIAL
>> config ARCH_HAS_HUGEPD
>>        bool
>> 
>> +config SPLIT_RSS_COUNTING
>> +       bool "Per-thread mm counter caching"
>> +       depends on MMU
>> +       default y if NR_CPUS >= SPLIT_PTLOCK_CPUS
>> +       help
>> +         Cache mm counter updates in thread structures and
>> +         flush them to visible per-process statistics in batches.
>> +         Say Y here to slightly reduce cache contention in processes
>> +         with many threads at the expense of decreasing the accuracy
>> +         of memory statistics in /proc.
>> +
>> endmenu

All those vague words are going to make developers almost impossible to decide the right selection here. It sounds like we should kill SPLIT_RSS_COUNTING at all to simplify the code as the benefit is so small vs the side-effect?

>> diff --git a/mm/memory.c b/mm/memory.c
>> index b1ca51a079f2..bf557ed5ba23 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -141,7 +141,7 @@ static int __init init_zero_pfn(void)
>> core_initcall(init_zero_pfn);
>> 
>> 
>> -#if defined(SPLIT_RSS_COUNTING)
>> +#ifdef CONFIG_SPLIT_RSS_COUNTING
>> 
>> void sync_mm_rss(struct mm_struct *mm)
>> {
>> @@ -177,7 +177,7 @@ static void check_sync_rss_stat(struct task_struct *task)
>>        if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
>>                sync_mm_rss(task->mm);
>> }
>> -#else /* SPLIT_RSS_COUNTING */
>> +#else /* CONFIG_SPLIT_RSS_COUNTING */
>> 
>> #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
>> #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
>> @@ -186,7 +186,7 @@ static void check_sync_rss_stat(struct task_struct *task)
>> {
>> }
>> 
>> -#endif /* SPLIT_RSS_COUNTING */
>> +#endif /* CONFIG_SPLIT_RSS_COUNTING */
>> 
>> /*
>>  * Note: this doesn't free the actual pages themselves. That
>> --
>> 2.23.0.581.g78d2f28ef7-goog
>> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Make SPLIT_RSS_COUNTING configurable
  2019-10-03  1:56   ` Qian Cai
@ 2019-10-03  2:08     ` Daniel Colascione
  2019-10-04 12:33       ` Michal Hocko
  0 siblings, 1 reply; 9+ messages in thread
From: Daniel Colascione @ 2019-10-03  2:08 UTC (permalink / raw)
  To: Qian Cai; +Cc: Tim Murray, Suren Baghdasaryan, linux-mm, linux-kernel, linux-mm

On Wed, Oct 2, 2019 at 6:56 PM Qian Cai <cai@lca.pw> wrote:
> > On Oct 2, 2019, at 4:29 PM, Daniel Colascione <dancol@google.com> wrote:
> >
> > Adding the correct linux-mm address.
> >
> >
> >> On Wed, Oct 2, 2019 at 1:25 PM Daniel Colascione <dancol@google.com> wrote:
> >>
> >> Using the new config option, users can disable SPLIT_RSS_COUNTING to
> >> get increased accuracy in user-visible mm counters.
> >>
> >> Signed-off-by: Daniel Colascione <dancol@google.com>
> >> ---
> >> include/linux/mm.h            |  4 ++--
> >> include/linux/mm_types_task.h |  5 ++---
> >> include/linux/sched.h         |  2 +-
> >> kernel/fork.c                 |  2 +-
> >> mm/Kconfig                    | 11 +++++++++++
> >> mm/memory.c                   |  6 +++---
> >> 6 files changed, 20 insertions(+), 10 deletions(-)
> >>
> >> diff --git a/include/linux/mm.h b/include/linux/mm.h
> >> index cc292273e6ba..221395de3cb4 100644
> >> --- a/include/linux/mm.h
> >> +++ b/include/linux/mm.h
> >> @@ -1637,7 +1637,7 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
> >> {
> >>        long val = atomic_long_read(&mm->rss_stat.count[member]);
> >>
> >> -#ifdef SPLIT_RSS_COUNTING
> >> +#ifdef CONFIG_SPLIT_RSS_COUNTING
> >>        /*
> >>         * counter is updated in asynchronous manner and may go to minus.
> >>         * But it's never be expected number for users.
> >> @@ -1723,7 +1723,7 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
> >>                *maxrss = hiwater_rss;
> >> }
> >>
> >> -#if defined(SPLIT_RSS_COUNTING)
> >> +#ifdef CONFIG_SPLIT_RSS_COUNTING
> >> void sync_mm_rss(struct mm_struct *mm);
> >> #else
> >> static inline void sync_mm_rss(struct mm_struct *mm)
> >> diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h
> >> index c1bc6731125c..d2adc8057e65 100644
> >> --- a/include/linux/mm_types_task.h
> >> +++ b/include/linux/mm_types_task.h
> >> @@ -48,14 +48,13 @@ enum {
> >>        NR_MM_COUNTERS
> >> };
> >>
> >> -#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU)
> >> -#define SPLIT_RSS_COUNTING
> >> +#ifdef CONFIG_SPLIT_RSS_COUNTING
> >> /* per-thread cached information, */
> >> struct task_rss_stat {
> >>        int events;     /* for synchronization threshold */
> >>        int count[NR_MM_COUNTERS];
> >> };
> >> -#endif /* USE_SPLIT_PTE_PTLOCKS */
> >> +#endif /* CONFIG_SPLIT_RSS_COUNTING */
> >>
> >> struct mm_rss_stat {
> >>        atomic_long_t count[NR_MM_COUNTERS];
> >> diff --git a/include/linux/sched.h b/include/linux/sched.h
> >> index 2c2e56bd8913..22f354774540 100644
> >> --- a/include/linux/sched.h
> >> +++ b/include/linux/sched.h
> >> @@ -729,7 +729,7 @@ struct task_struct {
> >>        /* Per-thread vma caching: */
> >>        struct vmacache                 vmacache;
> >>
> >> -#ifdef SPLIT_RSS_COUNTING
> >> +#ifdef CONFIG_SPLIT_RSS_COUNTING
> >>        struct task_rss_stat            rss_stat;
> >> #endif
> >>        int                             exit_state;
> >> diff --git a/kernel/fork.c b/kernel/fork.c
> >> index f9572f416126..fc5e0889922b 100644
> >> --- a/kernel/fork.c
> >> +++ b/kernel/fork.c
> >> @@ -1917,7 +1917,7 @@ static __latent_entropy struct task_struct *copy_process(
> >>        p->vtime.state = VTIME_INACTIVE;
> >> #endif
> >>
> >> -#if defined(SPLIT_RSS_COUNTING)
> >> +#ifdef CONFIG_SPLIT_RSS_COUNTING
> >>        memset(&p->rss_stat, 0, sizeof(p->rss_stat));
> >> #endif
> >>
> >> diff --git a/mm/Kconfig b/mm/Kconfig
> >> index a5dae9a7eb51..372ef9449924 100644
> >> --- a/mm/Kconfig
> >> +++ b/mm/Kconfig
> >> @@ -736,4 +736,15 @@ config ARCH_HAS_PTE_SPECIAL
> >> config ARCH_HAS_HUGEPD
> >>        bool
> >>
> >> +config SPLIT_RSS_COUNTING
> >> +       bool "Per-thread mm counter caching"
> >> +       depends on MMU
> >> +       default y if NR_CPUS >= SPLIT_PTLOCK_CPUS
> >> +       help
> >> +         Cache mm counter updates in thread structures and
> >> +         flush them to visible per-process statistics in batches.
> >> +         Say Y here to slightly reduce cache contention in processes
> >> +         with many threads at the expense of decreasing the accuracy
> >> +         of memory statistics in /proc.
> >> +
> >> endmenu
>
> All those vague words are going to make developers almost impossible to decide the right selection here. It sounds like we should kill SPLIT_RSS_COUNTING at all to simplify the code as the benefit is so small vs the side-effect?

Killing SPLIT_RSS_COUNTING would be my first choice; IME, on mobile
and a basic desktop, it doesn't make a difference. I figured making it
a knob would help allay concerns about the performance impact in more
extreme configurations.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Make SPLIT_RSS_COUNTING configurable
  2019-10-03  2:08     ` Daniel Colascione
@ 2019-10-04 12:33       ` Michal Hocko
  2019-10-04 13:26         ` Kirill A. Shutemov
  0 siblings, 1 reply; 9+ messages in thread
From: Michal Hocko @ 2019-10-04 12:33 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: Qian Cai, Tim Murray, Suren Baghdasaryan, linux-mm, linux-kernel,
	linux-mm

On Wed 02-10-19 19:08:16, Daniel Colascione wrote:
> On Wed, Oct 2, 2019 at 6:56 PM Qian Cai <cai@lca.pw> wrote:
> > > On Oct 2, 2019, at 4:29 PM, Daniel Colascione <dancol@google.com> wrote:
> > >
> > > Adding the correct linux-mm address.
> > >
> > >
> > >> +config SPLIT_RSS_COUNTING
> > >> +       bool "Per-thread mm counter caching"
> > >> +       depends on MMU
> > >> +       default y if NR_CPUS >= SPLIT_PTLOCK_CPUS
> > >> +       help
> > >> +         Cache mm counter updates in thread structures and
> > >> +         flush them to visible per-process statistics in batches.
> > >> +         Say Y here to slightly reduce cache contention in processes
> > >> +         with many threads at the expense of decreasing the accuracy
> > >> +         of memory statistics in /proc.
> > >> +
> > >> endmenu
> >
> > All those vague words are going to make developers almost impossible to decide the right selection here. It sounds like we should kill SPLIT_RSS_COUNTING at all to simplify the code as the benefit is so small vs the side-effect?
> 
> Killing SPLIT_RSS_COUNTING would be my first choice; IME, on mobile
> and a basic desktop, it doesn't make a difference. I figured making it
> a knob would help allay concerns about the performance impact in more
> extreme configurations.

I do agree with Qian. Either it is really helpful (is it? probably on
the number of cpus) and it should be auto-enabled or it should be
dropped altogether. You cannot really expect people know how to enable
this without a deep understanding of the MM internals. Not to mention
all those users using distro kernels/configs.

A config option sounds like a bad way forward.

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Make SPLIT_RSS_COUNTING configurable
  2019-10-04 12:33       ` Michal Hocko
@ 2019-10-04 13:26         ` Kirill A. Shutemov
  2019-10-04 13:45           ` Daniel Colascione
  0 siblings, 1 reply; 9+ messages in thread
From: Kirill A. Shutemov @ 2019-10-04 13:26 UTC (permalink / raw)
  To: Michal Hocko
  Cc: Daniel Colascione, Qian Cai, Tim Murray, Suren Baghdasaryan,
	linux-mm, linux-kernel, linux-mm

On Fri, Oct 04, 2019 at 02:33:49PM +0200, Michal Hocko wrote:
> On Wed 02-10-19 19:08:16, Daniel Colascione wrote:
> > On Wed, Oct 2, 2019 at 6:56 PM Qian Cai <cai@lca.pw> wrote:
> > > > On Oct 2, 2019, at 4:29 PM, Daniel Colascione <dancol@google.com> wrote:
> > > >
> > > > Adding the correct linux-mm address.
> > > >
> > > >
> > > >> +config SPLIT_RSS_COUNTING
> > > >> +       bool "Per-thread mm counter caching"
> > > >> +       depends on MMU
> > > >> +       default y if NR_CPUS >= SPLIT_PTLOCK_CPUS
> > > >> +       help
> > > >> +         Cache mm counter updates in thread structures and
> > > >> +         flush them to visible per-process statistics in batches.
> > > >> +         Say Y here to slightly reduce cache contention in processes
> > > >> +         with many threads at the expense of decreasing the accuracy
> > > >> +         of memory statistics in /proc.
> > > >> +
> > > >> endmenu
> > >
> > > All those vague words are going to make developers almost impossible to decide the right selection here. It sounds like we should kill SPLIT_RSS_COUNTING at all to simplify the code as the benefit is so small vs the side-effect?
> > 
> > Killing SPLIT_RSS_COUNTING would be my first choice; IME, on mobile
> > and a basic desktop, it doesn't make a difference. I figured making it
> > a knob would help allay concerns about the performance impact in more
> > extreme configurations.
> 
> I do agree with Qian. Either it is really helpful (is it? probably on
> the number of cpus) and it should be auto-enabled or it should be
> dropped altogether. You cannot really expect people know how to enable
> this without a deep understanding of the MM internals. Not to mention
> all those users using distro kernels/configs.
> 
> A config option sounds like a bad way forward.

And I don't see much point anyway. Reading RSS counters from proc is
inherently racy. It can just either way after the read due to process
behaviour.

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Make SPLIT_RSS_COUNTING configurable
  2019-10-04 13:26         ` Kirill A. Shutemov
@ 2019-10-04 13:45           ` Daniel Colascione
  2019-10-04 14:43             ` Michal Hocko
  2019-10-07  0:11             ` Kirill A. Shutemov
  0 siblings, 2 replies; 9+ messages in thread
From: Daniel Colascione @ 2019-10-04 13:45 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Michal Hocko, Qian Cai, Tim Murray, Suren Baghdasaryan,
	linux-kernel, linux-mm

On Fri, Oct 4, 2019 at 6:26 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
> On Fri, Oct 04, 2019 at 02:33:49PM +0200, Michal Hocko wrote:
> > On Wed 02-10-19 19:08:16, Daniel Colascione wrote:
> > > On Wed, Oct 2, 2019 at 6:56 PM Qian Cai <cai@lca.pw> wrote:
> > > > > On Oct 2, 2019, at 4:29 PM, Daniel Colascione <dancol@google.com> wrote:
> > > > >
> > > > > Adding the correct linux-mm address.
> > > > >
> > > > >
> > > > >> +config SPLIT_RSS_COUNTING
> > > > >> +       bool "Per-thread mm counter caching"
> > > > >> +       depends on MMU
> > > > >> +       default y if NR_CPUS >= SPLIT_PTLOCK_CPUS
> > > > >> +       help
> > > > >> +         Cache mm counter updates in thread structures and
> > > > >> +         flush them to visible per-process statistics in batches.
> > > > >> +         Say Y here to slightly reduce cache contention in processes
> > > > >> +         with many threads at the expense of decreasing the accuracy
> > > > >> +         of memory statistics in /proc.
> > > > >> +
> > > > >> endmenu
> > > >
> > > > All those vague words are going to make developers almost impossible to decide the right selection here. It sounds like we should kill SPLIT_RSS_COUNTING at all to simplify the code as the benefit is so small vs the side-effect?
> > >
> > > Killing SPLIT_RSS_COUNTING would be my first choice; IME, on mobile
> > > and a basic desktop, it doesn't make a difference. I figured making it
> > > a knob would help allay concerns about the performance impact in more
> > > extreme configurations.
> >
> > I do agree with Qian. Either it is really helpful (is it? probably on
> > the number of cpus) and it should be auto-enabled or it should be
> > dropped altogether. You cannot really expect people know how to enable
> > this without a deep understanding of the MM internals. Not to mention
> > all those users using distro kernels/configs.
> >
> > A config option sounds like a bad way forward.
>
> And I don't see much point anyway. Reading RSS counters from proc is
> inherently racy. It can just either way after the read due to process
> behaviour.

Split RSS accounting doesn't make reading from mm counters racy. It
makes these counters *wrong*. We flush task mm counters to the
mm_struct once every 64 page faults that a task incurs or when that
task exits. That means that if a thread takes 63 page faults and then
sleeps for a week, that thread's process's mm counters are wrong by 63
pages *for a week*. And some processes have a lot of threads,
compounding the error. Split RSS accounting means that memory usage
numbers don't add up. I don't think it's unreasonable to want a mode
where memory counters to agree with other indicators of system
activity.

Nobody has demonstrated that split RSS accounting actually helps in
the real world. But I've described above, concretely, how split RSS
accounting hurts. I've been trying for over a year to either disable
split RSS accounting or to let people opt out of it. If you won't
remove split RSS accounting and you won't let me add a configuration
knob that lets people opt out of it, what will you accept?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Make SPLIT_RSS_COUNTING configurable
  2019-10-04 13:45           ` Daniel Colascione
@ 2019-10-04 14:43             ` Michal Hocko
  2019-10-07  0:11             ` Kirill A. Shutemov
  1 sibling, 0 replies; 9+ messages in thread
From: Michal Hocko @ 2019-10-04 14:43 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: Kirill A. Shutemov, Qian Cai, Tim Murray, Suren Baghdasaryan,
	linux-kernel, linux-mm

On Fri 04-10-19 06:45:21, Daniel Colascione wrote:
[...]
> Nobody has demonstrated that split RSS accounting actually helps in
> the real world. But I've described above, concretely, how split RSS
> accounting hurts. I've been trying for over a year to either disable
> split RSS accounting or to let people opt out of it. If you won't
> remove split RSS accounting and you won't let me add a configuration
> knob that lets people opt out of it, what will you accept?

What is an argument to keep it around?

-- 
Michal Hocko
SUSE Labs

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] Make SPLIT_RSS_COUNTING configurable
  2019-10-04 13:45           ` Daniel Colascione
  2019-10-04 14:43             ` Michal Hocko
@ 2019-10-07  0:11             ` Kirill A. Shutemov
  1 sibling, 0 replies; 9+ messages in thread
From: Kirill A. Shutemov @ 2019-10-07  0:11 UTC (permalink / raw)
  To: Daniel Colascione
  Cc: Michal Hocko, Qian Cai, Tim Murray, Suren Baghdasaryan,
	linux-kernel, linux-mm

On Fri, Oct 04, 2019 at 06:45:21AM -0700, Daniel Colascione wrote:
> On Fri, Oct 4, 2019 at 6:26 AM Kirill A. Shutemov <kirill@shutemov.name> wrote:
> > On Fri, Oct 04, 2019 at 02:33:49PM +0200, Michal Hocko wrote:
> > > On Wed 02-10-19 19:08:16, Daniel Colascione wrote:
> > > > On Wed, Oct 2, 2019 at 6:56 PM Qian Cai <cai@lca.pw> wrote:
> > > > > > On Oct 2, 2019, at 4:29 PM, Daniel Colascione <dancol@google.com> wrote:
> > > > > >
> > > > > > Adding the correct linux-mm address.
> > > > > >
> > > > > >
> > > > > >> +config SPLIT_RSS_COUNTING
> > > > > >> +       bool "Per-thread mm counter caching"
> > > > > >> +       depends on MMU
> > > > > >> +       default y if NR_CPUS >= SPLIT_PTLOCK_CPUS
> > > > > >> +       help
> > > > > >> +         Cache mm counter updates in thread structures and
> > > > > >> +         flush them to visible per-process statistics in batches.
> > > > > >> +         Say Y here to slightly reduce cache contention in processes
> > > > > >> +         with many threads at the expense of decreasing the accuracy
> > > > > >> +         of memory statistics in /proc.
> > > > > >> +
> > > > > >> endmenu
> > > > >
> > > > > All those vague words are going to make developers almost
> > > > > impossible to decide the right selection here. It sounds like we
> > > > > should kill SPLIT_RSS_COUNTING at all to simplify the code as
> > > > > the benefit is so small vs the side-effect?
> > > >
> > > > Killing SPLIT_RSS_COUNTING would be my first choice; IME, on mobile
> > > > and a basic desktop, it doesn't make a difference. I figured making it
> > > > a knob would help allay concerns about the performance impact in more
> > > > extreme configurations.
> > >
> > > I do agree with Qian. Either it is really helpful (is it? probably on
> > > the number of cpus) and it should be auto-enabled or it should be
> > > dropped altogether. You cannot really expect people know how to enable
> > > this without a deep understanding of the MM internals. Not to mention
> > > all those users using distro kernels/configs.
> > >
> > > A config option sounds like a bad way forward.
> >
> > And I don't see much point anyway. Reading RSS counters from proc is
> > inherently racy. It can just either way after the read due to process
> > behaviour.
> 
> Split RSS accounting doesn't make reading from mm counters racy. It
> makes these counters *wrong*. We flush task mm counters to the
> mm_struct once every 64 page faults that a task incurs or when that
> task exits. That means that if a thread takes 63 page faults and then
> sleeps for a week, that thread's process's mm counters are wrong by 63
> pages *for a week*. And some processes have a lot of threads,
> compounding the error. Split RSS accounting means that memory usage
> numbers don't add up. I don't think it's unreasonable to want a mode
> where memory counters to agree with other indicators of system
> activity.

It's documented behaviour that is upstream for 9 years. Why is your workload
special?

The documentation suggests to use smaps if you want to have precise data.
Why would it not fly for you?

> Nobody has demonstrated that split RSS accounting actually helps in
> the real world.

The original commit 34e55232e59f ("mm: avoid false sharing of mm_counter")
shows numbers on cache misses. It's not a real world workload, but you
don't have any numbers at all to back your claim.

> But I've described above, concretely, how split RSS
> accounting hurts. I've been trying for over a year to either disable
> split RSS accounting or to let people opt out of it. If you won't
> remove split RSS accounting and you won't let me add a configuration
> knob that lets people opt out of it, what will you accept?

Keeping stats precise is welcome, but often expensive. It might be
negligible for small machine, but becomes a problem on multisocket
machine with dozens or hundreds of cores. We need to keep kernel scalable.

We have other stats that update asynchronously (i.e. /proc/vmstat). Would
you like to convert them too?

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2019-10-07  0:11 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-02 20:24 [PATCH] Make SPLIT_RSS_COUNTING configurable Daniel Colascione
2019-10-02 20:28 ` Daniel Colascione
2019-10-03  1:56   ` Qian Cai
2019-10-03  2:08     ` Daniel Colascione
2019-10-04 12:33       ` Michal Hocko
2019-10-04 13:26         ` Kirill A. Shutemov
2019-10-04 13:45           ` Daniel Colascione
2019-10-04 14:43             ` Michal Hocko
2019-10-07  0:11             ` Kirill A. Shutemov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).