All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] delayacct: track delays from ksm cow
@ 2022-03-16 13:34 cgel.zte
  2022-03-16 14:56 ` David Hildenbrand
  0 siblings, 1 reply; 13+ messages in thread
From: cgel.zte @ 2022-03-16 13:34 UTC (permalink / raw)
  To: bsingharora, akpm; +Cc: yang.yang29, linux-kernel, linux-mm

From: Yang Yang <yang.yang29@zte.com.cn>

Delay accounting does not track the delay of ksm cow.  When tasks
have many ksm pages, it may spend a amount of time waiting for ksm
cow.

To get the impact of tasks in ksm cow, measure the delay when ksm
cow happens. This could help users to decide whether to user ksm
or not.

Also update tools/accounting/getdelays.c:

    / # ./getdelays -dl -p 231
    print delayacct stats ON
    listen forever
    PID     231

    CPU             count     real total  virtual total    delay total  delay average
                     6247     1859000000     2154070021     1674255063          0.268ms
    IO              count    delay total  delay average
                        0              0              0ms
    SWAP            count    delay total  delay average
                        0              0              0ms
    RECLAIM         count    delay total  delay average
                        0              0              0ms
    THRASHING       count    delay total  delay average
                        0              0              0ms
    KSM             count    delay total  delay average
                     3635      271567604              0ms

Signed-off-by: Yang Yang <yang.yang29@zte.com.cn>
---
 include/linux/delayacct.h      | 28 ++++++++++++++++++++++++++++
 include/uapi/linux/taskstats.h |  6 +++++-
 kernel/delayacct.c             | 16 ++++++++++++++++
 mm/memory.c                    | 25 ++++++++++++++++++++++---
 tools/accounting/getdelays.c   |  8 +++++++-
 5 files changed, 78 insertions(+), 5 deletions(-)

diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 6b16a6930a19..0fbe2cb25c23 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -45,9 +45,13 @@ struct task_delay_info {
 	u64 compact_start;
 	u64 compact_delay;	/* wait for memory compact */
 
+	u64 ksm_start;
+	u64 ksm_delay;	/* wait for ksm cow */
+
 	u32 freepages_count;	/* total count of memory reclaim */
 	u32 thrashing_count;	/* total count of thrash waits */
 	u32 compact_count;	/* total count of memory compact */
+	u32 ksm_count;	/* total count of ksm cow */
 };
 #endif
 
@@ -75,6 +79,8 @@ extern void __delayacct_swapin_start(void);
 extern void __delayacct_swapin_end(void);
 extern void __delayacct_compact_start(void);
 extern void __delayacct_compact_end(void);
+extern void __delayacct_ksm_start(void);
+extern void __delayacct_ksm_end(void);
 
 static inline void delayacct_tsk_init(struct task_struct *tsk)
 {
@@ -191,6 +197,24 @@ static inline void delayacct_compact_end(void)
 		__delayacct_compact_end();
 }
 
+static inline void delayacct_ksm_start(void)
+{
+	if (!static_branch_unlikely(&delayacct_key))
+		return;
+
+	if (current->delays)
+		__delayacct_ksm_start();
+}
+
+static inline void delayacct_ksm_end(void)
+{
+	if (!static_branch_unlikely(&delayacct_key))
+		return;
+
+	if (current->delays)
+		__delayacct_ksm_end();
+}
+
 #else
 static inline void delayacct_init(void)
 {}
@@ -225,6 +249,10 @@ static inline void delayacct_compact_start(void)
 {}
 static inline void delayacct_compact_end(void)
 {}
+static inline void delayacct_ksm_start(void)
+{}
+static inline void delayacct_ksm_end(void)
+{}
 
 #endif /* CONFIG_TASK_DELAY_ACCT */
 
diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h
index 12327d32378f..a851c032dfb8 100644
--- a/include/uapi/linux/taskstats.h
+++ b/include/uapi/linux/taskstats.h
@@ -34,7 +34,7 @@
  */
 
 
-#define TASKSTATS_VERSION	11
+#define TASKSTATS_VERSION	12
 #define TS_COMM_LEN		32	/* should be >= TASK_COMM_LEN
 					 * in linux/sched.h */
 
@@ -176,6 +176,10 @@ struct taskstats {
 	/* Delay waiting for memory compact */
 	__u64	compact_count;
 	__u64	compact_delay_total;
+
+	/* Delay waiting for ksm cow */
+	__u64	ksm_count;
+	__u64	ksm_delay_total;
 };
 
 
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 2c1e18f7c5cf..11accef0c2bd 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -177,11 +177,14 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp;
 	tmp = d->compact_delay_total + tsk->delays->compact_delay;
 	d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp;
+	tmp = d->ksm_delay_total + tsk->delays->ksm_delay;
+	d->ksm_delay_total = (tmp < d->ksm_delay_total) ? 0 : tmp;
 	d->blkio_count += tsk->delays->blkio_count;
 	d->swapin_count += tsk->delays->swapin_count;
 	d->freepages_count += tsk->delays->freepages_count;
 	d->thrashing_count += tsk->delays->thrashing_count;
 	d->compact_count += tsk->delays->compact_count;
+	d->ksm_count += tsk->delays->ksm_count;
 	raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
 
 	return 0;
@@ -249,3 +252,16 @@ void __delayacct_compact_end(void)
 		      &current->delays->compact_delay,
 		      &current->delays->compact_count);
 }
+
+void __delayacct_ksm_start(void)
+{
+	current->delays->ksm_start = local_clock();
+}
+
+void __delayacct_ksm_end(void)
+{
+	delayacct_end(&current->delays->lock,
+		      &current->delays->ksm_start,
+		      &current->delays->ksm_delay,
+		      &current->delays->ksm_count);
+}
diff --git a/mm/memory.c b/mm/memory.c
index 4499cf09c21f..23dfc3b24d28 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3249,6 +3249,8 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
 	__releases(vmf->ptl)
 {
 	struct vm_area_struct *vma = vmf->vma;
+	vm_fault_t ret = 0;
+	bool delayacct = false;
 
 	if (userfaultfd_pte_wp(vma, *vmf->pte)) {
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -3294,7 +3296,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
 		 *
 		 * PageKsm() doesn't necessarily raise the page refcount.
 		 */
-		if (PageKsm(page) || page_count(page) > 3)
+		if (PageKsm(page)) {
+			delayacct = true;
+			goto copy;
+		}
+		if (page_count(page) > 3)
 			goto copy;
 		if (!PageLRU(page))
 			/*
@@ -3308,7 +3314,12 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
 			goto copy;
 		if (PageSwapCache(page))
 			try_to_free_swap(page);
-		if (PageKsm(page) || page_count(page) != 1) {
+		if (PageKsm(page)) {
+			delayacct = true;
+			unlock_page(page);
+			goto copy;
+		}
+		if (page_count(page) != 1) {
 			unlock_page(page);
 			goto copy;
 		}
@@ -3328,10 +3339,18 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
 	/*
 	 * Ok, we need to copy. Oh, well..
 	 */
+	if (delayacct)
+		delayacct_ksm_start();
+
 	get_page(vmf->page);
 
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
-	return wp_page_copy(vmf);
+	ret = wp_page_copy(vmf);
+
+	if (delayacct)
+		delayacct_ksm_end();
+
+	return ret;
 }
 
 static void unmap_mapping_range_vma(struct vm_area_struct *vma,
diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c
index 11e86739456d..3e77c9ff7fcf 100644
--- a/tools/accounting/getdelays.c
+++ b/tools/accounting/getdelays.c
@@ -207,6 +207,8 @@ static void print_delayacct(struct taskstats *t)
 	       "THRASHING%12s%15s%15s\n"
 	       "      %15llu%15llu%15llums\n"
 	       "COMPACT  %12s%15s%15s\n"
+	       "      %15llu%15llu%15llums\n"
+	       "KSM   %15s%15s%15s\n"
 	       "      %15llu%15llu%15llums\n",
 	       "count", "real total", "virtual total",
 	       "delay total", "delay average",
@@ -234,7 +236,11 @@ static void print_delayacct(struct taskstats *t)
 	       "count", "delay total", "delay average",
 	       (unsigned long long)t->compact_count,
 	       (unsigned long long)t->compact_delay_total,
-	       average_ms(t->compact_delay_total, t->compact_count));
+	       average_ms(t->compact_delay_total, t->compact_count),
+	       "count", "delay total", "delay average",
+	       (unsigned long long)t->ksm_count,
+	       (unsigned long long)t->ksm_delay_total,
+	       average_ms(t->ksm_delay_total, t->ksm_count));
 }
 
 static void task_context_switch_counts(struct taskstats *t)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] delayacct: track delays from ksm cow
  2022-03-16 13:34 [PATCH] delayacct: track delays from ksm cow cgel.zte
@ 2022-03-16 14:56 ` David Hildenbrand
  2022-03-17  2:03   ` CGEL
  0 siblings, 1 reply; 13+ messages in thread
From: David Hildenbrand @ 2022-03-16 14:56 UTC (permalink / raw)
  To: cgel.zte, bsingharora, akpm; +Cc: yang.yang29, linux-kernel, linux-mm

On 16.03.22 14:34, cgel.zte@gmail.com wrote:
> From: Yang Yang <yang.yang29@zte.com.cn>
> 
> Delay accounting does not track the delay of ksm cow.  When tasks
> have many ksm pages, it may spend a amount of time waiting for ksm
> cow.
> 
> To get the impact of tasks in ksm cow, measure the delay when ksm
> cow happens. This could help users to decide whether to user ksm
> or not.
> 
> Also update tools/accounting/getdelays.c:
> 
>     / # ./getdelays -dl -p 231
>     print delayacct stats ON
>     listen forever
>     PID     231
> 
>     CPU             count     real total  virtual total    delay total  delay average
>                      6247     1859000000     2154070021     1674255063          0.268ms
>     IO              count    delay total  delay average
>                         0              0              0ms
>     SWAP            count    delay total  delay average
>                         0              0              0ms
>     RECLAIM         count    delay total  delay average
>                         0              0              0ms
>     THRASHING       count    delay total  delay average
>                         0              0              0ms
>     KSM             count    delay total  delay average
>                      3635      271567604              0ms
> 

TBH I'm not sure how particularly helpful this is and if we want this.

[...]

>  	struct vm_area_struct *vma = vmf->vma;
> +	vm_fault_t ret = 0;
> +	bool delayacct = false;
>  
>  	if (userfaultfd_pte_wp(vma, *vmf->pte)) {
>  		pte_unmap_unlock(vmf->pte, vmf->ptl);
> @@ -3294,7 +3296,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
>  		 *
>  		 * PageKsm() doesn't necessarily raise the page refcount.
>  		 */
> -		if (PageKsm(page) || page_count(page) > 3)
> +		if (PageKsm(page)) {
> +			delayacct = true;
> +			goto copy;
> +		}
> +		if (page_count(page) > 3)
>  			goto copy;
>  		if (!PageLRU(page))
>  			/*
> @@ -3308,7 +3314,12 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
>  			goto copy;
>  		if (PageSwapCache(page))
>  			try_to_free_swap(page);
> -		if (PageKsm(page) || page_count(page) != 1) {
> +		if (PageKsm(page)) {
> +			delayacct = true;
> +			unlock_page(page);
> +			goto copy;
> +		}
> +		if (page_count(page) != 1) {
>  			unlock_page(page);
>  			goto copy;
>  		}
> @@ -3328,10 +3339,18 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
>  	/*
>  	 * Ok, we need to copy. Oh, well..
>  	 */

Why not simply check for PageKsm() here? I dislike the added complexity
above.


-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] delayacct: track delays from ksm cow
  2022-03-16 14:56 ` David Hildenbrand
@ 2022-03-17  2:03   ` CGEL
  2022-03-17  8:17     ` David Hildenbrand
  0 siblings, 1 reply; 13+ messages in thread
From: CGEL @ 2022-03-17  2:03 UTC (permalink / raw)
  To: David Hildenbrand; +Cc: bsingharora, akpm, yang.yang29, linux-kernel, linux-mm

On Wed, Mar 16, 2022 at 03:56:23PM +0100, David Hildenbrand wrote:
> On 16.03.22 14:34, cgel.zte@gmail.com wrote:
> > From: Yang Yang <yang.yang29@zte.com.cn>
> > 
> > Delay accounting does not track the delay of ksm cow.  When tasks
> > have many ksm pages, it may spend a amount of time waiting for ksm
> > cow.
> > 
> > To get the impact of tasks in ksm cow, measure the delay when ksm
> > cow happens. This could help users to decide whether to user ksm
> > or not.
> > 
> > Also update tools/accounting/getdelays.c:
> > 
> >     / # ./getdelays -dl -p 231
> >     print delayacct stats ON
> >     listen forever
> >     PID     231
> > 
> >     CPU             count     real total  virtual total    delay total  delay average
> >                      6247     1859000000     2154070021     1674255063          0.268ms
> >     IO              count    delay total  delay average
> >                         0              0              0ms
> >     SWAP            count    delay total  delay average
> >                         0              0              0ms
> >     RECLAIM         count    delay total  delay average
> >                         0              0              0ms
> >     THRASHING       count    delay total  delay average
> >                         0              0              0ms
> >     KSM             count    delay total  delay average
> >                      3635      271567604              0ms
> > 
> 
> TBH I'm not sure how particularly helpful this is and if we want this.
>
Thanks for replying.

Users may use ksm by calling madvise(, , MADV_MERGEABLE) when they want
save memory, it's a tradeoff by suffering delay on ksm cow. Users can
get to know how much memory ksm saved by reading
/sys/kernel/mm/ksm/pages_sharing, but they don't know what the costs of
ksm cow delay, and this is important of some delay sensitive tasks. If
users know both saved memory and ksm cow delay, they could better use
madvise(, , MADV_MERGEABLE).

> [...]
> 
> >  	struct vm_area_struct *vma = vmf->vma;
> > +	vm_fault_t ret = 0;
> > +	bool delayacct = false;
> >  
> >  	if (userfaultfd_pte_wp(vma, *vmf->pte)) {
> >  		pte_unmap_unlock(vmf->pte, vmf->ptl);
> > @@ -3294,7 +3296,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
> >  		 *
> >  		 * PageKsm() doesn't necessarily raise the page refcount.
> >  		 */
> > -		if (PageKsm(page) || page_count(page) > 3)
> > +		if (PageKsm(page)) {
> > +			delayacct = true;
> > +			goto copy;
> > +		}
> > +		if (page_count(page) > 3)
> >  			goto copy;
> >  		if (!PageLRU(page))
> >  			/*
> > @@ -3308,7 +3314,12 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
> >  			goto copy;
> >  		if (PageSwapCache(page))
> >  			try_to_free_swap(page);
> > -		if (PageKsm(page) || page_count(page) != 1) {
> > +		if (PageKsm(page)) {
> > +			delayacct = true;
> > +			unlock_page(page);
> > +			goto copy;
> > +		}
> > +		if (page_count(page) != 1) {
> >  			unlock_page(page);
> >  			goto copy;
> >  		}
> > @@ -3328,10 +3339,18 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
> >  	/*
> >  	 * Ok, we need to copy. Oh, well..
> >  	 */
> 
> Why not simply check for PageKsm() here? I dislike the added complexity
> above.
> 
The original code check PageKsm() twice, I just try to keep the original
semantics.

If you think this patch is reasonable, I will try to find a better way
to realize this.
> 
> -- 
> Thanks,
> 
> David / dhildenb

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] delayacct: track delays from ksm cow
  2022-03-17  2:03   ` CGEL
@ 2022-03-17  8:17     ` David Hildenbrand
  2022-03-17  9:48       ` CGEL
  0 siblings, 1 reply; 13+ messages in thread
From: David Hildenbrand @ 2022-03-17  8:17 UTC (permalink / raw)
  To: CGEL; +Cc: bsingharora, akpm, yang.yang29, linux-kernel, linux-mm

On 17.03.22 03:03, CGEL wrote:
> On Wed, Mar 16, 2022 at 03:56:23PM +0100, David Hildenbrand wrote:
>> On 16.03.22 14:34, cgel.zte@gmail.com wrote:
>>> From: Yang Yang <yang.yang29@zte.com.cn>
>>>
>>> Delay accounting does not track the delay of ksm cow.  When tasks
>>> have many ksm pages, it may spend a amount of time waiting for ksm
>>> cow.
>>>
>>> To get the impact of tasks in ksm cow, measure the delay when ksm
>>> cow happens. This could help users to decide whether to user ksm
>>> or not.
>>>
>>> Also update tools/accounting/getdelays.c:
>>>
>>>     / # ./getdelays -dl -p 231
>>>     print delayacct stats ON
>>>     listen forever
>>>     PID     231
>>>
>>>     CPU             count     real total  virtual total    delay total  delay average
>>>                      6247     1859000000     2154070021     1674255063          0.268ms
>>>     IO              count    delay total  delay average
>>>                         0              0              0ms
>>>     SWAP            count    delay total  delay average
>>>                         0              0              0ms
>>>     RECLAIM         count    delay total  delay average
>>>                         0              0              0ms
>>>     THRASHING       count    delay total  delay average
>>>                         0              0              0ms
>>>     KSM             count    delay total  delay average
>>>                      3635      271567604              0ms
>>>
>>
>> TBH I'm not sure how particularly helpful this is and if we want this.
>>
> Thanks for replying.
> 
> Users may use ksm by calling madvise(, , MADV_MERGEABLE) when they want
> save memory, it's a tradeoff by suffering delay on ksm cow. Users can
> get to know how much memory ksm saved by reading
> /sys/kernel/mm/ksm/pages_sharing, but they don't know what the costs of
> ksm cow delay, and this is important of some delay sensitive tasks. If
> users know both saved memory and ksm cow delay, they could better use
> madvise(, , MADV_MERGEABLE).

But that happens after the effects, no?

IOW a user already called madvise(, , MADV_MERGEABLE) and then gets the
results.

So how is this interface useful except for somebody writing an
application and simply being able to benchmark it with vs. without
MADV_MERGEABLE?

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] delayacct: track delays from ksm cow
  2022-03-17  8:17     ` David Hildenbrand
@ 2022-03-17  9:48       ` CGEL
  2022-03-17 10:05         ` David Hildenbrand
  0 siblings, 1 reply; 13+ messages in thread
From: CGEL @ 2022-03-17  9:48 UTC (permalink / raw)
  To: David Hildenbrand; +Cc: bsingharora, akpm, yang.yang29, linux-kernel, linux-mm

On Thu, Mar 17, 2022 at 09:17:13AM +0100, David Hildenbrand wrote:
> On 17.03.22 03:03, CGEL wrote:
> > On Wed, Mar 16, 2022 at 03:56:23PM +0100, David Hildenbrand wrote:
> >> On 16.03.22 14:34, cgel.zte@gmail.com wrote:
> >>> From: Yang Yang <yang.yang29@zte.com.cn>
> >>>
> >>> Delay accounting does not track the delay of ksm cow.  When tasks
> >>> have many ksm pages, it may spend a amount of time waiting for ksm
> >>> cow.
> >>>
> >>> To get the impact of tasks in ksm cow, measure the delay when ksm
> >>> cow happens. This could help users to decide whether to user ksm
> >>> or not.
> >>>
> >>> Also update tools/accounting/getdelays.c:
> >>>
> >>>     / # ./getdelays -dl -p 231
> >>>     print delayacct stats ON
> >>>     listen forever
> >>>     PID     231
> >>>
> >>>     CPU             count     real total  virtual total    delay total  delay average
> >>>                      6247     1859000000     2154070021     1674255063          0.268ms
> >>>     IO              count    delay total  delay average
> >>>                         0              0              0ms
> >>>     SWAP            count    delay total  delay average
> >>>                         0              0              0ms
> >>>     RECLAIM         count    delay total  delay average
> >>>                         0              0              0ms
> >>>     THRASHING       count    delay total  delay average
> >>>                         0              0              0ms
> >>>     KSM             count    delay total  delay average
> >>>                      3635      271567604              0ms
> >>>
> >>
> >> TBH I'm not sure how particularly helpful this is and if we want this.
> >>
> > Thanks for replying.
> > 
> > Users may use ksm by calling madvise(, , MADV_MERGEABLE) when they want
> > save memory, it's a tradeoff by suffering delay on ksm cow. Users can
> > get to know how much memory ksm saved by reading
> > /sys/kernel/mm/ksm/pages_sharing, but they don't know what the costs of
> > ksm cow delay, and this is important of some delay sensitive tasks. If
> > users know both saved memory and ksm cow delay, they could better use
> > madvise(, , MADV_MERGEABLE).
> 
> But that happens after the effects, no?
> 
> IOW a user already called madvise(, , MADV_MERGEABLE) and then gets the
> results.
>
Image user are developing or porting their applications on experiment
machine, they could takes those benchmark as feedback to adjust whether
to use madvise(, , MADV_MERGEABLE) or it's range.

> So how is this interface useful except for somebody writing an
> application and simply being able to benchmark it with vs. without
> MADV_MERGEABLE?
>
> -- 
> Thanks,
> 
> David / dhildenb

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] delayacct: track delays from ksm cow
  2022-03-17  9:48       ` CGEL
@ 2022-03-17 10:05         ` David Hildenbrand
  2022-03-18  1:41           ` CGEL
  0 siblings, 1 reply; 13+ messages in thread
From: David Hildenbrand @ 2022-03-17 10:05 UTC (permalink / raw)
  To: CGEL; +Cc: bsingharora, akpm, yang.yang29, linux-kernel, linux-mm

On 17.03.22 10:48, CGEL wrote:
> On Thu, Mar 17, 2022 at 09:17:13AM +0100, David Hildenbrand wrote:
>> On 17.03.22 03:03, CGEL wrote:
>>> On Wed, Mar 16, 2022 at 03:56:23PM +0100, David Hildenbrand wrote:
>>>> On 16.03.22 14:34, cgel.zte@gmail.com wrote:
>>>>> From: Yang Yang <yang.yang29@zte.com.cn>
>>>>>
>>>>> Delay accounting does not track the delay of ksm cow.  When tasks
>>>>> have many ksm pages, it may spend a amount of time waiting for ksm
>>>>> cow.
>>>>>
>>>>> To get the impact of tasks in ksm cow, measure the delay when ksm
>>>>> cow happens. This could help users to decide whether to user ksm
>>>>> or not.
>>>>>
>>>>> Also update tools/accounting/getdelays.c:
>>>>>
>>>>>     / # ./getdelays -dl -p 231
>>>>>     print delayacct stats ON
>>>>>     listen forever
>>>>>     PID     231
>>>>>
>>>>>     CPU             count     real total  virtual total    delay total  delay average
>>>>>                      6247     1859000000     2154070021     1674255063          0.268ms
>>>>>     IO              count    delay total  delay average
>>>>>                         0              0              0ms
>>>>>     SWAP            count    delay total  delay average
>>>>>                         0              0              0ms
>>>>>     RECLAIM         count    delay total  delay average
>>>>>                         0              0              0ms
>>>>>     THRASHING       count    delay total  delay average
>>>>>                         0              0              0ms
>>>>>     KSM             count    delay total  delay average
>>>>>                      3635      271567604              0ms
>>>>>
>>>>
>>>> TBH I'm not sure how particularly helpful this is and if we want this.
>>>>
>>> Thanks for replying.
>>>
>>> Users may use ksm by calling madvise(, , MADV_MERGEABLE) when they want
>>> save memory, it's a tradeoff by suffering delay on ksm cow. Users can
>>> get to know how much memory ksm saved by reading
>>> /sys/kernel/mm/ksm/pages_sharing, but they don't know what the costs of
>>> ksm cow delay, and this is important of some delay sensitive tasks. If
>>> users know both saved memory and ksm cow delay, they could better use
>>> madvise(, , MADV_MERGEABLE).
>>
>> But that happens after the effects, no?
>>
>> IOW a user already called madvise(, , MADV_MERGEABLE) and then gets the
>> results.
>>
> Image user are developing or porting their applications on experiment
> machine, they could takes those benchmark as feedback to adjust whether
> to use madvise(, , MADV_MERGEABLE) or it's range.

And why can't they run it with and without and observe performance using
existing metrics (or even application-specific metrics?)?


-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] delayacct: track delays from ksm cow
  2022-03-17 10:05         ` David Hildenbrand
@ 2022-03-18  1:41           ` CGEL
  2022-03-18  8:24             ` David Hildenbrand
  0 siblings, 1 reply; 13+ messages in thread
From: CGEL @ 2022-03-18  1:41 UTC (permalink / raw)
  To: David Hildenbrand; +Cc: bsingharora, akpm, yang.yang29, linux-kernel, linux-mm

On Thu, Mar 17, 2022 at 11:05:22AM +0100, David Hildenbrand wrote:
> On 17.03.22 10:48, CGEL wrote:
> > On Thu, Mar 17, 2022 at 09:17:13AM +0100, David Hildenbrand wrote:
> >> On 17.03.22 03:03, CGEL wrote:
> >>> On Wed, Mar 16, 2022 at 03:56:23PM +0100, David Hildenbrand wrote:
> >>>> On 16.03.22 14:34, cgel.zte@gmail.com wrote:
> >>>>> From: Yang Yang <yang.yang29@zte.com.cn>
> >>>>>
> >>>>> Delay accounting does not track the delay of ksm cow.  When tasks
> >>>>> have many ksm pages, it may spend a amount of time waiting for ksm
> >>>>> cow.
> >>>>>
> >>>>> To get the impact of tasks in ksm cow, measure the delay when ksm
> >>>>> cow happens. This could help users to decide whether to user ksm
> >>>>> or not.
> >>>>>
> >>>>> Also update tools/accounting/getdelays.c:
> >>>>>
> >>>>>     / # ./getdelays -dl -p 231
> >>>>>     print delayacct stats ON
> >>>>>     listen forever
> >>>>>     PID     231
> >>>>>
> >>>>>     CPU             count     real total  virtual total    delay total  delay average
> >>>>>                      6247     1859000000     2154070021     1674255063          0.268ms
> >>>>>     IO              count    delay total  delay average
> >>>>>                         0              0              0ms
> >>>>>     SWAP            count    delay total  delay average
> >>>>>                         0              0              0ms
> >>>>>     RECLAIM         count    delay total  delay average
> >>>>>                         0              0              0ms
> >>>>>     THRASHING       count    delay total  delay average
> >>>>>                         0              0              0ms
> >>>>>     KSM             count    delay total  delay average
> >>>>>                      3635      271567604              0ms
> >>>>>
> >>>>
> >>>> TBH I'm not sure how particularly helpful this is and if we want this.
> >>>>
> >>> Thanks for replying.
> >>>
> >>> Users may use ksm by calling madvise(, , MADV_MERGEABLE) when they want
> >>> save memory, it's a tradeoff by suffering delay on ksm cow. Users can
> >>> get to know how much memory ksm saved by reading
> >>> /sys/kernel/mm/ksm/pages_sharing, but they don't know what the costs of
> >>> ksm cow delay, and this is important of some delay sensitive tasks. If
> >>> users know both saved memory and ksm cow delay, they could better use
> >>> madvise(, , MADV_MERGEABLE).
> >>
> >> But that happens after the effects, no?
> >>
> >> IOW a user already called madvise(, , MADV_MERGEABLE) and then gets the
> >> results.
> >>
> > Image user are developing or porting their applications on experiment
> > machine, they could takes those benchmark as feedback to adjust whether
> > to use madvise(, , MADV_MERGEABLE) or it's range.
> 
> And why can't they run it with and without and observe performance using
> existing metrics (or even application-specific metrics?)?
> 
>
I think the reason why we need this patch, is just like why we need                                                                                                     
swap,reclaim,thrashing getdelay information. When system is complex,
it's hard to precise tell which kernel activity impact the observe
performance or application-specific metrics, preempt? cgroup throttle?
swap? reclaim? IO?

So if we could get the factor's precise impact data, when we are tunning
the factor(for this patch it's ksm), it's more efficient.

> -- 
> Thanks,
> 
> David / dhildenb

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] delayacct: track delays from ksm cow
  2022-03-18  1:41           ` CGEL
@ 2022-03-18  8:24             ` David Hildenbrand
  2022-03-20  6:13               ` CGEL
  0 siblings, 1 reply; 13+ messages in thread
From: David Hildenbrand @ 2022-03-18  8:24 UTC (permalink / raw)
  To: CGEL; +Cc: bsingharora, akpm, yang.yang29, linux-kernel, linux-mm

On 18.03.22 02:41, CGEL wrote:
> On Thu, Mar 17, 2022 at 11:05:22AM +0100, David Hildenbrand wrote:
>> On 17.03.22 10:48, CGEL wrote:
>>> On Thu, Mar 17, 2022 at 09:17:13AM +0100, David Hildenbrand wrote:
>>>> On 17.03.22 03:03, CGEL wrote:
>>>>> On Wed, Mar 16, 2022 at 03:56:23PM +0100, David Hildenbrand wrote:
>>>>>> On 16.03.22 14:34, cgel.zte@gmail.com wrote:
>>>>>>> From: Yang Yang <yang.yang29@zte.com.cn>
>>>>>>>
>>>>>>> Delay accounting does not track the delay of ksm cow.  When tasks
>>>>>>> have many ksm pages, it may spend a amount of time waiting for ksm
>>>>>>> cow.
>>>>>>>
>>>>>>> To get the impact of tasks in ksm cow, measure the delay when ksm
>>>>>>> cow happens. This could help users to decide whether to user ksm
>>>>>>> or not.
>>>>>>>
>>>>>>> Also update tools/accounting/getdelays.c:
>>>>>>>
>>>>>>>     / # ./getdelays -dl -p 231
>>>>>>>     print delayacct stats ON
>>>>>>>     listen forever
>>>>>>>     PID     231
>>>>>>>
>>>>>>>     CPU             count     real total  virtual total    delay total  delay average
>>>>>>>                      6247     1859000000     2154070021     1674255063          0.268ms
>>>>>>>     IO              count    delay total  delay average
>>>>>>>                         0              0              0ms
>>>>>>>     SWAP            count    delay total  delay average
>>>>>>>                         0              0              0ms
>>>>>>>     RECLAIM         count    delay total  delay average
>>>>>>>                         0              0              0ms
>>>>>>>     THRASHING       count    delay total  delay average
>>>>>>>                         0              0              0ms
>>>>>>>     KSM             count    delay total  delay average
>>>>>>>                      3635      271567604              0ms
>>>>>>>
>>>>>>
>>>>>> TBH I'm not sure how particularly helpful this is and if we want this.
>>>>>>
>>>>> Thanks for replying.
>>>>>
>>>>> Users may use ksm by calling madvise(, , MADV_MERGEABLE) when they want
>>>>> save memory, it's a tradeoff by suffering delay on ksm cow. Users can
>>>>> get to know how much memory ksm saved by reading
>>>>> /sys/kernel/mm/ksm/pages_sharing, but they don't know what the costs of
>>>>> ksm cow delay, and this is important of some delay sensitive tasks. If
>>>>> users know both saved memory and ksm cow delay, they could better use
>>>>> madvise(, , MADV_MERGEABLE).
>>>>
>>>> But that happens after the effects, no?
>>>>
>>>> IOW a user already called madvise(, , MADV_MERGEABLE) and then gets the
>>>> results.
>>>>
>>> Image user are developing or porting their applications on experiment
>>> machine, they could takes those benchmark as feedback to adjust whether
>>> to use madvise(, , MADV_MERGEABLE) or it's range.
>>
>> And why can't they run it with and without and observe performance using
>> existing metrics (or even application-specific metrics?)?
>>
>>
> I think the reason why we need this patch, is just like why we need                                                                                                     
> swap,reclaim,thrashing getdelay information. When system is complex,
> it's hard to precise tell which kernel activity impact the observe
> performance or application-specific metrics, preempt? cgroup throttle?
> swap? reclaim? IO?
> 
> So if we could get the factor's precise impact data, when we are tunning
> the factor(for this patch it's ksm), it's more efficient.
> 

I'm not convinced that we want to make or write-fault handler more
complicated for such a corner case with an unclear, eventual use case.
IIRC, whenever using KSM you're already agreeing to eventually pay a
performance price, and the price heavily depends on other factors in the
system. Simply looking at the number of write-faults might already give
an indication what changed with KSM being enabled.

Having that said, I'd like to hear other opinions.

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] delayacct: track delays from ksm cow
  2022-03-18  8:24             ` David Hildenbrand
@ 2022-03-20  6:13               ` CGEL
  2022-03-21 15:45                 ` David Hildenbrand
  0 siblings, 1 reply; 13+ messages in thread
From: CGEL @ 2022-03-20  6:13 UTC (permalink / raw)
  To: David Hildenbrand; +Cc: bsingharora, akpm, yang.yang29, linux-kernel, linux-mm

On Fri, Mar 18, 2022 at 09:24:44AM +0100, David Hildenbrand wrote:
> On 18.03.22 02:41, CGEL wrote:
> > On Thu, Mar 17, 2022 at 11:05:22AM +0100, David Hildenbrand wrote:
> >> On 17.03.22 10:48, CGEL wrote:
> >>> On Thu, Mar 17, 2022 at 09:17:13AM +0100, David Hildenbrand wrote:
> >>>> On 17.03.22 03:03, CGEL wrote:
> >>>>> On Wed, Mar 16, 2022 at 03:56:23PM +0100, David Hildenbrand wrote:
> >>>>>> On 16.03.22 14:34, cgel.zte@gmail.com wrote:
> >>>>>>> From: Yang Yang <yang.yang29@zte.com.cn>
> >>>>>>>
> >>>>>>> Delay accounting does not track the delay of ksm cow.  When tasks
> >>>>>>> have many ksm pages, it may spend a amount of time waiting for ksm
> >>>>>>> cow.
> >>>>>>>
> >>>>>>> To get the impact of tasks in ksm cow, measure the delay when ksm
> >>>>>>> cow happens. This could help users to decide whether to user ksm
> >>>>>>> or not.
> >>>>>>>
> >>>>>>> Also update tools/accounting/getdelays.c:
> >>>>>>>
> >>>>>>>     / # ./getdelays -dl -p 231
> >>>>>>>     print delayacct stats ON
> >>>>>>>     listen forever
> >>>>>>>     PID     231
> >>>>>>>
> >>>>>>>     CPU             count     real total  virtual total    delay total  delay average
> >>>>>>>                      6247     1859000000     2154070021     1674255063          0.268ms
> >>>>>>>     IO              count    delay total  delay average
> >>>>>>>                         0              0              0ms
> >>>>>>>     SWAP            count    delay total  delay average
> >>>>>>>                         0              0              0ms
> >>>>>>>     RECLAIM         count    delay total  delay average
> >>>>>>>                         0              0              0ms
> >>>>>>>     THRASHING       count    delay total  delay average
> >>>>>>>                         0              0              0ms
> >>>>>>>     KSM             count    delay total  delay average
> >>>>>>>                      3635      271567604              0ms
> >>>>>>>
> >>>>>>
> >>>>>> TBH I'm not sure how particularly helpful this is and if we want this.
> >>>>>>
> >>>>> Thanks for replying.
> >>>>>
> >>>>> Users may use ksm by calling madvise(, , MADV_MERGEABLE) when they want
> >>>>> save memory, it's a tradeoff by suffering delay on ksm cow. Users can
> >>>>> get to know how much memory ksm saved by reading
> >>>>> /sys/kernel/mm/ksm/pages_sharing, but they don't know what the costs of
> >>>>> ksm cow delay, and this is important of some delay sensitive tasks. If
> >>>>> users know both saved memory and ksm cow delay, they could better use
> >>>>> madvise(, , MADV_MERGEABLE).
> >>>>
> >>>> But that happens after the effects, no?
> >>>>
> >>>> IOW a user already called madvise(, , MADV_MERGEABLE) and then gets the
> >>>> results.
> >>>>
> >>> Image user are developing or porting their applications on experiment
> >>> machine, they could takes those benchmark as feedback to adjust whether
> >>> to use madvise(, , MADV_MERGEABLE) or it's range.
> >>
> >> And why can't they run it with and without and observe performance using
> >> existing metrics (or even application-specific metrics?)?
> >>
> >>
> > I think the reason why we need this patch, is just like why we need                                                                                                     
> > swap,reclaim,thrashing getdelay information. When system is complex,
> > it's hard to precise tell which kernel activity impact the observe
> > performance or application-specific metrics, preempt? cgroup throttle?
> > swap? reclaim? IO?
> > 
> > So if we could get the factor's precise impact data, when we are tunning
> > the factor(for this patch it's ksm), it's more efficient.
> > 
> 
> I'm not convinced that we want to make or write-fault handler more
> complicated for such a corner case with an unclear, eventual use case.

IIRC, KSM is designed for VM. But recently we found KSM works well for
system with many containers(save about 10%~20% of total memroy), and
container technology is more popular today, so KSM may be used more.

To reduce the impact for write-fault handler, we may write a new function
with ifdef CONFIG_KSM inside to do this job?

> IIRC, whenever using KSM you're already agreeing to eventually pay a
> performance price, and the price heavily depends on other factors in the
> system. Simply looking at the number of write-faults might already give
> an indication what changed with KSM being enabled.
> 
While saying "you're already agreeing to pay a performance price", I think
this is the shortcoming of KSM that putting off it being used more widely.
It's not easy for user/app to decide how to use madvise(, ,MADV_MERGEABLE).

Is there a more easy way to use KSM, enjoying memory saving while minimum
the performance price for container? We think it's possible, and are working
for a new patch: provide a knob for cgroup to enable/disable KSM for all tasks
in this cgroup, so if your container is delay sensitive just leave it, and if
not you can easy to enable KSM without modify app code.

Before using the new knob, user might want to know the precise impact of KSM.
I think write-faults is indirection. If indirection is good enough, why we need
taskstats and PSI? By the way, getdelays support container statistics.

Thanks.

> Having that said, I'd like to hear other opinions.
> 
> -- 
> Thanks,
> 
> David / dhildenb

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] delayacct: track delays from ksm cow
  2022-03-20  6:13               ` CGEL
@ 2022-03-21 15:45                 ` David Hildenbrand
  2022-03-22  3:12                   ` CGEL
  0 siblings, 1 reply; 13+ messages in thread
From: David Hildenbrand @ 2022-03-21 15:45 UTC (permalink / raw)
  To: CGEL; +Cc: bsingharora, akpm, yang.yang29, linux-kernel, linux-mm

On 20.03.22 07:13, CGEL wrote:
> On Fri, Mar 18, 2022 at 09:24:44AM +0100, David Hildenbrand wrote:
>> On 18.03.22 02:41, CGEL wrote:
>>> On Thu, Mar 17, 2022 at 11:05:22AM +0100, David Hildenbrand wrote:
>>>> On 17.03.22 10:48, CGEL wrote:
>>>>> On Thu, Mar 17, 2022 at 09:17:13AM +0100, David Hildenbrand wrote:
>>>>>> On 17.03.22 03:03, CGEL wrote:
>>>>>>> On Wed, Mar 16, 2022 at 03:56:23PM +0100, David Hildenbrand wrote:
>>>>>>>> On 16.03.22 14:34, cgel.zte@gmail.com wrote:
>>>>>>>>> From: Yang Yang <yang.yang29@zte.com.cn>
>>>>>>>>>
>>>>>>>>> Delay accounting does not track the delay of ksm cow.  When tasks
>>>>>>>>> have many ksm pages, it may spend a amount of time waiting for ksm
>>>>>>>>> cow.
>>>>>>>>>
>>>>>>>>> To get the impact of tasks in ksm cow, measure the delay when ksm
>>>>>>>>> cow happens. This could help users to decide whether to user ksm
>>>>>>>>> or not.
>>>>>>>>>
>>>>>>>>> Also update tools/accounting/getdelays.c:
>>>>>>>>>
>>>>>>>>>     / # ./getdelays -dl -p 231
>>>>>>>>>     print delayacct stats ON
>>>>>>>>>     listen forever
>>>>>>>>>     PID     231
>>>>>>>>>
>>>>>>>>>     CPU             count     real total  virtual total    delay total  delay average
>>>>>>>>>                      6247     1859000000     2154070021     1674255063          0.268ms
>>>>>>>>>     IO              count    delay total  delay average
>>>>>>>>>                         0              0              0ms
>>>>>>>>>     SWAP            count    delay total  delay average
>>>>>>>>>                         0              0              0ms
>>>>>>>>>     RECLAIM         count    delay total  delay average
>>>>>>>>>                         0              0              0ms
>>>>>>>>>     THRASHING       count    delay total  delay average
>>>>>>>>>                         0              0              0ms
>>>>>>>>>     KSM             count    delay total  delay average
>>>>>>>>>                      3635      271567604              0ms
>>>>>>>>>
>>>>>>>>
>>>>>>>> TBH I'm not sure how particularly helpful this is and if we want this.
>>>>>>>>
>>>>>>> Thanks for replying.
>>>>>>>
>>>>>>> Users may use ksm by calling madvise(, , MADV_MERGEABLE) when they want
>>>>>>> save memory, it's a tradeoff by suffering delay on ksm cow. Users can
>>>>>>> get to know how much memory ksm saved by reading
>>>>>>> /sys/kernel/mm/ksm/pages_sharing, but they don't know what the costs of
>>>>>>> ksm cow delay, and this is important of some delay sensitive tasks. If
>>>>>>> users know both saved memory and ksm cow delay, they could better use
>>>>>>> madvise(, , MADV_MERGEABLE).
>>>>>>
>>>>>> But that happens after the effects, no?
>>>>>>
>>>>>> IOW a user already called madvise(, , MADV_MERGEABLE) and then gets the
>>>>>> results.
>>>>>>
>>>>> Image user are developing or porting their applications on experiment
>>>>> machine, they could takes those benchmark as feedback to adjust whether
>>>>> to use madvise(, , MADV_MERGEABLE) or it's range.
>>>>
>>>> And why can't they run it with and without and observe performance using
>>>> existing metrics (or even application-specific metrics?)?
>>>>
>>>>
>>> I think the reason why we need this patch, is just like why we need                                                                                                     
>>> swap,reclaim,thrashing getdelay information. When system is complex,
>>> it's hard to precise tell which kernel activity impact the observe
>>> performance or application-specific metrics, preempt? cgroup throttle?
>>> swap? reclaim? IO?
>>>
>>> So if we could get the factor's precise impact data, when we are tunning
>>> the factor(for this patch it's ksm), it's more efficient.
>>>
>>
>> I'm not convinced that we want to make or write-fault handler more
>> complicated for such a corner case with an unclear, eventual use case.
> 
> IIRC, KSM is designed for VM. But recently we found KSM works well for
> system with many containers(save about 10%~20% of total memroy), and
> container technology is more popular today, so KSM may be used more.
> 
> To reduce the impact for write-fault handler, we may write a new function
> with ifdef CONFIG_KSM inside to do this job?

Maybe we just want to catch the impact of the write-fault handler when
copying more generally?

> 
>> IIRC, whenever using KSM you're already agreeing to eventually pay a
>> performance price, and the price heavily depends on other factors in the
>> system. Simply looking at the number of write-faults might already give
>> an indication what changed with KSM being enabled.
>>
> While saying "you're already agreeing to pay a performance price", I think
> this is the shortcoming of KSM that putting off it being used more widely.
> It's not easy for user/app to decide how to use madvise(, ,MADV_MERGEABLE).

... and my point is that the metric you're introducing might absolutely
not be expressive for such users playing with MADV_MERGEABLE. IMHO
people will look at actual application performance to figure out what
"harm" will be done, no?

But I do see value in capturing how many COW we have in general --
either via a counter or via a delay as proposed by you.

> 
> Is there a more easy way to use KSM, enjoying memory saving while minimum
> the performance price for container? We think it's possible, and are working
> for a new patch: provide a knob for cgroup to enable/disable KSM for all tasks
> in this cgroup, so if your container is delay sensitive just leave it, and if
> not you can easy to enable KSM without modify app code.
> 
> Before using the new knob, user might want to know the precise impact of KSM.
> I think write-faults is indirection. If indirection is good enough, why we need
> taskstats and PSI? By the way, getdelays support container statistics.

Would anything speak against making this more generic and capturing the
delay for any COW, not just for KSM?

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] delayacct: track delays from ksm cow
  2022-03-21 15:45                 ` David Hildenbrand
@ 2022-03-22  3:12                   ` CGEL
  2022-03-22  7:55                     ` David Hildenbrand
  0 siblings, 1 reply; 13+ messages in thread
From: CGEL @ 2022-03-22  3:12 UTC (permalink / raw)
  To: David Hildenbrand; +Cc: bsingharora, akpm, yang.yang29, linux-kernel, linux-mm

On Mon, Mar 21, 2022 at 04:45:40PM +0100, David Hildenbrand wrote:
> On 20.03.22 07:13, CGEL wrote:
> > On Fri, Mar 18, 2022 at 09:24:44AM +0100, David Hildenbrand wrote:
> >> On 18.03.22 02:41, CGEL wrote:
> >>> On Thu, Mar 17, 2022 at 11:05:22AM +0100, David Hildenbrand wrote:
> >>>> On 17.03.22 10:48, CGEL wrote:
> >>>>> On Thu, Mar 17, 2022 at 09:17:13AM +0100, David Hildenbrand wrote:
> >>>>>> On 17.03.22 03:03, CGEL wrote:
> >>>>>>> On Wed, Mar 16, 2022 at 03:56:23PM +0100, David Hildenbrand wrote:
> >>>>>>>> On 16.03.22 14:34, cgel.zte@gmail.com wrote:
> >>>>>>>>> From: Yang Yang <yang.yang29@zte.com.cn>
> >>>>>>>>>
> >>>>>>>>> Delay accounting does not track the delay of ksm cow.  When tasks
> >>>>>>>>> have many ksm pages, it may spend a amount of time waiting for ksm
> >>>>>>>>> cow.
> >>>>>>>>>
> >>>>>>>>> To get the impact of tasks in ksm cow, measure the delay when ksm
> >>>>>>>>> cow happens. This could help users to decide whether to user ksm
> >>>>>>>>> or not.
> >>>>>>>>>
> >>>>>>>>> Also update tools/accounting/getdelays.c:
> >>>>>>>>>
> >>>>>>>>>     / # ./getdelays -dl -p 231
> >>>>>>>>>     print delayacct stats ON
> >>>>>>>>>     listen forever
> >>>>>>>>>     PID     231
> >>>>>>>>>
> >>>>>>>>>     CPU             count     real total  virtual total    delay total  delay average
> >>>>>>>>>                      6247     1859000000     2154070021     1674255063          0.268ms
> >>>>>>>>>     IO              count    delay total  delay average
> >>>>>>>>>                         0              0              0ms
> >>>>>>>>>     SWAP            count    delay total  delay average
> >>>>>>>>>                         0              0              0ms
> >>>>>>>>>     RECLAIM         count    delay total  delay average
> >>>>>>>>>                         0              0              0ms
> >>>>>>>>>     THRASHING       count    delay total  delay average
> >>>>>>>>>                         0              0              0ms
> >>>>>>>>>     KSM             count    delay total  delay average
> >>>>>>>>>                      3635      271567604              0ms
> >>>>>>>>>
> >>>>>>>>
> >>>>>>>> TBH I'm not sure how particularly helpful this is and if we want this.
> >>>>>>>>
> >>>>>>> Thanks for replying.
> >>>>>>>
> >>>>>>> Users may use ksm by calling madvise(, , MADV_MERGEABLE) when they want
> >>>>>>> save memory, it's a tradeoff by suffering delay on ksm cow. Users can
> >>>>>>> get to know how much memory ksm saved by reading
> >>>>>>> /sys/kernel/mm/ksm/pages_sharing, but they don't know what the costs of
> >>>>>>> ksm cow delay, and this is important of some delay sensitive tasks. If
> >>>>>>> users know both saved memory and ksm cow delay, they could better use
> >>>>>>> madvise(, , MADV_MERGEABLE).
> >>>>>>
> >>>>>> But that happens after the effects, no?
> >>>>>>
> >>>>>> IOW a user already called madvise(, , MADV_MERGEABLE) and then gets the
> >>>>>> results.
> >>>>>>
> >>>>> Image user are developing or porting their applications on experiment
> >>>>> machine, they could takes those benchmark as feedback to adjust whether
> >>>>> to use madvise(, , MADV_MERGEABLE) or it's range.
> >>>>
> >>>> And why can't they run it with and without and observe performance using
> >>>> existing metrics (or even application-specific metrics?)?
> >>>>
> >>>>
> >>> I think the reason why we need this patch, is just like why we need                                                                                                     
> >>> swap,reclaim,thrashing getdelay information. When system is complex,
> >>> it's hard to precise tell which kernel activity impact the observe
> >>> performance or application-specific metrics, preempt? cgroup throttle?
> >>> swap? reclaim? IO?
> >>>
> >>> So if we could get the factor's precise impact data, when we are tunning
> >>> the factor(for this patch it's ksm), it's more efficient.
> >>>
> >>
> >> I'm not convinced that we want to make or write-fault handler more
> >> complicated for such a corner case with an unclear, eventual use case.
> > 
> > IIRC, KSM is designed for VM. But recently we found KSM works well for
> > system with many containers(save about 10%~20% of total memroy), and
> > container technology is more popular today, so KSM may be used more.
> > 
> > To reduce the impact for write-fault handler, we may write a new function
> > with ifdef CONFIG_KSM inside to do this job?
> 
> Maybe we just want to catch the impact of the write-fault handler when
> copying more generally?
>
We know kernel has different kind of COW, some are transparent for user.
For example child process may cause COW, and user should not care this
performance impact, because it's kernel inside mechanism, user is hard
to do something. But KSM is different, user can do the policy tuning in
userspace. If we metric all the COW, it may be noise, doesn't it?
> > 
> >> IIRC, whenever using KSM you're already agreeing to eventually pay a
> >> performance price, and the price heavily depends on other factors in the
> >> system. Simply looking at the number of write-faults might already give
> >> an indication what changed with KSM being enabled.
> >>
> > While saying "you're already agreeing to pay a performance price", I think
> > this is the shortcoming of KSM that putting off it being used more widely.
> > It's not easy for user/app to decide how to use madvise(, ,MADV_MERGEABLE).
> 
> ... and my point is that the metric you're introducing might absolutely
> not be expressive for such users playing with MADV_MERGEABLE. IMHO
> people will look at actual application performance to figure out what
> "harm" will be done, no?
> 
> But I do see value in capturing how many COW we have in general --
> either via a counter or via a delay as proposed by you.
> 
Thanks for your affirmative. As describe above, or we add a vm counter:
KSM_COW? 
> > 
> > Is there a more easy way to use KSM, enjoying memory saving while minimum
> > the performance price for container? We think it's possible, and are working
> > for a new patch: provide a knob for cgroup to enable/disable KSM for all tasks
> > in this cgroup, so if your container is delay sensitive just leave it, and if
> > not you can easy to enable KSM without modify app code.
> > 
> > Before using the new knob, user might want to know the precise impact of KSM.
> > I think write-faults is indirection. If indirection is good enough, why we need
> > taskstats and PSI? By the way, getdelays support container statistics.
> 
> Would anything speak against making this more generic and capturing the
> delay for any COW, not just for KSM?
I think we'd better to export data to userspace that is meaning for user.
User may no need kernel inside mechanism'data.

Thanks.
> 
> -- 
> Thanks,
> 
> David / dhildenb

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] delayacct: track delays from ksm cow
  2022-03-22  3:12                   ` CGEL
@ 2022-03-22  7:55                     ` David Hildenbrand
  2022-03-22  9:09                       ` CGEL
  0 siblings, 1 reply; 13+ messages in thread
From: David Hildenbrand @ 2022-03-22  7:55 UTC (permalink / raw)
  To: CGEL; +Cc: bsingharora, akpm, yang.yang29, linux-kernel, linux-mm

On 22.03.22 04:12, CGEL wrote:
> On Mon, Mar 21, 2022 at 04:45:40PM +0100, David Hildenbrand wrote:
>> On 20.03.22 07:13, CGEL wrote:
>>> On Fri, Mar 18, 2022 at 09:24:44AM +0100, David Hildenbrand wrote:
>>>> On 18.03.22 02:41, CGEL wrote:
>>>>> On Thu, Mar 17, 2022 at 11:05:22AM +0100, David Hildenbrand wrote:
>>>>>> On 17.03.22 10:48, CGEL wrote:
>>>>>>> On Thu, Mar 17, 2022 at 09:17:13AM +0100, David Hildenbrand wrote:
>>>>>>>> On 17.03.22 03:03, CGEL wrote:
>>>>>>>>> On Wed, Mar 16, 2022 at 03:56:23PM +0100, David Hildenbrand wrote:
>>>>>>>>>> On 16.03.22 14:34, cgel.zte@gmail.com wrote:
>>>>>>>>>>> From: Yang Yang <yang.yang29@zte.com.cn>
>>>>>>>>>>>
>>>>>>>>>>> Delay accounting does not track the delay of ksm cow.  When tasks
>>>>>>>>>>> have many ksm pages, it may spend a amount of time waiting for ksm
>>>>>>>>>>> cow.
>>>>>>>>>>>
>>>>>>>>>>> To get the impact of tasks in ksm cow, measure the delay when ksm
>>>>>>>>>>> cow happens. This could help users to decide whether to user ksm
>>>>>>>>>>> or not.
>>>>>>>>>>>
>>>>>>>>>>> Also update tools/accounting/getdelays.c:
>>>>>>>>>>>
>>>>>>>>>>>     / # ./getdelays -dl -p 231
>>>>>>>>>>>     print delayacct stats ON
>>>>>>>>>>>     listen forever
>>>>>>>>>>>     PID     231
>>>>>>>>>>>
>>>>>>>>>>>     CPU             count     real total  virtual total    delay total  delay average
>>>>>>>>>>>                      6247     1859000000     2154070021     1674255063          0.268ms
>>>>>>>>>>>     IO              count    delay total  delay average
>>>>>>>>>>>                         0              0              0ms
>>>>>>>>>>>     SWAP            count    delay total  delay average
>>>>>>>>>>>                         0              0              0ms
>>>>>>>>>>>     RECLAIM         count    delay total  delay average
>>>>>>>>>>>                         0              0              0ms
>>>>>>>>>>>     THRASHING       count    delay total  delay average
>>>>>>>>>>>                         0              0              0ms
>>>>>>>>>>>     KSM             count    delay total  delay average
>>>>>>>>>>>                      3635      271567604              0ms
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> TBH I'm not sure how particularly helpful this is and if we want this.
>>>>>>>>>>
>>>>>>>>> Thanks for replying.
>>>>>>>>>
>>>>>>>>> Users may use ksm by calling madvise(, , MADV_MERGEABLE) when they want
>>>>>>>>> save memory, it's a tradeoff by suffering delay on ksm cow. Users can
>>>>>>>>> get to know how much memory ksm saved by reading
>>>>>>>>> /sys/kernel/mm/ksm/pages_sharing, but they don't know what the costs of
>>>>>>>>> ksm cow delay, and this is important of some delay sensitive tasks. If
>>>>>>>>> users know both saved memory and ksm cow delay, they could better use
>>>>>>>>> madvise(, , MADV_MERGEABLE).
>>>>>>>>
>>>>>>>> But that happens after the effects, no?
>>>>>>>>
>>>>>>>> IOW a user already called madvise(, , MADV_MERGEABLE) and then gets the
>>>>>>>> results.
>>>>>>>>
>>>>>>> Image user are developing or porting their applications on experiment
>>>>>>> machine, they could takes those benchmark as feedback to adjust whether
>>>>>>> to use madvise(, , MADV_MERGEABLE) or it's range.
>>>>>>
>>>>>> And why can't they run it with and without and observe performance using
>>>>>> existing metrics (or even application-specific metrics?)?
>>>>>>
>>>>>>
>>>>> I think the reason why we need this patch, is just like why we need                                                                                                     
>>>>> swap,reclaim,thrashing getdelay information. When system is complex,
>>>>> it's hard to precise tell which kernel activity impact the observe
>>>>> performance or application-specific metrics, preempt? cgroup throttle?
>>>>> swap? reclaim? IO?
>>>>>
>>>>> So if we could get the factor's precise impact data, when we are tunning
>>>>> the factor(for this patch it's ksm), it's more efficient.
>>>>>
>>>>
>>>> I'm not convinced that we want to make or write-fault handler more
>>>> complicated for such a corner case with an unclear, eventual use case.
>>>
>>> IIRC, KSM is designed for VM. But recently we found KSM works well for
>>> system with many containers(save about 10%~20% of total memroy), and
>>> container technology is more popular today, so KSM may be used more.
>>>
>>> To reduce the impact for write-fault handler, we may write a new function
>>> with ifdef CONFIG_KSM inside to do this job?
>>
>> Maybe we just want to catch the impact of the write-fault handler when
>> copying more generally?
>>
> We know kernel has different kind of COW, some are transparent for user.
> For example child process may cause COW, and user should not care this
> performance impact, because it's kernel inside mechanism, user is hard
> to do something. But KSM is different, user can do the policy tuning in
> userspace. If we metric all the COW, it may be noise, doesn't it?

Only to some degree I think. The other delays (e.g., SWAP, RECLAIM) are
also not completely transparent to the user, no? I mean, user space
might affect them to some degree with some tunables, but it's not
completely transparent for the user either.

IIRC, we have these sources of COW that result in a r/w anon page (->
MAP_PRIVATE):
(1) R/O-mapped, (possibly) shared anonymous page (fork() or KSM)
(2) R/O-mapped, shared zeropage (e.g., KSM, read-only access to
    unpopulated page in MAP_ANON)
(3) R/O-mapped, shared file/device/... page that requires a private copy
    on modifications (e.g., MAP_PRIVATE !MAP_ANON)

Note that your current patch won't catch when KSM placed the shared
zeropage (use_zero_page).

Tracking the overall overhead might be of value I think, and it would
still allow for determining how much KSM is involved by measuring with
and without KSM enabled.

>>>
>>>> IIRC, whenever using KSM you're already agreeing to eventually pay a
>>>> performance price, and the price heavily depends on other factors in the
>>>> system. Simply looking at the number of write-faults might already give
>>>> an indication what changed with KSM being enabled.
>>>>
>>> While saying "you're already agreeing to pay a performance price", I think
>>> this is the shortcoming of KSM that putting off it being used more widely.
>>> It's not easy for user/app to decide how to use madvise(, ,MADV_MERGEABLE).
>>
>> ... and my point is that the metric you're introducing might absolutely
>> not be expressive for such users playing with MADV_MERGEABLE. IMHO
>> people will look at actual application performance to figure out what
>> "harm" will be done, no?
>>
>> But I do see value in capturing how many COW we have in general --
>> either via a counter or via a delay as proposed by you.
>>
> Thanks for your affirmative. As describe above, or we add a vm counter:
> KSM_COW? 

As I'm messing with the COW logic lately (e.g., [1]) I'd welcome vm
counters for all different kind of COW-related events, especially

(1) COW of an anon, !KSM page
(2) COW of a KSM page
(3) COW of the shared zeropage
(4) Reuse instead of COW

I used some VM counters myself to debug/test some of my latest changes.

>>>
>>> Is there a more easy way to use KSM, enjoying memory saving while minimum
>>> the performance price for container? We think it's possible, and are working
>>> for a new patch: provide a knob for cgroup to enable/disable KSM for all tasks
>>> in this cgroup, so if your container is delay sensitive just leave it, and if
>>> not you can easy to enable KSM without modify app code.
>>>
>>> Before using the new knob, user might want to know the precise impact of KSM.
>>> I think write-faults is indirection. If indirection is good enough, why we need
>>> taskstats and PSI? By the way, getdelays support container statistics.
>>
>> Would anything speak against making this more generic and capturing the
>> delay for any COW, not just for KSM?
> I think we'd better to export data to userspace that is meaning for user.
> User may no need kernel inside mechanism'data.

Reading Documentation/accounting/delay-accounting.rst I wonder what we
best put in there.

"Tasks encounter delays in execution when they wait for some kernel
resource to become available."

I mean, in any COW event we are waiting for the kernel to create a copy.


This could be of value even if we add separate VM counters.



[1]
https://lore.kernel.org/linux-mm/20220315104741.63071-2-david@redhat.com/T/

-- 
Thanks,

David / dhildenb


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] delayacct: track delays from ksm cow
  2022-03-22  7:55                     ` David Hildenbrand
@ 2022-03-22  9:09                       ` CGEL
  0 siblings, 0 replies; 13+ messages in thread
From: CGEL @ 2022-03-22  9:09 UTC (permalink / raw)
  To: David Hildenbrand; +Cc: bsingharora, akpm, yang.yang29, linux-kernel, linux-mm

On Tue, Mar 22, 2022 at 08:55:15AM +0100, David Hildenbrand wrote:
> On 22.03.22 04:12, CGEL wrote:
> > On Mon, Mar 21, 2022 at 04:45:40PM +0100, David Hildenbrand wrote:
> >> On 20.03.22 07:13, CGEL wrote:
> >>> On Fri, Mar 18, 2022 at 09:24:44AM +0100, David Hildenbrand wrote:
> >>>> On 18.03.22 02:41, CGEL wrote:
> >>>>> On Thu, Mar 17, 2022 at 11:05:22AM +0100, David Hildenbrand wrote:
> >>>>>> On 17.03.22 10:48, CGEL wrote:
> >>>>>>> On Thu, Mar 17, 2022 at 09:17:13AM +0100, David Hildenbrand wrote:
> >>>>>>>> On 17.03.22 03:03, CGEL wrote:
> >>>>>>>>> On Wed, Mar 16, 2022 at 03:56:23PM +0100, David Hildenbrand wrote:
> >>>>>>>>>> On 16.03.22 14:34, cgel.zte@gmail.com wrote:
> >>>>>>>>>>> From: Yang Yang <yang.yang29@zte.com.cn>
> >>>>>>>>>>>
> >>>>>>>>>>> Delay accounting does not track the delay of ksm cow.  When tasks
> >>>>>>>>>>> have many ksm pages, it may spend a amount of time waiting for ksm
> >>>>>>>>>>> cow.
> >>>>>>>>>>>
> >>>>>>>>>>> To get the impact of tasks in ksm cow, measure the delay when ksm
> >>>>>>>>>>> cow happens. This could help users to decide whether to user ksm
> >>>>>>>>>>> or not.
> >>>>>>>>>>>
> >>>>>>>>>>> Also update tools/accounting/getdelays.c:
> >>>>>>>>>>>
> >>>>>>>>>>>     / # ./getdelays -dl -p 231
> >>>>>>>>>>>     print delayacct stats ON
> >>>>>>>>>>>     listen forever
> >>>>>>>>>>>     PID     231
> >>>>>>>>>>>
> >>>>>>>>>>>     CPU             count     real total  virtual total    delay total  delay average
> >>>>>>>>>>>                      6247     1859000000     2154070021     1674255063          0.268ms
> >>>>>>>>>>>     IO              count    delay total  delay average
> >>>>>>>>>>>                         0              0              0ms
> >>>>>>>>>>>     SWAP            count    delay total  delay average
> >>>>>>>>>>>                         0              0              0ms
> >>>>>>>>>>>     RECLAIM         count    delay total  delay average
> >>>>>>>>>>>                         0              0              0ms
> >>>>>>>>>>>     THRASHING       count    delay total  delay average
> >>>>>>>>>>>                         0              0              0ms
> >>>>>>>>>>>     KSM             count    delay total  delay average
> >>>>>>>>>>>                      3635      271567604              0ms
> >>>>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>> TBH I'm not sure how particularly helpful this is and if we want this.
> >>>>>>>>>>
> >>>>>>>>> Thanks for replying.
> >>>>>>>>>
> >>>>>>>>> Users may use ksm by calling madvise(, , MADV_MERGEABLE) when they want
> >>>>>>>>> save memory, it's a tradeoff by suffering delay on ksm cow. Users can
> >>>>>>>>> get to know how much memory ksm saved by reading
> >>>>>>>>> /sys/kernel/mm/ksm/pages_sharing, but they don't know what the costs of
> >>>>>>>>> ksm cow delay, and this is important of some delay sensitive tasks. If
> >>>>>>>>> users know both saved memory and ksm cow delay, they could better use
> >>>>>>>>> madvise(, , MADV_MERGEABLE).
> >>>>>>>>
> >>>>>>>> But that happens after the effects, no?
> >>>>>>>>
> >>>>>>>> IOW a user already called madvise(, , MADV_MERGEABLE) and then gets the
> >>>>>>>> results.
> >>>>>>>>
> >>>>>>> Image user are developing or porting their applications on experiment
> >>>>>>> machine, they could takes those benchmark as feedback to adjust whether
> >>>>>>> to use madvise(, , MADV_MERGEABLE) or it's range.
> >>>>>>
> >>>>>> And why can't they run it with and without and observe performance using
> >>>>>> existing metrics (or even application-specific metrics?)?
> >>>>>>
> >>>>>>
> >>>>> I think the reason why we need this patch, is just like why we need                                                                                                     
> >>>>> swap,reclaim,thrashing getdelay information. When system is complex,
> >>>>> it's hard to precise tell which kernel activity impact the observe
> >>>>> performance or application-specific metrics, preempt? cgroup throttle?
> >>>>> swap? reclaim? IO?
> >>>>>
> >>>>> So if we could get the factor's precise impact data, when we are tunning
> >>>>> the factor(for this patch it's ksm), it's more efficient.
> >>>>>
> >>>>
> >>>> I'm not convinced that we want to make or write-fault handler more
> >>>> complicated for such a corner case with an unclear, eventual use case.
> >>>
> >>> IIRC, KSM is designed for VM. But recently we found KSM works well for
> >>> system with many containers(save about 10%~20% of total memroy), and
> >>> container technology is more popular today, so KSM may be used more.
> >>>
> >>> To reduce the impact for write-fault handler, we may write a new function
> >>> with ifdef CONFIG_KSM inside to do this job?
> >>
> >> Maybe we just want to catch the impact of the write-fault handler when
> >> copying more generally?
> >>
> > We know kernel has different kind of COW, some are transparent for user.
> > For example child process may cause COW, and user should not care this
> > performance impact, because it's kernel inside mechanism, user is hard
> > to do something. But KSM is different, user can do the policy tuning in
> > userspace. If we metric all the COW, it may be noise, doesn't it?
> 
> Only to some degree I think. The other delays (e.g., SWAP, RECLAIM) are
> also not completely transparent to the user, no? I mean, user space
> might affect them to some degree with some tunables, but it's not
> completely transparent for the user either.
> 
> IIRC, we have these sources of COW that result in a r/w anon page (->
> MAP_PRIVATE):
> (1) R/O-mapped, (possibly) shared anonymous page (fork() or KSM)
> (2) R/O-mapped, shared zeropage (e.g., KSM, read-only access to
>     unpopulated page in MAP_ANON)
> (3) R/O-mapped, shared file/device/... page that requires a private copy
>     on modifications (e.g., MAP_PRIVATE !MAP_ANON)
> 
> Note that your current patch won't catch when KSM placed the shared
> zeropage (use_zero_page).
> 
> Tracking the overall overhead might be of value I think, and it would
> still allow for determining how much KSM is involved by measuring with
> and without KSM enabled.
> 
> >>>
> >>>> IIRC, whenever using KSM you're already agreeing to eventually pay a
> >>>> performance price, and the price heavily depends on other factors in the
> >>>> system. Simply looking at the number of write-faults might already give
> >>>> an indication what changed with KSM being enabled.
> >>>>
> >>> While saying "you're already agreeing to pay a performance price", I think
> >>> this is the shortcoming of KSM that putting off it being used more widely.
> >>> It's not easy for user/app to decide how to use madvise(, ,MADV_MERGEABLE).
> >>
> >> ... and my point is that the metric you're introducing might absolutely
> >> not be expressive for such users playing with MADV_MERGEABLE. IMHO
> >> people will look at actual application performance to figure out what
> >> "harm" will be done, no?
> >>
> >> But I do see value in capturing how many COW we have in general --
> >> either via a counter or via a delay as proposed by you.
> >>
> > Thanks for your affirmative. As describe above, or we add a vm counter:
> > KSM_COW? 
> 
> As I'm messing with the COW logic lately (e.g., [1]) I'd welcome vm
> counters for all different kind of COW-related events, especially
> 
> (1) COW of an anon, !KSM page
> (2) COW of a KSM page
> (3) COW of the shared zeropage
> (4) Reuse instead of COW
> 
> I used some VM counters myself to debug/test some of my latest changes.
> 
> >>>
> >>> Is there a more easy way to use KSM, enjoying memory saving while minimum
> >>> the performance price for container? We think it's possible, and are working
> >>> for a new patch: provide a knob for cgroup to enable/disable KSM for all tasks
> >>> in this cgroup, so if your container is delay sensitive just leave it, and if
> >>> not you can easy to enable KSM without modify app code.
> >>>
> >>> Before using the new knob, user might want to know the precise impact of KSM.
> >>> I think write-faults is indirection. If indirection is good enough, why we need
> >>> taskstats and PSI? By the way, getdelays support container statistics.
> >>
> >> Would anything speak against making this more generic and capturing the
> >> delay for any COW, not just for KSM?
> > I think we'd better to export data to userspace that is meaning for user.
> > User may no need kernel inside mechanism'data.
> 
> Reading Documentation/accounting/delay-accounting.rst I wonder what we
> best put in there.
> 
> "Tasks encounter delays in execution when they wait for some kernel
> resource to become available."
> 
> I mean, in any COW event we are waiting for the kernel to create a copy.
> 
> 
> This could be of value even if we add separate VM counters.
>
I think your statement is good enough. I will modify this patch to support
counting all COW events delay, and submit patch to add new VM counters for
different kinds of COW.

Great thanks!
> 
> 
> [1]
> https://lore.kernel.org/linux-mm/20220315104741.63071-2-david@redhat.com/T/
> 
> -- 
> Thanks,
> 
> David / dhildenb

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2022-03-22  9:09 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-16 13:34 [PATCH] delayacct: track delays from ksm cow cgel.zte
2022-03-16 14:56 ` David Hildenbrand
2022-03-17  2:03   ` CGEL
2022-03-17  8:17     ` David Hildenbrand
2022-03-17  9:48       ` CGEL
2022-03-17 10:05         ` David Hildenbrand
2022-03-18  1:41           ` CGEL
2022-03-18  8:24             ` David Hildenbrand
2022-03-20  6:13               ` CGEL
2022-03-21 15:45                 ` David Hildenbrand
2022-03-22  3:12                   ` CGEL
2022-03-22  7:55                     ` David Hildenbrand
2022-03-22  9:09                       ` CGEL

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.