linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] mm: make allocation counters per-order
@ 2017-07-06 13:04 Roman Gushchin
  2017-07-06 13:19 ` Mel Gorman
                   ` (3 more replies)
  0 siblings, 4 replies; 18+ messages in thread
From: Roman Gushchin @ 2017-07-06 13:04 UTC (permalink / raw)
  To: linux-mm
  Cc: Roman Gushchin, Andrew Morton, Mel Gorman, Johannes Weiner,
	Michal Hocko, Vladimir Davydov, Rik van Riel, kernel-team,
	linux-kernel

High-order allocations are obviously more costly, and it's very useful
to know how many of them happens, if there are any issues
(or suspicions) with memory fragmentation.

This commit changes existing per-zone allocation counters to be
per-zone per-order. These counters are displayed using a new
procfs interface (similar to /proc/buddyinfo):

$ cat /proc/allocinfo
     DMA          0          0          0          0          0 \
       0          0          0          0          0          0
   DMA32          3          0          1          0          0 \
       0          0          0          0          0          0
  Normal    4997056      23594      10902      23686        931 \
      23        122        786         17          1          0
 Movable          0          0          0          0          0 \
       0          0          0          0          0          0
  Device          0          0          0          0          0 \
       0          0          0          0          0          0

The existing vmstat interface remains untouched*, and still shows
the total number of single page allocations, so high-order allocations
are represented as a corresponding number of order-0 allocations.

$ cat /proc/vmstat | grep alloc
pgalloc_dma 0
pgalloc_dma32 7
pgalloc_normal 5461660
pgalloc_movable 0
pgalloc_device 0

* I've added device zone for consistency with other zones,
and to avoid messy exclusion of this zone in the code.

Signed-off-by: Roman Gushchin <guro@fb.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: kernel-team@fb.com
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org
---
 arch/s390/appldata/appldata_mem.c |   7 +++
 include/linux/mmzone.h            |   2 +
 include/linux/vm_event_item.h     |  19 ++++--
 include/linux/vmstat.h            |  13 +++++
 mm/page_alloc.c                   |  11 +++-
 mm/vmstat.c                       | 120 +++++++++++++++++++++++++++++++++++---
 6 files changed, 158 insertions(+), 14 deletions(-)

diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c
index 598df57..06216ff0 100644
--- a/arch/s390/appldata/appldata_mem.c
+++ b/arch/s390/appldata/appldata_mem.c
@@ -81,6 +81,7 @@ static void appldata_get_mem_data(void *data)
 	static struct sysinfo val;
 	unsigned long ev[NR_VM_EVENT_ITEMS];
 	struct appldata_mem_data *mem_data;
+	int order;
 
 	mem_data = data;
 	mem_data->sync_count_1++;
@@ -92,6 +93,12 @@ static void appldata_get_mem_data(void *data)
 	mem_data->pswpout    = ev[PSWPOUT];
 	mem_data->pgalloc    = ev[PGALLOC_NORMAL];
 	mem_data->pgalloc    += ev[PGALLOC_DMA];
+	for (order = 1; order < MAX_ORDER; ++order) {
+		mem_data->pgalloc +=
+			ev[PGALLOC_NORMAL + order * MAX_NR_ZONES] << order;
+		mem_data->pgalloc +=
+			 ev[PGALLOC_DMA + order * MAX_NR_ZONES] << order;
+	}
 	mem_data->pgfault    = ev[PGFAULT];
 	mem_data->pgmajfault = ev[PGMAJFAULT];
 
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 16532fa..6598285 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -66,6 +66,8 @@ enum migratetype {
 /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
 extern char * const migratetype_names[MIGRATE_TYPES];
 
+extern const char *zone_name(int idx);
+
 #ifdef CONFIG_CMA
 #  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
 #  define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 37e8d31..75bbac8 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -19,12 +19,23 @@
 #define HIGHMEM_ZONE(xx)
 #endif
 
-#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE
+#ifdef CONFIG_ZONE_DEVICE
+#define DEVICE_ZONE(xx) xx##__DEVICE,
+#else
+#define DEVICE_ZONE(xx)
+#endif
+
+#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE, DEVICE_ZONE(xx)
+
+#define PGALLOC_EVENTS_SIZE (MAX_NR_ZONES * MAX_ORDER)
+#define PGALLOC_EVENTS_CUT_SIZE (MAX_NR_ZONES * (MAX_ORDER - 1))
+#define PGALLOC_FIRST_ZONE (PGALLOC_NORMAL - ZONE_NORMAL)
 
 enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
-		FOR_ALL_ZONES(PGALLOC),
-		FOR_ALL_ZONES(ALLOCSTALL),
-		FOR_ALL_ZONES(PGSCAN_SKIP),
+		FOR_ALL_ZONES(PGALLOC)
+		__PGALLOC_LAST = PGALLOC_FIRST_ZONE + PGALLOC_EVENTS_SIZE - 1,
+		FOR_ALL_ZONES(ALLOCSTALL)
+		FOR_ALL_ZONES(PGSCAN_SKIP)
 		PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE,
 		PGFAULT, PGMAJFAULT,
 		PGLAZYFREED,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index b3d85f3..ec30215 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -103,6 +103,19 @@ static inline void vm_events_fold_cpu(int cpu)
 #define __count_zid_vm_events(item, zid, delta) \
 	__count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)
 
+static inline void __count_alloc_event(enum zone_type zid, unsigned int order)
+{
+	enum vm_event_item item;
+
+	if (unlikely(order >= MAX_ORDER)) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	item = PGALLOC_FIRST_ZONE + order * MAX_NR_ZONES + zid;
+	__count_vm_events(item, 1);
+}
+
 /*
  * Zone and node-based page accounting with per cpu differentials.
  */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 80e4adb..e74b327 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -233,6 +233,13 @@ static char * const zone_names[MAX_NR_ZONES] = {
 #endif
 };
 
+const char *zone_name(int zid)
+{
+	if (zid < MAX_NR_ZONES)
+		return zone_names[zid];
+	return NULL;
+}
+
 char * const migratetype_names[MIGRATE_TYPES] = {
 	"Unmovable",
 	"Movable",
@@ -2779,7 +2786,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	list = &pcp->lists[migratetype];
 	page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
 	if (page) {
-		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+		__count_alloc_event(page_zonenum(page), order);
 		zone_statistics(preferred_zone, zone);
 	}
 	local_irq_restore(flags);
@@ -2827,7 +2834,7 @@ struct page *rmqueue(struct zone *preferred_zone,
 	__mod_zone_freepage_state(zone, -(1 << order),
 				  get_pcppage_migratetype(page));
 
-	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+	__count_alloc_event(page_zonenum(page), order);
 	zone_statistics(preferred_zone, zone);
 	local_irq_restore(flags);
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9a4441b..cd465f6 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -27,6 +27,7 @@
 #include <linux/mm_inline.h>
 #include <linux/page_ext.h>
 #include <linux/page_owner.h>
+#include <linux/mmzone.h>
 
 #include "internal.h"
 
@@ -34,18 +35,18 @@
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
 EXPORT_PER_CPU_SYMBOL(vm_event_states);
 
-static void sum_vm_events(unsigned long *ret)
+static void sum_vm_events(unsigned long *ret, int off, size_t nr_events)
 {
 	int cpu;
 	int i;
 
-	memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
+	memset(ret, 0, nr_events * sizeof(unsigned long));
 
 	for_each_online_cpu(cpu) {
 		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
 
-		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
-			ret[i] += this->event[i];
+		for (i = 0; i < nr_events; i++)
+			ret[i] += this->event[off + i];
 	}
 }
 
@@ -57,7 +58,7 @@ static void sum_vm_events(unsigned long *ret)
 void all_vm_events(unsigned long *ret)
 {
 	get_online_cpus();
-	sum_vm_events(ret);
+	sum_vm_events(ret, 0, NR_VM_EVENT_ITEMS);
 	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(all_vm_events);
@@ -915,8 +916,15 @@ int fragmentation_index(struct zone *zone, unsigned int order)
 #define TEXT_FOR_HIGHMEM(xx)
 #endif
 
+#ifdef CONFIG_ZONE_DEVICE
+#define TEXT_FOR_DEVICE(xx) xx "_device",
+#else
+#define TEXT_FOR_DEVICE(xx)
+#endif
+
 #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
-					TEXT_FOR_HIGHMEM(xx) xx "_movable",
+					TEXT_FOR_HIGHMEM(xx) xx "_movable", \
+					TEXT_FOR_DEVICE(xx)
 
 const char * const vmstat_text[] = {
 	/* enum zone_stat_item countes */
@@ -1480,12 +1488,86 @@ enum writeback_stat_item {
 	NR_VM_WRITEBACK_STAT_ITEMS,
 };
 
+static void sum_alloc_events(unsigned long *v)
+{
+	int zid, order, index;
+
+	for (zid = 0; zid < MAX_NR_ZONES; ++zid) {
+		for (order = 1; order < MAX_ORDER; order++) {
+			index = PGALLOC_FIRST_ZONE + zid;
+			v[index] += v[index + order * MAX_NR_ZONES] << order;
+		}
+	}
+}
+
+static int allocinfo_show(struct seq_file *m, void *arg)
+{
+	unsigned long allocs[PGALLOC_EVENTS_SIZE];
+	unsigned int order;
+	int zid;
+
+	if (arg != SEQ_START_TOKEN)
+		return 0;
+
+	get_online_cpus();
+	sum_vm_events(allocs, PGALLOC_FIRST_ZONE, PGALLOC_EVENTS_SIZE);
+	put_online_cpus();
+
+	for (zid = 0; zid < MAX_NR_ZONES; ++zid) {
+		seq_printf(m, "%8s ", zone_name(zid));
+
+		for (order = 0; order < MAX_ORDER; order++)
+			seq_printf(m, "%10lu ",
+				   allocs[zid + order * MAX_NR_ZONES]);
+
+		seq_putc(m, '\n');
+	}
+
+	return 0;
+}
+
+static void *allocinfo_start(struct seq_file *m, loff_t *pos)
+{
+	if (*pos)
+		return NULL;
+	return SEQ_START_TOKEN;
+}
+
+static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+	++*pos;
+	return NULL;
+}
+
+static void allocinfo_stop(struct seq_file *m, void *arg)
+{
+}
+
+static const struct seq_operations allocinfo_op = {
+	.start	= allocinfo_start,
+	.next	= allocinfo_next,
+	.stop	= allocinfo_stop,
+	.show	= allocinfo_show,
+};
+
+static int allocinfo_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &allocinfo_op);
+}
+
+static const struct file_operations allocinfo_file_operations = {
+	.open		= allocinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 static void *vmstat_start(struct seq_file *m, loff_t *pos)
 {
 	unsigned long *v;
 	int i, stat_items_size;
 
-	if (*pos >= ARRAY_SIZE(vmstat_text))
+	if (*pos >= ARRAY_SIZE(vmstat_text) + PGALLOC_EVENTS_CUT_SIZE)
 		return NULL;
 	stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
 			  NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
@@ -1513,6 +1595,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
 
 #ifdef CONFIG_VM_EVENT_COUNTERS
 	all_vm_events(v);
+	sum_alloc_events(v);
 	v[PGPGIN] /= 2;		/* sectors -> kbytes */
 	v[PGPGOUT] /= 2;
 #endif
@@ -1521,8 +1604,16 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
 
 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
 {
+	int alloc_event_start = NR_VM_ZONE_STAT_ITEMS +
+		NR_VM_NODE_STAT_ITEMS +
+		NR_VM_WRITEBACK_STAT_ITEMS +
+		PGALLOC_FIRST_ZONE;
+
 	(*pos)++;
-	if (*pos >= ARRAY_SIZE(vmstat_text))
+	if (*pos == alloc_event_start + MAX_NR_ZONES)
+		*(pos) += PGALLOC_EVENTS_CUT_SIZE;
+
+	if (*pos >= ARRAY_SIZE(vmstat_text) + PGALLOC_EVENTS_CUT_SIZE)
 		return NULL;
 	return (unsigned long *)m->private + *pos;
 }
@@ -1531,6 +1622,18 @@ static int vmstat_show(struct seq_file *m, void *arg)
 {
 	unsigned long *l = arg;
 	unsigned long off = l - (unsigned long *)m->private;
+	int alloc_event_start = NR_VM_ZONE_STAT_ITEMS +
+		NR_VM_NODE_STAT_ITEMS +
+		NR_VM_WRITEBACK_STAT_ITEMS +
+		PGALLOC_FIRST_ZONE;
+
+	if (off >= alloc_event_start + PGALLOC_EVENTS_SIZE)
+		off -= PGALLOC_EVENTS_CUT_SIZE;
+
+	if (unlikely(off >= sizeof(vmstat_text))) {
+		WARN_ON_ONCE(1);
+		return 0;
+	}
 
 	seq_puts(m, vmstat_text[off]);
 	seq_put_decimal_ull(m, " ", *l);
@@ -1790,6 +1893,7 @@ void __init init_mm_internals(void)
 #endif
 #ifdef CONFIG_PROC_FS
 	proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations);
+	proc_create("allocinfo", 0444, NULL, &allocinfo_file_operations);
 	proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations);
 	proc_create("vmstat", 0444, NULL, &vmstat_file_operations);
 	proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations);
-- 
2.7.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH] mm: make allocation counters per-order
  2017-07-06 13:04 [PATCH] mm: make allocation counters per-order Roman Gushchin
@ 2017-07-06 13:19 ` Mel Gorman
  2017-07-06 14:46   ` Roman Gushchin
  2017-07-06 14:54   ` [PATCH] " Debabrata Banerjee
  2017-07-06 15:02 ` Christoph Lameter
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 18+ messages in thread
From: Mel Gorman @ 2017-07-06 13:19 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: linux-mm, Andrew Morton, Johannes Weiner, Michal Hocko,
	Vladimir Davydov, Rik van Riel, kernel-team, linux-kernel

On Thu, Jul 06, 2017 at 02:04:31PM +0100, Roman Gushchin wrote:
> High-order allocations are obviously more costly, and it's very useful
> to know how many of them happens, if there are any issues
> (or suspicions) with memory fragmentation.
> 
> This commit changes existing per-zone allocation counters to be
> per-zone per-order. These counters are displayed using a new
> procfs interface (similar to /proc/buddyinfo):
> 
> $ cat /proc/allocinfo
>      DMA          0          0          0          0          0 \
>        0          0          0          0          0          0
>    DMA32          3          0          1          0          0 \
>        0          0          0          0          0          0
>   Normal    4997056      23594      10902      23686        931 \
>       23        122        786         17          1          0
>  Movable          0          0          0          0          0 \
>        0          0          0          0          0          0
>   Device          0          0          0          0          0 \
>        0          0          0          0          0          0
> 
> The existing vmstat interface remains untouched*, and still shows
> the total number of single page allocations, so high-order allocations
> are represented as a corresponding number of order-0 allocations.
> 
> $ cat /proc/vmstat | grep alloc
> pgalloc_dma 0
> pgalloc_dma32 7
> pgalloc_normal 5461660
> pgalloc_movable 0
> pgalloc_device 0
> 
> * I've added device zone for consistency with other zones,
> and to avoid messy exclusion of this zone in the code.
> 

The alloc counter updates are themselves a surprisingly heavy cost to
the allocation path and this makes it worse for a debugging case that is
relatively rare. I'm extremely reluctant for such a patch to be added
given that the tracepoints can be used to assemble such a monitor even
if it means running a userspace daemon to keep track of it. Would such a
solution be suitable? Failing that if this is a severe issue, would it be
possible to at least make this a compile-time or static tracepoint option?
That way, only people that really need it have to take the penalty.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mm: make allocation counters per-order
  2017-07-06 13:19 ` Mel Gorman
@ 2017-07-06 14:46   ` Roman Gushchin
  2017-07-06 15:47     ` Mel Gorman
  2017-07-06 14:54   ` [PATCH] " Debabrata Banerjee
  1 sibling, 1 reply; 18+ messages in thread
From: Roman Gushchin @ 2017-07-06 14:46 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linux-mm, Andrew Morton, Johannes Weiner, Michal Hocko,
	Vladimir Davydov, Rik van Riel, kernel-team, linux-kernel

On Thu, Jul 06, 2017 at 02:19:41PM +0100, Mel Gorman wrote:
> On Thu, Jul 06, 2017 at 02:04:31PM +0100, Roman Gushchin wrote:
> > High-order allocations are obviously more costly, and it's very useful
> > to know how many of them happens, if there are any issues
> > (or suspicions) with memory fragmentation.
> > 
> > This commit changes existing per-zone allocation counters to be
> > per-zone per-order. These counters are displayed using a new
> > procfs interface (similar to /proc/buddyinfo):
> > 
> > $ cat /proc/allocinfo
> >      DMA          0          0          0          0          0 \
> >        0          0          0          0          0          0
> >    DMA32          3          0          1          0          0 \
> >        0          0          0          0          0          0
> >   Normal    4997056      23594      10902      23686        931 \
> >       23        122        786         17          1          0
> >  Movable          0          0          0          0          0 \
> >        0          0          0          0          0          0
> >   Device          0          0          0          0          0 \
> >        0          0          0          0          0          0
> > 
> > The existing vmstat interface remains untouched*, and still shows
> > the total number of single page allocations, so high-order allocations
> > are represented as a corresponding number of order-0 allocations.
> > 
> > $ cat /proc/vmstat | grep alloc
> > pgalloc_dma 0
> > pgalloc_dma32 7
> > pgalloc_normal 5461660
> > pgalloc_movable 0
> > pgalloc_device 0
> > 
> > * I've added device zone for consistency with other zones,
> > and to avoid messy exclusion of this zone in the code.
> > 
> 
> The alloc counter updates are themselves a surprisingly heavy cost to
> the allocation path and this makes it worse for a debugging case that is
> relatively rare. I'm extremely reluctant for such a patch to be added
> given that the tracepoints can be used to assemble such a monitor even
> if it means running a userspace daemon to keep track of it. Would such a
> solution be suitable? Failing that if this is a severe issue, would it be
> possible to at least make this a compile-time or static tracepoint option?
> That way, only people that really need it have to take the penalty.

I've tried to measure the difference with my patch applied and without
any accounting at all (__count_alloc_event() redefined to an empty function),
and I wasn't able to find any measurable difference.
Can you, please, provide more details, how your scenario looked like,
when alloc coutners were costly?

As new counters replace an old one, and both are per-cpu counters, I believe,
that the difference should be really small.

If there is a case, when the difference is meaningful,
I'll, of course, make the whole thing a compile-time option.

Thank you!

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mm: make allocation counters per-order
  2017-07-06 13:19 ` Mel Gorman
  2017-07-06 14:46   ` Roman Gushchin
@ 2017-07-06 14:54   ` Debabrata Banerjee
  2017-07-06 15:51     ` Mel Gorman
  1 sibling, 1 reply; 18+ messages in thread
From: Debabrata Banerjee @ 2017-07-06 14:54 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Roman Gushchin, linux-mm, Andrew Morton, Johannes Weiner,
	Michal Hocko, Vladimir Davydov, Rik van Riel, kernel-team,
	linux-kernel

On Thu, Jul 6, 2017 at 9:19 AM, Mel Gorman <mgorman@techsingularity.net> wrote:

> The alloc counter updates are themselves a surprisingly heavy cost to
> the allocation path and this makes it worse for a debugging case that is
> relatively rare. I'm extremely reluctant for such a patch to be added
> given that the tracepoints can be used to assemble such a monitor even
> if it means running a userspace daemon to keep track of it. Would such a
> solution be suitable? Failing that if this is a severe issue, would it be
> possible to at least make this a compile-time or static tracepoint option?
> That way, only people that really need it have to take the penalty.
>
> --
> Mel Gorman

We (Akamai) have been struggling with memory fragmentation issues for
years, and especially the inability to track positive or negative
changes to fragmentation between allocator changes and kernels without
simply looking for how many allocations are failing. We've had someone
toying with trying to report the same data via scanning all pages at
report time versus keeping running stats, although we don't have
working code yet. If it did work it would avoid the runtime overhead.
I don't believe tracepoints are a workable solution for us, since we
would have to be collecting the data from boot, as well as continually
processing the data in userspace at high cost. Ultimately the
locations and other properties (merge-ability) of the allocations in
the buddy groups are also important, which would be interesting to add
on-top of Roman's patch.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mm: make allocation counters per-order
  2017-07-06 13:04 [PATCH] mm: make allocation counters per-order Roman Gushchin
  2017-07-06 13:19 ` Mel Gorman
@ 2017-07-06 15:02 ` Christoph Lameter
  2017-07-07  1:54 ` kbuild test robot
  2017-07-07  6:06 ` kbuild test robot
  3 siblings, 0 replies; 18+ messages in thread
From: Christoph Lameter @ 2017-07-06 15:02 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: linux-mm, Andrew Morton, Mel Gorman, Johannes Weiner,
	Michal Hocko, Vladimir Davydov, Rik van Riel, kernel-team,
	linux-kernel

On Thu, 6 Jul 2017, Roman Gushchin wrote:

> +#define PGALLOC_EVENTS_SIZE (MAX_NR_ZONES * MAX_ORDER)
> +#define PGALLOC_EVENTS_CUT_SIZE (MAX_NR_ZONES * (MAX_ORDER - 1))
> +#define PGALLOC_FIRST_ZONE (PGALLOC_NORMAL - ZONE_NORMAL)


You are significantly increasing the per cpu counters (ZONES *
MAX_ORDER * cpus!!!). This will increase the cache footprint of critical
functions significantly and thus lead to regressions.

Typically counters for zones are placed in the zone structures but
you would also significantly increase the per zone counters ...

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mm: make allocation counters per-order
  2017-07-06 14:46   ` Roman Gushchin
@ 2017-07-06 15:47     ` Mel Gorman
  2017-07-06 16:43       ` Roman Gushchin
                         ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Mel Gorman @ 2017-07-06 15:47 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: linux-mm, Andrew Morton, Johannes Weiner, Michal Hocko,
	Vladimir Davydov, Rik van Riel, kernel-team, linux-kernel

On Thu, Jul 06, 2017 at 03:46:34PM +0100, Roman Gushchin wrote:
> > The alloc counter updates are themselves a surprisingly heavy cost to
> > the allocation path and this makes it worse for a debugging case that is
> > relatively rare. I'm extremely reluctant for such a patch to be added
> > given that the tracepoints can be used to assemble such a monitor even
> > if it means running a userspace daemon to keep track of it. Would such a
> > solution be suitable? Failing that if this is a severe issue, would it be
> > possible to at least make this a compile-time or static tracepoint option?
> > That way, only people that really need it have to take the penalty.
> 
> I've tried to measure the difference with my patch applied and without
> any accounting at all (__count_alloc_event() redefined to an empty function),
> and I wasn't able to find any measurable difference.
> Can you, please, provide more details, how your scenario looked like,
> when alloc coutners were costly?
> 

At the time I used a page allocator microbenchmark from mmtests to call
the allocator directly without zeroing pages. Triggering allocations from
userspace generally mask the overhead by the zeroing costs. It's just a few
cycles but given the budget for the page allocator in some circumstances
is tiny, it was noticable. perf was used to examine the cost.

> As new counters replace an old one, and both are per-cpu counters, I believe,
> that the difference should be really small.
> 

Minimally you add a new branch and a small number of computations. It's
small but it's there. The cache footprint of the counters is also increased.
That is hard to take given that it's overhead for everybody on the off-chance
it can debug something.

It's not a strong objection and I won't nak it on this basis but given
that the same information can be easily obtained using tracepoints
(optionally lower overhead with systemtap), the information is rarely
going to be useful (no latency information for example) and there is an
increased maintenance cost then it does not seem to be that useful.

Maybe it would be slightly more convincing if there was an example of
real problems in the field that can be debugged with this. For high-order
allocations, I previously found that it was the latency that was of the
most concern and not the absolute count that happened since the system
started. Granted, the same criticism could be leveled at the existing
alloc counters but at least by correlating that value with allocstall,
you can determine what percentage of allocations stalled recently and
optionally ftrace at that point to figure out why. The same steps would
indicate then if it's only high-order allocations that stall, add stack
tracing to figure out where they are coming from and go from there. Even if
the per-order counters exist, all the other debugging steps are necessary
so I'm struggling to see how I would use them properly.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mm: make allocation counters per-order
  2017-07-06 14:54   ` [PATCH] " Debabrata Banerjee
@ 2017-07-06 15:51     ` Mel Gorman
  2017-07-06 16:12       ` Debabrata Banerjee
  0 siblings, 1 reply; 18+ messages in thread
From: Mel Gorman @ 2017-07-06 15:51 UTC (permalink / raw)
  To: Debabrata Banerjee
  Cc: Roman Gushchin, linux-mm, Andrew Morton, Johannes Weiner,
	Michal Hocko, Vladimir Davydov, Rik van Riel, kernel-team,
	linux-kernel

On Thu, Jul 06, 2017 at 10:54:24AM -0400, Debabrata Banerjee wrote:
> On Thu, Jul 6, 2017 at 9:19 AM, Mel Gorman <mgorman@techsingularity.net> wrote:
> 
> > The alloc counter updates are themselves a surprisingly heavy cost to
> > the allocation path and this makes it worse for a debugging case that is
> > relatively rare. I'm extremely reluctant for such a patch to be added
> > given that the tracepoints can be used to assemble such a monitor even
> > if it means running a userspace daemon to keep track of it. Would such a
> > solution be suitable? Failing that if this is a severe issue, would it be
> > possible to at least make this a compile-time or static tracepoint option?
> > That way, only people that really need it have to take the penalty.
> >
> > --
> > Mel Gorman
> 
> We (Akamai) have been struggling with memory fragmentation issues for
> years, and especially the inability to track positive or negative
> changes to fragmentation between allocator changes and kernels without
> simply looking for how many allocations are failing. We've had someone
> toying with trying to report the same data via scanning all pages at
> report time versus keeping running stats, although we don't have
> working code yet. If it did work it would avoid the runtime overhead.
> I don't believe tracepoints are a workable solution for us, since we
> would have to be collecting the data from boot, as well as continually
> processing the data in userspace at high cost. Ultimately the
> locations and other properties (merge-ability) of the allocations in
> the buddy groups are also important, which would be interesting to add
> on-top of Roman's patch.

These counters do not actually help you solve that particular problem.
Knowing how many allocations happened since the system booted doesn't tell
you much about how many failed or why they failed. You don't even know
what frequency they occured at unless you monitor it constantly so you're
back to square one whether this information is available from proc or not.
There even is a tracepoint that can be used to track information related
to events that degrade fragmentation (trace_mm_page_alloc_extfrag) although
the primary thing it tells you is that "the probability that an allocation
will fail due to fragmentation in the future is potentially higher".

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mm: make allocation counters per-order
  2017-07-06 15:51     ` Mel Gorman
@ 2017-07-06 16:12       ` Debabrata Banerjee
  2017-07-06 16:43         ` Mel Gorman
  0 siblings, 1 reply; 18+ messages in thread
From: Debabrata Banerjee @ 2017-07-06 16:12 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Roman Gushchin, linux-mm, Andrew Morton, Johannes Weiner,
	Michal Hocko, Vladimir Davydov, Rik van Riel, kernel-team,
	linux-kernel

On Thu, Jul 6, 2017 at 11:51 AM, Mel Gorman <mgorman@techsingularity.net> wrote:
>
> These counters do not actually help you solve that particular problem.
> Knowing how many allocations happened since the system booted doesn't tell
> you much about how many failed or why they failed. You don't even know
> what frequency they occured at unless you monitor it constantly so you're
> back to square one whether this information is available from proc or not.
> There even is a tracepoint that can be used to track information related
> to events that degrade fragmentation (trace_mm_page_alloc_extfrag) although
> the primary thing it tells you is that "the probability that an allocation
> will fail due to fragmentation in the future is potentially higher".

I agree these counters don't have enough information, but there a
start to a first order approximation of the current state of memory.
buddyinfo and pagetypeinfo basically show no information now, because
they only involve the small amount of free memory under the watermark
and all our machines are in this state. As second order approximation,
it would be nice to be able to get answers like: "There are
reclaimable high order allocations of at least this order" and "None
of this order allocation can become available due to unmovable and
unreclaimable allocations"

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mm: make allocation counters per-order
  2017-07-06 15:47     ` Mel Gorman
@ 2017-07-06 16:43       ` Roman Gushchin
  2017-07-06 17:16         ` Mel Gorman
  2017-07-16 13:27       ` Roman Gushchin
  2017-07-16 13:29       ` Roman Gushchin
  2 siblings, 1 reply; 18+ messages in thread
From: Roman Gushchin @ 2017-07-06 16:43 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linux-mm, Andrew Morton, Johannes Weiner, Michal Hocko,
	Vladimir Davydov, Rik van Riel, kernel-team, linux-kernel

On Thu, Jul 06, 2017 at 04:47:05PM +0100, Mel Gorman wrote:
> On Thu, Jul 06, 2017 at 03:46:34PM +0100, Roman Gushchin wrote:
> > > The alloc counter updates are themselves a surprisingly heavy cost to
> > > the allocation path and this makes it worse for a debugging case that is
> > > relatively rare. I'm extremely reluctant for such a patch to be added
> > > given that the tracepoints can be used to assemble such a monitor even
> > > if it means running a userspace daemon to keep track of it. Would such a
> > > solution be suitable? Failing that if this is a severe issue, would it be
> > > possible to at least make this a compile-time or static tracepoint option?
> > > That way, only people that really need it have to take the penalty.
> > 
> > I've tried to measure the difference with my patch applied and without
> > any accounting at all (__count_alloc_event() redefined to an empty function),
> > and I wasn't able to find any measurable difference.
> > Can you, please, provide more details, how your scenario looked like,
> > when alloc coutners were costly?
> > 
> 
> At the time I used a page allocator microbenchmark from mmtests to call
> the allocator directly without zeroing pages. Triggering allocations from
> userspace generally mask the overhead by the zeroing costs. It's just a few
> cycles but given the budget for the page allocator in some circumstances
> is tiny, it was noticable. perf was used to examine the cost.

I'll try to measure the difference with mmtests.

I agree, that it's not a feature that worth significant performance penalty,
but if it's small even in a special benchmark, I'd say, it's acceptable.

> > As new counters replace an old one, and both are per-cpu counters, I believe,
> > that the difference should be really small.
> > 
> 
> Minimally you add a new branch and a small number of computations. It's
> small but it's there. The cache footprint of the counters is also increased.
> That is hard to take given that it's overhead for everybody on the off-chance
> it can debug something.
> 
> It's not a strong objection and I won't nak it on this basis but given
> that the same information can be easily obtained using tracepoints
> (optionally lower overhead with systemtap), the information is rarely
> going to be useful (no latency information for example) and there is an
> increased maintenance cost then it does not seem to be that useful.

Tracepoints are good for investigations on one machine, not so convenient
if we are talking about gathering stats from the fleet with production load.
Unfortunately, some memory fragmentation issues are hard to reproduce on
a single dev machine.

> Maybe it would be slightly more convincing if there was an example of
> real problems in the field that can be debugged with this. For high-order
> allocations, I previously found that it was the latency that was of the
> most concern and not the absolute count that happened since the system
> started.

We met an issue with compaction consuming too much CPU under some specific
conditions, and one of the suspicions was a significant number of high-order
allocations, requested by some third-party device drivers.

Knowing the number of allocations is especially helpful for comparing
different kernel versions in a such case, as it's hard to distinguish changes
in mm, changes in these drivers or just workload/environment changes,
leaded to an increased or decreased fragmentation.

Roman

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mm: make allocation counters per-order
  2017-07-06 16:12       ` Debabrata Banerjee
@ 2017-07-06 16:43         ` Mel Gorman
  0 siblings, 0 replies; 18+ messages in thread
From: Mel Gorman @ 2017-07-06 16:43 UTC (permalink / raw)
  To: Debabrata Banerjee
  Cc: Roman Gushchin, linux-mm, Andrew Morton, Johannes Weiner,
	Michal Hocko, Vladimir Davydov, Rik van Riel, kernel-team,
	linux-kernel

On Thu, Jul 06, 2017 at 12:12:47PM -0400, Debabrata Banerjee wrote:
> On Thu, Jul 6, 2017 at 11:51 AM, Mel Gorman <mgorman@techsingularity.net> wrote:
> >
> > These counters do not actually help you solve that particular problem.
> > Knowing how many allocations happened since the system booted doesn't tell
> > you much about how many failed or why they failed. You don't even know
> > what frequency they occured at unless you monitor it constantly so you're
> > back to square one whether this information is available from proc or not.
> > There even is a tracepoint that can be used to track information related
> > to events that degrade fragmentation (trace_mm_page_alloc_extfrag) although
> > the primary thing it tells you is that "the probability that an allocation
> > will fail due to fragmentation in the future is potentially higher".
> 
> I agree these counters don't have enough information, but there a
> start to a first order approximation of the current state of memory.

That incurs a universal cost on the off-chance of debugging and ultimately
the debugging is only useful in combination with developing kernel patches
in which case it could be behind a kconfig option.

> buddyinfo and pagetypeinfo basically show no information now, because

They can be used to calculate a fragmentation index at a given point in
time. Admittedly, building a bigger picture requires a full scan of memory
(and that's what was required when fragmentation avoidance was first
being implemented).

> they only involve the small amount of free memory under the watermark
> and all our machines are in this state. As second order approximation,
> it would be nice to be able to get answers like: "There are
> reclaimable high order allocations of at least this order" and "None
> of this order allocation can become available due to unmovable and
> unreclaimable allocations"

Which this patch doesn't provide as what you are looking for requires
a full scan of memory to determine. I've done it in the past using a
severe abuse of systemtap to load a module that scans all of memory with
a variation of PAGE_OWNER to identify stack traces of pages that "don't
belonw" within a pageblock.

Even *with* that information, your options for tuning an unmodified kernel
are basically limited to increasing min_free_kbytes, altering THP's level
of aggression when compacting or brute forcing with either drop_caches,
compact_node or both. All other options after that require kernel patches
-- altering annotations, altering fallback mechanisms, altering compaction,
improving support for pages that can be migrated etc.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mm: make allocation counters per-order
  2017-07-06 16:43       ` Roman Gushchin
@ 2017-07-06 17:16         ` Mel Gorman
  2017-07-06 18:00           ` Debabrata Banerjee
  0 siblings, 1 reply; 18+ messages in thread
From: Mel Gorman @ 2017-07-06 17:16 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: linux-mm, Andrew Morton, Johannes Weiner, Michal Hocko,
	Vladimir Davydov, Rik van Riel, kernel-team, linux-kernel

On Thu, Jul 06, 2017 at 05:43:04PM +0100, Roman Gushchin wrote:
> > At the time I used a page allocator microbenchmark from mmtests to call
> > the allocator directly without zeroing pages. Triggering allocations from
> > userspace generally mask the overhead by the zeroing costs. It's just a few
> > cycles but given the budget for the page allocator in some circumstances
> > is tiny, it was noticable. perf was used to examine the cost.
> 
> I'll try to measure the difference with mmtests.
> 
> I agree, that it's not a feature that worth significant performance penalty,
> but if it's small even in a special benchmark, I'd say, it's acceptable.
> 

Note that even if you keep the cycle overhead down, the CPU cache footprint
for such a large increase remains. That will be permanent and unfixable which
is why I would like a Kconfig option at the very least for the vast majority
of people that have no intention or ability to debug such a situation.

> > > As new counters replace an old one, and both are per-cpu counters, I believe,
> > > that the difference should be really small.
> > > 
> > 
> > Minimally you add a new branch and a small number of computations. It's
> > small but it's there. The cache footprint of the counters is also increased.
> > That is hard to take given that it's overhead for everybody on the off-chance
> > it can debug something.
> > 
> > It's not a strong objection and I won't nak it on this basis but given
> > that the same information can be easily obtained using tracepoints
> > (optionally lower overhead with systemtap), the information is rarely
> > going to be useful (no latency information for example) and there is an
> > increased maintenance cost then it does not seem to be that useful.
> 
> Tracepoints are good for investigations on one machine, not so convenient
> if we are talking about gathering stats from the fleet with production load.
> Unfortunately, some memory fragmentation issues are hard to reproduce on
> a single dev machine.
> 

Sure, but just knowing that some high-order allocations occurred in the
past doesn't help either.

> > Maybe it would be slightly more convincing if there was an example of
> > real problems in the field that can be debugged with this. For high-order
> > allocations, I previously found that it was the latency that was of the
> > most concern and not the absolute count that happened since the system
> > started.
> 
> We met an issue with compaction consuming too much CPU under some specific
> conditions, and one of the suspicions was a significant number of high-order
> allocations, requested by some third-party device drivers.
> 

Even if this was the suspicion, you would have to activate monitoring on
the machine under load at the time the problem is occurring to determine if
the high-order allocations are currently happening or happened in the past.
If you are continually logging this data then logging allocation stalls for
high-order allocations would give you similar information. If you have to
activate a monitor anyway (or an agent that monitors for high CPU usage),
then it might as well be ftrace based as well as anything else. Even a
basic systemtap script would be able to capture only stack traces for
allocation requests that take longer than a threshold to limit the amount
of data recorded. Even *if* you had these counters running on your grid,
they will tell you nothing about how long those allocations are or whether
compaction is involved and that is what is key to begin debugging the issue.

A basic monitor of /proc/vmstat for the compact_* can be used an indication
of excessive time spent in compaction although you're back to ftrace to
quantify how much of a problem it is in terms of time. For example,
rapidly increasing compact_fail combined with rapidly increasing
compact_migrate_scanned and compact_free_scanned will tell you that
compaction is active and failing with a comparison of the ratio of
compact_fail to compact_success telling you if it's persistent or slow
progress. You'd need top information to see if it's the compaction daemon
that is consuming all the CPU or processes. If it's the daemon then that
points you in the direction of what potentially needs fixing. If it's
processes then there is a greater problem and ftrace needs to be used to
establish *what* is doing the high-allocation requests and whether they
can be reduced somehow or whether it's a general fragmentation problem
(in which case your day takes a turn for the worse).

What I'm trying to say is that in themselves, an high-order allocation
count doesn't help debug this class of problem as much as you'd think.
Hopefully the above information is more useful to you in helping debug
what's actually wrong.

> Knowing the number of allocations is especially helpful for comparing
> different kernel versions in a such case, as it's hard to distinguish changes
> in mm, changes in these drivers or just workload/environment changes,
> leaded to an increased or decreased fragmentation.
> 

I'm still struggling to see how counters help when an agent that monitors
for high CPU usage could be activated that captures tracing to see if it's
allocation and compaction stalls that are contributing to the overall load
or "something else".

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mm: make allocation counters per-order
  2017-07-06 17:16         ` Mel Gorman
@ 2017-07-06 18:00           ` Debabrata Banerjee
  2017-07-06 20:02             ` Mel Gorman
  0 siblings, 1 reply; 18+ messages in thread
From: Debabrata Banerjee @ 2017-07-06 18:00 UTC (permalink / raw)
  To: Mel Gorman
  Cc: Roman Gushchin, linux-mm, Andrew Morton, Johannes Weiner,
	Michal Hocko, Vladimir Davydov, Rik van Riel, kernel-team,
	linux-kernel

On Thu, Jul 6, 2017 at 1:16 PM, Mel Gorman <mgorman@techsingularity.net> wrote:
>
> I'm still struggling to see how counters help when an agent that monitors
> for high CPU usage could be activated
>

I suspect Roman has the same problem set as us, the CPU usage is
either always high, high and service critical likely when something
interesting is happening. We'd like to collect data on 200k machines,
and study the results statistically and with respect to time based on
kernel versions, build configs, hardware types, process types, load
patterns, etc, etc. Even finding good candidate machines and at the
right time of day to manually debug with ftrace is problematic.
Granted we could be utilizing existing counters like compact_fail
better. Ultimately the data either leads to dealing with certain bad
actors, different vm tunings, or patches to mm.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mm: make allocation counters per-order
  2017-07-06 18:00           ` Debabrata Banerjee
@ 2017-07-06 20:02             ` Mel Gorman
  0 siblings, 0 replies; 18+ messages in thread
From: Mel Gorman @ 2017-07-06 20:02 UTC (permalink / raw)
  To: Debabrata Banerjee
  Cc: Roman Gushchin, linux-mm, Andrew Morton, Johannes Weiner,
	Michal Hocko, Vladimir Davydov, Rik van Riel, kernel-team,
	linux-kernel

On Thu, Jul 06, 2017 at 02:00:00PM -0400, Debabrata Banerjee wrote:
> On Thu, Jul 6, 2017 at 1:16 PM, Mel Gorman <mgorman@techsingularity.net> wrote:
> >
> > I'm still struggling to see how counters help when an agent that monitors
> > for high CPU usage could be activated
> >
> 
> I suspect Roman has the same problem set as us, the CPU usage is
> either always high, high and service critical likely when something
> interesting is happening. We'd like to collect data on 200k machines,
> and study the results statistically and with respect to time based on
> kernel versions, build configs, hardware types, process types, load
> patterns, etc, etc. Even finding good candidate machines and at the
> right time of day to manually debug with ftrace is problematic.
> Granted we could be utilizing existing counters like compact_fail
> better. Ultimately the data either leads to dealing with certain bad
> actors, different vm tunings, or patches to mm.

Same issue as described in the other mail. The number of high-order
allocations that happened in the past or even the recent past does not
give you useful information for debugging high-order allocation stalls
or fragmentation-related issues. If the high-order allocations are
steady then two machines running similar workloads can both have similar
allocation counts but only one of them may be experiencing high latency.
Similarly, with high CPU usage, it may be due to compaction or a whole
variety of other factors. Even doing a statistical analysis is not going
to be enough unless all the relevant variables are accounted for and the
raw allocation count in isolation is one of the weakest variables to
draw conclusions from.

Correlating allocstall with compaction activity from just /proc/vmstat gives
a much better hint as to whether high CPU activity is due to high-order
allocations. Combining it with top will indicate whether it's direct or
indirect costs. If it really is high-order allocations then ftrace to
identify the source of the high-order allocations becomes relevant and if
it's due to fragmentation, it's a case of tracing the allocator itself to
determine why the fragmentation occurred.

The proc file with allocation counts is such a tiny part of debugging this
class of problem that it's almost irrelevant which is why minimally I think
it should be behind Kconfig at absolute minimum. If you want to activate
it across production machines then by all means go ahead and if so, I'd
be very interested in hearing what class of problem could be debugged and
either tuned or fixed without needing ftrace to gather more information. I
say "almost irrelevant" because technically, correlating high allocation
counts with a kernel version change may be a relevant factor if a kernel
introduced a new source of high-order allocations but I suspect that's
the exception. It would be much more interesting to correlate increased
latency with a kernel version because it's much more relevant. You may
be able to correlate high allocation counts with particular hardware
(particularly network hardware that cannot scatter/gather) *but* the
same proc will will not tell you if those increased requests are
actually a problem so the usefulness is diminished.

I'm not saying that fragmentation and high-order allocation stalls are not a
problem because they can be, but the proc file is unlikely to help and even
an extremely basic systemtap script would give you the same information,
work on much older kernels and with a trivial amount of additional work
it can gather latency information as well as counts.

-- 
Mel Gorman
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mm: make allocation counters per-order
  2017-07-06 13:04 [PATCH] mm: make allocation counters per-order Roman Gushchin
  2017-07-06 13:19 ` Mel Gorman
  2017-07-06 15:02 ` Christoph Lameter
@ 2017-07-07  1:54 ` kbuild test robot
  2017-07-07  6:06 ` kbuild test robot
  3 siblings, 0 replies; 18+ messages in thread
From: kbuild test robot @ 2017-07-07  1:54 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: kbuild-all, linux-mm, Andrew Morton, Mel Gorman, Johannes Weiner,
	Michal Hocko, Vladimir Davydov, Rik van Riel, kernel-team,
	linux-kernel

[-- Attachment #1: Type: text/plain, Size: 6134 bytes --]

Hi Roman,

[auto build test ERROR on mmotm/master]
[also build test ERROR on next-20170706]
[cannot apply to v4.12]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Roman-Gushchin/mm-make-allocation-counters-per-order/20170707-091630
base:   git://git.cmpxchg.org/linux-mmotm.git master
config: x86_64-randconfig-x019-201727 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

   mm/vmstat.c: In function 'allocinfo_show':
>> mm/vmstat.c:1513:2: error: implicit declaration of function 'sum_vm_events' [-Werror=implicit-function-declaration]
     sum_vm_events(allocs, PGALLOC_FIRST_ZONE, PGALLOC_EVENTS_SIZE);
     ^~~~~~~~~~~~~
   At top level:
   mm/vmstat.c:1491:13: warning: 'sum_alloc_events' defined but not used [-Wunused-function]
    static void sum_alloc_events(unsigned long *v)
                ^~~~~~~~~~~~~~~~
   Cyclomatic Complexity 5 include/linux/compiler.h:__read_once_size
   Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:fls64
   Cyclomatic Complexity 1 include/linux/log2.h:__ilog2_u64
   Cyclomatic Complexity 1 include/asm-generic/getorder.h:__get_order
   Cyclomatic Complexity 2 arch/x86/include/asm/jump_label.h:arch_static_branch
   Cyclomatic Complexity 1 arch/x86/include/asm/atomic64_64.h:atomic64_read
   Cyclomatic Complexity 1 include/asm-generic/atomic-long.h:atomic_long_read
   Cyclomatic Complexity 1 include/linux/math64.h:div_u64_rem
   Cyclomatic Complexity 1 include/linux/math64.h:div_u64
   Cyclomatic Complexity 1 include/linux/err.h:ERR_PTR
   Cyclomatic Complexity 1 include/linux/spinlock.h:spinlock_check
   Cyclomatic Complexity 1 include/linux/spinlock.h:spin_unlock_irqrestore
   Cyclomatic Complexity 1 include/linux/nodemask.h:node_state
   Cyclomatic Complexity 1 include/linux/mmzone.h:zone_end_pfn
   Cyclomatic Complexity 1 include/linux/mmzone.h:populated_zone
   Cyclomatic Complexity 2 include/linux/mmzone.h:__nr_to_section
   Cyclomatic Complexity 1 include/linux/mmzone.h:__section_mem_map_addr
   Cyclomatic Complexity 3 include/linux/mmzone.h:valid_section
   Cyclomatic Complexity 1 include/linux/mmzone.h:__pfn_to_section
   Cyclomatic Complexity 2 include/linux/mmzone.h:pfn_valid
   Cyclomatic Complexity 1 include/linux/mmzone.h:memmap_valid_within
   Cyclomatic Complexity 1 include/linux/mm.h:page_zonenum
   Cyclomatic Complexity 1 include/linux/mm.h:page_zone
   Cyclomatic Complexity 1 include/linux/mm.h:page_to_section
   Cyclomatic Complexity 1 include/linux/vmstat.h:global_page_state
   Cyclomatic Complexity 1 include/linux/vmstat.h:global_node_page_state
   Cyclomatic Complexity 1 include/linux/vmstat.h:zone_page_state
   Cyclomatic Complexity 28 include/linux/slab.h:kmalloc_index
   Cyclomatic Complexity 68 include/linux/slab.h:kmalloc_large
   Cyclomatic Complexity 5 include/linux/slab.h:kmalloc
   Cyclomatic Complexity 1 include/linux/cpu.h:cpus_read_lock
   Cyclomatic Complexity 1 include/linux/cpu.h:cpus_read_unlock
   Cyclomatic Complexity 1 include/linux/cpu.h:get_online_cpus
   Cyclomatic Complexity 1 include/linux/cpu.h:put_online_cpus
   Cyclomatic Complexity 1 include/linux/proc_fs.h:proc_create
   Cyclomatic Complexity 3 mm/vmstat.c:fill_contig_page_info
   Cyclomatic Complexity 3 mm/vmstat.c:__fragmentation_index
   Cyclomatic Complexity 1 mm/vmstat.c:frag_stop
   Cyclomatic Complexity 6 mm/vmstat.c:walk_zones_in_node
   Cyclomatic Complexity 1 mm/vmstat.c:frag_show
   Cyclomatic Complexity 3 mm/vmstat.c:is_zone_first_populated
   Cyclomatic Complexity 1 mm/vmstat.c:zoneinfo_show
   Cyclomatic Complexity 2 mm/vmstat.c:allocinfo_start
   Cyclomatic Complexity 1 mm/vmstat.c:allocinfo_next
   Cyclomatic Complexity 1 mm/vmstat.c:allocinfo_stop
   Cyclomatic Complexity 3 mm/vmstat.c:vmstat_next
   Cyclomatic Complexity 2 mm/vmstat.c:unusable_free_index
   Cyclomatic Complexity 2 mm/vmstat.c:unusable_show
   Cyclomatic Complexity 1 mm/vmstat.c:extfrag_show
   Cyclomatic Complexity 1 mm/vmstat.c:zoneinfo_open
   Cyclomatic Complexity 1 mm/vmstat.c:vmstat_open
   Cyclomatic Complexity 1 mm/vmstat.c:pagetypeinfo_open
   Cyclomatic Complexity 1 mm/vmstat.c:allocinfo_open
   Cyclomatic Complexity 1 mm/vmstat.c:fragmentation_open
   Cyclomatic Complexity 1 mm/vmstat.c:extfrag_open
   Cyclomatic Complexity 1 mm/vmstat.c:unusable_open
   Cyclomatic Complexity 7 mm/vmstat.c:zoneinfo_show_print
   Cyclomatic Complexity 2 mm/vmstat.c:pagetypeinfo_showfree
   Cyclomatic Complexity 4 mm/vmstat.c:pagetypeinfo_showfree_print
   Cyclomatic Complexity 2 mm/vmstat.c:pagetypeinfo_showblockcount
   Cyclomatic Complexity 2 mm/vmstat.c:frag_show_print
   Cyclomatic Complexity 2 mm/vmstat.c:extfrag_show_print
   Cyclomatic Complexity 2 mm/vmstat.c:unusable_show_print
   Cyclomatic Complexity 1 mm/vmstat.c:frag_next
   Cyclomatic Complexity 2 mm/vmstat.c:frag_start
   Cyclomatic Complexity 4 mm/vmstat.c:vmstat_show
   Cyclomatic Complexity 1 mm/vmstat.c:vmstat_stop
   Cyclomatic Complexity 5 mm/vmstat.c:vmstat_start
   Cyclomatic Complexity 8 mm/vmstat.c:pagetypeinfo_showblockcount_print
   Cyclomatic Complexity 3 mm/vmstat.c:pagetypeinfo_showmixedcount
   Cyclomatic Complexity 2 mm/vmstat.c:pagetypeinfo_show
   Cyclomatic Complexity 4 mm/vmstat.c:allocinfo_show
   Cyclomatic Complexity 4 mm/vmstat.c:extfrag_debug_init
   Cyclomatic Complexity 1 mm/vmstat.c:fragmentation_index
   Cyclomatic Complexity 1 mm/vmstat.c:init_mm_internals
   cc1: some warnings being treated as errors

vim +/sum_vm_events +1513 mm/vmstat.c

  1507		int zid;
  1508	
  1509		if (arg != SEQ_START_TOKEN)
  1510			return 0;
  1511	
  1512		get_online_cpus();
> 1513		sum_vm_events(allocs, PGALLOC_FIRST_ZONE, PGALLOC_EVENTS_SIZE);
  1514		put_online_cpus();
  1515	
  1516		for (zid = 0; zid < MAX_NR_ZONES; ++zid) {

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 29631 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mm: make allocation counters per-order
  2017-07-06 13:04 [PATCH] mm: make allocation counters per-order Roman Gushchin
                   ` (2 preceding siblings ...)
  2017-07-07  1:54 ` kbuild test robot
@ 2017-07-07  6:06 ` kbuild test robot
  3 siblings, 0 replies; 18+ messages in thread
From: kbuild test robot @ 2017-07-07  6:06 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: kbuild-all, linux-mm, Andrew Morton, Mel Gorman, Johannes Weiner,
	Michal Hocko, Vladimir Davydov, Rik van Riel, kernel-team,
	linux-kernel

[-- Attachment #1: Type: text/plain, Size: 3271 bytes --]

Hi Roman,

[auto build test WARNING on mmotm/master]
[also build test WARNING on next-20170706]
[cannot apply to v4.12]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Roman-Gushchin/mm-make-allocation-counters-per-order/20170707-091630
base:   git://git.cmpxchg.org/linux-mmotm.git master
config: s390-default_defconfig (attached as .config)
compiler: s390x-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
        wget https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=s390 

All warnings (new ones prefixed by >>):

   mm/memcontrol.c: In function 'memory_stat_show':
>> mm/memcontrol.c:5262:1: warning: the frame size of 1136 bytes is larger than 1024 bytes [-Wframe-larger-than=]
    }
    ^

vim +5262 mm/memcontrol.c

f10eb7534 Roman Gushchin  2017-06-30  5246  		   events[PGSCAN_DIRECT]);
f10eb7534 Roman Gushchin  2017-06-30  5247  	seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
f10eb7534 Roman Gushchin  2017-06-30  5248  		   events[PGSTEAL_DIRECT]);
f10eb7534 Roman Gushchin  2017-06-30  5249  	seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]);
f10eb7534 Roman Gushchin  2017-06-30  5250  	seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
f10eb7534 Roman Gushchin  2017-06-30  5251  	seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
f10eb7534 Roman Gushchin  2017-06-30  5252  	seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
f10eb7534 Roman Gushchin  2017-06-30  5253  
2a2e48854 Johannes Weiner 2017-05-03  5254  	seq_printf(m, "workingset_refault %lu\n",
71cd31135 Johannes Weiner 2017-05-03  5255  		   stat[WORKINGSET_REFAULT]);
2a2e48854 Johannes Weiner 2017-05-03  5256  	seq_printf(m, "workingset_activate %lu\n",
71cd31135 Johannes Weiner 2017-05-03  5257  		   stat[WORKINGSET_ACTIVATE]);
2a2e48854 Johannes Weiner 2017-05-03  5258  	seq_printf(m, "workingset_nodereclaim %lu\n",
71cd31135 Johannes Weiner 2017-05-03  5259  		   stat[WORKINGSET_NODERECLAIM]);
2a2e48854 Johannes Weiner 2017-05-03  5260  
587d9f726 Johannes Weiner 2016-01-20  5261  	return 0;
587d9f726 Johannes Weiner 2016-01-20 @5262  }
587d9f726 Johannes Weiner 2016-01-20  5263  
241994ed8 Johannes Weiner 2015-02-11  5264  static struct cftype memory_files[] = {
241994ed8 Johannes Weiner 2015-02-11  5265  	{
241994ed8 Johannes Weiner 2015-02-11  5266  		.name = "current",
f5fc3c5d8 Johannes Weiner 2015-11-05  5267  		.flags = CFTYPE_NOT_ON_ROOT,
241994ed8 Johannes Weiner 2015-02-11  5268  		.read_u64 = memory_current_read,
241994ed8 Johannes Weiner 2015-02-11  5269  	},
241994ed8 Johannes Weiner 2015-02-11  5270  	{

:::::: The code at line 5262 was first introduced by commit
:::::: 587d9f726aaec52157e4156e50363dbe6cb82bdb mm: memcontrol: basic memory statistics in cgroup2 memory controller

:::::: TO: Johannes Weiner <hannes@cmpxchg.org>
:::::: CC: Linus Torvalds <torvalds@linux-foundation.org>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 17324 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mm: make allocation counters per-order
  2017-07-06 15:47     ` Mel Gorman
  2017-07-06 16:43       ` Roman Gushchin
@ 2017-07-16 13:27       ` Roman Gushchin
  2017-07-16 13:29       ` Roman Gushchin
  2 siblings, 0 replies; 18+ messages in thread
From: Roman Gushchin @ 2017-07-16 13:27 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linux-mm, Andrew Morton, Johannes Weiner, Michal Hocko,
	Vladimir Davydov, Rik van Riel, kernel-team, linux-kernel

On Thu, Jul 06, 2017 at 04:47:05PM +0100, Mel Gorman wrote:
> On Thu, Jul 06, 2017 at 03:46:34PM +0100, Roman Gushchin wrote:
> > > The alloc counter updates are themselves a surprisingly heavy cost to
> > > the allocation path and this makes it worse for a debugging case that is
> > > relatively rare. I'm extremely reluctant for such a patch to be added
> > > given that the tracepoints can be used to assemble such a monitor even
> > > if it means running a userspace daemon to keep track of it. Would such a
> > > solution be suitable? Failing that if this is a severe issue, would it be
> > > possible to at least make this a compile-time or static tracepoint option?
> > > That way, only people that really need it have to take the penalty.
> > 
> > I've tried to measure the difference with my patch applied and without
> > any accounting at all (__count_alloc_event() redefined to an empty function),
> > and I wasn't able to find any measurable difference.
> > Can you, please, provide more details, how your scenario looked like,
> > when alloc coutners were costly?
> > 
> 
> At the time I used a page allocator microbenchmark from mmtests to call
> the allocator directly without zeroing pages. Triggering allocations from
> userspace generally mask the overhead by the zeroing costs. It's just a few
> cycles but given the budget for the page allocator in some circumstances
> is tiny, it was noticable. perf was used to examine the cost.
> 
> > As new counters replace an old one, and both are per-cpu counters, I believe,
> > that the difference should be really small.
> > 
> 
> Minimally you add a new branch and a small number of computations. It's
> small but it's there. The cache footprint of the counters is also increased.
> That is hard to take given that it's overhead for everybody on the off-chance
> it can debug something.
> 
> It's not a strong objection and I won't nak it on this basis but given
> that the same information can be easily obtained using tracepoints
> (optionally lower overhead with systemtap), the information is rarely
> going to be useful (no latency information for example) and there is an
> increased maintenance cost then it does not seem to be that useful.

I ran page allocator microbenchmark on raw 4.12 and 4.12 + the change
on my hardware.

Here are the results (raw 4.12 and patched 4.12 corrspondigly):
order  0 batch      1 alloc 1942 free 1041     order  0 batch      1 alloc 822 free 451
order  0 batch      1 alloc 596 free 306       order  0 batch      1 alloc 493 free 290
order  0 batch      1 alloc 417 free 245       order  0 batch      1 alloc 526 free 286
order  0 batch      1 alloc 419 free 243       order  0 batch      1 alloc 435 free 255
order  0 batch      1 alloc 411 free 243       order  0 batch      1 alloc 423 free 240
order  0 batch      1 alloc 417 free 241       order  0 batch      1 alloc 406 free 239
order  0 batch      1 alloc 376 free 225       order  0 batch      1 alloc 383 free 219
order  0 batch      1 alloc 416 free 222       order  0 batch      1 alloc 355 free 205
order  0 batch      1 alloc 312 free 183       order  0 batch      1 alloc 438 free 216
order  0 batch      1 alloc 315 free 181       order  0 batch      1 alloc 347 free 194
order  0 batch      1 alloc 305 free 181       order  0 batch      1 alloc 317 free 185
order  0 batch      1 alloc 307 free 179       order  0 batch      1 alloc 329 free 191
order  0 batch      1 alloc 308 free 178       order  0 batch      1 alloc 335 free 192
order  0 batch      1 alloc 314 free 180       order  0 batch      1 alloc 350 free 190
order  0 batch      1 alloc 301 free 180       order  0 batch      1 alloc 319 free 184
order  0 batch      1 alloc 1807 free 1002     order  0 batch      1 alloc 813 free 459
order  0 batch      1 alloc 633 free 302       order  0 batch      1 alloc 500 free 287
order  0 batch      1 alloc 331 free 194       order  0 batch      1 alloc 609 free 300
order  0 batch      1 alloc 332 free 194       order  0 batch      1 alloc 443 free 255
order  0 batch      1 alloc 330 free 194       order  0 batch      1 alloc 410 free 239
order  0 batch      1 alloc 331 free 194       order  0 batch      1 alloc 383 free 222
order  0 batch      1 alloc 386 free 214       order  0 batch      1 alloc 372 free 212
order  0 batch      1 alloc 370 free 212       order  0 batch      1 alloc 342 free 203
order  0 batch      1 alloc 360 free 208       order  0 batch      1 alloc 428 free 216
order  0 batch      1 alloc 324 free 186       order  0 batch      1 alloc 350 free 195
order  0 batch      1 alloc 298 free 179       order  0 batch      1 alloc 320 free 186
order  0 batch      1 alloc 293 free 173       order  0 batch      1 alloc 323 free 187
order  0 batch      1 alloc 296 free 173       order  0 batch      1 alloc 320 free 188
order  0 batch      1 alloc 294 free 173       order  0 batch      1 alloc 321 free 186
order  0 batch      1 alloc 312 free 174       order  0 batch      1 alloc 320 free 189
order  0 batch      1 alloc 1927 free 1042     order  0 batch      1 alloc 2016 free 10
order  0 batch      1 alloc 856 free 522       order  0 batch      1 alloc 1805 free 10
order  0 batch      1 alloc 372 free 225       order  0 batch      1 alloc 1485 free 73
order  0 batch      1 alloc 375 free 224       order  0 batch      1 alloc 732 free 419
order  0 batch      1 alloc 419 free 234       order  0 batch      1 alloc 576 free 327
order  0 batch      1 alloc 389 free 233       order  0 batch      1 alloc 488 free 280
order  0 batch      1 alloc 376 free 223       order  0 batch      1 alloc 390 free 233
order  0 batch      1 alloc 331 free 196       order  0 batch      1 alloc 409 free 227
order  0 batch      1 alloc 328 free 191       order  0 batch      1 alloc 338 free 198
order  0 batch      1 alloc 307 free 182       order  0 batch      1 alloc 320 free 186
order  0 batch      1 alloc 307 free 183       order  0 batch      1 alloc 322 free 187
order  0 batch      1 alloc 304 free 183       order  0 batch      1 alloc 296 free 173
order  0 batch      1 alloc 303 free 183       order  0 batch      1 alloc 297 free 173
order  0 batch      1 alloc 303 free 176       order  0 batch      1 alloc 296 free 173
order  0 batch      1 alloc 294 free 176       order  0 batch      1 alloc 296 free 173
order  0 batch      1 alloc 716 free 433       order  0 batch      1 alloc 725 free 421
order  0 batch      1 alloc 487 free 286       order  0 batch      1 alloc 485 free 287
order  0 batch      1 alloc 499 free 296       order  0 batch      1 alloc 627 free 300
order  0 batch      1 alloc 437 free 253       order  0 batch      1 alloc 406 free 238
order  0 batch      1 alloc 412 free 242       order  0 batch      1 alloc 390 free 226
order  0 batch      1 alloc 391 free 228       order  0 batch      1 alloc 364 free 218
order  0 batch      1 alloc 379 free 220       order  0 batch      1 alloc 353 free 213
order  0 batch      1 alloc 349 free 210       order  0 batch      1 alloc 334 free 200
order  0 batch      1 alloc 350 free 207       order  0 batch      1 alloc 328 free 194
order  0 batch      1 alloc 402 free 229       order  0 batch      1 alloc 406 free 202
order  0 batch      1 alloc 329 free 188       order  0 batch      1 alloc 333 free 188
order  0 batch      1 alloc 310 free 182       order  0 batch      1 alloc 318 free 188
order  0 batch      1 alloc 307 free 180       order  0 batch      1 alloc 319 free 186
order  0 batch      1 alloc 304 free 180       order  0 batch      1 alloc 320 free 183
order  0 batch      1 alloc 307 free 180       order  0 batch      1 alloc 317 free 185
order  0 batch      1 alloc 827 free 479       order  0 batch      1 alloc 667 free 375
order  0 batch      1 alloc 389 free 228       order  0 batch      1 alloc 479 free 276
order  0 batch      1 alloc 509 free 256       order  0 batch      1 alloc 599 free 294
order  0 batch      1 alloc 338 free 204       order  0 batch      1 alloc 412 free 243
order  0 batch      1 alloc 331 free 194       order  0 batch      1 alloc 376 free 227
order  0 batch      1 alloc 318 free 189       order  0 batch      1 alloc 363 free 211
order  0 batch      1 alloc 305 free 180       order  0 batch      1 alloc 343 free 204
order  0 batch      1 alloc 307 free 191       order  0 batch      1 alloc 332 free 200
order  0 batch      1 alloc 304 free 180       order  0 batch      1 alloc 415 free 206
order  0 batch      1 alloc 351 free 195       order  0 batch      1 alloc 328 free 199
order  0 batch      1 alloc 351 free 193       order  0 batch      1 alloc 321 free 185
order  0 batch      1 alloc 315 free 184       order  0 batch      1 alloc 318 free 185
order  0 batch      1 alloc 317 free 194       order  0 batch      1 alloc 319 free 193
order  0 batch      1 alloc 298 free 179       order  0 batch      1 alloc 323 free 187
order  0 batch      1 alloc 293 free 175       order  0 batch      1 alloc 324 free 184

TBH, I can't see any meaningful difference here, but it might
depend on hardware, of course. As most of the allocations are
order 0, and all other nearby counters are not so hot,
the cache footprint increase should be not so important.

Anyway, I've added a config option.

Thanks!

Roman

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] mm: make allocation counters per-order
  2017-07-06 15:47     ` Mel Gorman
  2017-07-06 16:43       ` Roman Gushchin
  2017-07-16 13:27       ` Roman Gushchin
@ 2017-07-16 13:29       ` Roman Gushchin
  2017-07-16 13:55         ` [v2] " kbuild test robot
  2 siblings, 1 reply; 18+ messages in thread
From: Roman Gushchin @ 2017-07-16 13:29 UTC (permalink / raw)
  To: Mel Gorman
  Cc: linux-mm, Andrew Morton, Johannes Weiner, Michal Hocko,
	Vladimir Davydov, Rik van Riel, kernel-team, linux-kernel



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [v2] mm: make allocation counters per-order
  2017-07-16 13:29       ` Roman Gushchin
@ 2017-07-16 13:55         ` kbuild test robot
  0 siblings, 0 replies; 18+ messages in thread
From: kbuild test robot @ 2017-07-16 13:55 UTC (permalink / raw)
  To: Roman Gushchin
  Cc: kbuild-all, Mel Gorman, linux-mm, Andrew Morton, Johannes Weiner,
	Michal Hocko, Vladimir Davydov, Rik van Riel, kernel-team,
	linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1883 bytes --]

Hi Roman,

[auto build test WARNING on mmotm/master]
[also build test WARNING on v4.13-rc1 next-20170714]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Roman-Gushchin/mm-make-allocation-counters-per-order/20170716-213429
base:   git://git.cmpxchg.org/linux-mmotm.git master
config: s390-allyesconfig (attached as .config)
compiler: s390x-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
        wget https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=s390 

All warnings (new ones prefixed by >>):

>> arch/s390//appldata/appldata_mem.c:70:15: warning: return type defaults to 'int' [-Wreturn-type]
    static inline sum_pgalloc_events(u64 *pgalloc, unsigned long *ev)
                  ^~~~~~~~~~~~~~~~~~
   arch/s390//appldata/appldata_mem.c: In function 'sum_pgalloc_events':
>> arch/s390//appldata/appldata_mem.c:78:1: warning: control reaches end of non-void function [-Wreturn-type]
    }
    ^

vim +/int +70 arch/s390//appldata/appldata_mem.c

    68	
    69	#ifdef CONFIG_PER_ORDER_ALLOC_COUNTERS
  > 70	static inline sum_pgalloc_events(u64 *pgalloc, unsigned long *ev)
    71	{
    72		int order;
    73	
    74		for (order = 1; order < MAX_ORDER; ++order) {
    75			pgalloc += ev[PGALLOC_NORMAL + order * MAX_NR_ZONES] << order;
    76			pgalloc += ev[PGALLOC_DMA + order * MAX_NR_ZONES] << order;
    77		}
  > 78	}
    79	#else
    80	static inline sum_pgalloc_events(u64 *pgalloc, unsigned long *ev)
    81	{
    82	}
    83	#endif
    84	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 47366 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2017-07-16 13:56 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-07-06 13:04 [PATCH] mm: make allocation counters per-order Roman Gushchin
2017-07-06 13:19 ` Mel Gorman
2017-07-06 14:46   ` Roman Gushchin
2017-07-06 15:47     ` Mel Gorman
2017-07-06 16:43       ` Roman Gushchin
2017-07-06 17:16         ` Mel Gorman
2017-07-06 18:00           ` Debabrata Banerjee
2017-07-06 20:02             ` Mel Gorman
2017-07-16 13:27       ` Roman Gushchin
2017-07-16 13:29       ` Roman Gushchin
2017-07-16 13:55         ` [v2] " kbuild test robot
2017-07-06 14:54   ` [PATCH] " Debabrata Banerjee
2017-07-06 15:51     ` Mel Gorman
2017-07-06 16:12       ` Debabrata Banerjee
2017-07-06 16:43         ` Mel Gorman
2017-07-06 15:02 ` Christoph Lameter
2017-07-07  1:54 ` kbuild test robot
2017-07-07  6:06 ` kbuild test robot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).