From mboxrd@z Thu Jan 1 00:00:00 1970 From: Leon Romanovsky Subject: Re: [PATCH] mm: page_alloc: Fix setting of ZONE_FAIR_DEPLETED on UP v2 Date: Tue, 9 Sep 2014 11:17:31 +0300 Message-ID: References: <1404893588-21371-1-git-send-email-mgorman@suse.de> <1404893588-21371-7-git-send-email-mgorman@suse.de> <53E4EC53.1050904@suse.cz> <20140811121241.GD7970@suse.de> <53E8B83D.1070004@suse.cz> <20140902140116.GD29501@cmpxchg.org> <20140905101451.GF17501@suse.de> <20140908115718.GL17501@suse.de> Mime-Version: 1.0 Content-Type: multipart/alternative; boundary=bcaec529a0074e481d05029d92b4 Cc: Andrew Morton , Vlastimil Babka , Johannes Weiner , Linux Kernel , Linux-MM , Linux-FSDevel To: Mel Gorman Return-path: In-Reply-To: <20140908115718.GL17501@suse.de> Sender: owner-linux-mm@kvack.org List-Id: linux-fsdevel.vger.kernel.org --bcaec529a0074e481d05029d92b4 Content-Type: text/plain; charset=UTF-8 Hi Mel, On Mon, Sep 8, 2014 at 2:57 PM, Mel Gorman wrote: > Commit 4ffeaf35 (mm: page_alloc: reduce cost of the fair zone allocation > policy) arguably broke the fair zone allocation policy on UP with these > hunks. > > a/mm/page_alloc.c > b/mm/page_alloc.c > @@ -1612,6 +1612,9 @@ again: > } > > __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); > + if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 && > + !zone_is_fair_depleted(zone)) > + zone_set_flag(zone, ZONE_FAIR_DEPLETED); > > __count_zone_vm_events(PGALLOC, zone, 1 << order); > zone_statistics(preferred_zone, zone, gfp_flags); > @@ -1966,8 +1985,10 @@ zonelist_scan: > if (alloc_flags & ALLOC_FAIR) { > if (!zone_local(preferred_zone, zone)) > break; > - if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) > + if (zone_is_fair_depleted(zone)) { > + nr_fair_skipped++; > continue; > + } > } > > A <= check was replaced with a ==. On SMP it doesn't matter because > negative values are returned as zero due to per-CPU drift which is not > possible in the UP case. Vlastimil Babka correctly pointed out that this > can wrap negative due to high-order allocations. > > However, Leon Romanovsky pointed out that a <= check on zone_page_state > was never correct as zone_page_state returns unsigned long so the root > cause of the breakage was the <= check in the first place. > > zone_page_state is an API hazard because of the difference in behaviour > between SMP and UP is very surprising. There is a good reason to allow > NR_ALLOC_BATCH to go negative -- when the counter is reset the negative > value takes recent activity into account. This patch makes zone_page_state > behave the same on SMP and UP as saving one branch on UP is not likely to > make a measurable performance difference. > > Reported-by: Vlastimil Babka > Reported-by: Leon Romanovsky > Signed-off-by: Mel Gorman > --- > include/linux/vmstat.h | 2 -- > 1 file changed, 2 deletions(-) > > diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h > index 82e7db7..cece0f0 100644 > --- a/include/linux/vmstat.h > +++ b/include/linux/vmstat.h > @@ -131,10 +131,8 @@ static inline unsigned long zone_page_state(struct > zone *zone, > enum zone_stat_item item) > { > long x = atomic_long_read(&zone->vm_stat[item]); > -#ifdef CONFIG_SMP > if (x < 0) > x = 0; > -#endif > return x; > } > Since you are changing vmstat.h, what do you think about change in all similiar places? diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 82e7db7..88d3d3e 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -120,10 +120,8 @@ static inline void zone_page_state_add(long x, struct zone *zone, static inline unsigned long global_page_state(enum zone_stat_item item) { long x = atomic_long_read(&vm_stat[item]); -#ifdef CONFIG_SMP if (x < 0) x = 0; -#endif return x; } @@ -131,10 +129,8 @@ static inline unsigned long zone_page_state(struct zone *zone, enum zone_stat_item item) { long x = atomic_long_read(&zone->vm_stat[item]); -#ifdef CONFIG_SMP if (x < 0) x = 0; -#endif return x; } @@ -153,10 +149,9 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, int cpu; for_each_online_cpu(cpu) x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item]; - +#endif if (x < 0) x = 0; -#endif return x; } -- Leon Romanovsky | Independent Linux Consultant www.leon.nu | leon@leon.nu --bcaec529a0074e481d05029d92b4 Content-Type: text/html; charset=UTF-8 Content-Transfer-Encoding: quoted-printable
Hi Mel,

On Mon, Sep 8, 2014 at 2:57 PM, Mel Gorman <mgorman@suse.de= > wrote:
Commit 4ffeaf35 (mm: page_alloc: reduce cost of the fair zone allocation policy) arguably broke the fair zone allocation policy on UP with these
hunks.

a/mm/page_alloc.c
b/mm/page_alloc.c
@@ -1612,6 +1612,9 @@ again:
=C2=A0 =C2=A0 =C2=A0 =C2=A0 }

=C2=A0 =C2=A0 =C2=A0 =C2=A0 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1= << order));
+=C2=A0 =C2=A0 =C2=A0 =C2=A0if (zone_page_state(zone, NR_ALLOC_BATCH) =3D= =3D 0 &&
+=C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0!zone_is_fair_depleted(zone))
+=C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0zone_set_flag(zone,= ZONE_FAIR_DEPLETED);

=C2=A0 =C2=A0 =C2=A0 =C2=A0 __count_zone_vm_events(PGALLOC, zone, 1 <<= ; order);
=C2=A0 =C2=A0 =C2=A0 =C2=A0 zone_statistics(preferred_zone, zone, gfp_flags= );
@@ -1966,8 +1985,10 @@ zonelist_scan:
=C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 if (alloc_flags &am= p; ALLOC_FAIR) {
=C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2= =A0 =C2=A0 if (!zone_local(preferred_zone, zone))
=C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2= =A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 break;
-=C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2= =A0 =C2=A0if (zone_page_state(zone, NR_ALLOC_BATCH) <=3D 0)
+=C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2= =A0 =C2=A0if (zone_is_fair_depleted(zone)) {
+=C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2= =A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0nr_fair_skipped++;
=C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2= =A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 continue;
+=C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2= =A0 =C2=A0}
=C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 }

A <=3D check was replaced with a =3D=3D. On SMP it doesn't matter be= cause
negative values are returned as zero due to per-CPU drift which is not
possible in the UP case. Vlastimil Babka correctly pointed out that this can wrap negative due to high-order allocations.

However, Leon Romanovsky pointed out that a <=3D check on zone_page_stat= e
was never correct as zone_page_state returns unsigned long so the root
cause of the breakage was the <=3D check in the first place.

zone_page_state is an API hazard because of the difference in behaviour
between SMP and UP is very surprising. There is a good reason to allow
NR_ALLOC_BATCH to go negative -- when the counter is reset the negative
value takes recent activity into account. This patch makes zone_page_state<= br> behave the same on SMP and UP as saving one branch on UP is not likely to make a measurable performance difference.

Reported-by: Vlastimil Babka <vbabka@s= use.cz>
Reported-by: Leon Romanovsky <leon@leon.= nu>
Signed-off-by: Mel Gorman <mgorman@su= se.de>
---
=C2=A0include/linux/vmstat.h | 2 --
=C2=A01 file changed, 2 deletions(-)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 82e7db7..cece0f0 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -131,10 +131,8 @@ static inline unsigned long zone_page_state(struct zon= e *zone,
=C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2= =A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 enum zon= e_stat_item item)
=C2=A0{
=C2=A0 =C2=A0 =C2=A0 =C2=A0 long x =3D atomic_long_read(&zone->vm_st= at[item]);
-#ifdef CONFIG_SMP
=C2=A0 =C2=A0 =C2=A0 =C2=A0 if (x < 0)
=C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 x =3D 0;
-#endif
=C2=A0 =C2=A0 =C2=A0 =C2=A0 return x;
=C2=A0}

Since you are changing vmstat.h, what do you think a= bout change in all similiar places?

diff --git a/include/linux/vmst= at.h b/include/linux/vmstat.h
index 82e7db7..88d3d3e 100644
--- a/inc= lude/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -120,10 +120,8 @@= static inline void zone_page_state_add(long x, struct zone *zone,
=C2= =A0static inline unsigned long global_page_state(enum zone_stat_item item)<= br>=C2=A0{
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 long x =3D atomic_= long_read(&vm_stat[item]);
-#ifdef CONFIG_SMP
=C2=A0=C2=A0=C2=A0= =C2=A0=C2=A0=C2=A0=C2=A0 if (x < 0)
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 x =3D 0;
-#end= if
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 return x;
=C2=A0}
= =C2=A0
@@ -131,10 +129,8 @@ static inline unsigned long zone_page_state(= struct zone *zone,
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0= =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 enum zone_stat_item item)
=C2=A0{
= =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 long x =3D atomic_long_read(&= ;zone->vm_stat[item]);
-#ifdef CONFIG_SMP
=C2=A0=C2=A0=C2=A0=C2=A0= =C2=A0=C2=A0=C2=A0 if (x < 0)
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 x =3D 0;
-#endif
= =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 return x;
=C2=A0}
=C2=A0@@ -153,10 +149,9 @@ static inline unsigned long zone_page_state_snapshot= (struct zone *zone,
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 int cpu;<= br>=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 for_each_online_cpu(cpu)
= =C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0 x +=3D per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[= item];
-
+#endif
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0 if (x = < 0)
=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2=A0=C2= =A0=C2=A0=C2=A0=C2=A0=C2=A0 x =3D 0;
-#endif
=C2=A0=C2=A0=C2=A0=C2=A0= =C2=A0=C2=A0=C2=A0 return x;
=C2=A0}


--
Leon Romanovsky | Indep= endent Linux Consultant
=C2=A0 =C2=A0 =C2=A0 =C2=A0=C2=A0www.leon.nu=C2=A0| leon@leon.nu
--bcaec529a0074e481d05029d92b4-- -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: email@kvack.org