From mboxrd@z Thu Jan  1 00:00:00 1970
From: Andrew Cooper <andrew.cooper3@citrix.com>
Subject: Re: [PATCH 1/2] make fls() and ffs() consistent across
	architectures
Date: Fri, 23 Jan 2015 14:38:21 +0000
Message-ID: <54C25CDD.4080200@citrix.com>
References: <54C108910200007800058233@mail.emea.novell.com>
	<54C10B4F0200007800058252@mail.emea.novell.com>
Mime-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Return-path: <xen-devel-bounces@lists.xen.org>
Received: from mail6.bemta5.messagelabs.com ([195.245.231.135])
	by lists.xen.org with esmtp (Exim 4.72)
	(envelope-from <Andrew.Cooper3@citrix.com>) id 1YEfNV-0002R3-0m
	for xen-devel@lists.xenproject.org; Fri, 23 Jan 2015 14:38:29 +0000
In-Reply-To: <54C10B4F0200007800058252@mail.emea.novell.com>
List-Unsubscribe: <http://lists.xen.org/cgi-bin/mailman/options/xen-devel>,
	<mailto:xen-devel-request@lists.xen.org?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xen.org>
List-Help: <mailto:xen-devel-request@lists.xen.org?subject=help>
List-Subscribe: <http://lists.xen.org/cgi-bin/mailman/listinfo/xen-devel>,
	<mailto:xen-devel-request@lists.xen.org?subject=subscribe>
Sender: xen-devel-bounces@lists.xen.org
Errors-To: xen-devel-bounces@lists.xen.org
To: Jan Beulich <JBeulich@suse.com>, xen-devel <xen-devel@lists.xenproject.org>
Cc: Ian Campbell <Ian.Campbell@eu.citrix.com>, Keir Fraser <keir@xen.org>, Tim Deegan <tim@xen.org>, Ian Jackson <Ian.Jackson@eu.citrix.com>, Stefano Stabellini <stefano.stabellini@eu.citrix.com>
List-Id: xen-devel@lists.xenproject.org

On 22/01/15 13:38, Jan Beulich wrote:
> Their parameter types differed between ARM and x86.
>
> Along with generalizing the functions this fixes
> - x86's non-long functions having long parameter types
> - ARM's ffs() using a long intermediate variable
> - generic_fls64() being broken when the upper half of the input is
>   non-zero
> - common (and in one case also ARM) code using fls() when flsl() was
>   meant
>
> Also drop ARM's constant_fls() in favor of the identical generic_fls().
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

The refactoring looks fine, so Reviewed-by: Andrew Cooper
<andrew.cooper3@citrix.com>

As for the x86 asm itself, what about
http://lkml.kernel.org/r/5058741E020000780009C014@nat28.tlf.novell.com ?
It would appear that the same optimisation is relevant for Xens __ffs()

~Andrew

>
> --- unstable.orig/xen/common/page_alloc.c	2014-11-07 17:16:38.000000000 +0100
> +++ unstable/xen/common/page_alloc.c	2015-01-21 08:53:14.000000000 +0100
> @@ -278,7 +278,7 @@ unsigned long __init alloc_boot_pages(
>  
>  #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 1 : ((b) - PAGE_SHIFT))
>  #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN :  \
> -                          (fls(page_to_mfn(pg)) ? : 1))
> +                          (flsl(page_to_mfn(pg)) ? : 1))
>  
>  typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
>  static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
> @@ -1259,7 +1259,7 @@ void __init end_boot_allocator(void)
>      {
>  #ifdef CONFIG_X86
>          dma_bitsize = min_t(unsigned int,
> -                            fls(NODE_DATA(0)->node_spanned_pages) - 1
> +                            flsl(NODE_DATA(0)->node_spanned_pages) - 1
>                              + PAGE_SHIFT - 2,
>                              32);
>  #else
> @@ -1544,7 +1544,7 @@ static unsigned int __read_mostly xenhea
>  
>  void __init xenheap_max_mfn(unsigned long mfn)
>  {
> -    xenheap_bits = fls(mfn) + PAGE_SHIFT;
> +    xenheap_bits = flsl(mfn) + PAGE_SHIFT;
>  }
>  
>  void init_xenheap_pages(paddr_t ps, paddr_t pe)
> --- unstable.orig/xen/common/xmalloc_tlsf.c	2015-01-22 13:28:41.000000000 +0100
> +++ unstable/xen/common/xmalloc_tlsf.c	2015-01-21 08:53:05.000000000 +0100
> @@ -138,9 +138,9 @@ static inline void MAPPING_SEARCH(unsign
>      }
>      else
>      {
> -        t = (1 << (fls(*r) - 1 - MAX_LOG2_SLI)) - 1;
> +        t = (1 << (flsl(*r) - 1 - MAX_LOG2_SLI)) - 1;
>          *r = *r + t;
> -        *fl = fls(*r) - 1;
> +        *fl = flsl(*r) - 1;
>          *sl = (*r >> (*fl - MAX_LOG2_SLI)) - MAX_SLI;
>          *fl -= FLI_OFFSET;
>          /*if ((*fl -= FLI_OFFSET) < 0) // FL will be always >0!
> @@ -164,7 +164,7 @@ static inline void MAPPING_INSERT(unsign
>      }
>      else
>      {
> -        *fl = fls(r) - 1;
> +        *fl = flsl(r) - 1;
>          *sl = (r >> (*fl - MAX_LOG2_SLI)) - MAX_SLI;
>          *fl -= FLI_OFFSET;
>      }
> --- unstable.orig/xen/include/asm-arm/arm32/bitops.h	2015-01-22 13:28:41.000000000 +0100
> +++ unstable/xen/include/asm-arm/arm32/bitops.h	2015-01-21 10:06:01.000000000 +0100
> @@ -15,6 +15,8 @@ extern int _test_and_change_bit(int nr, 
>  #define test_and_clear_bit(n,p)   _test_and_clear_bit(n,p)
>  #define test_and_change_bit(n,p)  _test_and_change_bit(n,p)
>  
> +#define flsl fls
> +
>  /*
>   * Little endian assembly bitops.  nr = 0 -> byte 0 bit 0.
>   */
> --- unstable.orig/xen/include/asm-arm/arm64/bitops.h	2015-01-22 13:28:41.000000000 +0100
> +++ unstable/xen/include/asm-arm/arm64/bitops.h	2015-01-21 08:28:43.000000000 +0100
> @@ -32,6 +32,17 @@ static /*__*/always_inline unsigned long
>   */
>  #define ffz(x)  __ffs(~(x))
>  
> +static inline int flsl(unsigned long x)
> +{
> +        int ret;
> +
> +        if (__builtin_constant_p(x))
> +               return generic_flsl(x);
> +
> +        asm("clz\t%0, %1" : "=r" (ret) : "r" (x));
> +        return BITS_PER_LONG - ret;
> +}
> +
>  /* Based on linux/include/asm-generic/bitops/find.h */
>  
>  #ifndef find_next_bit
> --- unstable.orig/xen/include/asm-arm/bitops.h	2015-01-22 13:28:41.000000000 +0100
> +++ unstable/xen/include/asm-arm/bitops.h	2015-01-22 13:30:17.000000000 +0100
> @@ -99,46 +99,17 @@ static inline int test_bit(int nr, const
>          return 1UL & (p[BIT_WORD(nr)] >> (nr & (BITS_PER_WORD-1)));
>  }
>  
> -static inline int constant_fls(int x)
> -{
> -        int r = 32;
> -
> -        if (!x)
> -                return 0;
> -        if (!(x & 0xffff0000u)) {
> -                x <<= 16;
> -                r -= 16;
> -        }
> -        if (!(x & 0xff000000u)) {
> -                x <<= 8;
> -                r -= 8;
> -        }
> -        if (!(x & 0xf0000000u)) {
> -                x <<= 4;
> -                r -= 4;
> -        }
> -        if (!(x & 0xc0000000u)) {
> -                x <<= 2;
> -                r -= 2;
> -        }
> -        if (!(x & 0x80000000u)) {
> -                x <<= 1;
> -                r -= 1;
> -        }
> -        return r;
> -}
> -
>  /*
>   * On ARMv5 and above those functions can be implemented around
>   * the clz instruction for much better code efficiency.
>   */
>  
> -static inline int fls(int x)
> +static inline int fls(unsigned int x)
>  {
>          int ret;
>  
>          if (__builtin_constant_p(x))
> -               return constant_fls(x);
> +               return generic_fls(x);
>  
>          asm("clz\t%0, %1" : "=r" (ret) : "r" (x));
>          ret = BITS_PER_LONG - ret;
> @@ -146,7 +117,8 @@ static inline int fls(int x)
>  }
>  
>  
> -#define ffs(x) ({ unsigned long __t = (x); fls(__t & -__t); })
> +#define ffs(x) ({ unsigned int __t = (x); fls(__t & -__t); })
> +#define ffsl(x) ({ unsigned long __t = (x); flsl(__t & -__t); })
>  
>  /**
>   * find_first_set_bit - find the first set bit in @word
> @@ -157,7 +129,7 @@ static inline int fls(int x)
>   */
>  static inline unsigned int find_first_set_bit(unsigned long word)
>  {
> -        return ffs(word) - 1;
> +        return ffsl(word) - 1;
>  }
>  
>  /**
> --- unstable.orig/xen/include/asm-x86/bitops.h	2014-09-15 15:42:35.000000000 +0200
> +++ unstable/xen/include/asm-x86/bitops.h	2015-01-22 13:30:02.000000000 +0100
> @@ -401,7 +401,7 @@ static inline unsigned int find_first_se
>   *
>   * This is defined the same way as the libc and compiler builtin ffs routines.
>   */
> -static inline int ffs(unsigned long x)
> +static inline int ffsl(unsigned long x)
>  {
>      long r;
>  
> @@ -412,13 +412,24 @@ static inline int ffs(unsigned long x)
>      return (int)r+1;
>  }
>  
> +static inline int ffs(unsigned int x)
> +{
> +    int r;
> +
> +    asm ( "bsf %1,%0\n\t"
> +          "jnz 1f\n\t"
> +          "mov $-1,%0\n"
> +          "1:" : "=r" (r) : "rm" (x));
> +    return r + 1;
> +}
> +
>  /**
>   * fls - find last bit set
>   * @x: the word to search
>   *
>   * This is defined the same way as ffs.
>   */
> -static inline int fls(unsigned long x)
> +static inline int flsl(unsigned long x)
>  {
>      long r;
>  
> @@ -429,8 +440,16 @@ static inline int fls(unsigned long x)
>      return (int)r+1;
>  }
>  
> -#define ffs64 ffs
> -#define fls64 fls
> +static inline int fls(unsigned int x)
> +{
> +    int r;
> +
> +    asm ( "bsr %1,%0\n\t"
> +          "jnz 1f\n\t"
> +          "mov $-1,%0\n"
> +          "1:" : "=r" (r) : "rm" (x));
> +    return r + 1;
> +}
>  
>  /**
>   * hweightN - returns the hamming weight of a N-bit word
> --- unstable.orig/xen/include/xen/bitops.h	2015-01-22 13:28:41.000000000 +0100
> +++ unstable/xen/include/xen/bitops.h	2015-01-22 13:30:46.000000000 +0100
> @@ -70,20 +70,52 @@ static __inline__ int generic_fls(int x)
>      return r;
>  }
>  
> +#if BITS_PER_LONG == 64
> +
> +static inline int generic_ffsl(unsigned long x)
> +{
> +    return !x || (u32)x ? generic_ffs(x) : generic_ffs(x >> 32) + 32;
> +}
> +
> +static inline int generic_flsl(unsigned long x)
> +{
> +    u32 h = x >> 32;
> +
> +    return h ? generic_fls(h) + 32 : generic_fls(x);
> +}
> +
> +#else
> +# define generic_ffsl generic_ffs
> +# define generic_flsl generic_fls
> +#endif
> +
>  /*
>   * Include this here because some architectures need generic_ffs/fls in
>   * scope
>   */
>  #include <asm/bitops.h>
>  
> -
> +#if BITS_PER_LONG == 64
> +# define fls64 flsl
> +# define ffs64 ffsl
> +#else
> +# ifndef ffs64
> +static inline int generic_ffs64(__u64 x)
> +{
> +    return !x || (__u32)x ? ffs(x) : ffs(x >> 32) + 32;
> +}
> +#  define ffs64 generic_ffs64
> +# endif
> +# ifndef fls64
>  static inline int generic_fls64(__u64 x)
>  {
>      __u32 h = x >> 32;
> -    if (h)
> -        return fls(x) + 32;
> -    return fls(x);
> +
> +    return h ? fls(h) + 32 : fls(x);
>  }
> +#  define fls64 generic_fls64
> +# endif
> +#endif
>  
>  static __inline__ int get_bitmask_order(unsigned int count)
>  {
>
>