All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] fls() / ffs() adjustments
@ 2015-01-22 13:26 Jan Beulich
  2015-01-22 13:38 ` [PATCH 1/2] make fls() and ffs() consistent across architectures Jan Beulich
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Jan Beulich @ 2015-01-22 13:26 UTC (permalink / raw)
  To: xen-devel
  Cc: Keir Fraser, Stefano Stabellini, Andrew Cooper, Tim Deegan,
	Ian Campbell, Ian Jackson

1: make fls() and ffs() consistent across architectures
2: arm64: fix fls()

ARM maintainers: If you desire so, I'm fine with patch 2 being replaced
by anything you like better (e.g. as Ian C mentioned, compiler builtins),
just don't rely on me doing so.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 1/2] make fls() and ffs() consistent across architectures
  2015-01-22 13:26 [PATCH 0/2] fls() / ffs() adjustments Jan Beulich
@ 2015-01-22 13:38 ` Jan Beulich
  2015-01-23 14:38   ` Andrew Cooper
  2015-01-22 13:38 ` [PATCH 2/2] arm64: fix fls() Jan Beulich
  2015-01-22 14:59 ` [PATCH 0/2] fls() / ffs() adjustments Ian Campbell
  2 siblings, 1 reply; 9+ messages in thread
From: Jan Beulich @ 2015-01-22 13:38 UTC (permalink / raw)
  To: xen-devel
  Cc: Keir Fraser, Stefano Stabellini, Andrew Cooper, Tim Deegan,
	Ian Campbell, Ian Jackson

[-- Attachment #1: Type: text/plain, Size: 8247 bytes --]

Their parameter types differed between ARM and x86.

Along with generalizing the functions this fixes
- x86's non-long functions having long parameter types
- ARM's ffs() using a long intermediate variable
- generic_fls64() being broken when the upper half of the input is
  non-zero
- common (and in one case also ARM) code using fls() when flsl() was
  meant

Also drop ARM's constant_fls() in favor of the identical generic_fls().

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- unstable.orig/xen/common/page_alloc.c	2014-11-07 17:16:38.000000000 +0100
+++ unstable/xen/common/page_alloc.c	2015-01-21 08:53:14.000000000 +0100
@@ -278,7 +278,7 @@ unsigned long __init alloc_boot_pages(
 
 #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 1 : ((b) - PAGE_SHIFT))
 #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN :  \
-                          (fls(page_to_mfn(pg)) ? : 1))
+                          (flsl(page_to_mfn(pg)) ? : 1))
 
 typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
@@ -1259,7 +1259,7 @@ void __init end_boot_allocator(void)
     {
 #ifdef CONFIG_X86
         dma_bitsize = min_t(unsigned int,
-                            fls(NODE_DATA(0)->node_spanned_pages) - 1
+                            flsl(NODE_DATA(0)->node_spanned_pages) - 1
                             + PAGE_SHIFT - 2,
                             32);
 #else
@@ -1544,7 +1544,7 @@ static unsigned int __read_mostly xenhea
 
 void __init xenheap_max_mfn(unsigned long mfn)
 {
-    xenheap_bits = fls(mfn) + PAGE_SHIFT;
+    xenheap_bits = flsl(mfn) + PAGE_SHIFT;
 }
 
 void init_xenheap_pages(paddr_t ps, paddr_t pe)
--- unstable.orig/xen/common/xmalloc_tlsf.c	2015-01-22 13:28:41.000000000 +0100
+++ unstable/xen/common/xmalloc_tlsf.c	2015-01-21 08:53:05.000000000 +0100
@@ -138,9 +138,9 @@ static inline void MAPPING_SEARCH(unsign
     }
     else
     {
-        t = (1 << (fls(*r) - 1 - MAX_LOG2_SLI)) - 1;
+        t = (1 << (flsl(*r) - 1 - MAX_LOG2_SLI)) - 1;
         *r = *r + t;
-        *fl = fls(*r) - 1;
+        *fl = flsl(*r) - 1;
         *sl = (*r >> (*fl - MAX_LOG2_SLI)) - MAX_SLI;
         *fl -= FLI_OFFSET;
         /*if ((*fl -= FLI_OFFSET) < 0) // FL will be always >0!
@@ -164,7 +164,7 @@ static inline void MAPPING_INSERT(unsign
     }
     else
     {
-        *fl = fls(r) - 1;
+        *fl = flsl(r) - 1;
         *sl = (r >> (*fl - MAX_LOG2_SLI)) - MAX_SLI;
         *fl -= FLI_OFFSET;
     }
--- unstable.orig/xen/include/asm-arm/arm32/bitops.h	2015-01-22 13:28:41.000000000 +0100
+++ unstable/xen/include/asm-arm/arm32/bitops.h	2015-01-21 10:06:01.000000000 +0100
@@ -15,6 +15,8 @@ extern int _test_and_change_bit(int nr, 
 #define test_and_clear_bit(n,p)   _test_and_clear_bit(n,p)
 #define test_and_change_bit(n,p)  _test_and_change_bit(n,p)
 
+#define flsl fls
+
 /*
  * Little endian assembly bitops.  nr = 0 -> byte 0 bit 0.
  */
--- unstable.orig/xen/include/asm-arm/arm64/bitops.h	2015-01-22 13:28:41.000000000 +0100
+++ unstable/xen/include/asm-arm/arm64/bitops.h	2015-01-21 08:28:43.000000000 +0100
@@ -32,6 +32,17 @@ static /*__*/always_inline unsigned long
  */
 #define ffz(x)  __ffs(~(x))
 
+static inline int flsl(unsigned long x)
+{
+        int ret;
+
+        if (__builtin_constant_p(x))
+               return generic_flsl(x);
+
+        asm("clz\t%0, %1" : "=r" (ret) : "r" (x));
+        return BITS_PER_LONG - ret;
+}
+
 /* Based on linux/include/asm-generic/bitops/find.h */
 
 #ifndef find_next_bit
--- unstable.orig/xen/include/asm-arm/bitops.h	2015-01-22 13:28:41.000000000 +0100
+++ unstable/xen/include/asm-arm/bitops.h	2015-01-22 13:30:17.000000000 +0100
@@ -99,46 +99,17 @@ static inline int test_bit(int nr, const
         return 1UL & (p[BIT_WORD(nr)] >> (nr & (BITS_PER_WORD-1)));
 }
 
-static inline int constant_fls(int x)
-{
-        int r = 32;
-
-        if (!x)
-                return 0;
-        if (!(x & 0xffff0000u)) {
-                x <<= 16;
-                r -= 16;
-        }
-        if (!(x & 0xff000000u)) {
-                x <<= 8;
-                r -= 8;
-        }
-        if (!(x & 0xf0000000u)) {
-                x <<= 4;
-                r -= 4;
-        }
-        if (!(x & 0xc0000000u)) {
-                x <<= 2;
-                r -= 2;
-        }
-        if (!(x & 0x80000000u)) {
-                x <<= 1;
-                r -= 1;
-        }
-        return r;
-}
-
 /*
  * On ARMv5 and above those functions can be implemented around
  * the clz instruction for much better code efficiency.
  */
 
-static inline int fls(int x)
+static inline int fls(unsigned int x)
 {
         int ret;
 
         if (__builtin_constant_p(x))
-               return constant_fls(x);
+               return generic_fls(x);
 
         asm("clz\t%0, %1" : "=r" (ret) : "r" (x));
         ret = BITS_PER_LONG - ret;
@@ -146,7 +117,8 @@ static inline int fls(int x)
 }
 
 
-#define ffs(x) ({ unsigned long __t = (x); fls(__t & -__t); })
+#define ffs(x) ({ unsigned int __t = (x); fls(__t & -__t); })
+#define ffsl(x) ({ unsigned long __t = (x); flsl(__t & -__t); })
 
 /**
  * find_first_set_bit - find the first set bit in @word
@@ -157,7 +129,7 @@ static inline int fls(int x)
  */
 static inline unsigned int find_first_set_bit(unsigned long word)
 {
-        return ffs(word) - 1;
+        return ffsl(word) - 1;
 }
 
 /**
--- unstable.orig/xen/include/asm-x86/bitops.h	2014-09-15 15:42:35.000000000 +0200
+++ unstable/xen/include/asm-x86/bitops.h	2015-01-22 13:30:02.000000000 +0100
@@ -401,7 +401,7 @@ static inline unsigned int find_first_se
  *
  * This is defined the same way as the libc and compiler builtin ffs routines.
  */
-static inline int ffs(unsigned long x)
+static inline int ffsl(unsigned long x)
 {
     long r;
 
@@ -412,13 +412,24 @@ static inline int ffs(unsigned long x)
     return (int)r+1;
 }
 
+static inline int ffs(unsigned int x)
+{
+    int r;
+
+    asm ( "bsf %1,%0\n\t"
+          "jnz 1f\n\t"
+          "mov $-1,%0\n"
+          "1:" : "=r" (r) : "rm" (x));
+    return r + 1;
+}
+
 /**
  * fls - find last bit set
  * @x: the word to search
  *
  * This is defined the same way as ffs.
  */
-static inline int fls(unsigned long x)
+static inline int flsl(unsigned long x)
 {
     long r;
 
@@ -429,8 +440,16 @@ static inline int fls(unsigned long x)
     return (int)r+1;
 }
 
-#define ffs64 ffs
-#define fls64 fls
+static inline int fls(unsigned int x)
+{
+    int r;
+
+    asm ( "bsr %1,%0\n\t"
+          "jnz 1f\n\t"
+          "mov $-1,%0\n"
+          "1:" : "=r" (r) : "rm" (x));
+    return r + 1;
+}
 
 /**
  * hweightN - returns the hamming weight of a N-bit word
--- unstable.orig/xen/include/xen/bitops.h	2015-01-22 13:28:41.000000000 +0100
+++ unstable/xen/include/xen/bitops.h	2015-01-22 13:30:46.000000000 +0100
@@ -70,20 +70,52 @@ static __inline__ int generic_fls(int x)
     return r;
 }
 
+#if BITS_PER_LONG == 64
+
+static inline int generic_ffsl(unsigned long x)
+{
+    return !x || (u32)x ? generic_ffs(x) : generic_ffs(x >> 32) + 32;
+}
+
+static inline int generic_flsl(unsigned long x)
+{
+    u32 h = x >> 32;
+
+    return h ? generic_fls(h) + 32 : generic_fls(x);
+}
+
+#else
+# define generic_ffsl generic_ffs
+# define generic_flsl generic_fls
+#endif
+
 /*
  * Include this here because some architectures need generic_ffs/fls in
  * scope
  */
 #include <asm/bitops.h>
 
-
+#if BITS_PER_LONG == 64
+# define fls64 flsl
+# define ffs64 ffsl
+#else
+# ifndef ffs64
+static inline int generic_ffs64(__u64 x)
+{
+    return !x || (__u32)x ? ffs(x) : ffs(x >> 32) + 32;
+}
+#  define ffs64 generic_ffs64
+# endif
+# ifndef fls64
 static inline int generic_fls64(__u64 x)
 {
     __u32 h = x >> 32;
-    if (h)
-        return fls(x) + 32;
-    return fls(x);
+
+    return h ? fls(h) + 32 : fls(x);
 }
+#  define fls64 generic_fls64
+# endif
+#endif
 
 static __inline__ int get_bitmask_order(unsigned int count)
 {



[-- Attachment #2: fls-ffs.patch --]
[-- Type: text/plain, Size: 8299 bytes --]

make fls() and ffs() consistent across architectures

Their parameter types differed between ARM and x86.

Along with generalizing the functions this fixes
- x86's non-long functions having long parameter types
- ARM's ffs() using a long intermediate variable
- generic_fls64() being broken when the upper half of the input is
  non-zero
- common (and in one case also ARM) code using fls() when flsl() was
  meant

Also drop ARM's constant_fls() in favor of the identical generic_fls().

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- unstable.orig/xen/common/page_alloc.c	2014-11-07 17:16:38.000000000 +0100
+++ unstable/xen/common/page_alloc.c	2015-01-21 08:53:14.000000000 +0100
@@ -278,7 +278,7 @@ unsigned long __init alloc_boot_pages(
 
 #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 1 : ((b) - PAGE_SHIFT))
 #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN :  \
-                          (fls(page_to_mfn(pg)) ? : 1))
+                          (flsl(page_to_mfn(pg)) ? : 1))
 
 typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
@@ -1259,7 +1259,7 @@ void __init end_boot_allocator(void)
     {
 #ifdef CONFIG_X86
         dma_bitsize = min_t(unsigned int,
-                            fls(NODE_DATA(0)->node_spanned_pages) - 1
+                            flsl(NODE_DATA(0)->node_spanned_pages) - 1
                             + PAGE_SHIFT - 2,
                             32);
 #else
@@ -1544,7 +1544,7 @@ static unsigned int __read_mostly xenhea
 
 void __init xenheap_max_mfn(unsigned long mfn)
 {
-    xenheap_bits = fls(mfn) + PAGE_SHIFT;
+    xenheap_bits = flsl(mfn) + PAGE_SHIFT;
 }
 
 void init_xenheap_pages(paddr_t ps, paddr_t pe)
--- unstable.orig/xen/common/xmalloc_tlsf.c	2015-01-22 13:28:41.000000000 +0100
+++ unstable/xen/common/xmalloc_tlsf.c	2015-01-21 08:53:05.000000000 +0100
@@ -138,9 +138,9 @@ static inline void MAPPING_SEARCH(unsign
     }
     else
     {
-        t = (1 << (fls(*r) - 1 - MAX_LOG2_SLI)) - 1;
+        t = (1 << (flsl(*r) - 1 - MAX_LOG2_SLI)) - 1;
         *r = *r + t;
-        *fl = fls(*r) - 1;
+        *fl = flsl(*r) - 1;
         *sl = (*r >> (*fl - MAX_LOG2_SLI)) - MAX_SLI;
         *fl -= FLI_OFFSET;
         /*if ((*fl -= FLI_OFFSET) < 0) // FL will be always >0!
@@ -164,7 +164,7 @@ static inline void MAPPING_INSERT(unsign
     }
     else
     {
-        *fl = fls(r) - 1;
+        *fl = flsl(r) - 1;
         *sl = (r >> (*fl - MAX_LOG2_SLI)) - MAX_SLI;
         *fl -= FLI_OFFSET;
     }
--- unstable.orig/xen/include/asm-arm/arm32/bitops.h	2015-01-22 13:28:41.000000000 +0100
+++ unstable/xen/include/asm-arm/arm32/bitops.h	2015-01-21 10:06:01.000000000 +0100
@@ -15,6 +15,8 @@ extern int _test_and_change_bit(int nr, 
 #define test_and_clear_bit(n,p)   _test_and_clear_bit(n,p)
 #define test_and_change_bit(n,p)  _test_and_change_bit(n,p)
 
+#define flsl fls
+
 /*
  * Little endian assembly bitops.  nr = 0 -> byte 0 bit 0.
  */
--- unstable.orig/xen/include/asm-arm/arm64/bitops.h	2015-01-22 13:28:41.000000000 +0100
+++ unstable/xen/include/asm-arm/arm64/bitops.h	2015-01-21 08:28:43.000000000 +0100
@@ -32,6 +32,17 @@ static /*__*/always_inline unsigned long
  */
 #define ffz(x)  __ffs(~(x))
 
+static inline int flsl(unsigned long x)
+{
+        int ret;
+
+        if (__builtin_constant_p(x))
+               return generic_flsl(x);
+
+        asm("clz\t%0, %1" : "=r" (ret) : "r" (x));
+        return BITS_PER_LONG - ret;
+}
+
 /* Based on linux/include/asm-generic/bitops/find.h */
 
 #ifndef find_next_bit
--- unstable.orig/xen/include/asm-arm/bitops.h	2015-01-22 13:28:41.000000000 +0100
+++ unstable/xen/include/asm-arm/bitops.h	2015-01-22 13:30:17.000000000 +0100
@@ -99,46 +99,17 @@ static inline int test_bit(int nr, const
         return 1UL & (p[BIT_WORD(nr)] >> (nr & (BITS_PER_WORD-1)));
 }
 
-static inline int constant_fls(int x)
-{
-        int r = 32;
-
-        if (!x)
-                return 0;
-        if (!(x & 0xffff0000u)) {
-                x <<= 16;
-                r -= 16;
-        }
-        if (!(x & 0xff000000u)) {
-                x <<= 8;
-                r -= 8;
-        }
-        if (!(x & 0xf0000000u)) {
-                x <<= 4;
-                r -= 4;
-        }
-        if (!(x & 0xc0000000u)) {
-                x <<= 2;
-                r -= 2;
-        }
-        if (!(x & 0x80000000u)) {
-                x <<= 1;
-                r -= 1;
-        }
-        return r;
-}
-
 /*
  * On ARMv5 and above those functions can be implemented around
  * the clz instruction for much better code efficiency.
  */
 
-static inline int fls(int x)
+static inline int fls(unsigned int x)
 {
         int ret;
 
         if (__builtin_constant_p(x))
-               return constant_fls(x);
+               return generic_fls(x);
 
         asm("clz\t%0, %1" : "=r" (ret) : "r" (x));
         ret = BITS_PER_LONG - ret;
@@ -146,7 +117,8 @@ static inline int fls(int x)
 }
 
 
-#define ffs(x) ({ unsigned long __t = (x); fls(__t & -__t); })
+#define ffs(x) ({ unsigned int __t = (x); fls(__t & -__t); })
+#define ffsl(x) ({ unsigned long __t = (x); flsl(__t & -__t); })
 
 /**
  * find_first_set_bit - find the first set bit in @word
@@ -157,7 +129,7 @@ static inline int fls(int x)
  */
 static inline unsigned int find_first_set_bit(unsigned long word)
 {
-        return ffs(word) - 1;
+        return ffsl(word) - 1;
 }
 
 /**
--- unstable.orig/xen/include/asm-x86/bitops.h	2014-09-15 15:42:35.000000000 +0200
+++ unstable/xen/include/asm-x86/bitops.h	2015-01-22 13:30:02.000000000 +0100
@@ -401,7 +401,7 @@ static inline unsigned int find_first_se
  *
  * This is defined the same way as the libc and compiler builtin ffs routines.
  */
-static inline int ffs(unsigned long x)
+static inline int ffsl(unsigned long x)
 {
     long r;
 
@@ -412,13 +412,24 @@ static inline int ffs(unsigned long x)
     return (int)r+1;
 }
 
+static inline int ffs(unsigned int x)
+{
+    int r;
+
+    asm ( "bsf %1,%0\n\t"
+          "jnz 1f\n\t"
+          "mov $-1,%0\n"
+          "1:" : "=r" (r) : "rm" (x));
+    return r + 1;
+}
+
 /**
  * fls - find last bit set
  * @x: the word to search
  *
  * This is defined the same way as ffs.
  */
-static inline int fls(unsigned long x)
+static inline int flsl(unsigned long x)
 {
     long r;
 
@@ -429,8 +440,16 @@ static inline int fls(unsigned long x)
     return (int)r+1;
 }
 
-#define ffs64 ffs
-#define fls64 fls
+static inline int fls(unsigned int x)
+{
+    int r;
+
+    asm ( "bsr %1,%0\n\t"
+          "jnz 1f\n\t"
+          "mov $-1,%0\n"
+          "1:" : "=r" (r) : "rm" (x));
+    return r + 1;
+}
 
 /**
  * hweightN - returns the hamming weight of a N-bit word
--- unstable.orig/xen/include/xen/bitops.h	2015-01-22 13:28:41.000000000 +0100
+++ unstable/xen/include/xen/bitops.h	2015-01-22 13:30:46.000000000 +0100
@@ -70,20 +70,52 @@ static __inline__ int generic_fls(int x)
     return r;
 }
 
+#if BITS_PER_LONG == 64
+
+static inline int generic_ffsl(unsigned long x)
+{
+    return !x || (u32)x ? generic_ffs(x) : generic_ffs(x >> 32) + 32;
+}
+
+static inline int generic_flsl(unsigned long x)
+{
+    u32 h = x >> 32;
+
+    return h ? generic_fls(h) + 32 : generic_fls(x);
+}
+
+#else
+# define generic_ffsl generic_ffs
+# define generic_flsl generic_fls
+#endif
+
 /*
  * Include this here because some architectures need generic_ffs/fls in
  * scope
  */
 #include <asm/bitops.h>
 
-
+#if BITS_PER_LONG == 64
+# define fls64 flsl
+# define ffs64 ffsl
+#else
+# ifndef ffs64
+static inline int generic_ffs64(__u64 x)
+{
+    return !x || (__u32)x ? ffs(x) : ffs(x >> 32) + 32;
+}
+#  define ffs64 generic_ffs64
+# endif
+# ifndef fls64
 static inline int generic_fls64(__u64 x)
 {
     __u32 h = x >> 32;
-    if (h)
-        return fls(x) + 32;
-    return fls(x);
+
+    return h ? fls(h) + 32 : fls(x);
 }
+#  define fls64 generic_fls64
+# endif
+#endif
 
 static __inline__ int get_bitmask_order(unsigned int count)
 {

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 2/2] arm64: fix fls()
  2015-01-22 13:26 [PATCH 0/2] fls() / ffs() adjustments Jan Beulich
  2015-01-22 13:38 ` [PATCH 1/2] make fls() and ffs() consistent across architectures Jan Beulich
@ 2015-01-22 13:38 ` Jan Beulich
  2015-01-22 14:01   ` Tim Deegan
  2015-01-22 14:59 ` [PATCH 0/2] fls() / ffs() adjustments Ian Campbell
  2 siblings, 1 reply; 9+ messages in thread
From: Jan Beulich @ 2015-01-22 13:38 UTC (permalink / raw)
  To: xen-devel; +Cc: Ian Campbell, Tim Deegan, Stefano Stabellini

[-- Attachment #1: Type: text/plain, Size: 2311 bytes --]

It using CLZ on a 64-bit register while specifying the input operand as
only 32 bits wide is wrong: An operand intentionally shrunk down to 32
bits at the source level doesn't imply respective zero extension also
happens at the machine instruction level, and hence the wrong result
could get returned.

Add suitable inline assembly abstraction so that the function can
remain shared between arm32 and arm64. The need to include asm_defns.h
in bitops.h makes it necessary to adjust processor.h though - it is
generally wrong to include public headers without making sure that
integer types are properly defined. (I didn't innvestigate or try
whether the possible alternative of moving the public/arch-arm.h
inclusion down in the file would also work.)

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Note: Only compile tested due to lack of suitable ARM64 test environment.

--- a/xen/include/asm-arm/asm_defns.h
+++ b/xen/include/asm-arm/asm_defns.h
@@ -7,6 +7,15 @@
 #endif
 #include <asm/processor.h>
 
+/* For generic assembly code: use macros to define operand sizes. */
+#if defined(CONFIG_ARM_32)
+# define __OP32
+#elif defined(CONFIG_ARM_64)
+# define __OP32 "w"
+#else
+# error "unknown ARM variant"
+#endif
+
 #endif /* __ARM_ASM_DEFNS_H__ */
 /*
  * Local variables:
--- a/xen/include/asm-arm/bitops.h
+++ b/xen/include/asm-arm/bitops.h
@@ -9,6 +9,8 @@
 #ifndef _ARM_BITOPS_H
 #define _ARM_BITOPS_H
 
+#include <asm/asm_defns.h>
+
 /*
  * Non-atomic bit manipulation.
  *
@@ -111,9 +113,8 @@ static inline int fls(unsigned int x)
         if (__builtin_constant_p(x))
                return generic_fls(x);
 
-        asm("clz\t%0, %1" : "=r" (ret) : "r" (x));
-        ret = BITS_PER_LONG - ret;
-        return ret;
+        asm("clz\t%"__OP32"0, %"__OP32"1" : "=r" (ret) : "r" (x));
+        return 32 - ret;
 }
 
 
--- a/xen/include/asm-arm/processor.h
+++ b/xen/include/asm-arm/processor.h
@@ -3,6 +3,9 @@
 
 #include <asm/cpregs.h>
 #include <asm/sysregs.h>
+#ifndef __ASSEMBLY__
+#include <xen/types.h>
+#endif
 #include <public/arch-arm.h>
 
 /* MIDR Main ID Register */
@@ -220,8 +223,6 @@
 
 #ifndef __ASSEMBLY__
 
-#include <xen/types.h>
-
 struct cpuinfo_arm {
     union {
         uint32_t bits;




[-- Attachment #2: arm64-fls.patch --]
[-- Type: text/plain, Size: 2325 bytes --]

arm64: fix fls()

It using CLZ on a 64-bit register while specifying the input operand as
only 32 bits wide is wrong: An operand intentionally shrunk down to 32
bits at the source level doesn't imply respective zero extension also
happens at the machine instruction level, and hence the wrong result
could get returned.

Add suitable inline assembly abstraction so that the function can
remain shared between arm32 and arm64. The need to include asm_defns.h
in bitops.h makes it necessary to adjust processor.h though - it is
generally wrong to include public headers without making sure that
integer types are properly defined. (I didn't innvestigate or try
whether the possible alternative of moving the public/arch-arm.h
inclusion down in the file would also work.)

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
Note: Only compile tested due to lack of suitable ARM64 test environment.

--- a/xen/include/asm-arm/asm_defns.h
+++ b/xen/include/asm-arm/asm_defns.h
@@ -7,6 +7,15 @@
 #endif
 #include <asm/processor.h>
 
+/* For generic assembly code: use macros to define operand sizes. */
+#if defined(CONFIG_ARM_32)
+# define __OP32
+#elif defined(CONFIG_ARM_64)
+# define __OP32 "w"
+#else
+# error "unknown ARM variant"
+#endif
+
 #endif /* __ARM_ASM_DEFNS_H__ */
 /*
  * Local variables:
--- a/xen/include/asm-arm/bitops.h
+++ b/xen/include/asm-arm/bitops.h
@@ -9,6 +9,8 @@
 #ifndef _ARM_BITOPS_H
 #define _ARM_BITOPS_H
 
+#include <asm/asm_defns.h>
+
 /*
  * Non-atomic bit manipulation.
  *
@@ -111,9 +113,8 @@ static inline int fls(unsigned int x)
         if (__builtin_constant_p(x))
                return generic_fls(x);
 
-        asm("clz\t%0, %1" : "=r" (ret) : "r" (x));
-        ret = BITS_PER_LONG - ret;
-        return ret;
+        asm("clz\t%"__OP32"0, %"__OP32"1" : "=r" (ret) : "r" (x));
+        return 32 - ret;
 }
 
 
--- a/xen/include/asm-arm/processor.h
+++ b/xen/include/asm-arm/processor.h
@@ -3,6 +3,9 @@
 
 #include <asm/cpregs.h>
 #include <asm/sysregs.h>
+#ifndef __ASSEMBLY__
+#include <xen/types.h>
+#endif
 #include <public/arch-arm.h>
 
 /* MIDR Main ID Register */
@@ -220,8 +223,6 @@
 
 #ifndef __ASSEMBLY__
 
-#include <xen/types.h>
-
 struct cpuinfo_arm {
     union {
         uint32_t bits;

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] arm64: fix fls()
  2015-01-22 13:38 ` [PATCH 2/2] arm64: fix fls() Jan Beulich
@ 2015-01-22 14:01   ` Tim Deegan
  2015-01-22 14:05     ` Jan Beulich
  0 siblings, 1 reply; 9+ messages in thread
From: Tim Deegan @ 2015-01-22 14:01 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Ian Campbell, xen-devel, Stefano Stabellini

At 13:38 +0000 on 22 Jan (1421930334), Jan Beulich wrote:
> It using CLZ on a 64-bit register while specifying the input operand as
> only 32 bits wide is wrong: An operand intentionally shrunk down to 32
> bits at the source level doesn't imply respective zero extension also
> happens at the machine instruction level, and hence the wrong result
> could get returned.
> 
> Add suitable inline assembly abstraction so that the function can
> remain shared between arm32 and arm64.

Would casting the asm arguments to unsigned long DTRT withuot
needing #ifdefs?

Cheers,

Tim.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] arm64: fix fls()
  2015-01-22 14:01   ` Tim Deegan
@ 2015-01-22 14:05     ` Jan Beulich
  2015-01-22 14:29       ` Tim Deegan
  0 siblings, 1 reply; 9+ messages in thread
From: Jan Beulich @ 2015-01-22 14:05 UTC (permalink / raw)
  To: Tim Deegan; +Cc: Ian Campbell, xen-devel, Stefano Stabellini

>>> On 22.01.15 at 15:01, <tim@xen.org> wrote:
> At 13:38 +0000 on 22 Jan (1421930334), Jan Beulich wrote:
>> It using CLZ on a 64-bit register while specifying the input operand as
>> only 32 bits wide is wrong: An operand intentionally shrunk down to 32
>> bits at the source level doesn't imply respective zero extension also
>> happens at the machine instruction level, and hence the wrong result
>> could get returned.
>> 
>> Add suitable inline assembly abstraction so that the function can
>> remain shared between arm32 and arm64.
> 
> Would casting the asm arguments to unsigned long DTRT withuot
> needing #ifdefs?

The input one - yes, I think so. But it wouldn't be me to introduce
such (suspicious) casts, and I also think it is better to use 32-bit
instruction variants where possible (which quite certainly won't
perform worse than the 64-bit ones, but may perform better).

Jan

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] arm64: fix fls()
  2015-01-22 14:05     ` Jan Beulich
@ 2015-01-22 14:29       ` Tim Deegan
  0 siblings, 0 replies; 9+ messages in thread
From: Tim Deegan @ 2015-01-22 14:29 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Ian Campbell, xen-devel, Stefano Stabellini

At 14:05 +0000 on 22 Jan (1421931912), Jan Beulich wrote:
> >>> On 22.01.15 at 15:01, <tim@xen.org> wrote:
> > At 13:38 +0000 on 22 Jan (1421930334), Jan Beulich wrote:
> >> It using CLZ on a 64-bit register while specifying the input operand as
> >> only 32 bits wide is wrong: An operand intentionally shrunk down to 32
> >> bits at the source level doesn't imply respective zero extension also
> >> happens at the machine instruction level, and hence the wrong result
> >> could get returned.
> >> 
> >> Add suitable inline assembly abstraction so that the function can
> >> remain shared between arm32 and arm64.
> > 
> > Would casting the asm arguments to unsigned long DTRT withuot
> > needing #ifdefs?
> 
> The input one - yes, I think so. But it wouldn't be me to introduce
> such (suspicious) casts, and I also think it is better to use 32-bit
> instruction variants where possible (which quite certainly won't
> perform worse than the 64-bit ones, but may perform better).

Ah, I had misunderstood - I thought the new runes were just enforcing
correct extension of the arguments.  Using a 32-bit instruction does
indeed sound better.

Tim.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 0/2] fls() / ffs() adjustments
  2015-01-22 13:26 [PATCH 0/2] fls() / ffs() adjustments Jan Beulich
  2015-01-22 13:38 ` [PATCH 1/2] make fls() and ffs() consistent across architectures Jan Beulich
  2015-01-22 13:38 ` [PATCH 2/2] arm64: fix fls() Jan Beulich
@ 2015-01-22 14:59 ` Ian Campbell
  2 siblings, 0 replies; 9+ messages in thread
From: Ian Campbell @ 2015-01-22 14:59 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Keir Fraser, Stefano Stabellini, Tim Deegan, Ian Jackson,
	Andrew Cooper, xen-devel

On Thu, 2015-01-22 at 13:26 +0000, Jan Beulich wrote:
> 1: make fls() and ffs() consistent across architectures
> 2: arm64: fix fls()
> 
> ARM maintainers: If you desire so, I'm fine with patch 2 being replaced
> by anything you like better (e.g. as Ian C mentioned, compiler builtins),
> just don't rely on me doing so.

I'll do this as an incremental change once this is in, rather than
blocking you.

> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Both:

Acked-by: Ian Campbell <ian.campbell@citrix.com>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] make fls() and ffs() consistent across architectures
  2015-01-22 13:38 ` [PATCH 1/2] make fls() and ffs() consistent across architectures Jan Beulich
@ 2015-01-23 14:38   ` Andrew Cooper
  2015-01-23 14:53     ` Jan Beulich
  0 siblings, 1 reply; 9+ messages in thread
From: Andrew Cooper @ 2015-01-23 14:38 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: Ian Campbell, Keir Fraser, Tim Deegan, Ian Jackson, Stefano Stabellini

On 22/01/15 13:38, Jan Beulich wrote:
> Their parameter types differed between ARM and x86.
>
> Along with generalizing the functions this fixes
> - x86's non-long functions having long parameter types
> - ARM's ffs() using a long intermediate variable
> - generic_fls64() being broken when the upper half of the input is
>   non-zero
> - common (and in one case also ARM) code using fls() when flsl() was
>   meant
>
> Also drop ARM's constant_fls() in favor of the identical generic_fls().
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

The refactoring looks fine, so Reviewed-by: Andrew Cooper
<andrew.cooper3@citrix.com>

As for the x86 asm itself, what about
http://lkml.kernel.org/r/5058741E020000780009C014@nat28.tlf.novell.com ?
It would appear that the same optimisation is relevant for Xens __ffs()

~Andrew

>
> --- unstable.orig/xen/common/page_alloc.c	2014-11-07 17:16:38.000000000 +0100
> +++ unstable/xen/common/page_alloc.c	2015-01-21 08:53:14.000000000 +0100
> @@ -278,7 +278,7 @@ unsigned long __init alloc_boot_pages(
>  
>  #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 1 : ((b) - PAGE_SHIFT))
>  #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN :  \
> -                          (fls(page_to_mfn(pg)) ? : 1))
> +                          (flsl(page_to_mfn(pg)) ? : 1))
>  
>  typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
>  static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
> @@ -1259,7 +1259,7 @@ void __init end_boot_allocator(void)
>      {
>  #ifdef CONFIG_X86
>          dma_bitsize = min_t(unsigned int,
> -                            fls(NODE_DATA(0)->node_spanned_pages) - 1
> +                            flsl(NODE_DATA(0)->node_spanned_pages) - 1
>                              + PAGE_SHIFT - 2,
>                              32);
>  #else
> @@ -1544,7 +1544,7 @@ static unsigned int __read_mostly xenhea
>  
>  void __init xenheap_max_mfn(unsigned long mfn)
>  {
> -    xenheap_bits = fls(mfn) + PAGE_SHIFT;
> +    xenheap_bits = flsl(mfn) + PAGE_SHIFT;
>  }
>  
>  void init_xenheap_pages(paddr_t ps, paddr_t pe)
> --- unstable.orig/xen/common/xmalloc_tlsf.c	2015-01-22 13:28:41.000000000 +0100
> +++ unstable/xen/common/xmalloc_tlsf.c	2015-01-21 08:53:05.000000000 +0100
> @@ -138,9 +138,9 @@ static inline void MAPPING_SEARCH(unsign
>      }
>      else
>      {
> -        t = (1 << (fls(*r) - 1 - MAX_LOG2_SLI)) - 1;
> +        t = (1 << (flsl(*r) - 1 - MAX_LOG2_SLI)) - 1;
>          *r = *r + t;
> -        *fl = fls(*r) - 1;
> +        *fl = flsl(*r) - 1;
>          *sl = (*r >> (*fl - MAX_LOG2_SLI)) - MAX_SLI;
>          *fl -= FLI_OFFSET;
>          /*if ((*fl -= FLI_OFFSET) < 0) // FL will be always >0!
> @@ -164,7 +164,7 @@ static inline void MAPPING_INSERT(unsign
>      }
>      else
>      {
> -        *fl = fls(r) - 1;
> +        *fl = flsl(r) - 1;
>          *sl = (r >> (*fl - MAX_LOG2_SLI)) - MAX_SLI;
>          *fl -= FLI_OFFSET;
>      }
> --- unstable.orig/xen/include/asm-arm/arm32/bitops.h	2015-01-22 13:28:41.000000000 +0100
> +++ unstable/xen/include/asm-arm/arm32/bitops.h	2015-01-21 10:06:01.000000000 +0100
> @@ -15,6 +15,8 @@ extern int _test_and_change_bit(int nr, 
>  #define test_and_clear_bit(n,p)   _test_and_clear_bit(n,p)
>  #define test_and_change_bit(n,p)  _test_and_change_bit(n,p)
>  
> +#define flsl fls
> +
>  /*
>   * Little endian assembly bitops.  nr = 0 -> byte 0 bit 0.
>   */
> --- unstable.orig/xen/include/asm-arm/arm64/bitops.h	2015-01-22 13:28:41.000000000 +0100
> +++ unstable/xen/include/asm-arm/arm64/bitops.h	2015-01-21 08:28:43.000000000 +0100
> @@ -32,6 +32,17 @@ static /*__*/always_inline unsigned long
>   */
>  #define ffz(x)  __ffs(~(x))
>  
> +static inline int flsl(unsigned long x)
> +{
> +        int ret;
> +
> +        if (__builtin_constant_p(x))
> +               return generic_flsl(x);
> +
> +        asm("clz\t%0, %1" : "=r" (ret) : "r" (x));
> +        return BITS_PER_LONG - ret;
> +}
> +
>  /* Based on linux/include/asm-generic/bitops/find.h */
>  
>  #ifndef find_next_bit
> --- unstable.orig/xen/include/asm-arm/bitops.h	2015-01-22 13:28:41.000000000 +0100
> +++ unstable/xen/include/asm-arm/bitops.h	2015-01-22 13:30:17.000000000 +0100
> @@ -99,46 +99,17 @@ static inline int test_bit(int nr, const
>          return 1UL & (p[BIT_WORD(nr)] >> (nr & (BITS_PER_WORD-1)));
>  }
>  
> -static inline int constant_fls(int x)
> -{
> -        int r = 32;
> -
> -        if (!x)
> -                return 0;
> -        if (!(x & 0xffff0000u)) {
> -                x <<= 16;
> -                r -= 16;
> -        }
> -        if (!(x & 0xff000000u)) {
> -                x <<= 8;
> -                r -= 8;
> -        }
> -        if (!(x & 0xf0000000u)) {
> -                x <<= 4;
> -                r -= 4;
> -        }
> -        if (!(x & 0xc0000000u)) {
> -                x <<= 2;
> -                r -= 2;
> -        }
> -        if (!(x & 0x80000000u)) {
> -                x <<= 1;
> -                r -= 1;
> -        }
> -        return r;
> -}
> -
>  /*
>   * On ARMv5 and above those functions can be implemented around
>   * the clz instruction for much better code efficiency.
>   */
>  
> -static inline int fls(int x)
> +static inline int fls(unsigned int x)
>  {
>          int ret;
>  
>          if (__builtin_constant_p(x))
> -               return constant_fls(x);
> +               return generic_fls(x);
>  
>          asm("clz\t%0, %1" : "=r" (ret) : "r" (x));
>          ret = BITS_PER_LONG - ret;
> @@ -146,7 +117,8 @@ static inline int fls(int x)
>  }
>  
>  
> -#define ffs(x) ({ unsigned long __t = (x); fls(__t & -__t); })
> +#define ffs(x) ({ unsigned int __t = (x); fls(__t & -__t); })
> +#define ffsl(x) ({ unsigned long __t = (x); flsl(__t & -__t); })
>  
>  /**
>   * find_first_set_bit - find the first set bit in @word
> @@ -157,7 +129,7 @@ static inline int fls(int x)
>   */
>  static inline unsigned int find_first_set_bit(unsigned long word)
>  {
> -        return ffs(word) - 1;
> +        return ffsl(word) - 1;
>  }
>  
>  /**
> --- unstable.orig/xen/include/asm-x86/bitops.h	2014-09-15 15:42:35.000000000 +0200
> +++ unstable/xen/include/asm-x86/bitops.h	2015-01-22 13:30:02.000000000 +0100
> @@ -401,7 +401,7 @@ static inline unsigned int find_first_se
>   *
>   * This is defined the same way as the libc and compiler builtin ffs routines.
>   */
> -static inline int ffs(unsigned long x)
> +static inline int ffsl(unsigned long x)
>  {
>      long r;
>  
> @@ -412,13 +412,24 @@ static inline int ffs(unsigned long x)
>      return (int)r+1;
>  }
>  
> +static inline int ffs(unsigned int x)
> +{
> +    int r;
> +
> +    asm ( "bsf %1,%0\n\t"
> +          "jnz 1f\n\t"
> +          "mov $-1,%0\n"
> +          "1:" : "=r" (r) : "rm" (x));
> +    return r + 1;
> +}
> +
>  /**
>   * fls - find last bit set
>   * @x: the word to search
>   *
>   * This is defined the same way as ffs.
>   */
> -static inline int fls(unsigned long x)
> +static inline int flsl(unsigned long x)
>  {
>      long r;
>  
> @@ -429,8 +440,16 @@ static inline int fls(unsigned long x)
>      return (int)r+1;
>  }
>  
> -#define ffs64 ffs
> -#define fls64 fls
> +static inline int fls(unsigned int x)
> +{
> +    int r;
> +
> +    asm ( "bsr %1,%0\n\t"
> +          "jnz 1f\n\t"
> +          "mov $-1,%0\n"
> +          "1:" : "=r" (r) : "rm" (x));
> +    return r + 1;
> +}
>  
>  /**
>   * hweightN - returns the hamming weight of a N-bit word
> --- unstable.orig/xen/include/xen/bitops.h	2015-01-22 13:28:41.000000000 +0100
> +++ unstable/xen/include/xen/bitops.h	2015-01-22 13:30:46.000000000 +0100
> @@ -70,20 +70,52 @@ static __inline__ int generic_fls(int x)
>      return r;
>  }
>  
> +#if BITS_PER_LONG == 64
> +
> +static inline int generic_ffsl(unsigned long x)
> +{
> +    return !x || (u32)x ? generic_ffs(x) : generic_ffs(x >> 32) + 32;
> +}
> +
> +static inline int generic_flsl(unsigned long x)
> +{
> +    u32 h = x >> 32;
> +
> +    return h ? generic_fls(h) + 32 : generic_fls(x);
> +}
> +
> +#else
> +# define generic_ffsl generic_ffs
> +# define generic_flsl generic_fls
> +#endif
> +
>  /*
>   * Include this here because some architectures need generic_ffs/fls in
>   * scope
>   */
>  #include <asm/bitops.h>
>  
> -
> +#if BITS_PER_LONG == 64
> +# define fls64 flsl
> +# define ffs64 ffsl
> +#else
> +# ifndef ffs64
> +static inline int generic_ffs64(__u64 x)
> +{
> +    return !x || (__u32)x ? ffs(x) : ffs(x >> 32) + 32;
> +}
> +#  define ffs64 generic_ffs64
> +# endif
> +# ifndef fls64
>  static inline int generic_fls64(__u64 x)
>  {
>      __u32 h = x >> 32;
> -    if (h)
> -        return fls(x) + 32;
> -    return fls(x);
> +
> +    return h ? fls(h) + 32 : fls(x);
>  }
> +#  define fls64 generic_fls64
> +# endif
> +#endif
>  
>  static __inline__ int get_bitmask_order(unsigned int count)
>  {
>
>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] make fls() and ffs() consistent across architectures
  2015-01-23 14:38   ` Andrew Cooper
@ 2015-01-23 14:53     ` Jan Beulich
  0 siblings, 0 replies; 9+ messages in thread
From: Jan Beulich @ 2015-01-23 14:53 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: Keir Fraser, Stefano Stabellini, Ian Jackson, Tim Deegan,
	Ian Campbell, xen-devel

>>> On 23.01.15 at 15:38, <andrew.cooper3@citrix.com> wrote:
> As for the x86 asm itself, what about
> http://lkml.kernel.org/r/5058741E020000780009C014@nat28.tlf.novell.com ?
> It would appear that the same optimisation is relevant for Xens __ffs()

For find_first_set_bit() (and perhaps also __scanbit()) you mean?
Indeed, and I'm sure I meant to pull it over - and then forgot after
it got accepted. Will do that now.

Jan

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2015-01-23 14:53 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-01-22 13:26 [PATCH 0/2] fls() / ffs() adjustments Jan Beulich
2015-01-22 13:38 ` [PATCH 1/2] make fls() and ffs() consistent across architectures Jan Beulich
2015-01-23 14:38   ` Andrew Cooper
2015-01-23 14:53     ` Jan Beulich
2015-01-22 13:38 ` [PATCH 2/2] arm64: fix fls() Jan Beulich
2015-01-22 14:01   ` Tim Deegan
2015-01-22 14:05     ` Jan Beulich
2015-01-22 14:29       ` Tim Deegan
2015-01-22 14:59 ` [PATCH 0/2] fls() / ffs() adjustments Ian Campbell

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.