arm: Use builtins for ffs/fls

Since ARMv5, the clz instruction allows for efficient implementation of
ffs/fls with builtins. Until ARMv7 (with Thumb-2), this instruction is
only available in ARM mode. LTO makes it difficult to force specific
functions to be in ARM mode, as it is effectively a form of very
aggressive inlining. To work around this, fls/ffs are implemented in
assembly for ARMv5 and ARMv6 when compiling U-Boot in Thumb mode.
Overall, this saves around 75 bytes per call.

This code is synced with v5.15 of the Linux kernel.

Signed-off-by: Sean Anderson <sean.anderson@seco.com>
Reviewed-by: Tom Rini <trini@konsulko.com>
diff --git a/arch/arm/include/asm/bitops.h b/arch/arm/include/asm/bitops.h
index fa85486..8e89783 100644
--- a/arch/arm/include/asm/bitops.h
+++ b/arch/arm/include/asm/bitops.h
@@ -15,9 +15,34 @@
 #ifndef __ASM_ARM_BITOPS_H
 #define __ASM_ARM_BITOPS_H
 
+#if __LINUX_ARM_ARCH__ < 5
+
 #include <asm-generic/bitops/__ffs.h>
 #include <asm-generic/bitops/__fls.h>
 #include <asm-generic/bitops/fls.h>
+
+#else
+
+#define PLATFORM_FFS
+#define PLATFORM_FLS
+
+#if !IS_ENABLED(CONFIG_HAS_THUMB2) && CONFIG_IS_ENABLED(SYS_THUMB_BUILD)
+
+unsigned long __fls(unsigned long word);
+unsigned long __ffs(unsigned long word);
+int fls(unsigned int x);
+int ffs(int x);
+
+#else
+
+#include <asm-generic/bitops/builtin-__fls.h>
+#include <asm-generic/bitops/builtin-__ffs.h>
+#include <asm-generic/bitops/builtin-fls.h>
+#include <asm-generic/bitops/builtin-ffs.h>
+
+#endif
+#endif
+
 #include <asm-generic/bitops/fls64.h>
 
 #ifdef __KERNEL__
@@ -113,7 +138,7 @@
 
 static inline int __ilog2(unsigned int x)
 {
-	return generic_fls(x) - 1;
+	return fls(x) - 1;
 }
 
 #define ffz(x)  __ffs(~(x))