arm: build arch memset/memcpy in Thumb2 mode

Resynchronize memcpy/memset with kernel 3.17 and build them in
Thumb2 mode (unified syntax). Those assembler files can be built
and linked in ARM mode too, however when calling them from Thumb2
built code, the stack got corrupted and the copy did not succeed
(the exact details have not been traced back). However, the Linux
kernel builds those files in Thumb2 mode. Hence U-Boot should
build them in Thumb2 mode too when CONFIG_SYS_THUMB_BUILD is set.

To build the files without warning, some assembler instructions
had to be replaced with their UAL compliant variant (thanks
Jeroen for this input).

To build the file in Thumb2 mode the implicit-it=always option need
to be set to generate Thumb2 compliant IT instructions where needed.
We add this option to the general AFLAGS when building for Thumb2.

Reviewed-by: Simon Glass <sjg@chromium.org>
Tested-by: Simon Glass <sjg@chromium.org>
Signed-off-by: Stefan Agner <stefan@agner.ch>
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index 5e4789b..11b80fb 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -14,12 +14,14 @@
  *  assembler source.
  */
 
+#include <config.h>
+
 /*
  * Endian independent macros for shifting bytes within registers.
  */
 #ifndef __ARMEB__
-#define pull		lsr
-#define push		lsl
+#define lspull		lsr
+#define lspush		lsl
 #define get_byte_0	lsl #0
 #define get_byte_1	lsr #8
 #define get_byte_2	lsr #16
@@ -29,8 +31,8 @@
 #define put_byte_2	lsl #16
 #define put_byte_3	lsl #24
 #else
-#define pull		lsl
-#define push		lsr
+#define lspull		lsl
+#define lspush		lsr
 #define get_byte_0	lsr #24
 #define get_byte_1	lsr #16
 #define get_byte_2	lsr #8
@@ -54,7 +56,28 @@
 #define PLD(code...)
 #endif
 
+	.irp	c,,eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,hs,lo
+	.macro	ret\c, reg
+#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__)
+	mov\c	pc, \reg
+#else
+	.ifeqs	"\reg", "lr"
+	bx\c	\reg
+	.else
+	mov\c	pc, \reg
+	.endif
+#endif
+	.endm
+	.endr
+
 /*
- * Cache alligned
+ * Cache aligned, used for optimized memcpy/memset
+ * In the kernel this is only enabled for Feroceon CPU's...
+ * We disable it especially for Thumb builds since those instructions
+ * are not made in a Thumb ready way...
  */
+#ifdef CONFIG_SYS_THUMB_BUILD
+#define CALGN(code...)
+#else
 #define CALGN(code...) code
+#endif