arm: move gd handling outside of C code

As of gcc 5.2.1 for Thumb-1, it is not possible any
more to assign gd from C code, as gd is mapped to r9,
and r9 may now be saved in the prolog sequence, and
restored in the epilog sequence, of any C functions.

Therefore arch_setup_gd(), which is supposed to set
r9, may actually have no effect, causing U-Boot to
use a bad address to access GD.

Fix this by never calling arch_setup_gd() for ARM,
and instead setting r9 in arch/arm/lib/crt0.S, to
the value returned by board_init_f_alloc_reserve().

Signed-off-by: Albert ARIBAUD <albert.u.boot@aribaud.net>
Reviewed-by: Simon Glass <sjg@chromium.org>
diff --git a/arch/arm/lib/crt0.S b/arch/arm/lib/crt0.S
index 4f2a712..2f4c14e 100644
--- a/arch/arm/lib/crt0.S
+++ b/arch/arm/lib/crt0.S
@@ -85,6 +85,8 @@
 	mov	r0, sp
 	bl	board_init_f_alloc_reserve
 	mov	sp, r0
+	/* set up gd here, outside any C code */
+	mov	r9, r0
 	bl	board_init_f_init_reserve
 
 	mov	r0, #0
@@ -134,6 +136,7 @@
 	bl	spl_relocate_stack_gd
 	cmp	r0, #0
 	movne	sp, r0
+	movne	r9, r0
 # endif
 	ldr	r0, =__bss_start	/* this is auto-relocated! */
 
diff --git a/common/init/board_init.c b/common/init/board_init.c
index e649e07..d98648e 100644
--- a/common/init/board_init.c
+++ b/common/init/board_init.c
@@ -21,13 +21,13 @@
 #define _USE_MEMCPY
 #endif
 
-/* Unfortunately x86 can't compile this code as gd cannot be assigned */
-#ifndef CONFIG_X86
+/* Unfortunately x86 or ARM can't compile this code as gd cannot be assigned */
+#if !defined(CONFIG_X86) && !defined(CONFIG_ARM)
 __weak void arch_setup_gd(struct global_data *gd_ptr)
 {
 	gd = gd_ptr;
 }
-#endif /* !CONFIG_X86 */
+#endif /* !CONFIG_X86 && !CONFIG_ARM */
 
 /*
  * Allocate reserved space for use as 'globals' from 'top' address and
@@ -128,7 +128,7 @@
 		*ptr++ = 0;
 #endif
 	/* set GD unless architecture did it already */
-#ifndef CONFIG_X86
+#if !defined(CONFIG_X86) && !defined(CONFIG_ARM)
 	arch_setup_gd(gd_ptr);
 #endif
 	/* next alloc will be higher by one GD plus 16-byte alignment */
diff --git a/common/spl/spl.c b/common/spl/spl.c
index 6e6dee7..e5167bf 100644
--- a/common/spl/spl.c
+++ b/common/spl/spl.c
@@ -431,8 +431,13 @@
  * more stack space for things like the MMC sub-system.
  *
  * This function calculates the stack position, copies the global_data into
- * place and returns the new stack position. The caller is responsible for
- * setting up the sp register.
+ * place, sets the new gd (except for ARM, for which setting GD within a C
+ * function may not always work) and returns the new stack position. The
+ * caller is responsible for setting up the sp register and, in the case
+ * of ARM, setting up gd.
+ *
+ * All of this is done using the same layout and alignments as done in
+ * board_init_f_init_reserve() / board_init_f_alloc_reserve().
  *
  * @return new stack location, or 0 to use the same stack
  */
@@ -440,14 +445,7 @@
 {
 #ifdef CONFIG_SPL_STACK_R
 	gd_t *new_gd;
-	ulong ptr;
-
-	/* Get stack position: use 8-byte alignment for ABI compliance */
-	ptr = CONFIG_SPL_STACK_R_ADDR - sizeof(gd_t);
-	ptr &= ~7;
-	new_gd = (gd_t *)ptr;
-	memcpy(new_gd, (void *)gd, sizeof(gd_t));
-	gd = new_gd;
+	ulong ptr = CONFIG_SPL_STACK_R_ADDR;
 
 #ifdef CONFIG_SPL_SYS_MALLOC_SIMPLE
 	if (CONFIG_SPL_STACK_R_MALLOC_SIMPLE_LEN) {
@@ -460,7 +458,13 @@
 		gd->malloc_ptr = 0;
 	}
 #endif
-
+	/* Get stack position: use 8-byte alignment for ABI compliance */
+	ptr = CONFIG_SPL_STACK_R_ADDR - roundup(sizeof(gd_t),16);
+	new_gd = (gd_t *)ptr;
+	memcpy(new_gd, (void *)gd, sizeof(gd_t));
+#if !defined(CONFIG_ARM)
+	gd = new_gd;
+#endif
 	return ptr;
 #else
 	return 0;