AVR32: Relocate u-boot to SDRAM

Relocate the u-boot image into SDRAM like everyone else does. This
means that we can handle much larger .data and .bss than we used to.

Signed-off-by: Haavard Skinnemoen <hskinnemoen@atmel.com>
diff --git a/board/atmel/atstk1000/flash.c b/board/atmel/atstk1000/flash.c
index 3aebf66..958f4dc 100644
--- a/board/atmel/atstk1000/flash.c
+++ b/board/atmel/atstk1000/flash.c
@@ -57,7 +57,7 @@
 
 	gd->bd->bi_flashstart = CFG_FLASH_BASE;
 	gd->bd->bi_flashsize = CFG_FLASH_SIZE;
-	gd->bd->bi_flashoffset = __edata_lma - _text;
+	gd->bd->bi_flashoffset = _edata - _text;
 
 	flash_info[0].size = CFG_FLASH_SIZE;
 	flash_info[0].sector_count = 135;
diff --git a/board/atmel/atstk1000/u-boot.lds b/board/atmel/atstk1000/u-boot.lds
index ef89ea4..34e347a 100644
--- a/board/atmel/atstk1000/u-boot.lds
+++ b/board/atmel/atstk1000/u-boot.lds
@@ -40,35 +40,38 @@
 	}
 	. = ALIGN(32);
 	__flashprog_end = .;
+	_etext = .;
 
-	. = ALIGN(8);
 	.rodata : {
 		*(.rodata)
 		*(.rodata.*)
 	}
-	_etext = .;
 
-	__data_lma = ALIGN(8);
-	. = 0x24000000;
+	. = ALIGN(8);
 	_data = .;
-	.data : AT(__data_lma) {
+	.data : {
 		*(.data)
 		*(.data.*)
 	}
 
 	. = ALIGN(4);
 	__u_boot_cmd_start = .;
-	__u_boot_cmd_lma = __data_lma + (__u_boot_cmd_start - _data);
-	.u_boot_cmd : AT(__u_boot_cmd_lma) {
+	.u_boot_cmd : {
 		KEEP(*(.u_boot_cmd))
 	}
 	__u_boot_cmd_end = .;
 
+	. = ALIGN(4);
+	_got = .;
+	.got : {
+		*(.got)
+	}
+	_egot = .;
+
 	. = ALIGN(8);
 	_edata = .;
-	__edata_lma = __u_boot_cmd_lma + (_edata - __u_boot_cmd_start);
 
-	.bss : AT(__edata_lma) {
+	.bss : {
 		*(.bss)
 		*(.bss.*)
 	}
diff --git a/cpu/at32ap/entry.S b/cpu/at32ap/entry.S
index b52d798..a6fc688 100644
--- a/cpu/at32ap/entry.S
+++ b/cpu/at32ap/entry.S
@@ -42,8 +42,7 @@
 	 * We're running at interrupt level 3, so we don't need to save
 	 * r8-r12 or lr to the stack.
 	 */
-	mov	r8, lo(timer_overflow)
-	orh	r8, hi(timer_overflow)
+	lda.w	r8, timer_overflow
 	ld.w	r9, r8[0]
 	mov	r10, -1
 	mtsr	SYSREG_COMPARE, r10
diff --git a/cpu/at32ap/exception.c b/cpu/at32ap/exception.c
index 4123c44..0672685 100644
--- a/cpu/at32ap/exception.c
+++ b/cpu/at32ap/exception.c
@@ -24,6 +24,8 @@
 #include <asm/sysreg.h>
 #include <asm/ptrace.h>
 
+DECLARE_GLOBAL_DATA_PTR;
+
 static const char * const cpu_modes[8] = {
 	"Application", "Supervisor", "Interrupt level 0", "Interrupt level 1",
 	"Interrupt level 2", "Interrupt level 3", "Exception", "NMI"
@@ -109,11 +111,10 @@
 	printf("CPU Mode: %s\n", cpu_modes[mode]);
 
 	/* Avoid exception loops */
-	if (regs->sp >= CFG_INIT_SP_ADDR
-	    || regs->sp < (CFG_INIT_SP_ADDR - CONFIG_STACKSIZE))
+	if (regs->sp < CFG_SDRAM_BASE || regs->sp >= gd->stack_end)
 		printf("\nStack pointer seems bogus, won't do stack dump\n");
 	else
-		dump_mem("\nStack: ", regs->sp, CFG_INIT_SP_ADDR);
+		dump_mem("\nStack: ", regs->sp, gd->stack_end);
 
 	panic("Unhandled exception\n");
 }
diff --git a/cpu/at32ap/interrupts.c b/cpu/at32ap/interrupts.c
index 85420a4..c9e0499 100644
--- a/cpu/at32ap/interrupts.c
+++ b/cpu/at32ap/interrupts.c
@@ -115,9 +115,12 @@
 static int set_interrupt_handler(unsigned int nr, void (*handler)(void),
 				 unsigned int priority)
 {
+	extern void _evba(void);
 	unsigned long intpr;
 	unsigned long handler_addr = (unsigned long)handler;
 
+	handler_addr -= (unsigned long)&_evba;
+
 	if ((handler_addr & HANDLER_MASK) != handler_addr
 	    || (priority & INTLEV_MASK) != priority)
 		return -EINVAL;
diff --git a/cpu/at32ap/start.S b/cpu/at32ap/start.S
index 4ae0b54..ab8c2b7 100644
--- a/cpu/at32ap/start.S
+++ b/cpu/at32ap/start.S
@@ -70,32 +70,12 @@
 
 2:	lddpc	sp, sp_init
 
-	/*
-	 * Relocate the data section and initialize .bss.  Everything
-	 * is guaranteed to be at least doubleword aligned by the
-	 * linker script.
-	 */
-	lddpc	r12, .Ldata_vma
-	lddpc	r11, .Ldata_lma
-	lddpc	r10, .Ldata_end
-	sub	r10, r12
-4:	ld.d	r8, r11++
-	sub	r10, 8
-	st.d	r12++, r8
-	brne	4b
-
-	mov	r8, 0
-	mov	r9, 0
-	lddpc	r10, .Lbss_end
-	sub	r10, r12
-4:	sub	r10, 8
-	st.d	r12++, r8
-	brne	4b
-
 	/* Initialize the GOT pointer */
 	lddpc	r6, got_init
 3:	rsub	r6, pc
-	ld.w	pc, r6[board_init_f@got]
+
+	/* Let's go */
+	rjmp	board_init_f
 
 	.align	2
 	.type	sp_init,@object
@@ -103,11 +83,82 @@
 	.long	CFG_INIT_SP_ADDR
 got_init:
 	.long	3b - _GLOBAL_OFFSET_TABLE_
-.Ldata_lma:
-	.long	__data_lma
-.Ldata_vma:
-	.long	_data
-.Ldata_end:
-	.long	_edata
-.Lbss_end:
-	.long	_end
+
+	/*
+	 * void	relocate_code(new_sp, new_gd, monitor_addr)
+	 *
+	 * Relocate the u-boot image into RAM and continue from there.
+	 * Does not return.
+	 */
+	.global	relocate_code
+	.type	relocate_code,@function
+relocate_code:
+	mov	sp, r12		/* use new stack */
+	mov	r12, r11	/* save new_gd */
+	mov	r11, r10	/* save destination address */
+
+	/* copy .text section and flush the cache along the way */
+	lda.w	r8, _text
+	lda.w	r9, _etext
+	sub	lr, r10, r8	/* relocation offset */
+
+1:	ldm	r8++, r0-r3
+	stm	r10, r0-r3
+	sub	r10, -16
+	ldm	r8++, r0-r3
+	stm	r10, r0-r3
+	sub	r10, -16
+	cp.w	r8, r9
+	cache	r10[-4], 0x0d	/* dcache clean/invalidate */
+	cache	r10[-4], 0x01	/* icache invalidate */
+	brlt	1b
+
+	/* flush write buffer */
+	sync	0
+
+	/* copy data sections */
+	lda.w	r9, _edata
+1:	ld.d	r0, r8++
+	st.d	r10++, r0
+	cp.w	r8, r9
+	brlt	1b
+
+	/* zero out .bss */
+	mov	r0, 0
+	mov	r1, 0
+	lda.w	r9, _end
+	sub	r9, r8
+1:	st.d	r10++, r0
+	sub	r9, 8
+	brgt	1b
+
+	/* jump to RAM */
+	sub	r0, pc, . - in_ram
+	add	pc, r0, lr
+
+	.align	2
+in_ram:
+	/* find the new GOT and relocate it */
+	lddpc	r6, got_init_reloc
+3:	rsub	r6, pc
+	mov	r8, r6
+	lda.w	r9, _egot
+	lda.w	r10, _got
+	sub	r9, r10
+1:	ld.w	r0, r8[0]
+	add	r0, lr
+	st.w	r8++, r0
+	sub	r9, 4
+	brgt	1b
+
+	/* Move the exception handlers */
+	mfsr	r2, SYSREG_EVBA
+	add	r2, lr
+	mtsr	SYSREG_EVBA, r2
+
+	/* Do the rest of the initialization sequence */
+	call	board_init_r
+
+	.align	2
+got_init_reloc:
+	.long	3b - _GLOBAL_OFFSET_TABLE_
diff --git a/include/asm-avr32/global_data.h b/include/asm-avr32/global_data.h
index 3b6769b..7c45b36 100644
--- a/include/asm-avr32/global_data.h
+++ b/include/asm-avr32/global_data.h
@@ -37,6 +37,7 @@
 	unsigned long	flags;
 	unsigned long	baudrate;
 	unsigned long	sdram_size;
+	unsigned long	stack_end;	/* highest stack address */
 	unsigned long	have_console;	/* serial_init() was called */
 	unsigned long	reloc_off;	/* Relocation Offset */
 	unsigned long	env_addr;	/* Address of env struct */
diff --git a/include/configs/atstk1002.h b/include/configs/atstk1002.h
index 717f540..7463633 100644
--- a/include/configs/atstk1002.h
+++ b/include/configs/atstk1002.h
@@ -151,16 +151,8 @@
 #define CFG_INIT_SP_ADDR		(CFG_INTRAM_BASE + CFG_INTRAM_SIZE)
 
 #define CFG_MALLOC_LEN			(256*1024)
-#define CFG_MALLOC_END							\
-	({								\
-		DECLARE_GLOBAL_DATA_PTR;				\
-		CFG_SDRAM_BASE + gd->sdram_size;			\
-	})
-#define CFG_MALLOC_START		(CFG_MALLOC_END - CFG_MALLOC_LEN)
-
 #define CFG_DMA_ALLOC_LEN		(16384)
-#define CFG_DMA_ALLOC_END		(CFG_MALLOC_START)
-#define CFG_DMA_ALLOC_START		(CFG_DMA_ALLOC_END - CFG_DMA_ALLOC_LEN)
+
 /* Allow 2MB for the kernel run-time image */
 #define CFG_LOAD_ADDR			(CFG_SDRAM_BASE + 0x00200000)
 #define CFG_BOOTPARAMS_LEN		(16 * 1024)
diff --git a/lib_avr32/board.c b/lib_avr32/board.c
index 12d0b97..c55ebd5 100644
--- a/lib_avr32/board.c
+++ b/lib_avr32/board.c
@@ -47,11 +47,14 @@
 static unsigned long mem_malloc_end = 0;
 static unsigned long mem_malloc_brk = 0;
 
-/* The malloc area is wherever the board wants it to be */
+/* The malloc area is right below the monitor image in RAM */
 static void mem_malloc_init(void)
 {
-	mem_malloc_start = CFG_MALLOC_START;
-	mem_malloc_end = CFG_MALLOC_END;
+	unsigned long monitor_addr;
+
+	monitor_addr = CFG_MONITOR_BASE + gd->reloc_off;
+	mem_malloc_end = monitor_addr;
+	mem_malloc_start = mem_malloc_end - CFG_MALLOC_LEN;
 	mem_malloc_brk = mem_malloc_start;
 
 	printf("malloc: Using memory from 0x%08lx to 0x%08lx\n",
@@ -125,6 +128,12 @@
 void board_init_f(ulong unused)
 {
 	gd_t gd_data;
+	gd_t *new_gd;
+	bd_t *bd;
+	unsigned long *new_sp;
+	unsigned long monitor_len;
+	unsigned long monitor_addr;
+	unsigned long addr;
 
 	/* Initialize the global data pointer */
 	memset(&gd_data, 0, sizeof(gd_data));
@@ -133,7 +142,6 @@
 	/* Perform initialization sequence */
 	board_early_init_f();
 	cpu_init();
-	timer_init();
 	env_init();
 	init_baudrate();
 	serial_init();
@@ -141,28 +149,120 @@
 	display_banner();
 	board_init_memories();
 
-	board_init_r(gd, CFG_MONITOR_BASE);
+	/* If we have no SDRAM, we can't go on */
+	if (!gd->sdram_size)
+		panic("No working SDRAM available\n");
+
+	/*
+	 * Now that we have DRAM mapped and working, we can
+	 * relocate the code and continue running from DRAM.
+	 *
+	 * Reserve memory at end of RAM for (top down in that order):
+	 *  - u-boot image
+	 *  - heap for malloc()
+	 *  - board info struct
+	 *  - global data struct
+	 *  - stack
+	 */
+	addr = CFG_SDRAM_BASE + gd->sdram_size;
+	monitor_len = _end - _text;
+
+	/*
+	 * Reserve memory for u-boot code, data and bss.
+	 * Round down to next 4 kB limit.
+	 */
+	addr -= monitor_len;
+	addr &= ~(4096UL - 1);
+	monitor_addr = addr;
+
+	/* Reserve memory for malloc() */
+	addr -= CFG_MALLOC_LEN;
+
+	/* Allocate a Board Info struct on a word boundary */
+	addr -= sizeof(bd_t);
+	addr &= ~3UL;
+	gd->bd = bd = (bd_t *)addr;
+
+	/* Allocate a new global data copy on a 8-byte boundary. */
+	addr -= sizeof(gd_t);
+	addr &= ~7UL;
+	new_gd = (gd_t *)addr;
+
+	/* And finally, a new, bigger stack. */
+	new_sp = (unsigned long *)addr;
+	gd->stack_end = addr;
+	*(--new_sp) = 0;
+	*(--new_sp) = 0;
+
+	/*
+	 * Initialize the board information struct with the
+	 * information we have.
+	 */
+	bd->bi_dram[0].start = CFG_SDRAM_BASE;
+	bd->bi_dram[0].size = gd->sdram_size;
+	bd->bi_baudrate = gd->baudrate;
+
+	memcpy(new_gd, gd, sizeof(gd_t));
+
+	relocate_code((unsigned long)new_sp, new_gd, monitor_addr);
 }
 
 void board_init_r(gd_t *new_gd, ulong dest_addr)
 {
+	extern void malloc_bin_reloc (void);
+#ifndef CFG_ENV_IS_NOWHERE
+	extern char * env_name_spec;
+#endif
+	cmd_tbl_t *cmdtp;
+	bd_t *bd;
+
 	gd = new_gd;
+	bd = gd->bd;
+
+	gd->flags |= GD_FLG_RELOC;
+	gd->reloc_off = dest_addr - CFG_MONITOR_BASE;
 
 	monitor_flash_len = _edata - _text;
 
-	mem_malloc_init();
-	gd->bd = malloc(sizeof(bd_t));
-	memset(gd->bd, 0, sizeof(bd_t));
-	gd->bd->bi_baudrate = gd->baudrate;
-	gd->bd->bi_dram[0].start = CFG_SDRAM_BASE;
-	gd->bd->bi_dram[0].size = gd->sdram_size;
+	/*
+	 * We have to relocate the command table manually
+	 */
+	for (cmdtp = &__u_boot_cmd_start;
+	     cmdtp !=  &__u_boot_cmd_end; cmdtp++) {
+		unsigned long addr;
 
+		addr = (unsigned long)cmdtp->cmd + gd->reloc_off;
+		cmdtp->cmd = (typeof(cmdtp->cmd))addr;
+
+		addr = (unsigned long)cmdtp->name + gd->reloc_off;
+		cmdtp->name = (typeof(cmdtp->name))addr;
+
+		if (cmdtp->usage) {
+			addr = (unsigned long)cmdtp->usage + gd->reloc_off;
+			cmdtp->usage = (typeof(cmdtp->usage))addr;
+		}
+#ifdef CFG_LONGHELP
+		if (cmdtp->help) {
+			addr = (unsigned long)cmdtp->help + gd->reloc_off;
+			cmdtp->help = (typeof(cmdtp->help))addr;
+		}
+#endif
+	}
+
+	/* there are some other pointer constants we must deal with */
+#ifndef CFG_ENV_IS_NOWHERE
+	env_name_spec += gd->reloc_off;
+#endif
+
+	timer_init();
+	mem_malloc_init();
+	malloc_bin_reloc();
 	board_init_info();
 	flash_init();
 
-	if (gd->bd->bi_flashsize)
+	if (bd->bi_flashsize)
 		display_flash_config();
-	if (gd->bd->bi_dram[0].size)
+	if (bd->bi_dram[0].size)
 		display_dram_config();
 
 	gd->bd->bi_boot_params = malloc(CFG_BOOTPARAMS_LEN);