ppc4xx: Add NAND booting support for AMCC Bamboo (440EP) eval board

This patch adds NAND booting support for the AMCC Bamboo eval board.
Since the NAND-SPL boot image is limited to 4kbytes, this version
only supports the onboard 64MBytes of DDR. The DIMM modules can't be
supported, since the setup code for I2C DIMM autodetection and
configuration is too big for this NAND bootloader.

Signed-off-by: Stefan Roese <sr@denx.de>
diff --git a/cpu/ppc4xx/start.S b/cpu/ppc4xx/start.S
index 3b1586c..fe14ecd 100644
--- a/cpu/ppc4xx/start.S
+++ b/cpu/ppc4xx/start.S
@@ -110,6 +110,13 @@
 # endif
 #endif /* CFG_INIT_DCACHE_CS */
 
+#define function_prolog(func_name)      .text; \
+					.align 2; \
+					.globl func_name; \
+					func_name:
+#define function_epilog(func_name)      .type func_name,@function; \
+					.size func_name,.-func_name
+
 /* We don't want the  MMU yet.
 */
 #undef	MSR_KERNEL
@@ -388,8 +395,9 @@
 2:
 
 #if defined(CONFIG_NAND_SPL)
+#if defined(CONFIG_440EPX) || defined(CONFIG_440GRX)
 	/*
-	 * Enable internal SRAM
+	 * Enable internal SRAM (only on 440EPx/GRx, 440EP/GR have no OCM)
 	 */
 	lis	r2,0x7fff
 	ori	r2,r2,0xffff
@@ -399,6 +407,45 @@
 	mfdcr	r1,isram0_pmeg
 	and	r1,r1,r2		/* Disable pwr mgmt */
 	mtdcr	isram0_pmeg,r1
+#endif
+#if defined(CONFIG_440EP)
+	/*
+	 * On 440EP with no internal SRAM, we setup SDRAM very early
+	 * and copy the NAND_SPL to SDRAM and jump to it
+	 */
+	/* Clear Dcache to use as RAM */
+	addis	r3,r0,CFG_INIT_RAM_ADDR@h
+	ori	r3,r3,CFG_INIT_RAM_ADDR@l
+	addis	r4,r0,CFG_INIT_RAM_END@h
+	ori	r4,r4,CFG_INIT_RAM_END@l
+	rlwinm. r5,r4,0,27,31
+	rlwinm	r5,r4,27,5,31
+	beq	..d_ran3
+	addi	r5,r5,0x0001
+..d_ran3:
+	mtctr	r5
+..d_ag3:
+	dcbz	r0,r3
+	addi	r3,r3,32
+	bdnz	..d_ag3
+	/*----------------------------------------------------------------*/
+	/* Setup the stack in internal SRAM */
+	/*----------------------------------------------------------------*/
+	lis	r1,CFG_INIT_RAM_ADDR@h
+	ori	r1,r1,CFG_INIT_SP_OFFSET@l
+	li	r0,0
+	stwu	r0,-4(r1)
+	stwu	r0,-4(r1)		/* Terminate call chain */
+
+	stwu	r1,-8(r1)		/* Save back chain and move SP */
+	lis	r0,RESET_VECTOR@h	/* Address of reset vector */
+	ori	r0,r0, RESET_VECTOR@l
+	stwu	r1,-8(r1)		/* Save back chain and move SP */
+	stw	r0,+12(r1)		/* Save return addr (underflow vect) */
+	sync
+	bl	early_sdram_init
+	sync
+#endif /* CONFIG_440EP */
 
 	/*
 	 * Copy SPL from cache into internal SRAM
@@ -429,7 +476,7 @@
 start_ram:
 	sync
 	isync
-#endif
+#endif /* CONFIG_NAND_SPL */
 
 	bl	3f
 	b	_start
@@ -1137,7 +1184,6 @@
 	lwz	r1,GPR1(r1)
 	SYNC
 	rfci
-#endif /* CONFIG_NAND_SPL */
 
 /* Cache functions.
 */
@@ -1255,24 +1301,6 @@
 	blr
 
 /*------------------------------------------------------------------------------- */
-/* Function:	 in8 */
-/* Description:	 Input 8 bits */
-/*------------------------------------------------------------------------------- */
-	.globl	in8
-in8:
-	lbz	r3,0x0000(r3)
-	blr
-
-/*------------------------------------------------------------------------------- */
-/* Function:	 out8 */
-/* Description:	 Output 8 bits */
-/*------------------------------------------------------------------------------- */
-	.globl	out8
-out8:
-	stb	r4,0x0000(r3)
-	blr
-
-/*------------------------------------------------------------------------------- */
 /* Function:	 out16 */
 /* Description:	 Output 16 bits */
 /*------------------------------------------------------------------------------- */
@@ -1291,15 +1319,6 @@
 	blr
 
 /*------------------------------------------------------------------------------- */
-/* Function:	 out32 */
-/* Description:	 Output 32 bits */
-/*------------------------------------------------------------------------------- */
-	.globl	out32
-out32:
-	stw	r4,0x0000(r3)
-	blr
-
-/*------------------------------------------------------------------------------- */
 /* Function:	 out32r */
 /* Description:	 Byte reverse and output 32 bits */
 /*------------------------------------------------------------------------------- */
@@ -1327,15 +1346,6 @@
 	blr
 
 /*------------------------------------------------------------------------------- */
-/* Function:	 in32 */
-/* Description:	 Input 32 bits */
-/*------------------------------------------------------------------------------- */
-	.globl	in32
-in32:
-	lwz	3,0x0000(3)
-	blr
-
-/*------------------------------------------------------------------------------- */
 /* Function:	 in32r */
 /* Description:	 Input 32 bits and byte reverse */
 /*------------------------------------------------------------------------------- */
@@ -1377,9 +1387,6 @@
 	sync
 	blr
 
-/*------------------------------------------------------------------------------*/
-
-#ifndef CONFIG_NAND_SPL
 /*
  * void relocate_code (addr_sp, gd, addr_moni)
  *
@@ -1644,8 +1651,88 @@
 	stw	r0, 4(r7)
 
 	blr
+
+#if defined(CONFIG_440)
+/*----------------------------------------------------------------------------+
+| dcbz_area.
++----------------------------------------------------------------------------*/
+	function_prolog(dcbz_area)
+	rlwinm. r5,r4,0,27,31
+	rlwinm  r5,r4,27,5,31
+	beq     ..d_ra2
+	addi    r5,r5,0x0001
+..d_ra2:mtctr   r5
+..d_ag2:dcbz    r0,r3
+	addi    r3,r3,32
+	bdnz    ..d_ag2
+	sync
+	blr
+	function_epilog(dcbz_area)
+
+/*----------------------------------------------------------------------------+
+| dflush.  Assume 32K at vector address is cachable.
++----------------------------------------------------------------------------*/
+	function_prolog(dflush)
+	mfmsr   r9
+	rlwinm  r8,r9,0,15,13
+	rlwinm  r8,r8,0,17,15
+	mtmsr   r8
+	addi    r3,r0,0x0000
+	mtspr   dvlim,r3
+	mfspr   r3,ivpr
+	addi    r4,r0,1024
+	mtctr   r4
+..dflush_loop:
+	lwz     r6,0x0(r3)
+	addi    r3,r3,32
+	bdnz    ..dflush_loop
+	addi    r3,r3,-32
+	mtctr   r4
+..ag:   dcbf    r0,r3
+	addi    r3,r3,-32
+	bdnz    ..ag
+	sync
+	mtmsr   r9
+	blr
+	function_epilog(dflush)
+#endif /* CONFIG_440 */
 #endif /* CONFIG_NAND_SPL */
 
+/*------------------------------------------------------------------------------- */
+/* Function:	 in8 */
+/* Description:	 Input 8 bits */
+/*------------------------------------------------------------------------------- */
+	.globl	in8
+in8:
+	lbz	r3,0x0000(r3)
+	blr
+
+/*------------------------------------------------------------------------------- */
+/* Function:	 out8 */
+/* Description:	 Output 8 bits */
+/*------------------------------------------------------------------------------- */
+	.globl	out8
+out8:
+	stb	r4,0x0000(r3)
+	blr
+
+/*------------------------------------------------------------------------------- */
+/* Function:	 out32 */
+/* Description:	 Output 32 bits */
+/*------------------------------------------------------------------------------- */
+	.globl	out32
+out32:
+	stw	r4,0x0000(r3)
+	blr
+
+/*------------------------------------------------------------------------------- */
+/* Function:	 in32 */
+/* Description:	 Input 32 bits */
+/*------------------------------------------------------------------------------- */
+	.globl	in32
+in32:
+	lwz	3,0x0000(3)
+	blr
 
 /**************************************************************************/
 /* PPC405EP specific stuff						  */
@@ -1892,13 +1979,6 @@
 #endif /* CONFIG_405EP */
 
 #if defined(CONFIG_440)
-#define function_prolog(func_name)      .text; \
-					.align 2; \
-					.globl func_name; \
-					func_name:
-#define function_epilog(func_name)      .type func_name,@function; \
-					.size func_name,.-func_name
-
 /*----------------------------------------------------------------------------+
 | mttlb3.
 +----------------------------------------------------------------------------*/
@@ -1946,47 +2026,4 @@
 	TLBRE(3,3,0)
 	blr
 	function_epilog(mftlb1)
-
-/*----------------------------------------------------------------------------+
-| dcbz_area.
-+----------------------------------------------------------------------------*/
-	function_prolog(dcbz_area)
-	rlwinm. r5,r4,0,27,31
-	rlwinm  r5,r4,27,5,31
-	beq     ..d_ra2
-	addi    r5,r5,0x0001
-..d_ra2:mtctr   r5
-..d_ag2:dcbz    r0,r3
-	addi    r3,r3,32
-	bdnz    ..d_ag2
-	sync
-	blr
-	function_epilog(dcbz_area)
-
-/*----------------------------------------------------------------------------+
-| dflush.  Assume 32K at vector address is cachable.
-+----------------------------------------------------------------------------*/
-	function_prolog(dflush)
-	mfmsr   r9
-	rlwinm  r8,r9,0,15,13
-	rlwinm  r8,r8,0,17,15
-	mtmsr   r8
-	addi    r3,r0,0x0000
-	mtspr   dvlim,r3
-	mfspr   r3,ivpr
-	addi    r4,r0,1024
-	mtctr   r4
-..dflush_loop:
-	lwz     r6,0x0(r3)
-	addi    r3,r3,32
-	bdnz    ..dflush_loop
-	addi    r3,r3,-32
-	mtctr   r4
-..ag:   dcbf    r0,r3
-	addi    r3,r3,-32
-	bdnz    ..ag
-	sync
-	mtmsr   r9
-	blr
-	function_epilog(dflush)
 #endif /* CONFIG_440 */