ppc4xx: Rework 4xx cache support

New cache handling functions added and all existing functions
moved from start.S into seperate cache.S.

Signed-off-by: Stefan Roese <sr@denx.de>
diff --git a/cpu/ppc4xx/Makefile b/cpu/ppc4xx/Makefile
index 1947249..9155e9a 100644
--- a/cpu/ppc4xx/Makefile
+++ b/cpu/ppc4xx/Makefile
@@ -25,14 +25,38 @@
 
 LIB	= $(obj)lib$(CPU).a
 
-START	= start.o resetvec.o kgdb.o
-SOBJS	= dcr.o
-COBJS	= 40x_spd_sdram.o 44x_spd_ddr.o 44x_spd_ddr2.o \
-	  4xx_enet.o 4xx_pci.o 4xx_pcie.o 4xx_uart.o \
-	  bedbug_405.o commproc.o cpu.o cpu_init.o \
-	  fdt.o gpio.o i2c.o interrupts.o iop480_uart.o \
-	  miiphy.o ndfc.o sdram.o speed.o \
-	  tlb.o traps.o usb.o usb_ohci.o usbdev.o
+START	:= resetvec.o
+START	+= start.o
+
+SOBJS	:= cache.o
+SOBJS	+= dcr.o
+SOBJS	+= kgdb.o
+
+COBJS	:= 40x_spd_sdram.o
+COBJS	+= 44x_spd_ddr.o
+COBJS	+= 44x_spd_ddr2.o
+COBJS	+= 4xx_enet.o
+COBJS	+= 4xx_pci.o
+COBJS	+= 4xx_pcie.o
+COBJS	+= 4xx_uart.o
+COBJS	+= bedbug_405.o
+COBJS	+= commproc.o
+COBJS	+= cpu.o
+COBJS	+= cpu_init.o
+COBJS	+= fdt.o
+COBJS	+= gpio.o
+COBJS	+= i2c.o
+COBJS	+= interrupts.o
+COBJS	+= iop480_uart.o
+COBJS	+= miiphy.o
+COBJS	+= ndfc.o
+COBJS	+= sdram.o
+COBJS	+= speed.o
+COBJS	+= tlb.o
+COBJS	+= traps.o
+COBJS	+= usb.o
+COBJS	+= usb_ohci.o
+COBJS	+= usbdev.o
 
 SRCS	:= $(START:.o=.S) $(SOBJS:.o=.S) $(COBJS:.o=.c)
 OBJS	:= $(addprefix $(obj),$(SOBJS) $(COBJS))
diff --git a/cpu/ppc4xx/cache.S b/cpu/ppc4xx/cache.S
new file mode 100644
index 0000000..5124dec
--- /dev/null
+++ b/cpu/ppc4xx/cache.S
@@ -0,0 +1,233 @@
+/*
+ * This file contains miscellaneous low-level functions.
+ *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
+ *
+ * Largely rewritten by Cort Dougan (cort@cs.nmt.edu)
+ * and Paul Mackerras.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <config.h>
+#include <config.h>
+#include <ppc4xx.h>
+#include <ppc_asm.tmpl>
+#include <ppc_defs.h>
+#include <asm/cache.h>
+#include <asm/mmu.h>
+
+/*
+ * Flush instruction cache.
+ */
+_GLOBAL(invalidate_icache)
+	iccci	r0,r0
+	isync
+	blr
+
+/*
+ * Write any modified data cache blocks out to memory
+ * and invalidate the corresponding instruction cache blocks.
+ *
+ * flush_icache_range(unsigned long start, unsigned long stop)
+ */
+_GLOBAL(flush_icache_range)
+	li	r5,L1_CACHE_BYTES-1
+	andc	r3,r3,r5
+	subf	r4,r3,r4
+	add	r4,r4,r5
+	srwi.	r4,r4,L1_CACHE_SHIFT
+	beqlr
+	mtctr	r4
+	mr	r6,r3
+1:	dcbst	0,r3
+	addi	r3,r3,L1_CACHE_BYTES
+	bdnz	1b
+	sync				/* wait for dcbst's to get to ram */
+	mtctr	r4
+2:	icbi	0,r6
+	addi	r6,r6,L1_CACHE_BYTES
+	bdnz	2b
+	sync				/* additional sync needed on g4 */
+	isync
+	blr
+
+/*
+ * Write any modified data cache blocks out to memory.
+ * Does not invalidate the corresponding cache lines (especially for
+ * any corresponding instruction cache).
+ *
+ * clean_dcache_range(unsigned long start, unsigned long stop)
+ */
+_GLOBAL(clean_dcache_range)
+	li	r5,L1_CACHE_BYTES-1
+	andc	r3,r3,r5
+	subf	r4,r3,r4
+	add	r4,r4,r5
+	srwi.	r4,r4,L1_CACHE_SHIFT
+	beqlr
+	mtctr	r4
+
+1:	dcbst	0,r3
+	addi	r3,r3,L1_CACHE_BYTES
+	bdnz	1b
+	sync				/* wait for dcbst's to get to ram */
+	blr
+
+/*
+ * Write any modified data cache blocks out to memory and invalidate them.
+ * Does not invalidate the corresponding instruction cache blocks.
+ *
+ * flush_dcache_range(unsigned long start, unsigned long stop)
+ */
+_GLOBAL(flush_dcache_range)
+	li	r5,L1_CACHE_BYTES-1
+	andc	r3,r3,r5
+	subf	r4,r3,r4
+	add	r4,r4,r5
+	srwi.	r4,r4,L1_CACHE_SHIFT
+	beqlr
+	mtctr	r4
+
+1:	dcbf	0,r3
+	addi	r3,r3,L1_CACHE_BYTES
+	bdnz	1b
+	sync				/* wait for dcbst's to get to ram */
+	blr
+
+/*
+ * Like above, but invalidate the D-cache.  This is used by the 8xx
+ * to invalidate the cache so the PPC core doesn't get stale data
+ * from the CPM (no cache snooping here :-).
+ *
+ * invalidate_dcache_range(unsigned long start, unsigned long stop)
+ */
+_GLOBAL(invalidate_dcache_range)
+	li	r5,L1_CACHE_BYTES-1
+	andc	r3,r3,r5
+	subf	r4,r3,r4
+	add	r4,r4,r5
+	srwi.	r4,r4,L1_CACHE_SHIFT
+	beqlr
+	mtctr	r4
+
+1:	dcbi	0,r3
+	addi	r3,r3,L1_CACHE_BYTES
+	bdnz	1b
+	sync				/* wait for dcbi's to get to ram */
+	blr
+
+/*
+ * 40x cores have 8K or 16K dcache and 32 byte line size.
+ * 44x has a 32K dcache and 32 byte line size.
+ * 8xx has 1, 2, 4, 8K variants.
+ * For now, cover the worst case of the 44x.
+ * Must be called with external interrupts disabled.
+ */
+#define CACHE_NWAYS     64
+#define CACHE_NLINES    32
+
+_GLOBAL(flush_dcache)
+	li	r4,(2 * CACHE_NWAYS * CACHE_NLINES)
+	mtctr	r4
+	lis	r5,0
+1:	lwz	r3,0(r5)		/* Load one word from every line */
+	addi	r5,r5,L1_CACHE_BYTES
+	bdnz	1b
+	sync
+	blr
+
+_GLOBAL(invalidate_dcache)
+	addi	r6,0,0x0000		/* clear GPR 6 */
+	/* Do loop for # of dcache congruence classes. */
+	lis	r7,(CFG_DCACHE_SIZE / L1_CACHE_BYTES / 2)@ha	/* TBS for large sized cache */
+	ori	r7,r7,(CFG_DCACHE_SIZE / L1_CACHE_BYTES / 2)@l
+					/* NOTE: dccci invalidates both */
+	mtctr	r7			/* ways in the D cache */
+..dcloop:
+	dccci	0,r6			/* invalidate line */
+	addi	r6,r6,L1_CACHE_BYTES	/* bump to next line */
+	bdnz	..dcloop
+	sync
+	blr
+
+/*
+ * Cache functions.
+ *
+ * NOTE: currently the 440s run with dcache _disabled_ once relocated to DRAM,
+ * although for some cache-ralated calls stubs have to be provided to satisfy
+ * symbols resolution.
+ * Icache-related functions are used in POST framework.
+ *
+ */
+#ifdef CONFIG_440
+
+       .globl  dcache_disable
+       .globl  icache_disable
+       .globl  icache_enable
+dcache_disable:
+icache_disable:
+icache_enable:
+	blr
+
+	.globl	dcache_status
+	.globl	icache_status
+dcache_status:
+icache_status:
+	mr	r3,  0
+	blr
+
+#else /* CONFIG_440 */
+
+	.globl	icache_enable
+icache_enable:
+	mflr	r8
+	bl	invalidate_icache
+	mtlr	r8
+	isync
+	addis	r3,r0, 0xc000	      /* set bit 0 */
+	mticcr	r3
+	blr
+
+	.globl	icache_disable
+icache_disable:
+	addis	r3,r0, 0x0000	      /* clear bit 0 */
+	mticcr	r3
+	isync
+	blr
+
+	.globl	icache_status
+icache_status:
+	mficcr	r3
+	srwi	r3, r3, 31	/* >>31 => select bit 0 */
+	blr
+
+	.globl	dcache_enable
+dcache_enable:
+	mflr	r8
+	bl	invalidate_dcache
+	mtlr	r8
+	isync
+	addis	r3,r0, 0x8000	      /* set bit 0 */
+	mtdccr	r3
+	blr
+
+	.globl	dcache_disable
+dcache_disable:
+	mflr	r8
+	bl	flush_dcache
+	mtlr	r8
+	addis	r3,r0, 0x0000	      /* clear bit 0 */
+	mtdccr	r3
+	blr
+
+	.globl	dcache_status
+dcache_status:
+	mfdccr	r3
+	srwi	r3, r3, 31	/* >>31 => select bit 0 */
+	blr
+
+#endif /* CONFIG_440 */
diff --git a/cpu/ppc4xx/kgdb.S b/cpu/ppc4xx/kgdb.S
index 8c4bbf2..42b9546 100644
--- a/cpu/ppc4xx/kgdb.S
+++ b/cpu/ppc4xx/kgdb.S
@@ -56,21 +56,21 @@
 
 	.globl	kgdb_flush_cache_range
 kgdb_flush_cache_range:
-	li	r5,CFG_CACHELINE_SIZE-1
+	li	r5,L1_CACHE_BYTES-1
 	andc	r3,r3,r5
 	subf	r4,r3,r4
 	add	r4,r4,r5
-	srwi.	r4,r4,CFG_CACHELINE_SHIFT
+	srwi.	r4,r4,L1_CACHE_SHIFT
 	beqlr
 	mtctr	r4
 	mr	r6,r3
 1:	dcbst	0,r3
-	addi	r3,r3,CFG_CACHELINE_SIZE
+	addi	r3,r3,L1_CACHE_BYTES
 	bdnz	1b
 	sync			/* wait for dcbst's to get to ram */
 	mtctr	r4
 2:	icbi	0,r6
-	addi	r6,r6,CFG_CACHELINE_SIZE
+	addi	r6,r6,L1_CACHE_BYTES
 	bdnz	2b
 	SYNC
 	blr
diff --git a/cpu/ppc4xx/start.S b/cpu/ppc4xx/start.S
index 81a15fe..f5a135f 100644
--- a/cpu/ppc4xx/start.S
+++ b/cpu/ppc4xx/start.S
@@ -1220,111 +1220,6 @@
 #endif /* CONFIG_440 */
 
 
-/*
- * Cache functions.
- *
- * NOTE: currently the 440s run with dcache _disabled_ once relocated to DRAM,
- * although for some cache-ralated calls stubs have to be provided to satisfy
- * symbols resolution.
- * Icache-related functions are used in POST framework.
- *
- */
-#ifdef CONFIG_440
-       .globl  dcache_disable
-       .globl  icache_disable
-       .globl  icache_enable
-dcache_disable:
-icache_disable:
-icache_enable:
-	blr
-
-	.globl	dcache_status
-	.globl	icache_status
-dcache_status:
-icache_status:
-	mr	r3,  0
-	blr
-#else
-flush_dcache:
-	addis	r9,r0,0x0002		/* set mask for EE and CE msr bits */
-	ori	r9,r9,0x8000
-	mfmsr	r12			/* save msr */
-	andc	r9,r12,r9
-	mtmsr	r9			/* disable EE and CE */
-	addi	r10,r0,0x0001		/* enable data cache for unused memory */
-	mfdccr	r9			/* region 0xF8000000-0xFFFFFFFF via */
-	or	r10,r10,r9		/* bit 31 in dccr */
-	mtdccr	r10
-
-	/* do loop for # of congruence classes. */
-	lis	r10,(CFG_DCACHE_SIZE / CFG_CACHELINE_SIZE / 2)@ha	/* TBS: for large cache sizes */
-	ori	r10,r10,(CFG_DCACHE_SIZE / CFG_CACHELINE_SIZE / 2)@l
-	lis	r11,(CFG_DCACHE_SIZE / 2)@ha /* D cache set size - 2 way sets */
-	ori	r11,r11,(CFG_DCACHE_SIZE / 2)@l /* D cache set size - 2 way sets */
-	mtctr	r10
-	addi	r10,r0,(0xE000-0x10000) /* start at 0xFFFFE000 */
-	add	r11,r10,r11		/* add to get to other side of cache line */
-..flush_dcache_loop:
-	lwz	r3,0(r10)		/* least recently used side */
-	lwz	r3,0(r11)		/* the other side */
-	dccci	r0,r11			/* invalidate both sides */
-	addi	r10,r10,CFG_CACHELINE_SIZE /* bump to next line */
-	addi	r11,r11,CFG_CACHELINE_SIZE /* bump to next line */
-	bdnz	..flush_dcache_loop
-	sync				/* allow memory access to complete */
-	mtdccr	r9			/* restore dccr */
-	mtmsr	r12			/* restore msr */
-	blr
-
-	.globl	icache_enable
-icache_enable:
-	mflr	r8
-	bl	invalidate_icache
-	mtlr	r8
-	isync
-	addis	r3,r0, 0xc000	      /* set bit 0 */
-	mticcr	r3
-	blr
-
-	.globl	icache_disable
-icache_disable:
-	addis	r3,r0, 0x0000	      /* clear bit 0 */
-	mticcr	r3
-	isync
-	blr
-
-	.globl	icache_status
-icache_status:
-	mficcr	r3
-	srwi	r3, r3, 31	/* >>31 => select bit 0 */
-	blr
-
-	.globl	dcache_enable
-dcache_enable:
-	mflr	r8
-	bl	invalidate_dcache
-	mtlr	r8
-	isync
-	addis	r3,r0, 0x8000	      /* set bit 0 */
-	mtdccr	r3
-	blr
-
-	.globl	dcache_disable
-dcache_disable:
-	mflr	r8
-	bl	flush_dcache
-	mtlr	r8
-	addis	r3,r0, 0x0000	      /* clear bit 0 */
-	mtdccr	r3
-	blr
-
-	.globl	dcache_status
-dcache_status:
-	mfdccr	r3
-	srwi	r3, r3, 31	/* >>31 => select bit 0 */
-	blr
-#endif
-
 	.globl get_pvr
 get_pvr:
 	mfspr	r3, PVR
@@ -1430,6 +1325,26 @@
  */
 	.globl	relocate_code
 relocate_code:
+#ifdef CONFIG_4xx_DCACHE
+	/*
+	 * We need to flush the Init Data before the dcache will be
+	 * invalidated
+	 */
+
+	/* save regs */
+	mr	r9,r3
+	mr	r10,r4
+	mr	r11,r5
+
+	mr	r3,r4
+	addi	r4,r4,0x200	/* should be enough for init data */
+	bl	flush_dcache_range
+
+	/* restore regs */
+	mr	r3,r9
+	mr	r4,r10
+	mr	r5,r11
+#endif
 #if defined(CONFIG_440EP) || defined(CONFIG_440GR) || \
     defined(CONFIG_440EPX) || defined(CONFIG_440GRX) || \
     defined(CONFIG_440SP) || defined(CONFIG_440SPE)
@@ -1457,7 +1372,7 @@
 	ori	r4, r4, CFG_MONITOR_BASE@l
 	lwz	r5, GOT(__init_end)
 	sub	r5, r5, r4
-	li	r6, CFG_CACHELINE_SIZE		/* Cache Line Size	*/
+	li	r6, L1_CACHE_BYTES		/* Cache Line Size	*/
 
 	/*
 	 * Fix GOT pointer:
@@ -1777,23 +1692,6 @@
 	lwz	3,0x0000(3)
 	blr
 
-invalidate_icache:
-	iccci	r0,r0			/* for 405, iccci invalidates the */
-	blr				/*   entire I cache */
-
-invalidate_dcache:
-	addi	r6,0,0x0000		/* clear GPR 6 */
-	/* Do loop for # of dcache congruence classes. */
-	lis	r7, (CFG_DCACHE_SIZE / CFG_CACHELINE_SIZE / 2)@ha	/* TBS for large sized cache */
-	ori	r7, r7, (CFG_DCACHE_SIZE / CFG_CACHELINE_SIZE / 2)@l
-					/* NOTE: dccci invalidates both */
-	mtctr	r7			/* ways in the D cache */
-..dcloop:
-	dccci	0,r6			/* invalidate line */
-	addi	r6,r6, CFG_CACHELINE_SIZE /* bump to next line */
-	bdnz	..dcloop
-	blr
-
 /**************************************************************************/
 /* PPC405EP specific stuff						  */
 /**************************************************************************/