sh4: fix start.S by calling board_init_f() after first code relocation

Like on ARM platform keep the first code relocation from a U-boot
image storage to RAM at CONFIG_SYS_TEXT_BASE, then pass execution to a
generic board_init_f() with empty GD flags. If CONFIG_SYS_TEXT_BASE is
equal to a calculated by board_init_f() relocation address there will
be no more code and data copy, however it's worth to mention that the
first copy happens even if $pc on _start is the same as
CONFIG_SYS_TEXT_BASE, on practice this works without a problem.

Also note that _sh_start is renamed back to _start to correct
gd->mon_len calculation by setup_mon_len(), the opposite rename was
done in pre-generic board commit 2024b968ee9 ("sh: Fix build in start.S").

Signed-off-by: Vladimir Zapolskiy <vz@mleia.com>
Reviewed-by: Simon Glass <sjg@chromium.org>
diff --git a/arch/sh/cpu/sh4/start.S b/arch/sh/cpu/sh4/start.S
index 77fc221..416adcf 100644
--- a/arch/sh/cpu/sh4/start.S
+++ b/arch/sh/cpu/sh4/start.S
@@ -1,6 +1,6 @@
 /*
- * (C) Copyright 2007, 2010
- * Nobuhiro Iwamatsu <iwamatsu@nigauri.org>
+ * Copyright (C) 2016 Vladimir Zapolskiy <vz@mleia.com>
+ * Copyright (C) 2007, 2010 Nobuhiro Iwamatsu <iwamatsu@nigauri.org>
  *
  * SPDX-License-Identifier:	GPL-2.0+
  */
@@ -12,7 +12,7 @@
 	.align	2
 
 	.global	_start
-_sh_start:
+_start:
 	mov.l	._lowlevel_init, r0
 100:	bsrf	r0
 	nop
@@ -21,7 +21,7 @@
 	nop
 1:	sts	pr, r5
 	mov.l	._reloc_dst, r4
-	add	#(_sh_start-1b), r5
+	add	#(_start-1b), r5
 	mov.l	._reloc_dst_end, r6
 
 2:	mov.l	@r5+, r1
@@ -42,10 +42,9 @@
 	mov.l	._gd_init, r13		/* global data */
 	mov.l	._stack_init, r15	/* stack */
 
-	#TODO(sh maintainer): Fix this up to call the correct code
-	#mov.l	._sh_generic_init, r0
-	#jsr	@r0
-	nop
+	mov.l	._sh_generic_init, r0
+	jsr	@r0
+	mov     #0, r4
 
 loop:
 	bra	loop
@@ -53,10 +52,10 @@
 	.align	2
 
 ._lowlevel_init:	.long	(lowlevel_init - (100b + 4))
-._reloc_dst:		.long	reloc_dst
+._reloc_dst:		.long	_start
 ._reloc_dst_end:	.long	reloc_dst_end
 ._bss_start:		.long	bss_start
 ._bss_end:		.long	bss_end
-._gd_init:		.long	(_sh_start - GENERATED_GBL_DATA_SIZE)
-._stack_init:		.long	(_sh_start - GENERATED_GBL_DATA_SIZE - CONFIG_SYS_MALLOC_LEN - 16)
-#._sh_generic_init:	.long	sh_generic_init
+._gd_init:		.long	(_start - GENERATED_GBL_DATA_SIZE)
+._stack_init:		.long	(_start - GENERATED_GBL_DATA_SIZE - CONFIG_SYS_MALLOC_LEN - 16)
+._sh_generic_init:	.long	board_init_f