armv8: add simple sdelay implementation

The sunxi DRAM setup code needs an sdelay() implementation, which
wasn't defined for armv8 so far.
Shamelessly copy the armv7 version and adjust it to work in AArch64.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Jagan Teki <jagan@openedev.com>
diff --git a/arch/arm/cpu/armv8/cpu.c b/arch/arm/cpu/armv8/cpu.c
index 5dcb5e2..28a27f7 100644
--- a/arch/arm/cpu/armv8/cpu.c
+++ b/arch/arm/cpu/armv8/cpu.c
@@ -17,6 +17,20 @@
 #include <asm/secure.h>
 #include <linux/compiler.h>
 
+/*
+ * sdelay() - simple spin loop.
+ *
+ * Will delay execution by roughly (@loops * 2) cycles.
+ * This is necessary to be used before timers are accessible.
+ *
+ * A value of "0" will results in 2^64 loops.
+ */
+void sdelay(unsigned long loops)
+{
+	__asm__ volatile ("1:\n" "subs %0, %0, #1\n"
+			  "b.ne 1b" : "=r" (loops) : "0"(loops) : "cc");
+}
+
 int cleanup_before_linux(void)
 {
 	/*