rockchip: spi: make optimised receive-handler unaligned-safe

To support unaligned output buffers (i.e. 'in' in the terminology of
the SPI framework), this change splits each 16bit FIFO element after
reading and writes them to memory in two 8bit transactions.  With this
change, we can now always use the optimised mode for receive-only
transcations independent on the alignment of the target buffer.

Given that we'll run with caches on, the impact should be negligible:
as expected, this has no adverse impact on throughput if running with
a 960MHz LPLL configuration.

Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
diff --git a/drivers/spi/rk_spi.c b/drivers/spi/rk_spi.c
index 7b39d1c..dceced9 100644
--- a/drivers/spi/rk_spi.c
+++ b/drivers/spi/rk_spi.c
@@ -353,23 +353,13 @@
 	u32 statistics_rxlevels[33] = { };
 #endif
 	u32 frames = *len / 2;
-	u16 *in16 = (u16 *)(*din);
+	u8 *in = (u8 *)(*din);
 	u32 max_chunk_size = SPI_FIFO_DEPTH;
 
 	if (!frames)
 		return 0;
 
 	/*
-	 * If the destination buffer is unaligned, we'd run into a problem
-	 * on ARMv8.  Given that this doesn't seem to be a real issue, we
-	 * just chicken out and fall back to the unoptimised implementation.
-	 */
-	if ((uintptr_t)*din & 1) {
-		debug("%s: unaligned buffer, din = %p\n", __func__, *din);
-		return 0;
-	}
-
-	/*
 	 * If we know that the hardware will manage RXFIFO overruns
 	 * (i.e. stop the SPI clock until there's space in the FIFO),
 	 * we the allow largest possible chunk size that can be
@@ -406,8 +396,11 @@
 			statistics_rxlevels[rx_level]++;
 #endif
 			chunk_size -= rx_level;
-			while (rx_level--)
-				*in16++ = readw(regs->rxdr);
+			while (rx_level--) {
+				u16 val = readw(regs->rxdr);
+				*in++ = val & 0xff;
+				*in++ = val >> 8;
+			}
 		} while (chunk_size);
 
 		rkspi_enable_chip(regs, false);