Davinci: SPI performance enhancements

The following restructuring and optimisations increase the SPI
read performance from 1.3MiB/s (on da850) to 2.87MiB/s (on da830):

Remove continual revaluation of driver state from the core of the
copy loop. State can not change during the copy loop, so it is
possible to move these evaluations to before the copy loop.

Cost is more code space as loop variants are required for each set
of possible configurations. The loops are simpler however, so the
extra is only 128bytes on da830 with CONFIG_SPI_HALF_DUPLEX
defined.

Unrolling the first copy loop iteration allows the TX buffer to be
pre-loaded reducing SPI clock starvation.

Unrolling the last copy loop iteration removes testing for the
final loop iteration every time round the loop.

Using the RX buffer empty flag as a transfer throttle allows the
assumption that it is always safe to write to the TX buffer, so
polling of TX buffer full flag can be removed.

Signed-off-by: Nick Thompson <nick.thompson@ge.com>
Signed-off-by: Sandeep Paulraj <s-paulraj@ti.com>
diff --git a/drivers/spi/davinci_spi.c b/drivers/spi/davinci_spi.c
index 08f837b..4518ecb 100644
--- a/drivers/spi/davinci_spi.c
+++ b/drivers/spi/davinci_spi.c
@@ -66,7 +66,7 @@
 int spi_claim_bus(struct spi_slave *slave)
 {
 	struct davinci_spi_slave *ds = to_davinci_spi(slave);
-	unsigned int scalar, data1_reg_val = 0;
+	unsigned int scalar;
 
 	/* Enable the SPI hardware */
 	writel(SPIGCR0_SPIRST_MASK, &ds->regs->gcr0);
@@ -93,11 +93,6 @@
 	writel(8 | (scalar << SPIFMT_PRESCALE_SHIFT) |
 		(1 << SPIFMT_PHASE_SHIFT), &ds->regs->fmt0);
 
-	/* hold cs active at end of transfer until explicitly de-asserted */
-	data1_reg_val = (1 << SPIDAT1_CSHOLD_SHIFT) |
-			(slave->cs << SPIDAT1_CSNR_SHIFT);
-	writel(data1_reg_val, &ds->regs->dat1);
-
 	/*
 	 * Including a minor delay. No science here. Should be good even with
 	 * no delay
@@ -113,8 +108,7 @@
 	writel(0, &ds->regs->lvl);
 
 	/* enable SPI */
-	writel((readl(&ds->regs->gcr1) |
-		SPIGCR1_SPIENA_MASK), &ds->regs->gcr1);
+	writel((readl(&ds->regs->gcr1) | SPIGCR1_SPIENA_MASK), &ds->regs->gcr1);
 
 	return 0;
 }
@@ -127,14 +121,125 @@
 	writel(SPIGCR0_SPIRST_MASK, &ds->regs->gcr0);
 }
 
+/*
+ * This functions needs to act like a macro to avoid pipeline reloads in the
+ * loops below. Use always_inline. This gains us about 160KiB/s and the bloat
+ * appears to be zero bytes (da830).
+ */
+__attribute__((always_inline))
+static inline u32 davinci_spi_xfer_data(struct davinci_spi_slave *ds, u32 data)
+{
+	u32	buf_reg_val;
+
+	/* send out data */
+	writel(data, &ds->regs->dat1);
+
+	/* wait for the data to clock in/out */
+	while ((buf_reg_val = readl(&ds->regs->buf)) & SPIBUF_RXEMPTY_MASK)
+		;
+
+	return buf_reg_val;
+}
+
+static int davinci_spi_read(struct spi_slave *slave, unsigned int len,
+			    u8 *rxp, unsigned long flags)
+{
+	struct davinci_spi_slave *ds = to_davinci_spi(slave);
+	unsigned int data1_reg_val;
+
+	/* enable CS hold, CS[n] and clear the data bits */
+	data1_reg_val = ((1 << SPIDAT1_CSHOLD_SHIFT) |
+			 (slave->cs << SPIDAT1_CSNR_SHIFT));
+
+	/* wait till TXFULL is deasserted */
+	while (readl(&ds->regs->buf) & SPIBUF_TXFULL_MASK)
+		;
+
+	/* preload the TX buffer to avoid clock starvation */
+	writel(data1_reg_val, &ds->regs->dat1);
+
+	/* keep reading 1 byte until only 1 byte left */
+	while ((len--) > 1)
+		*rxp++ = davinci_spi_xfer_data(ds, data1_reg_val);
+
+	/* clear CS hold when we reach the end */
+	if (flags & SPI_XFER_END)
+		data1_reg_val &= ~(1 << SPIDAT1_CSHOLD_SHIFT);
+
+	/* read the last byte */
+	*rxp = davinci_spi_xfer_data(ds, data1_reg_val);
+
+	return 0;
+}
+
+static int davinci_spi_write(struct spi_slave *slave, unsigned int len,
+		const u8 *txp, unsigned long flags)
+{
+	struct davinci_spi_slave *ds = to_davinci_spi(slave);
+	unsigned int data1_reg_val;
+
+	/* enable CS hold and clear the data bits */
+	data1_reg_val = ((1 << SPIDAT1_CSHOLD_SHIFT) |
+			 (slave->cs << SPIDAT1_CSNR_SHIFT));
+
+	/* wait till TXFULL is deasserted */
+	while (readl(&ds->regs->buf) & SPIBUF_TXFULL_MASK)
+		;
+
+	/* preload the TX buffer to avoid clock starvation */
+	if (len > 2) {
+		writel(data1_reg_val | *txp++, &ds->regs->dat1);
+		len--;
+	}
+
+	/* keep writing 1 byte until only 1 byte left */
+	while ((len--) > 1)
+		davinci_spi_xfer_data(ds, data1_reg_val | *txp++);
+
+	/* clear CS hold when we reach the end */
+	if (flags & SPI_XFER_END)
+		data1_reg_val &= ~(1 << SPIDAT1_CSHOLD_SHIFT);
+
+	/* write the last byte */
+	davinci_spi_xfer_data(ds, data1_reg_val | *txp);
+
+	return 0;
+}
+
+#ifndef CONFIG_SPI_HALF_DUPLEX
+static int davinci_spi_read_write(struct spi_slave *slave, unsigned int len,
+				  u8 *rxp, const u8 *txp, unsigned long flags)
+{
+	struct davinci_spi_slave *ds = to_davinci_spi(slave);
+	unsigned int data1_reg_val;
+
+	/* enable CS hold and clear the data bits */
+	data1_reg_val = ((1 << SPIDAT1_CSHOLD_SHIFT) |
+			 (slave->cs << SPIDAT1_CSNR_SHIFT));
+
+	/* wait till TXFULL is deasserted */
+	while (readl(&ds->regs->buf) & SPIBUF_TXFULL_MASK)
+		;
+
+	/* keep reading and writing 1 byte until only 1 byte left */
+	while ((len--) > 1)
+		*rxp++ = davinci_spi_xfer_data(ds, data1_reg_val | *txp++);
+
+	/* clear CS hold when we reach the end */
+	if (flags & SPI_XFER_END)
+		data1_reg_val &= ~(1 << SPIDAT1_CSHOLD_SHIFT);
+
+	/* read and write the last byte */
+	*rxp = davinci_spi_xfer_data(ds, data1_reg_val | *txp);
+
+	return 0;
+}
+#endif
+
 int spi_xfer(struct spi_slave *slave, unsigned int bitlen,
 		const void *dout, void *din, unsigned long flags)
 {
-	struct davinci_spi_slave *ds = to_davinci_spi(slave);
-	unsigned int	len, data1_reg_val = readl(&ds->regs->dat1);
-	unsigned int	i_cnt = 0, o_cnt = 0, buf_reg_val;
-	const u8	*txp = dout; /* dout can be NULL for read operation */
-	u8		*rxp = din;  /* din can be NULL for write operation */
+	unsigned int len;
 
 	if (bitlen == 0)
 		/* Finish any previously submitted transfers */
@@ -154,63 +259,19 @@
 
 	len = bitlen / 8;
 
-	/* do an empty read to clear the current contents */
-	readl(&ds->regs->buf);
-
-	/* keep writing and reading 1 byte until done */
-	while ((i_cnt < len) || (o_cnt < len)) {
-		/* read RX buffer and flags */
-		buf_reg_val = readl(&ds->regs->buf);
-
-		/* if data is available */
-		if ((i_cnt < len) &&
-			(buf_reg_val & SPIBUF_RXEMPTY_MASK) == 0) {
-			/*
-			 * If there is no read buffer simply
-			 * ignore the read character
-			 */
-			if (rxp)
-				*rxp++ = buf_reg_val & 0xFF;
-			/* increment read words count */
-			i_cnt++;
-		}
-
-		/*
-		 * if the tx buffer is empty and there
-		 * is still data to transmit
-		 */
-		if ((o_cnt < len) &&
-			((buf_reg_val & SPIBUF_TXFULL_MASK) == 0)) {
-			/* write the data */
-			data1_reg_val &= ~0xFFFF;
-			if (txp)
-				data1_reg_val |= *txp++;
-			/*
-			 * Write to DAT1 is required to keep
-			 * the serial transfer going.
-			 * We just terminate when we reach the end.
-			 */
-			if ((o_cnt == (len - 1)) && (flags & SPI_XFER_END)) {
-				/* clear CS hold */
-				writel(data1_reg_val &
-						~(1 << SPIDAT1_CSHOLD_SHIFT),
-						&ds->regs->dat1);
-			} else {
-				/* enable CS hold and write TX register */
-				data1_reg_val |= ((1 << SPIDAT1_CSHOLD_SHIFT) |
-					(slave->cs << SPIDAT1_CSNR_SHIFT));
-				writel(data1_reg_val, &ds->regs->dat1);
-			}
-			/* increment written words count */
-			o_cnt++;
-		}
-	}
-	return 0;
+	if (!dout)
+		return davinci_spi_read(slave, len, din, flags);
+	else if (!din)
+		return davinci_spi_write(slave, len, dout, flags);
+#ifndef CONFIG_SPI_HALF_DUPLEX
+	else
+		return davinci_spi_read_write(slave, len, din, dout, flags);
+#endif
 
 out:
 	if (flags & SPI_XFER_END) {
-		writel(data1_reg_val &
-			~(1 << SPIDAT1_CSHOLD_SHIFT), &ds->regs->dat1);
+		u8 dummy = 0;
+		davinci_spi_write(slave, 1, &dummy, flags);
 	}
 	return 0;
 }