net: dwc_eth_qos: Add support for bulk RX descriptor cleaning

Add new desc_per_cacheline property which lets a platform run RX descriptor
cleanup after every power-of-2 - 1 received packets instead of every packet.
This is useful on platforms where (axi_bus_width EQOS_AXI_WIDTH_n * DMA DSL
inter-descriptor word skip count + DMA descriptor size) is less than cache
line size, which necessitates packing multiple DMA descriptors into single
cache line.

In case of TX descriptors, this is not a problem, since the driver always
does synchronous TX, i.e. the TX descriptor is always written, flushed and
polled for completion in eqos_send().

In case of RX descriptors, it is necessary to update their status in bulk,
i.e. after the entire cache line worth of RX descriptors has been used up
to receive data.

Signed-off-by: Marek Vasut <marex@denx.de>
Reviewed-by: Patrice Chotard <patrice.chotard@foss.st.com>
Reviewed-by: Ramon Fried <rfried.dev@gmail.com>
diff --git a/drivers/net/dwc_eth_qos.c b/drivers/net/dwc_eth_qos.c
index dde2c18..afc47b5 100644
--- a/drivers/net/dwc_eth_qos.c
+++ b/drivers/net/dwc_eth_qos.c
@@ -75,7 +75,7 @@
  */
 static void *eqos_alloc_descs(struct eqos_priv *eqos, unsigned int num)
 {
-	return memalign(eqos->desc_size, num * eqos->desc_size);
+	return memalign(ARCH_DMA_MINALIGN, num * eqos->desc_size);
 }
 
 static void eqos_free_descs(void *descs)
@@ -92,7 +92,7 @@
 
 void eqos_inval_desc_generic(void *desc)
 {
-	unsigned long start = (unsigned long)desc;
+	unsigned long start = (unsigned long)desc & ~(ARCH_DMA_MINALIGN - 1);
 	unsigned long end = ALIGN(start + sizeof(struct eqos_desc),
 				  ARCH_DMA_MINALIGN);
 
@@ -101,7 +101,7 @@
 
 void eqos_flush_desc_generic(void *desc)
 {
-	unsigned long start = (unsigned long)desc;
+	unsigned long start = (unsigned long)desc & ~(ARCH_DMA_MINALIGN - 1);
 	unsigned long end = ALIGN(start + sizeof(struct eqos_desc),
 				  ARCH_DMA_MINALIGN);
 
@@ -1185,6 +1185,7 @@
 static int eqos_free_pkt(struct udevice *dev, uchar *packet, int length)
 {
 	struct eqos_priv *eqos = dev_get_priv(dev);
+	u32 idx, idx_mask = eqos->desc_per_cacheline - 1;
 	uchar *packet_expected;
 	struct eqos_desc *rx_desc;
 
@@ -1200,24 +1201,30 @@
 
 	eqos->config->ops->eqos_inval_buffer(packet, length);
 
-	rx_desc = eqos_get_desc(eqos, eqos->rx_desc_idx, true);
-
-	rx_desc->des0 = 0;
-	mb();
-	eqos->config->ops->eqos_flush_desc(rx_desc);
-	eqos->config->ops->eqos_inval_buffer(packet, length);
-	rx_desc->des0 = (u32)(ulong)packet;
-	rx_desc->des1 = 0;
-	rx_desc->des2 = 0;
-	/*
-	 * Make sure that if HW sees the _OWN write below, it will see all the
-	 * writes to the rest of the descriptor too.
-	 */
-	mb();
-	rx_desc->des3 = EQOS_DESC3_OWN | EQOS_DESC3_BUF1V;
-	eqos->config->ops->eqos_flush_desc(rx_desc);
-
-	writel((ulong)rx_desc, &eqos->dma_regs->ch0_rxdesc_tail_pointer);
+	if ((eqos->rx_desc_idx & idx_mask) == idx_mask) {
+		for (idx = eqos->rx_desc_idx - idx_mask;
+		     idx <= eqos->rx_desc_idx;
+		     idx++) {
+			rx_desc = eqos_get_desc(eqos, idx, true);
+			rx_desc->des0 = 0;
+			mb();
+			eqos->config->ops->eqos_flush_desc(rx_desc);
+			eqos->config->ops->eqos_inval_buffer(packet, length);
+			rx_desc->des0 = (u32)(ulong)(eqos->rx_dma_buf +
+					     (idx * EQOS_MAX_PACKET_SIZE));
+			rx_desc->des1 = 0;
+			rx_desc->des2 = 0;
+			/*
+			 * Make sure that if HW sees the _OWN write below,
+			 * it will see all the writes to the rest of the
+			 * descriptor too.
+			 */
+			mb();
+			rx_desc->des3 = EQOS_DESC3_OWN | EQOS_DESC3_BUF1V;
+			eqos->config->ops->eqos_flush_desc(rx_desc);
+		}
+		writel((ulong)rx_desc, &eqos->dma_regs->ch0_rxdesc_tail_pointer);
+	}
 
 	eqos->rx_desc_idx++;
 	eqos->rx_desc_idx %= EQOS_DESCRIPTORS_RX;
@@ -1228,12 +1235,26 @@
 static int eqos_probe_resources_core(struct udevice *dev)
 {
 	struct eqos_priv *eqos = dev_get_priv(dev);
+	unsigned int desc_step;
 	int ret;
 
 	debug("%s(dev=%p):\n", __func__, dev);
 
-	eqos->desc_size = ALIGN(sizeof(struct eqos_desc),
-				(unsigned int)ARCH_DMA_MINALIGN);
+	/* Maximum distance between neighboring descriptors, in Bytes. */
+	desc_step = sizeof(struct eqos_desc) +
+		    EQOS_DMA_CH0_CONTROL_DSL_MASK * eqos->config->axi_bus_width;
+	if (desc_step < ARCH_DMA_MINALIGN) {
+		/*
+		 * The EQoS hardware implementation cannot place one descriptor
+		 * per cacheline, it is necessary to place multiple descriptors
+		 * per cacheline in memory and do cache management carefully.
+		 */
+		eqos->desc_size = BIT(fls(desc_step) - 1);
+	} else {
+		eqos->desc_size = ALIGN(sizeof(struct eqos_desc),
+					(unsigned int)ARCH_DMA_MINALIGN);
+	}
+	eqos->desc_per_cacheline = ARCH_DMA_MINALIGN / eqos->desc_size;
 
 	eqos->tx_descs = eqos_alloc_descs(eqos, EQOS_DESCRIPTORS_TX);
 	if (!eqos->tx_descs) {
diff --git a/drivers/net/dwc_eth_qos.h b/drivers/net/dwc_eth_qos.h
index e3e43c8..8fccd6f 100644
--- a/drivers/net/dwc_eth_qos.h
+++ b/drivers/net/dwc_eth_qos.h
@@ -162,6 +162,7 @@
 #define EQOS_DMA_SYSBUS_MODE_BLEN4			BIT(1)
 
 #define EQOS_DMA_CH0_CONTROL_DSL_SHIFT			18
+#define EQOS_DMA_CH0_CONTROL_DSL_MASK			0x7
 #define EQOS_DMA_CH0_CONTROL_PBLX8			BIT(16)
 
 #define EQOS_DMA_CH0_TX_CONTROL_TXPBL_SHIFT		16
@@ -268,6 +269,7 @@
 	void *rx_descs;
 	int tx_desc_idx, rx_desc_idx;
 	unsigned int desc_size;
+	unsigned int desc_per_cacheline;
 	void *tx_dma_buf;
 	void *rx_dma_buf;
 	void *rx_pkt;