riscv: Add Zbb support for building U-Boot

This patch adds ISA string to the -march to generate zbb instructions
for U-Boot binaries, along with optimized string functions introduced
from Linux kernel.

Signed-off-by: Yu Chien Peter Lin <peterlin@andestech.com>
Reviewed-by: Leo Yu-Chi Liang <ycliang@andestech.com>
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 49b6e1a..e291456 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -187,6 +187,97 @@
 	  riscv32 ABI from ilp32 to ilp32d and the riscv64 ABI from lp64 to
 	  lp64d.
 
+config RISCV_ISA_ZBB
+	bool "Zbb extension support for bit manipulation instructions"
+	help
+	  Adds ZBB extension (basic bit manipulation) to the ISA subsets
+	  that the toolchain is allowed to emit when building U-Boot.
+	  The Zbb extension provides instructions to accelerate a number
+	  of bit-specific operations (count bit population, sign extending,
+	  bitrotation, etc) and enables optimized string routines.
+
+menu "Use assembly optimized implementation of string routines"
+
+config USE_ARCH_STRLEN
+	bool "Use an assembly optimized implementation of strlen"
+	default y
+	depends on RISCV_ISA_ZBB
+	help
+	  Enable the generation of an optimized version of strlen using
+	  Zbb extension.
+
+config SPL_USE_ARCH_STRLEN
+	bool "Use an assembly optimized implementation of strlen for SPL"
+	default y if USE_ARCH_STRLEN
+	depends on RISCV_ISA_ZBB
+	depends on SPL
+	help
+	  Enable the generation of an optimized version of strlen using
+	  Zbb extension.
+
+config TPL_USE_ARCH_STRLEN
+	bool "Use an assembly optimized implementation of strlen for TPL"
+	default y if USE_ARCH_STRLEN
+	depends on RISCV_ISA_ZBB
+	depends on TPL
+	help
+	  Enable the generation of an optimized version of strlen using
+	  Zbb extension.
+
+config USE_ARCH_STRCMP
+	bool "Use an assembly optimized implementation of strcmp"
+	default y
+	depends on RISCV_ISA_ZBB
+	help
+	  Enable the generation of an optimized version of strcmp using
+	  Zbb extension.
+
+config SPL_USE_ARCH_STRCMP
+	bool "Use an assembly optimized implementation of strcmp for SPL"
+	default y if USE_ARCH_STRCMP
+	depends on RISCV_ISA_ZBB
+	depends on SPL
+	help
+	  Enable the generation of an optimized version of strcmp using
+	  Zbb extension.
+
+config TPL_USE_ARCH_STRCMP
+	bool "Use an assembly optimized implementation of strcmp for TPL"
+	default y if USE_ARCH_STRCMP
+	depends on RISCV_ISA_ZBB
+	depends on TPL
+	help
+	  Enable the generation of an optimized version of strcmp using
+	  Zbb extension.
+
+config USE_ARCH_STRNCMP
+	bool "Use an assembly optimized implementation of strncmp"
+	default y
+	depends on RISCV_ISA_ZBB
+	help
+	  Enable the generation of an optimized version of strncmp using
+	  Zbb extension.
+
+config SPL_USE_ARCH_STRNCMP
+	bool "Use an assembly optimized implementation of strncmp for SPL"
+	default y if USE_ARCH_STRNCMP
+	depends on RISCV_ISA_ZBB
+	depends on SPL
+	help
+	  Enable the generation of an optimized version of strncmp using
+	  Zbb extension.
+
+config TPL_USE_ARCH_STRNCMP
+	bool "Use an assembly optimized implementation of strncmp for TPL"
+	default y if USE_ARCH_STRNCMP
+	depends on RISCV_ISA_ZBB
+	depends on TPL
+	help
+	  Enable the generation of an optimized version of strncmp using
+	  Zbb extension.
+
+endmenu
+
 config RISCV_ISA_A
 	def_bool y
 
diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index 4963b51..b3ef870 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -24,6 +24,9 @@
 ifeq ($(CONFIG_RISCV_ISA_C),y)
 	ARCH_C = c
 endif
+ifeq ($(CONFIG_RISCV_ISA_ZBB),y)
+	ARCH_ZBB = _zbb
+endif
 ifeq ($(CONFIG_CMODEL_MEDLOW),y)
 	CMODEL = medlow
 endif
@@ -32,7 +35,7 @@
 endif
 
 
-RISCV_MARCH = $(ARCH_BASE)$(ARCH_A)$(ARCH_F)$(ARCH_D)$(ARCH_C)
+RISCV_MARCH = $(ARCH_BASE)$(ARCH_A)$(ARCH_F)$(ARCH_D)$(ARCH_C)$(ARCH_ZBB)
 ABI = $(ABI_BASE)$(ABI_D)
 
 # Newer binutils versions default to ISA spec version 20191213 which moves some
diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
index 7dee3e4..38ad85f 100644
--- a/arch/riscv/include/asm/string.h
+++ b/arch/riscv/include/asm/string.h
@@ -40,4 +40,22 @@
 #endif
 extern void *memset(void *, int, __kernel_size_t);
 
+#undef __HAVE_ARCH_STRLEN
+#if CONFIG_IS_ENABLED(USE_ARCH_STRLEN)
+#define __HAVE_ARCH_STRLEN
+#endif
+extern __kernel_size_t strlen(const char *);
+
+#undef __HAVE_ARCH_STRCMP
+#if CONFIG_IS_ENABLED(USE_ARCH_STRCMP)
+#define __HAVE_ARCH_STRCMP
+#endif
+extern int strcmp(const char *, const char *);
+
+#undef __HAVE_ARCH_STRNCMP
+#if CONFIG_IS_ENABLED(USE_ARCH_STRNCMP)
+#define __HAVE_ARCH_STRNCMP
+#endif
+extern int strncmp(const char *, const char *, size_t __kernel_size_t);
+
 #endif /* __ASM_RISCV_STRING_H */
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 02c4d8f..9a05b66 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -42,5 +42,8 @@
 obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMSET) += memset.o
 obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMMOVE) += memmove.o
 obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMCPY) += memcpy.o
+obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_STRLEN) += strlen_zbb.o
+obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_STRCMP) += strcmp_zbb.o
+obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_STRNCMP) += strncmp_zbb.o
 
 obj-$(CONFIG_$(SPL_TPL_)SEMIHOSTING) += semihosting.o
diff --git a/arch/riscv/lib/strcmp_zbb.S b/arch/riscv/lib/strcmp_zbb.S
new file mode 100644
index 0000000..e5a367c
--- /dev/null
+++ b/arch/riscv/lib/strcmp_zbb.S
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Taken from Linux arch/riscv/lib/strcmp.S
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+
+ENTRY(__strcmp)
+WEAK(strcmp)
+.option push
+.option arch,+zbb
+	/*
+	 * Returns
+	 *   a0 - comparison result, value like strcmp
+	 *
+	 * Parameters
+	 *   a0 - string1
+	 *   a1 - string2
+	 *
+	 * Clobbers
+	 *   t0, t1, t2, t3, t4
+	 */
+
+	or	t2, a0, a1
+	li	t4, -1
+	and	t2, t2, SZREG-1
+	bnez	t2, 3f
+
+	/* Main loop for aligned string.  */
+	.p2align 3
+1:
+	REG_L	t0, 0(a0)
+	REG_L	t1, 0(a1)
+	orc.b	t3, t0
+	bne	t3, t4, 2f
+	addi	a0, a0, SZREG
+	addi	a1, a1, SZREG
+	beq	t0, t1, 1b
+
+	/*
+	 * Words don't match, and no null byte in the first
+	 * word. Get bytes in big-endian order and compare.
+	 */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	rev8	t0, t0
+	rev8	t1, t1
+#endif
+
+	/* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */
+	sltu	a0, t0, t1
+	neg	a0, a0
+	ori	a0, a0, 1
+	ret
+
+2:
+	/*
+	 * Found a null byte.
+	 * If words don't match, fall back to simple loop.
+	 */
+	bne	t0, t1, 3f
+
+	/* Otherwise, strings are equal. */
+	li	a0, 0
+	ret
+
+	/* Simple loop for misaligned strings. */
+	.p2align 3
+3:
+	lbu	t0, 0(a0)
+	lbu	t1, 0(a1)
+	addi	a0, a0, 1
+	addi	a1, a1, 1
+	bne	t0, t1, 4f
+	bnez	t0, 3b
+
+4:
+	sub	a0, t0, t1
+	ret
+.option pop
+END(__strcmp)
diff --git a/arch/riscv/lib/strlen_zbb.S b/arch/riscv/lib/strlen_zbb.S
new file mode 100644
index 0000000..bd8fa4c
--- /dev/null
+++ b/arch/riscv/lib/strlen_zbb.S
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Taken from Linux arch/riscv/lib/strlen.S
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+# define CZ	ctz
+# define SHIFT	srl
+#else
+# define CZ	clz
+# define SHIFT	sll
+#endif
+
+ENTRY(__strlen)
+WEAK(strlen)
+.option push
+.option arch,+zbb
+	/*
+	 * Returns
+	 *   a0 - string length
+	 *
+	 * Parameters
+	 *   a0 - String to measure
+	 *
+	 * Clobbers
+	 *   t0, t1, t2, t3
+	 */
+
+	/* Number of irrelevant bytes in the first word. */
+	andi	t2, a0, SZREG-1
+
+	/* Align pointer. */
+	andi	t0, a0, -SZREG
+
+	li	t3, SZREG
+	sub	t3, t3, t2
+	slli	t2, t2, 3
+
+	/* Get the first word.  */
+	REG_L	t1, 0(t0)
+
+	/*
+	 * Shift away the partial data we loaded to remove the irrelevant bytes
+	 * preceding the string with the effect of adding NUL bytes at the
+	 * end of the string's first word.
+	 */
+	SHIFT	t1, t1, t2
+
+	/* Convert non-NUL into 0xff and NUL into 0x00. */
+	orc.b	t1, t1
+
+	/* Convert non-NUL into 0x00 and NUL into 0xff. */
+	not	t1, t1
+
+	/*
+	 * Search for the first set bit (corresponding to a NUL byte in the
+	 * original chunk).
+	 */
+	CZ	t1, t1
+
+	/*
+	 * The first chunk is special: compare against the number
+	 * of valid bytes in this chunk.
+	 */
+	srli	a0, t1, 3
+	bgtu	t3, a0, 2f
+
+	/* Prepare for the word comparison loop. */
+	addi	t2, t0, SZREG
+	li	t3, -1
+
+	/*
+	 * Our critical loop is 4 instructions and processes data in
+	 * 4 byte or 8 byte chunks.
+	 */
+	.p2align 3
+1:
+	REG_L	t1, SZREG(t0)
+	addi	t0, t0, SZREG
+	orc.b	t1, t1
+	beq	t1, t3, 1b
+
+	not	t1, t1
+	CZ	t1, t1
+	srli	t1, t1, 3
+
+	/* Get number of processed bytes. */
+	sub	t2, t0, t2
+
+	/* Add number of characters in the first word.  */
+	add	a0, a0, t2
+
+	/* Add number of characters in the last word.  */
+	add	a0, a0, t1
+2:
+	ret
+.option pop
+END(__strlen)
diff --git a/arch/riscv/lib/strncmp_zbb.S b/arch/riscv/lib/strncmp_zbb.S
new file mode 100644
index 0000000..00e0fec
--- /dev/null
+++ b/arch/riscv/lib/strncmp_zbb.S
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Taken from Linux arch/riscv/lib/strncmp.S
+ */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+
+ENTRY(__strncmp)
+WEAK(strncmp)
+.option push
+.option arch,+zbb
+	/*
+	 * Returns
+	 *   a0 - comparison result, like strncmp
+	 *
+	 * Parameters
+	 *   a0 - string1
+	 *   a1 - string2
+	 *   a2 - number of characters to compare
+	 *
+	 * Clobbers
+	 *   t0, t1, t2, t3, t4, t5, t6
+	 */
+
+	or	t2, a0, a1
+	li	t5, -1
+	and	t2, t2, SZREG-1
+	add	t4, a0, a2
+	bnez	t2, 3f
+
+	/* Adjust limit for fast-path.  */
+	andi	t6, t4, -SZREG
+
+	/* Main loop for aligned string.  */
+	.p2align 3
+1:
+	bge	a0, t6, 3f
+	REG_L	t0, 0(a0)
+	REG_L	t1, 0(a1)
+	orc.b	t3, t0
+	bne	t3, t5, 2f
+	orc.b	t3, t1
+	bne	t3, t5, 2f
+	addi	a0, a0, SZREG
+	addi	a1, a1, SZREG
+	beq	t0, t1, 1b
+
+	/*
+	 * Words don't match, and no null byte in the first
+	 * word. Get bytes in big-endian order and compare.
+	 */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	rev8	t0, t0
+	rev8	t1, t1
+#endif
+
+	/* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence.  */
+	sltu	a0, t0, t1
+	neg	a0, a0
+	ori	a0, a0, 1
+	ret
+
+2:
+	/*
+	 * Found a null byte.
+	 * If words don't match, fall back to simple loop.
+	 */
+	bne	t0, t1, 3f
+
+	/* Otherwise, strings are equal.  */
+	li	a0, 0
+	ret
+
+	/* Simple loop for misaligned strings.  */
+	.p2align 3
+3:
+	bge	a0, t4, 5f
+	lbu	t0, 0(a0)
+	lbu	t1, 0(a1)
+	addi	a0, a0, 1
+	addi	a1, a1, 1
+	bne	t0, t1, 4f
+	bnez	t0, 3b
+
+4:
+	sub	a0, t0, t1
+	ret
+
+5:
+	li	a0, 0
+	ret
+.option pop
+END(__strncmp)