blob: 9884077c9338ca961608fbdc7b73c2558c393a72 [file] [log] [blame]
Heinrich Schuchardt8f0dc4c2021-03-27 12:37:04 +01001/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright (C) 2013 Regents of the University of California
4 */
5
6#include <linux/linkage.h>
7#include <asm/asm.h>
8
9/* void *memcpy(void *, const void *, size_t) */
10ENTRY(__memcpy)
11WEAK(memcpy)
Rick Chen3c60e592023-01-04 09:56:28 +080012 beq a0, a1, .copy_end
Bin Meng703b84e2021-05-13 16:46:17 +080013 /* Save for return value */
14 mv t6, a0
Heinrich Schuchardt8f0dc4c2021-03-27 12:37:04 +010015
Heinrich Schuchardt8f0dc4c2021-03-27 12:37:04 +010016 /*
Bin Meng703b84e2021-05-13 16:46:17 +080017 * Register allocation for code below:
18 * a0 - start of uncopied dst
19 * a1 - start of uncopied src
20 * t0 - end of uncopied dst
Heinrich Schuchardt8f0dc4c2021-03-27 12:37:04 +010021 */
Bin Meng703b84e2021-05-13 16:46:17 +080022 add t0, a0, a2
23
24 /*
25 * Use bytewise copy if too small.
26 *
27 * This threshold must be at least 2*SZREG to ensure at least one
28 * wordwise copy is performed. It is chosen to be 16 because it will
29 * save at least 7 iterations of bytewise copy, which pays off the
30 * fixed overhead.
31 */
32 li a3, 16
33 bltu a2, a3, .Lbyte_copy_tail
34
35 /*
36 * Bytewise copy first to align a0 to word boundary.
37 */
38 addi a2, a0, SZREG-1
39 andi a2, a2, ~(SZREG-1)
40 beq a0, a2, 2f
Heinrich Schuchardt8f0dc4c2021-03-27 12:37:04 +0100411:
Bin Meng703b84e2021-05-13 16:46:17 +080042 lb a5, 0(a1)
43 addi a1, a1, 1
44 sb a5, 0(a0)
45 addi a0, a0, 1
46 bne a0, a2, 1b
Heinrich Schuchardt8f0dc4c2021-03-27 12:37:04 +0100472:
Heinrich Schuchardt8f0dc4c2021-03-27 12:37:04 +010048
Bin Meng703b84e2021-05-13 16:46:17 +080049 /*
50 * Now a0 is word-aligned. If a1 is also word aligned, we could perform
51 * aligned word-wise copy. Otherwise we need to perform misaligned
52 * word-wise copy.
53 */
54 andi a3, a1, SZREG-1
55 bnez a3, .Lmisaligned_word_copy
Heinrich Schuchardt8f0dc4c2021-03-27 12:37:04 +010056
Bin Meng703b84e2021-05-13 16:46:17 +080057 /* Unrolled wordwise copy */
58 addi t0, t0, -(16*SZREG-1)
59 bgeu a0, t0, 2f
601:
61 REG_L a2, 0(a1)
62 REG_L a3, SZREG(a1)
63 REG_L a4, 2*SZREG(a1)
64 REG_L a5, 3*SZREG(a1)
65 REG_L a6, 4*SZREG(a1)
66 REG_L a7, 5*SZREG(a1)
67 REG_L t1, 6*SZREG(a1)
68 REG_L t2, 7*SZREG(a1)
69 REG_L t3, 8*SZREG(a1)
70 REG_L t4, 9*SZREG(a1)
71 REG_L t5, 10*SZREG(a1)
72 REG_S a2, 0(a0)
73 REG_S a3, SZREG(a0)
74 REG_S a4, 2*SZREG(a0)
75 REG_S a5, 3*SZREG(a0)
76 REG_S a6, 4*SZREG(a0)
77 REG_S a7, 5*SZREG(a0)
78 REG_S t1, 6*SZREG(a0)
79 REG_S t2, 7*SZREG(a0)
80 REG_S t3, 8*SZREG(a0)
81 REG_S t4, 9*SZREG(a0)
82 REG_S t5, 10*SZREG(a0)
83 REG_L a2, 11*SZREG(a1)
84 REG_L a3, 12*SZREG(a1)
85 REG_L a4, 13*SZREG(a1)
86 REG_L a5, 14*SZREG(a1)
87 REG_L a6, 15*SZREG(a1)
88 addi a1, a1, 16*SZREG
89 REG_S a2, 11*SZREG(a0)
90 REG_S a3, 12*SZREG(a0)
91 REG_S a4, 13*SZREG(a0)
92 REG_S a5, 14*SZREG(a0)
93 REG_S a6, 15*SZREG(a0)
94 addi a0, a0, 16*SZREG
95 bltu a0, t0, 1b
962:
97 /* Post-loop increment by 16*SZREG-1 and pre-loop decrement by SZREG-1 */
98 addi t0, t0, 15*SZREG
Heinrich Schuchardt8f0dc4c2021-03-27 12:37:04 +010099
Bin Meng703b84e2021-05-13 16:46:17 +0800100 /* Wordwise copy */
101 bgeu a0, t0, 2f
1021:
103 REG_L a5, 0(a1)
104 addi a1, a1, SZREG
105 REG_S a5, 0(a0)
106 addi a0, a0, SZREG
107 bltu a0, t0, 1b
1082:
109 addi t0, t0, SZREG-1
110
111.Lbyte_copy_tail:
112 /*
113 * Bytewise copy anything left.
114 */
115 beq a0, t0, 2f
1161:
117 lb a5, 0(a1)
118 addi a1, a1, 1
119 sb a5, 0(a0)
120 addi a0, a0, 1
121 bne a0, t0, 1b
1222:
123
124 mv a0, t6
Rick Chen3c60e592023-01-04 09:56:28 +0800125.copy_end:
Heinrich Schuchardt8f0dc4c2021-03-27 12:37:04 +0100126 ret
127
Bin Meng703b84e2021-05-13 16:46:17 +0800128.Lmisaligned_word_copy:
129 /*
130 * Misaligned word-wise copy.
131 * For misaligned copy we still perform word-wise copy, but we need to
132 * use the value fetched from the previous iteration and do some shifts.
133 * This is safe because we wouldn't access more words than necessary.
134 */
135
136 /* Calculate shifts */
137 slli t3, a3, 3
138 sub t4, x0, t3 /* negate is okay as shift will only look at LSBs */
139
140 /* Load the initial value and align a1 */
141 andi a1, a1, ~(SZREG-1)
142 REG_L a5, 0(a1)
143
144 addi t0, t0, -(SZREG-1)
145 /* At least one iteration will be executed here, no check */
1461:
147 srl a4, a5, t3
148 REG_L a5, SZREG(a1)
149 addi a1, a1, SZREG
150 sll a2, a5, t4
151 or a2, a2, a4
152 REG_S a2, 0(a0)
153 addi a0, a0, SZREG
154 bltu a0, t0, 1b
155
156 /* Update pointers to correct value */
157 addi t0, t0, SZREG-1
158 add a1, a1, a3
159
160 j .Lbyte_copy_tail
Heinrich Schuchardt8f0dc4c2021-03-27 12:37:04 +0100161END(__memcpy)