Skip to content

Commit 2851330

Browse files
rmurphy-armwilldeacon
authored andcommitted
arm64: Import latest memcpy()/memmove() implementation
Import the latest implementation of memcpy(), based on the upstream code of string/aarch64/memcpy.S at commit afd6244 from https://github.com/ARM-software/optimized-routines, and subsuming memmove() in the process. Note that for simplicity Arm have chosen to contribute this code to Linux under GPLv2 rather than the original MIT license. Note also that the needs of the usercopy routines vs. regular memcpy() have now diverged so far that we abandon the shared template idea and the damage which that incurred to the tuning of LDP/STP loops. We'll be back to tackle those routines separately in future. Signed-off-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/3c953af43506581b2422f61952261e76949ba711.1622128527.git.robin.murphy@arm.com Signed-off-by: Will Deacon <will@kernel.org>
1 parent b6c4ea4 commit 2851330

3 files changed

Lines changed: 230 additions & 233 deletions

File tree

arch/arm64/lib/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# SPDX-License-Identifier: GPL-2.0
22
lib-y := clear_user.o delay.o copy_from_user.o \
33
copy_to_user.o copy_in_user.o copy_page.o \
4-
clear_page.o csum.o memchr.o memcpy.o memmove.o \
4+
clear_page.o csum.o memchr.o memcpy.o \
55
memset.o memcmp.o strcmp.o strncmp.o strlen.o \
66
strnlen.o strchr.o strrchr.o tishift.o
77

arch/arm64/lib/memcpy.S

Lines changed: 229 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,66 +1,252 @@
11
/* SPDX-License-Identifier: GPL-2.0-only */
22
/*
3-
* Copyright (C) 2013 ARM Ltd.
4-
* Copyright (C) 2013 Linaro.
3+
* Copyright (c) 2012-2020, Arm Limited.
54
*
6-
* This code is based on glibc cortex strings work originally authored by Linaro
7-
* be found @
8-
*
9-
* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10-
* files/head:/src/aarch64/
5+
* Adapted from the original at:
6+
* https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcpy.S
117
*/
128

139
#include <linux/linkage.h>
1410
#include <asm/assembler.h>
15-
#include <asm/cache.h>
1611

17-
/*
18-
* Copy a buffer from src to dest (alignment handled by the hardware)
12+
/* Assumptions:
13+
*
14+
* ARMv8-a, AArch64, unaligned accesses.
1915
*
20-
* Parameters:
21-
* x0 - dest
22-
* x1 - src
23-
* x2 - n
24-
* Returns:
25-
* x0 - dest
2616
*/
27-
.macro ldrb1 reg, ptr, val
28-
ldrb \reg, [\ptr], \val
29-
.endm
30-
31-
.macro strb1 reg, ptr, val
32-
strb \reg, [\ptr], \val
33-
.endm
3417

35-
.macro ldrh1 reg, ptr, val
36-
ldrh \reg, [\ptr], \val
37-
.endm
18+
#define L(label) .L ## label
3819

39-
.macro strh1 reg, ptr, val
40-
strh \reg, [\ptr], \val
41-
.endm
20+
#define dstin x0
21+
#define src x1
22+
#define count x2
23+
#define dst x3
24+
#define srcend x4
25+
#define dstend x5
26+
#define A_l x6
27+
#define A_lw w6
28+
#define A_h x7
29+
#define B_l x8
30+
#define B_lw w8
31+
#define B_h x9
32+
#define C_l x10
33+
#define C_lw w10
34+
#define C_h x11
35+
#define D_l x12
36+
#define D_h x13
37+
#define E_l x14
38+
#define E_h x15
39+
#define F_l x16
40+
#define F_h x17
41+
#define G_l count
42+
#define G_h dst
43+
#define H_l src
44+
#define H_h srcend
45+
#define tmp1 x14
4246

43-
.macro ldr1 reg, ptr, val
44-
ldr \reg, [\ptr], \val
45-
.endm
47+
/* This implementation handles overlaps and supports both memcpy and memmove
48+
from a single entry point. It uses unaligned accesses and branchless
49+
sequences to keep the code small, simple and improve performance.
4650
47-
.macro str1 reg, ptr, val
48-
str \reg, [\ptr], \val
49-
.endm
51+
Copies are split into 3 main cases: small copies of up to 32 bytes, medium
52+
copies of up to 128 bytes, and large copies. The overhead of the overlap
53+
check is negligible since it is only required for large copies.
5054
51-
.macro ldp1 reg1, reg2, ptr, val
52-
ldp \reg1, \reg2, [\ptr], \val
53-
.endm
54-
55-
.macro stp1 reg1, reg2, ptr, val
56-
stp \reg1, \reg2, [\ptr], \val
57-
.endm
55+
Large copies use a software pipelined loop processing 64 bytes per iteration.
56+
The destination pointer is 16-byte aligned to minimize unaligned accesses.
57+
The loop tail is handled by always copying 64 bytes from the end.
58+
*/
5859

60+
SYM_FUNC_START_ALIAS(__memmove)
61+
SYM_FUNC_START_WEAK_ALIAS_PI(memmove)
5962
SYM_FUNC_START_ALIAS(__memcpy)
6063
SYM_FUNC_START_WEAK_PI(memcpy)
61-
#include "copy_template.S"
64+
add srcend, src, count
65+
add dstend, dstin, count
66+
cmp count, 128
67+
b.hi L(copy_long)
68+
cmp count, 32
69+
b.hi L(copy32_128)
70+
71+
/* Small copies: 0..32 bytes. */
72+
cmp count, 16
73+
b.lo L(copy16)
74+
ldp A_l, A_h, [src]
75+
ldp D_l, D_h, [srcend, -16]
76+
stp A_l, A_h, [dstin]
77+
stp D_l, D_h, [dstend, -16]
78+
ret
79+
80+
/* Copy 8-15 bytes. */
81+
L(copy16):
82+
tbz count, 3, L(copy8)
83+
ldr A_l, [src]
84+
ldr A_h, [srcend, -8]
85+
str A_l, [dstin]
86+
str A_h, [dstend, -8]
87+
ret
88+
89+
.p2align 3
90+
/* Copy 4-7 bytes. */
91+
L(copy8):
92+
tbz count, 2, L(copy4)
93+
ldr A_lw, [src]
94+
ldr B_lw, [srcend, -4]
95+
str A_lw, [dstin]
96+
str B_lw, [dstend, -4]
97+
ret
98+
99+
/* Copy 0..3 bytes using a branchless sequence. */
100+
L(copy4):
101+
cbz count, L(copy0)
102+
lsr tmp1, count, 1
103+
ldrb A_lw, [src]
104+
ldrb C_lw, [srcend, -1]
105+
ldrb B_lw, [src, tmp1]
106+
strb A_lw, [dstin]
107+
strb B_lw, [dstin, tmp1]
108+
strb C_lw, [dstend, -1]
109+
L(copy0):
110+
ret
111+
112+
.p2align 4
113+
/* Medium copies: 33..128 bytes. */
114+
L(copy32_128):
115+
ldp A_l, A_h, [src]
116+
ldp B_l, B_h, [src, 16]
117+
ldp C_l, C_h, [srcend, -32]
118+
ldp D_l, D_h, [srcend, -16]
119+
cmp count, 64
120+
b.hi L(copy128)
121+
stp A_l, A_h, [dstin]
122+
stp B_l, B_h, [dstin, 16]
123+
stp C_l, C_h, [dstend, -32]
124+
stp D_l, D_h, [dstend, -16]
62125
ret
126+
127+
.p2align 4
128+
/* Copy 65..128 bytes. */
129+
L(copy128):
130+
ldp E_l, E_h, [src, 32]
131+
ldp F_l, F_h, [src, 48]
132+
cmp count, 96
133+
b.ls L(copy96)
134+
ldp G_l, G_h, [srcend, -64]
135+
ldp H_l, H_h, [srcend, -48]
136+
stp G_l, G_h, [dstend, -64]
137+
stp H_l, H_h, [dstend, -48]
138+
L(copy96):
139+
stp A_l, A_h, [dstin]
140+
stp B_l, B_h, [dstin, 16]
141+
stp E_l, E_h, [dstin, 32]
142+
stp F_l, F_h, [dstin, 48]
143+
stp C_l, C_h, [dstend, -32]
144+
stp D_l, D_h, [dstend, -16]
145+
ret
146+
147+
.p2align 4
148+
/* Copy more than 128 bytes. */
149+
L(copy_long):
150+
/* Use backwards copy if there is an overlap. */
151+
sub tmp1, dstin, src
152+
cbz tmp1, L(copy0)
153+
cmp tmp1, count
154+
b.lo L(copy_long_backwards)
155+
156+
/* Copy 16 bytes and then align dst to 16-byte alignment. */
157+
158+
ldp D_l, D_h, [src]
159+
and tmp1, dstin, 15
160+
bic dst, dstin, 15
161+
sub src, src, tmp1
162+
add count, count, tmp1 /* Count is now 16 too large. */
163+
ldp A_l, A_h, [src, 16]
164+
stp D_l, D_h, [dstin]
165+
ldp B_l, B_h, [src, 32]
166+
ldp C_l, C_h, [src, 48]
167+
ldp D_l, D_h, [src, 64]!
168+
subs count, count, 128 + 16 /* Test and readjust count. */
169+
b.ls L(copy64_from_end)
170+
171+
L(loop64):
172+
stp A_l, A_h, [dst, 16]
173+
ldp A_l, A_h, [src, 16]
174+
stp B_l, B_h, [dst, 32]
175+
ldp B_l, B_h, [src, 32]
176+
stp C_l, C_h, [dst, 48]
177+
ldp C_l, C_h, [src, 48]
178+
stp D_l, D_h, [dst, 64]!
179+
ldp D_l, D_h, [src, 64]!
180+
subs count, count, 64
181+
b.hi L(loop64)
182+
183+
/* Write the last iteration and copy 64 bytes from the end. */
184+
L(copy64_from_end):
185+
ldp E_l, E_h, [srcend, -64]
186+
stp A_l, A_h, [dst, 16]
187+
ldp A_l, A_h, [srcend, -48]
188+
stp B_l, B_h, [dst, 32]
189+
ldp B_l, B_h, [srcend, -32]
190+
stp C_l, C_h, [dst, 48]
191+
ldp C_l, C_h, [srcend, -16]
192+
stp D_l, D_h, [dst, 64]
193+
stp E_l, E_h, [dstend, -64]
194+
stp A_l, A_h, [dstend, -48]
195+
stp B_l, B_h, [dstend, -32]
196+
stp C_l, C_h, [dstend, -16]
197+
ret
198+
199+
.p2align 4
200+
201+
/* Large backwards copy for overlapping copies.
202+
Copy 16 bytes and then align dst to 16-byte alignment. */
203+
L(copy_long_backwards):
204+
ldp D_l, D_h, [srcend, -16]
205+
and tmp1, dstend, 15
206+
sub srcend, srcend, tmp1
207+
sub count, count, tmp1
208+
ldp A_l, A_h, [srcend, -16]
209+
stp D_l, D_h, [dstend, -16]
210+
ldp B_l, B_h, [srcend, -32]
211+
ldp C_l, C_h, [srcend, -48]
212+
ldp D_l, D_h, [srcend, -64]!
213+
sub dstend, dstend, tmp1
214+
subs count, count, 128
215+
b.ls L(copy64_from_start)
216+
217+
L(loop64_backwards):
218+
stp A_l, A_h, [dstend, -16]
219+
ldp A_l, A_h, [srcend, -16]
220+
stp B_l, B_h, [dstend, -32]
221+
ldp B_l, B_h, [srcend, -32]
222+
stp C_l, C_h, [dstend, -48]
223+
ldp C_l, C_h, [srcend, -48]
224+
stp D_l, D_h, [dstend, -64]!
225+
ldp D_l, D_h, [srcend, -64]!
226+
subs count, count, 64
227+
b.hi L(loop64_backwards)
228+
229+
/* Write the last iteration and copy 64 bytes from the start. */
230+
L(copy64_from_start):
231+
ldp G_l, G_h, [src, 48]
232+
stp A_l, A_h, [dstend, -16]
233+
ldp A_l, A_h, [src, 32]
234+
stp B_l, B_h, [dstend, -32]
235+
ldp B_l, B_h, [src, 16]
236+
stp C_l, C_h, [dstend, -48]
237+
ldp C_l, C_h, [src]
238+
stp D_l, D_h, [dstend, -64]
239+
stp G_l, G_h, [dstin, 48]
240+
stp A_l, A_h, [dstin, 32]
241+
stp B_l, B_h, [dstin, 16]
242+
stp C_l, C_h, [dstin]
243+
ret
244+
63245
SYM_FUNC_END_PI(memcpy)
64246
EXPORT_SYMBOL(memcpy)
65247
SYM_FUNC_END_ALIAS(__memcpy)
66248
EXPORT_SYMBOL(__memcpy)
249+
SYM_FUNC_END_ALIAS_PI(memmove)
250+
EXPORT_SYMBOL(memmove)
251+
SYM_FUNC_END_ALIAS(__memmove)
252+
EXPORT_SYMBOL(__memmove)

0 commit comments

Comments
 (0)