Skip to content

Commit 387d828

Browse files
jgoulywilldeacon
authored andcommitted
arm64: lib: Import latest version of Arm Optimized Routines' strncmp
Import the latest version of the Arm Optimized Routines strncmp function based on the upstream code of string/aarch64/strncmp.S at commit 189dfefe37d5 from: https://github.com/ARM-software/optimized-routines This latest version includes MTE support. Note that for simplicity Arm have chosen to contribute this code to Linux under GPLv2 rather than the original MIT OR Apache-2.0 WITH LLVM-exception license. Arm is the sole copyright holder for this code. Signed-off-by: Joey Gouly <joey.gouly@arm.com> Cc: Robin Murphy <robin.murphy@arm.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Will Deacon <will@kernel.org> Acked-by: Mark Rutland <mark.rutland@arm.com> Acked-by: Catalin Marinas <catalin.marinas@arm.com> Link: https://lore.kernel.org/r/20220301101435.19327-3-joey.gouly@arm.com Signed-off-by: Will Deacon <will@kernel.org>
1 parent 507f788 commit 387d828

1 file changed

Lines changed: 141 additions & 93 deletions

File tree

arch/arm64/lib/strncmp.S

Lines changed: 141 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,24 @@
11
/* SPDX-License-Identifier: GPL-2.0-only */
22
/*
3-
* Copyright (c) 2013-2021, Arm Limited.
3+
* Copyright (c) 2013-2022, Arm Limited.
44
*
55
* Adapted from the original at:
6-
* https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/strncmp.S
6+
* https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strncmp.S
77
*/
88

99
#include <linux/linkage.h>
1010
#include <asm/assembler.h>
1111

1212
/* Assumptions:
1313
*
14-
* ARMv8-a, AArch64
14+
* ARMv8-a, AArch64.
15+
* MTE compatible.
1516
*/
1617

1718
#define L(label) .L ## label
1819

1920
#define REP8_01 0x0101010101010101
2021
#define REP8_7f 0x7f7f7f7f7f7f7f7f
21-
#define REP8_80 0x8080808080808080
2222

2323
/* Parameters and result. */
2424
#define src1 x0
@@ -39,10 +39,24 @@
3939
#define tmp3 x10
4040
#define zeroones x11
4141
#define pos x12
42-
#define limit_wd x13
43-
#define mask x14
44-
#define endloop x15
42+
#define mask x13
43+
#define endloop x14
4544
#define count mask
45+
#define offset pos
46+
#define neg_offset x15
47+
48+
/* Define endian dependent shift operations.
49+
On big-endian early bytes are at MSB and on little-endian LSB.
50+
LS_FW means shifting towards early bytes.
51+
LS_BK means shifting towards later bytes.
52+
*/
53+
#ifdef __AARCH64EB__
54+
#define LS_FW lsl
55+
#define LS_BK lsr
56+
#else
57+
#define LS_FW lsr
58+
#define LS_BK lsl
59+
#endif
4660

4761
SYM_FUNC_START_WEAK_PI(strncmp)
4862
cbz limit, L(ret0)
@@ -52,9 +66,6 @@ SYM_FUNC_START_WEAK_PI(strncmp)
5266
and count, src1, #7
5367
b.ne L(misaligned8)
5468
cbnz count, L(mutual_align)
55-
/* Calculate the number of full and partial words -1. */
56-
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
57-
lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */
5869

5970
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
6071
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
@@ -64,56 +75,52 @@ L(loop_aligned):
6475
ldr data1, [src1], #8
6576
ldr data2, [src2], #8
6677
L(start_realigned):
67-
subs limit_wd, limit_wd, #1
78+
subs limit, limit, #8
6879
sub tmp1, data1, zeroones
6980
orr tmp2, data1, #REP8_7f
7081
eor diff, data1, data2 /* Non-zero if differences found. */
71-
csinv endloop, diff, xzr, pl /* Last Dword or differences. */
82+
csinv endloop, diff, xzr, hi /* Last Dword or differences. */
7283
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
7384
ccmp endloop, #0, #0, eq
7485
b.eq L(loop_aligned)
7586
/* End of main loop */
7687

77-
/* Not reached the limit, must have found the end or a diff. */
78-
tbz limit_wd, #63, L(not_limit)
79-
80-
/* Limit % 8 == 0 => all bytes significant. */
81-
ands limit, limit, #7
82-
b.eq L(not_limit)
83-
84-
lsl limit, limit, #3 /* Bits -> bytes. */
85-
mov mask, #~0
86-
#ifdef __AARCH64EB__
87-
lsr mask, mask, limit
88-
#else
89-
lsl mask, mask, limit
90-
#endif
91-
bic data1, data1, mask
92-
bic data2, data2, mask
93-
94-
/* Make sure that the NUL byte is marked in the syndrome. */
95-
orr has_nul, has_nul, mask
96-
97-
L(not_limit):
88+
L(full_check):
89+
#ifndef __AARCH64EB__
9890
orr syndrome, diff, has_nul
99-
100-
#ifndef __AARCH64EB__
91+
add limit, limit, 8 /* Rewind limit to before last subs. */
92+
L(syndrome_check):
93+
/* Limit was reached. Check if the NUL byte or the difference
94+
is before the limit. */
10195
rev syndrome, syndrome
10296
rev data1, data1
103-
/* The MS-non-zero bit of the syndrome marks either the first bit
104-
that is different, or the top bit of the first zero byte.
105-
Shifting left now will bring the critical information into the
106-
top bits. */
10797
clz pos, syndrome
10898
rev data2, data2
10999
lsl data1, data1, pos
100+
cmp limit, pos, lsr #3
110101
lsl data2, data2, pos
111102
/* But we need to zero-extend (char is unsigned) the value and then
112103
perform a signed 32-bit subtraction. */
113104
lsr data1, data1, #56
114105
sub result, data1, data2, lsr #56
106+
csel result, result, xzr, hi
115107
ret
116108
#else
109+
/* Not reached the limit, must have found the end or a diff. */
110+
tbz limit, #63, L(not_limit)
111+
add tmp1, limit, 8
112+
cbz limit, L(not_limit)
113+
114+
lsl limit, tmp1, #3 /* Bits -> bytes. */
115+
mov mask, #~0
116+
lsr mask, mask, limit
117+
bic data1, data1, mask
118+
bic data2, data2, mask
119+
120+
/* Make sure that the NUL byte is marked in the syndrome. */
121+
orr has_nul, has_nul, mask
122+
123+
L(not_limit):
117124
/* For big-endian we cannot use the trick with the syndrome value
118125
as carry-propagation can corrupt the upper bits if the trailing
119126
bytes in the string contain 0x01. */
@@ -134,10 +141,11 @@ L(not_limit):
134141
rev has_nul, has_nul
135142
orr syndrome, diff, has_nul
136143
clz pos, syndrome
137-
/* The MS-non-zero bit of the syndrome marks either the first bit
138-
that is different, or the top bit of the first zero byte.
144+
/* The most-significant-non-zero bit of the syndrome marks either the
145+
first bit that is different, or the top bit of the first zero byte.
139146
Shifting left now will bring the critical information into the
140147
top bits. */
148+
L(end_quick):
141149
lsl data1, data1, pos
142150
lsl data2, data2, pos
143151
/* But we need to zero-extend (char is unsigned) the value and then
@@ -159,22 +167,12 @@ L(mutual_align):
159167
neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */
160168
ldr data2, [src2], #8
161169
mov tmp2, #~0
162-
sub limit_wd, limit, #1 /* limit != 0, so no underflow. */
163-
#ifdef __AARCH64EB__
164-
/* Big-endian. Early bytes are at MSB. */
165-
lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */
166-
#else
167-
/* Little-endian. Early bytes are at LSB. */
168-
lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */
169-
#endif
170-
and tmp3, limit_wd, #7
171-
lsr limit_wd, limit_wd, #3
172-
/* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */
173-
add limit, limit, count
174-
add tmp3, tmp3, count
170+
LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */
171+
/* Adjust the limit and ensure it doesn't overflow. */
172+
adds limit, limit, count
173+
csinv limit, limit, xzr, lo
175174
orr data1, data1, tmp2
176175
orr data2, data2, tmp2
177-
add limit_wd, limit_wd, tmp3, lsr #3
178176
b L(start_realigned)
179177

180178
.p2align 4
@@ -197,13 +195,11 @@ L(done):
197195
/* Align the SRC1 to a dword by doing a bytewise compare and then do
198196
the dword loop. */
199197
L(try_misaligned_words):
200-
lsr limit_wd, limit, #3
201-
cbz count, L(do_misaligned)
198+
cbz count, L(src1_aligned)
202199

203200
neg count, count
204201
and count, count, #7
205202
sub limit, limit, count
206-
lsr limit_wd, limit, #3
207203

208204
L(page_end_loop):
209205
ldrb data1w, [src1], #1
@@ -214,48 +210,100 @@ L(page_end_loop):
214210
subs count, count, #1
215211
b.hi L(page_end_loop)
216212

217-
L(do_misaligned):
218-
/* Prepare ourselves for the next page crossing. Unlike the aligned
219-
loop, we fetch 1 less dword because we risk crossing bounds on
220-
SRC2. */
221-
mov count, #8
222-
subs limit_wd, limit_wd, #1
223-
b.lo L(done_loop)
224-
L(loop_misaligned):
225-
and tmp2, src2, #0xff8
226-
eor tmp2, tmp2, #0xff8
227-
cbz tmp2, L(page_end_loop)
213+
/* The following diagram explains the comparison of misaligned strings.
214+
The bytes are shown in natural order. For little-endian, it is
215+
reversed in the registers. The "x" bytes are before the string.
216+
The "|" separates data that is loaded at one time.
217+
src1 | a a a a a a a a | b b b c c c c c | . . .
218+
src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
219+
220+
After shifting in each step, the data looks like this:
221+
STEP_A STEP_B STEP_C
222+
data1 a a a a a a a a b b b c c c c c b b b c c c c c
223+
data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
228224

225+
The bytes with "0" are eliminated from the syndrome via mask.
226+
227+
Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
228+
time from SRC2. The comparison happens in 3 steps. After each step
229+
the loop can exit, or read from SRC1 or SRC2. */
230+
L(src1_aligned):
231+
/* Calculate offset from 8 byte alignment to string start in bits. No
232+
need to mask offset since shifts are ignoring upper bits. */
233+
lsl offset, src2, #3
234+
bic src2, src2, #0xf
235+
mov mask, -1
236+
neg neg_offset, offset
229237
ldr data1, [src1], #8
230-
ldr data2, [src2], #8
231-
sub tmp1, data1, zeroones
232-
orr tmp2, data1, #REP8_7f
233-
eor diff, data1, data2 /* Non-zero if differences found. */
234-
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
235-
ccmp diff, #0, #0, eq
236-
b.ne L(not_limit)
237-
subs limit_wd, limit_wd, #1
238-
b.pl L(loop_misaligned)
238+
ldp tmp1, tmp2, [src2], #16
239+
LS_BK mask, mask, neg_offset
240+
and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */
241+
/* Skip the first compare if data in tmp1 is irrelevant. */
242+
tbnz offset, 6, L(misaligned_mid_loop)
239243

240-
L(done_loop):
241-
/* We found a difference or a NULL before the limit was reached. */
242-
and limit, limit, #7
243-
cbz limit, L(not_limit)
244-
/* Read the last word. */
245-
sub src1, src1, 8
246-
sub src2, src2, 8
247-
ldr data1, [src1, limit]
248-
ldr data2, [src2, limit]
249-
sub tmp1, data1, zeroones
250-
orr tmp2, data1, #REP8_7f
244+
L(loop_misaligned):
245+
/* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/
246+
LS_FW data2, tmp1, offset
247+
LS_BK tmp1, tmp2, neg_offset
248+
subs limit, limit, #8
249+
orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/
250+
sub has_nul, data1, zeroones
251251
eor diff, data1, data2 /* Non-zero if differences found. */
252-
bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
253-
ccmp diff, #0, #0, eq
254-
b.ne L(not_limit)
252+
orr tmp3, data1, #REP8_7f
253+
csinv endloop, diff, xzr, hi /* If limit, set to all ones. */
254+
bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */
255+
orr tmp3, endloop, has_nul
256+
cbnz tmp3, L(full_check)
257+
258+
ldr data1, [src1], #8
259+
L(misaligned_mid_loop):
260+
/* STEP_B: Compare first part of data1 to second part of tmp2. */
261+
LS_FW data2, tmp2, offset
262+
#ifdef __AARCH64EB__
263+
/* For big-endian we do a byte reverse to avoid carry-propagation
264+
problem described above. This way we can reuse the has_nul in the
265+
next step and also use syndrome value trick at the end. */
266+
rev tmp3, data1
267+
#define data1_fixed tmp3
268+
#else
269+
#define data1_fixed data1
270+
#endif
271+
sub has_nul, data1_fixed, zeroones
272+
orr tmp3, data1_fixed, #REP8_7f
273+
eor diff, data2, data1 /* Non-zero if differences found. */
274+
bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */
275+
#ifdef __AARCH64EB__
276+
rev has_nul, has_nul
277+
#endif
278+
cmp limit, neg_offset, lsr #3
279+
orr syndrome, diff, has_nul
280+
bic syndrome, syndrome, mask /* Ignore later bytes. */
281+
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
282+
cbnz tmp3, L(syndrome_check)
283+
284+
/* STEP_C: Compare second part of data1 to first part of tmp1. */
285+
ldp tmp1, tmp2, [src2], #16
286+
cmp limit, #8
287+
LS_BK data2, tmp1, neg_offset
288+
eor diff, data2, data1 /* Non-zero if differences found. */
289+
orr syndrome, diff, has_nul
290+
and syndrome, syndrome, mask /* Ignore earlier bytes. */
291+
csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */
292+
cbnz tmp3, L(syndrome_check)
293+
294+
ldr data1, [src1], #8
295+
sub limit, limit, #8
296+
b L(loop_misaligned)
297+
298+
#ifdef __AARCH64EB__
299+
L(syndrome_check):
300+
clz pos, syndrome
301+
cmp pos, limit, lsl #3
302+
b.lo L(end_quick)
303+
#endif
255304

256305
L(ret0):
257306
mov result, #0
258307
ret
259-
260308
SYM_FUNC_END_PI(strncmp)
261309
EXPORT_SYMBOL_NOHWKASAN(strncmp)

0 commit comments

Comments
 (0)