Skip to content

Commit 507f788

Browse files
jgoulywilldeacon
authored andcommitted
arm64: lib: Import latest version of Arm Optimized Routines' strcmp
Import the latest version of the Arm Optimized Routines strcmp function based on the upstream code of string/aarch64/strcmp.S at commit 189dfefe37d5 from: https://github.com/ARM-software/optimized-routines This latest version includes MTE support. Note that for simplicity Arm have chosen to contribute this code to Linux under GPLv2 rather than the original MIT OR Apache-2.0 WITH LLVM-exception license. Arm is the sole copyright holder for this code. Signed-off-by: Joey Gouly <joey.gouly@arm.com> Cc: Robin Murphy <robin.murphy@arm.com> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Will Deacon <will@kernel.org> Acked-by: Mark Rutland <mark.rutland@arm.com> Acked-by: Catalin Marinas <catalin.marinas@arm.com> Link: https://lore.kernel.org/r/20220301101435.19327-2-joey.gouly@arm.com Signed-off-by: Will Deacon <will@kernel.org>
1 parent dfd42fa commit 507f788

1 file changed

Lines changed: 126 additions & 112 deletions

File tree

arch/arm64/lib/strcmp.S

Lines changed: 126 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -1,171 +1,185 @@
11
/* SPDX-License-Identifier: GPL-2.0-only */
22
/*
3-
* Copyright (c) 2012-2021, Arm Limited.
3+
* Copyright (c) 2012-2022, Arm Limited.
44
*
55
* Adapted from the original at:
6-
* https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/strcmp.S
6+
* https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strcmp.S
77
*/
88

99
#include <linux/linkage.h>
1010
#include <asm/assembler.h>
1111

1212
/* Assumptions:
1313
*
14-
* ARMv8-a, AArch64
14+
* ARMv8-a, AArch64.
15+
* MTE compatible.
1516
*/
1617

1718
#define L(label) .L ## label
1819

1920
#define REP8_01 0x0101010101010101
2021
#define REP8_7f 0x7f7f7f7f7f7f7f7f
21-
#define REP8_80 0x8080808080808080
2222

23-
/* Parameters and result. */
2423
#define src1 x0
2524
#define src2 x1
2625
#define result x0
2726

28-
/* Internal variables. */
2927
#define data1 x2
3028
#define data1w w2
3129
#define data2 x3
3230
#define data2w w3
3331
#define has_nul x4
3432
#define diff x5
33+
#define off1 x5
3534
#define syndrome x6
36-
#define tmp1 x7
37-
#define tmp2 x8
38-
#define tmp3 x9
39-
#define zeroones x10
40-
#define pos x11
41-
42-
/* Start of performance-critical section -- one 64B cache line. */
43-
.align 6
35+
#define tmp x6
36+
#define data3 x7
37+
#define zeroones x8
38+
#define shift x9
39+
#define off2 x10
40+
41+
/* On big-endian early bytes are at MSB and on little-endian LSB.
42+
LS_FW means shifting towards early bytes. */
43+
#ifdef __AARCH64EB__
44+
# define LS_FW lsl
45+
#else
46+
# define LS_FW lsr
47+
#endif
48+
49+
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
50+
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
51+
can be done in parallel across the entire word.
52+
Since carry propagation makes 0x1 bytes before a NUL byte appear
53+
NUL too in big-endian, byte-reverse the data before the NUL check. */
54+
55+
4456
SYM_FUNC_START_WEAK_PI(strcmp)
45-
eor tmp1, src1, src2
46-
mov zeroones, #REP8_01
47-
tst tmp1, #7
57+
sub off2, src2, src1
58+
mov zeroones, REP8_01
59+
and tmp, src1, 7
60+
tst off2, 7
4861
b.ne L(misaligned8)
49-
ands tmp1, src1, #7
50-
b.ne L(mutual_align)
51-
/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
52-
(=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
53-
can be done in parallel across the entire word. */
62+
cbnz tmp, L(mutual_align)
63+
64+
.p2align 4
65+
5466
L(loop_aligned):
55-
ldr data1, [src1], #8
56-
ldr data2, [src2], #8
67+
ldr data2, [src1, off2]
68+
ldr data1, [src1], 8
5769
L(start_realigned):
58-
sub tmp1, data1, zeroones
59-
orr tmp2, data1, #REP8_7f
60-
eor diff, data1, data2 /* Non-zero if differences found. */
61-
bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
70+
#ifdef __AARCH64EB__
71+
rev tmp, data1
72+
sub has_nul, tmp, zeroones
73+
orr tmp, tmp, REP8_7f
74+
#else
75+
sub has_nul, data1, zeroones
76+
orr tmp, data1, REP8_7f
77+
#endif
78+
bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */
79+
ccmp data1, data2, 0, eq
80+
b.eq L(loop_aligned)
81+
#ifdef __AARCH64EB__
82+
rev has_nul, has_nul
83+
#endif
84+
eor diff, data1, data2
6285
orr syndrome, diff, has_nul
63-
cbz syndrome, L(loop_aligned)
64-
/* End of performance-critical section -- one 64B cache line. */
65-
6686
L(end):
67-
#ifndef __AARCH64EB__
87+
#ifndef __AARCH64EB__
6888
rev syndrome, syndrome
6989
rev data1, data1
70-
/* The MS-non-zero bit of the syndrome marks either the first bit
71-
that is different, or the top bit of the first zero byte.
72-
Shifting left now will bring the critical information into the
73-
top bits. */
74-
clz pos, syndrome
7590
rev data2, data2
76-
lsl data1, data1, pos
77-
lsl data2, data2, pos
78-
/* But we need to zero-extend (char is unsigned) the value and then
79-
perform a signed 32-bit subtraction. */
80-
lsr data1, data1, #56
81-
sub result, data1, data2, lsr #56
82-
ret
83-
#else
84-
/* For big-endian we cannot use the trick with the syndrome value
85-
as carry-propagation can corrupt the upper bits if the trailing
86-
bytes in the string contain 0x01. */
87-
/* However, if there is no NUL byte in the dword, we can generate
88-
the result directly. We can't just subtract the bytes as the
89-
MSB might be significant. */
90-
cbnz has_nul, 1f
91-
cmp data1, data2
92-
cset result, ne
93-
cneg result, result, lo
94-
ret
95-
1:
96-
/* Re-compute the NUL-byte detection, using a byte-reversed value. */
97-
rev tmp3, data1
98-
sub tmp1, tmp3, zeroones
99-
orr tmp2, tmp3, #REP8_7f
100-
bic has_nul, tmp1, tmp2
101-
rev has_nul, has_nul
102-
orr syndrome, diff, has_nul
103-
clz pos, syndrome
104-
/* The MS-non-zero bit of the syndrome marks either the first bit
105-
that is different, or the top bit of the first zero byte.
91+
#endif
92+
clz shift, syndrome
93+
/* The most-significant-non-zero bit of the syndrome marks either the
94+
first bit that is different, or the top bit of the first zero byte.
10695
Shifting left now will bring the critical information into the
10796
top bits. */
108-
lsl data1, data1, pos
109-
lsl data2, data2, pos
97+
lsl data1, data1, shift
98+
lsl data2, data2, shift
11099
/* But we need to zero-extend (char is unsigned) the value and then
111100
perform a signed 32-bit subtraction. */
112-
lsr data1, data1, #56
113-
sub result, data1, data2, lsr #56
101+
lsr data1, data1, 56
102+
sub result, data1, data2, lsr 56
114103
ret
115-
#endif
104+
105+
.p2align 4
116106

117107
L(mutual_align):
118108
/* Sources are mutually aligned, but are not currently at an
119109
alignment boundary. Round down the addresses and then mask off
120-
the bytes that preceed the start point. */
121-
bic src1, src1, #7
122-
bic src2, src2, #7
123-
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
124-
ldr data1, [src1], #8
125-
neg tmp1, tmp1 /* Bits to alignment -64. */
126-
ldr data2, [src2], #8
127-
mov tmp2, #~0
128-
#ifdef __AARCH64EB__
129-
/* Big-endian. Early bytes are at MSB. */
130-
lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
131-
#else
132-
/* Little-endian. Early bytes are at LSB. */
133-
lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
134-
#endif
135-
orr data1, data1, tmp2
136-
orr data2, data2, tmp2
110+
the bytes that precede the start point. */
111+
bic src1, src1, 7
112+
ldr data2, [src1, off2]
113+
ldr data1, [src1], 8
114+
neg shift, src2, lsl 3 /* Bits to alignment -64. */
115+
mov tmp, -1
116+
LS_FW tmp, tmp, shift
117+
orr data1, data1, tmp
118+
orr data2, data2, tmp
137119
b L(start_realigned)
138120

139121
L(misaligned8):
140122
/* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
141-
checking to make sure that we don't access beyond page boundary in
142-
SRC2. */
143-
tst src1, #7
144-
b.eq L(loop_misaligned)
123+
checking to make sure that we don't access beyond the end of SRC2. */
124+
cbz tmp, L(src1_aligned)
145125
L(do_misaligned):
146-
ldrb data1w, [src1], #1
147-
ldrb data2w, [src2], #1
148-
cmp data1w, #1
149-
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
126+
ldrb data1w, [src1], 1
127+
ldrb data2w, [src2], 1
128+
cmp data1w, 0
129+
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
150130
b.ne L(done)
151-
tst src1, #7
131+
tst src1, 7
152132
b.ne L(do_misaligned)
153133

154-
L(loop_misaligned):
155-
/* Test if we are within the last dword of the end of a 4K page. If
156-
yes then jump back to the misaligned loop to copy a byte at a time. */
157-
and tmp1, src2, #0xff8
158-
eor tmp1, tmp1, #0xff8
159-
cbz tmp1, L(do_misaligned)
160-
ldr data1, [src1], #8
161-
ldr data2, [src2], #8
162-
163-
sub tmp1, data1, zeroones
164-
orr tmp2, data1, #REP8_7f
165-
eor diff, data1, data2 /* Non-zero if differences found. */
166-
bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */
134+
L(src1_aligned):
135+
neg shift, src2, lsl 3
136+
bic src2, src2, 7
137+
ldr data3, [src2], 8
138+
#ifdef __AARCH64EB__
139+
rev data3, data3
140+
#endif
141+
lsr tmp, zeroones, shift
142+
orr data3, data3, tmp
143+
sub has_nul, data3, zeroones
144+
orr tmp, data3, REP8_7f
145+
bics has_nul, has_nul, tmp
146+
b.ne L(tail)
147+
148+
sub off1, src2, src1
149+
150+
.p2align 4
151+
152+
L(loop_unaligned):
153+
ldr data3, [src1, off1]
154+
ldr data2, [src1, off2]
155+
#ifdef __AARCH64EB__
156+
rev data3, data3
157+
#endif
158+
sub has_nul, data3, zeroones
159+
orr tmp, data3, REP8_7f
160+
ldr data1, [src1], 8
161+
bics has_nul, has_nul, tmp
162+
ccmp data1, data2, 0, eq
163+
b.eq L(loop_unaligned)
164+
165+
lsl tmp, has_nul, shift
166+
#ifdef __AARCH64EB__
167+
rev tmp, tmp
168+
#endif
169+
eor diff, data1, data2
170+
orr syndrome, diff, tmp
171+
cbnz syndrome, L(end)
172+
L(tail):
173+
ldr data1, [src1]
174+
neg shift, shift
175+
lsr data2, data3, shift
176+
lsr has_nul, has_nul, shift
177+
#ifdef __AARCH64EB__
178+
rev data2, data2
179+
rev has_nul, has_nul
180+
#endif
181+
eor diff, data1, data2
167182
orr syndrome, diff, has_nul
168-
cbz syndrome, L(loop_misaligned)
169183
b L(end)
170184

171185
L(done):

0 commit comments

Comments
 (0)