|
1 | 1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | 2 | /* |
3 | | - * Copyright (c) 2012-2021, Arm Limited. |
| 3 | + * Copyright (c) 2012-2022, Arm Limited. |
4 | 4 | * |
5 | 5 | * Adapted from the original at: |
6 | | - * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/strcmp.S |
| 6 | + * https://github.com/ARM-software/optimized-routines/blob/189dfefe37d54c5b/string/aarch64/strcmp.S |
7 | 7 | */ |
8 | 8 |
|
9 | 9 | #include <linux/linkage.h> |
10 | 10 | #include <asm/assembler.h> |
11 | 11 |
|
12 | 12 | /* Assumptions: |
13 | 13 | * |
14 | | - * ARMv8-a, AArch64 |
| 14 | + * ARMv8-a, AArch64. |
| 15 | + * MTE compatible. |
15 | 16 | */ |
16 | 17 |
|
17 | 18 | #define L(label) .L ## label |
18 | 19 |
|
19 | 20 | #define REP8_01 0x0101010101010101 |
20 | 21 | #define REP8_7f 0x7f7f7f7f7f7f7f7f |
21 | | -#define REP8_80 0x8080808080808080 |
22 | 22 |
|
23 | | -/* Parameters and result. */ |
24 | 23 | #define src1 x0 |
25 | 24 | #define src2 x1 |
26 | 25 | #define result x0 |
27 | 26 |
|
28 | | -/* Internal variables. */ |
29 | 27 | #define data1 x2 |
30 | 28 | #define data1w w2 |
31 | 29 | #define data2 x3 |
32 | 30 | #define data2w w3 |
33 | 31 | #define has_nul x4 |
34 | 32 | #define diff x5 |
| 33 | +#define off1 x5 |
35 | 34 | #define syndrome x6 |
36 | | -#define tmp1 x7 |
37 | | -#define tmp2 x8 |
38 | | -#define tmp3 x9 |
39 | | -#define zeroones x10 |
40 | | -#define pos x11 |
41 | | - |
42 | | - /* Start of performance-critical section -- one 64B cache line. */ |
43 | | - .align 6 |
| 35 | +#define tmp x6 |
| 36 | +#define data3 x7 |
| 37 | +#define zeroones x8 |
| 38 | +#define shift x9 |
| 39 | +#define off2 x10 |
| 40 | + |
| 41 | +/* On big-endian early bytes are at MSB and on little-endian LSB. |
| 42 | + LS_FW means shifting towards early bytes. */ |
| 43 | +#ifdef __AARCH64EB__ |
| 44 | +# define LS_FW lsl |
| 45 | +#else |
| 46 | +# define LS_FW lsr |
| 47 | +#endif |
| 48 | + |
| 49 | +/* NUL detection works on the principle that (X - 1) & (~X) & 0x80 |
| 50 | + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and |
| 51 | + can be done in parallel across the entire word. |
| 52 | + Since carry propagation makes 0x1 bytes before a NUL byte appear |
| 53 | + NUL too in big-endian, byte-reverse the data before the NUL check. */ |
| 54 | + |
| 55 | + |
44 | 56 | SYM_FUNC_START_WEAK_PI(strcmp) |
45 | | - eor tmp1, src1, src2 |
46 | | - mov zeroones, #REP8_01 |
47 | | - tst tmp1, #7 |
| 57 | + sub off2, src2, src1 |
| 58 | + mov zeroones, REP8_01 |
| 59 | + and tmp, src1, 7 |
| 60 | + tst off2, 7 |
48 | 61 | b.ne L(misaligned8) |
49 | | - ands tmp1, src1, #7 |
50 | | - b.ne L(mutual_align) |
51 | | - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 |
52 | | - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and |
53 | | - can be done in parallel across the entire word. */ |
| 62 | + cbnz tmp, L(mutual_align) |
| 63 | + |
| 64 | + .p2align 4 |
| 65 | + |
54 | 66 | L(loop_aligned): |
55 | | - ldr data1, [src1], #8 |
56 | | - ldr data2, [src2], #8 |
| 67 | + ldr data2, [src1, off2] |
| 68 | + ldr data1, [src1], 8 |
57 | 69 | L(start_realigned): |
58 | | - sub tmp1, data1, zeroones |
59 | | - orr tmp2, data1, #REP8_7f |
60 | | - eor diff, data1, data2 /* Non-zero if differences found. */ |
61 | | - bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ |
| 70 | +#ifdef __AARCH64EB__ |
| 71 | + rev tmp, data1 |
| 72 | + sub has_nul, tmp, zeroones |
| 73 | + orr tmp, tmp, REP8_7f |
| 74 | +#else |
| 75 | + sub has_nul, data1, zeroones |
| 76 | + orr tmp, data1, REP8_7f |
| 77 | +#endif |
| 78 | + bics has_nul, has_nul, tmp /* Non-zero if NUL terminator. */ |
| 79 | + ccmp data1, data2, 0, eq |
| 80 | + b.eq L(loop_aligned) |
| 81 | +#ifdef __AARCH64EB__ |
| 82 | + rev has_nul, has_nul |
| 83 | +#endif |
| 84 | + eor diff, data1, data2 |
62 | 85 | orr syndrome, diff, has_nul |
63 | | - cbz syndrome, L(loop_aligned) |
64 | | - /* End of performance-critical section -- one 64B cache line. */ |
65 | | - |
66 | 86 | L(end): |
67 | | -#ifndef __AARCH64EB__ |
| 87 | +#ifndef __AARCH64EB__ |
68 | 88 | rev syndrome, syndrome |
69 | 89 | rev data1, data1 |
70 | | - /* The MS-non-zero bit of the syndrome marks either the first bit |
71 | | - that is different, or the top bit of the first zero byte. |
72 | | - Shifting left now will bring the critical information into the |
73 | | - top bits. */ |
74 | | - clz pos, syndrome |
75 | 90 | rev data2, data2 |
76 | | - lsl data1, data1, pos |
77 | | - lsl data2, data2, pos |
78 | | - /* But we need to zero-extend (char is unsigned) the value and then |
79 | | - perform a signed 32-bit subtraction. */ |
80 | | - lsr data1, data1, #56 |
81 | | - sub result, data1, data2, lsr #56 |
82 | | - ret |
83 | | -#else |
84 | | - /* For big-endian we cannot use the trick with the syndrome value |
85 | | - as carry-propagation can corrupt the upper bits if the trailing |
86 | | - bytes in the string contain 0x01. */ |
87 | | - /* However, if there is no NUL byte in the dword, we can generate |
88 | | - the result directly. We can't just subtract the bytes as the |
89 | | - MSB might be significant. */ |
90 | | - cbnz has_nul, 1f |
91 | | - cmp data1, data2 |
92 | | - cset result, ne |
93 | | - cneg result, result, lo |
94 | | - ret |
95 | | -1: |
96 | | - /* Re-compute the NUL-byte detection, using a byte-reversed value. */ |
97 | | - rev tmp3, data1 |
98 | | - sub tmp1, tmp3, zeroones |
99 | | - orr tmp2, tmp3, #REP8_7f |
100 | | - bic has_nul, tmp1, tmp2 |
101 | | - rev has_nul, has_nul |
102 | | - orr syndrome, diff, has_nul |
103 | | - clz pos, syndrome |
104 | | - /* The MS-non-zero bit of the syndrome marks either the first bit |
105 | | - that is different, or the top bit of the first zero byte. |
| 91 | +#endif |
| 92 | + clz shift, syndrome |
| 93 | + /* The most-significant-non-zero bit of the syndrome marks either the |
| 94 | + first bit that is different, or the top bit of the first zero byte. |
106 | 95 | Shifting left now will bring the critical information into the |
107 | 96 | top bits. */ |
108 | | - lsl data1, data1, pos |
109 | | - lsl data2, data2, pos |
| 97 | + lsl data1, data1, shift |
| 98 | + lsl data2, data2, shift |
110 | 99 | /* But we need to zero-extend (char is unsigned) the value and then |
111 | 100 | perform a signed 32-bit subtraction. */ |
112 | | - lsr data1, data1, #56 |
113 | | - sub result, data1, data2, lsr #56 |
| 101 | + lsr data1, data1, 56 |
| 102 | + sub result, data1, data2, lsr 56 |
114 | 103 | ret |
115 | | -#endif |
| 104 | + |
| 105 | + .p2align 4 |
116 | 106 |
|
117 | 107 | L(mutual_align): |
118 | 108 | /* Sources are mutually aligned, but are not currently at an |
119 | 109 | alignment boundary. Round down the addresses and then mask off |
120 | | - the bytes that preceed the start point. */ |
121 | | - bic src1, src1, #7 |
122 | | - bic src2, src2, #7 |
123 | | - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ |
124 | | - ldr data1, [src1], #8 |
125 | | - neg tmp1, tmp1 /* Bits to alignment -64. */ |
126 | | - ldr data2, [src2], #8 |
127 | | - mov tmp2, #~0 |
128 | | -#ifdef __AARCH64EB__ |
129 | | - /* Big-endian. Early bytes are at MSB. */ |
130 | | - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ |
131 | | -#else |
132 | | - /* Little-endian. Early bytes are at LSB. */ |
133 | | - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ |
134 | | -#endif |
135 | | - orr data1, data1, tmp2 |
136 | | - orr data2, data2, tmp2 |
| 110 | + the bytes that precede the start point. */ |
| 111 | + bic src1, src1, 7 |
| 112 | + ldr data2, [src1, off2] |
| 113 | + ldr data1, [src1], 8 |
| 114 | + neg shift, src2, lsl 3 /* Bits to alignment -64. */ |
| 115 | + mov tmp, -1 |
| 116 | + LS_FW tmp, tmp, shift |
| 117 | + orr data1, data1, tmp |
| 118 | + orr data2, data2, tmp |
137 | 119 | b L(start_realigned) |
138 | 120 |
|
139 | 121 | L(misaligned8): |
140 | 122 | /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always |
141 | | - checking to make sure that we don't access beyond page boundary in |
142 | | - SRC2. */ |
143 | | - tst src1, #7 |
144 | | - b.eq L(loop_misaligned) |
| 123 | + checking to make sure that we don't access beyond the end of SRC2. */ |
| 124 | + cbz tmp, L(src1_aligned) |
145 | 125 | L(do_misaligned): |
146 | | - ldrb data1w, [src1], #1 |
147 | | - ldrb data2w, [src2], #1 |
148 | | - cmp data1w, #1 |
149 | | - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ |
| 126 | + ldrb data1w, [src1], 1 |
| 127 | + ldrb data2w, [src2], 1 |
| 128 | + cmp data1w, 0 |
| 129 | + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ |
150 | 130 | b.ne L(done) |
151 | | - tst src1, #7 |
| 131 | + tst src1, 7 |
152 | 132 | b.ne L(do_misaligned) |
153 | 133 |
|
154 | | -L(loop_misaligned): |
155 | | - /* Test if we are within the last dword of the end of a 4K page. If |
156 | | - yes then jump back to the misaligned loop to copy a byte at a time. */ |
157 | | - and tmp1, src2, #0xff8 |
158 | | - eor tmp1, tmp1, #0xff8 |
159 | | - cbz tmp1, L(do_misaligned) |
160 | | - ldr data1, [src1], #8 |
161 | | - ldr data2, [src2], #8 |
162 | | - |
163 | | - sub tmp1, data1, zeroones |
164 | | - orr tmp2, data1, #REP8_7f |
165 | | - eor diff, data1, data2 /* Non-zero if differences found. */ |
166 | | - bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ |
| 134 | +L(src1_aligned): |
| 135 | + neg shift, src2, lsl 3 |
| 136 | + bic src2, src2, 7 |
| 137 | + ldr data3, [src2], 8 |
| 138 | +#ifdef __AARCH64EB__ |
| 139 | + rev data3, data3 |
| 140 | +#endif |
| 141 | + lsr tmp, zeroones, shift |
| 142 | + orr data3, data3, tmp |
| 143 | + sub has_nul, data3, zeroones |
| 144 | + orr tmp, data3, REP8_7f |
| 145 | + bics has_nul, has_nul, tmp |
| 146 | + b.ne L(tail) |
| 147 | + |
| 148 | + sub off1, src2, src1 |
| 149 | + |
| 150 | + .p2align 4 |
| 151 | + |
| 152 | +L(loop_unaligned): |
| 153 | + ldr data3, [src1, off1] |
| 154 | + ldr data2, [src1, off2] |
| 155 | +#ifdef __AARCH64EB__ |
| 156 | + rev data3, data3 |
| 157 | +#endif |
| 158 | + sub has_nul, data3, zeroones |
| 159 | + orr tmp, data3, REP8_7f |
| 160 | + ldr data1, [src1], 8 |
| 161 | + bics has_nul, has_nul, tmp |
| 162 | + ccmp data1, data2, 0, eq |
| 163 | + b.eq L(loop_unaligned) |
| 164 | + |
| 165 | + lsl tmp, has_nul, shift |
| 166 | +#ifdef __AARCH64EB__ |
| 167 | + rev tmp, tmp |
| 168 | +#endif |
| 169 | + eor diff, data1, data2 |
| 170 | + orr syndrome, diff, tmp |
| 171 | + cbnz syndrome, L(end) |
| 172 | +L(tail): |
| 173 | + ldr data1, [src1] |
| 174 | + neg shift, shift |
| 175 | + lsr data2, data3, shift |
| 176 | + lsr has_nul, has_nul, shift |
| 177 | +#ifdef __AARCH64EB__ |
| 178 | + rev data2, data2 |
| 179 | + rev has_nul, has_nul |
| 180 | +#endif |
| 181 | + eor diff, data1, data2 |
167 | 182 | orr syndrome, diff, has_nul |
168 | | - cbz syndrome, L(loop_misaligned) |
169 | 183 | b L(end) |
170 | 184 |
|
171 | 185 | L(done): |
|
0 commit comments