11/ * SPDX - License - Identifier: GPL - 2 . 0 - only * /
22/ *
3- * Copyright (c) 2013 - 2021 , Arm Limited.
3+ * Copyright (c) 2013 - 2022 , Arm Limited.
44 *
55 * Adapted from the original at :
6- * https://github.com/ARM - software/optimized - routines/blob/e823e3abf5f89ecb /string/aarch64/strncmp.S
6+ * https://github.com/ARM - software/optimized - routines/blob/189dfefe37d54c5b /string/aarch64/strncmp.S
77 * /
88
99#include <linux/linkage.h>
1010#include <asm/assembler.h>
1111
1212/ * Assumptions:
1313 *
14- * ARMv8 - a , AArch64
14+ * ARMv8 - a , AArch64.
15+ * MTE compatible.
1516 * /
1617
1718#define L(label) .L ## label
1819
1920#define REP8_01 0x0101010101010101
2021#define REP8_7f 0x7f7f7f7f7f7f7f7f
21- #define REP8_80 0x8080808080808080
2222
2323/ * Parameters and result. * /
2424#define src1 x0
3939#define tmp3 x10
4040#define zeroones x11
4141#define pos x12
42- #define limit_wd x13
43- #define mask x14
44- #define endloop x15
42+ #define mask x13
43+ #define endloop x14
4544#define count mask
45+ #define offset pos
46+ #define neg_offset x15
47+
48+ / * Define endian dependent shift operations.
49+ On big - endian early bytes are at MSB and on little - endian LSB.
50+ LS_FW means shifting towards early bytes.
51+ LS_BK means shifting towards later bytes.
52+ * /
53+ #ifdef __AARCH64EB__
54+ #define LS_FW lsl
55+ #define LS_BK lsr
56+ #else
57+ #define LS_FW lsr
58+ #define LS_BK lsl
59+ #endif
4660
4761SYM_FUNC_START_WEAK_PI(strncmp)
4862 cbz limit , L(ret0)
@@ -52,9 +66,6 @@ SYM_FUNC_START_WEAK_PI(strncmp)
5266 and count , src1 , # 7
5367 b.ne L(misaligned8)
5468 cbnz count , L(mutual_align)
55- / * Calculate the number of full and partial words - 1 . * /
56- sub limit_wd , limit , # 1 / * limit != 0 , so no underflow. * /
57- lsr limit_wd , limit_wd , # 3 / * Convert to Dwords. * /
5869
5970 / * NUL detection works on the principle th at (X - 1 ) & (~X) & 0x80
6071 (=> (X - 1 ) & ~(X | 0x7f )) is non - zero iff a byte is zero , and
@@ -64,56 +75,52 @@ L(loop_aligned):
6475 ldr data1 , [ src1 ], # 8
6576 ldr data2 , [ src2 ], # 8
6677L(start_realigned):
67- subs limit_wd , limit_wd , # 1
78+ subs limit , limit , # 8
6879 sub tmp1 , data1 , zeroones
6980 orr tmp2 , data1 , #REP8_7f
7081 eor diff , data1 , data2 / * Non - zero if differences found. * /
71- csinv endloop , diff , xzr , pl / * Last Dword or differences. * /
82+ csinv endloop , diff , xzr , hi / * Last Dword or differences. * /
7283 bics has_nul , tmp1 , tmp2 / * Non - zero if NUL terminator. * /
7384 ccmp endloop , # 0 , # 0 , eq
7485 b.eq L(loop_aligned)
7586 / * End of main loop * /
7687
77- / * Not reached the limit , must have found the end or a diff. * /
78- tbz limit_wd , # 63 , L(not_limit)
79-
80- / * Limit % 8 == 0 => all bytes significant. * /
81- ands limit , limit , # 7
82- b.eq L(not_limit)
83-
84- lsl limit , limit , # 3 / * Bits - > bytes. * /
85- mov mask , #~ 0
86- #ifdef __AARCH64EB__
87- lsr mask , mask , limit
88- #else
89- lsl mask , mask , limit
90- #endif
91- bic data1 , data1 , mask
92- bic data2 , data2 , mask
93-
94- / * Make sure th at the NUL byte is marked in the syndrome. * /
95- orr has_nul , has_nul , mask
96-
97- L(not_limit):
88+ L(full_check):
89+ #ifndef __AARCH64EB__
9890 orr syndrome , diff , has_nul
99-
100- #ifndef __AARCH64EB__
91+ add limit , limit , 8 / * Rewind limit to before last subs. * /
92+ L(syndrome_check):
93+ / * Limit was reached. Check if the NUL byte or the difference
94+ is before the limit. * /
10195 rev syndrome , syndrome
10296 rev data1 , data1
103- / * The MS - non - zero bit of the syndrome marks either the first bit
104- th at is different , or the top bit of the first zero byte.
105- Shifting left now will bring the critical information into the
106- top bits. * /
10797 clz pos , syndrome
10898 rev data2 , data2
10999 lsl data1 , data1 , pos
100+ cmp limit , pos , lsr # 3
110101 lsl data2 , data2 , pos
111102 / * But we need to zero - extend (char is unsigned) the value and then
112103 perform a signed 32 - bit subtraction. * /
113104 lsr data1 , data1 , # 56
114105 sub result , data1 , data2 , lsr # 56
106+ csel result , result , xzr , hi
115107 ret
116108#else
109+ / * Not reached the limit , must have found the end or a diff. * /
110+ tbz limit , # 63 , L(not_limit)
111+ add tmp1 , limit , 8
112+ cbz limit , L(not_limit)
113+
114+ lsl limit , tmp1 , # 3 / * Bits - > bytes. * /
115+ mov mask , #~ 0
116+ lsr mask , mask , limit
117+ bic data1 , data1 , mask
118+ bic data2 , data2 , mask
119+
120+ / * Make sure th at the NUL byte is marked in the syndrome. * /
121+ orr has_nul , has_nul , mask
122+
123+ L(not_limit):
117124 / * For big - endian we cannot use the trick with the syndrome value
118125 as carry - propagation can corrupt the upper bits if the trailing
119126 bytes in the string contain 0x01 . * /
@@ -134,10 +141,11 @@ L(not_limit):
134141 rev has_nul , has_nul
135142 orr syndrome , diff , has_nul
136143 clz pos , syndrome
137- / * The MS - non - zero bit of the syndrome marks either the first bit
138- th at is different , or the top bit of the first zero byte.
144+ / * The most - significant - non - zero bit of the syndrome marks either the
145+ first bit th at is different , or the top bit of the first zero byte.
139146 Shifting left now will bring the critical information into the
140147 top bits. * /
148+ L(end_quick):
141149 lsl data1 , data1 , pos
142150 lsl data2 , data2 , pos
143151 / * But we need to zero - extend (char is unsigned) the value and then
@@ -159,22 +167,12 @@ L(mutual_align):
159167 neg tmp3 , count , lsl # 3 / * 64 - bits(bytes beyond align). * /
160168 ldr data2 , [ src2 ], # 8
161169 mov tmp2 , #~ 0
162- sub limit_wd , limit , # 1 / * limit != 0 , so no underflow. * /
163- #ifdef __AARCH64EB__
164- / * Big - endian. Early bytes are at MSB. * /
165- lsl tmp2 , tmp2 , tmp3 / * Shift (count & 63 ). * /
166- #else
167- / * Little - endian. Early bytes are at LSB. * /
168- lsr tmp2 , tmp2 , tmp3 / * Shift (count & 63 ). * /
169- #endif
170- and tmp3 , limit_wd , # 7
171- lsr limit_wd , limit_wd , # 3
172- / * Adjust the limit. Only low 3 bits used , so overflow irrelevant. * /
173- add limit , limit , count
174- add tmp3 , tmp3 , count
170+ LS_FW tmp2 , tmp2 , tmp3 / * Shift (count & 63 ). * /
171+ / * Adjust the limit and ensure it doesn't overflow. * /
172+ adds limit , limit , count
173+ csinv limit , limit , xzr , lo
175174 orr data1 , data1 , tmp2
176175 orr data2 , data2 , tmp2
177- add limit_wd , limit_wd , tmp3 , lsr # 3
178176 b L(start_realigned)
179177
180178 .p2align 4
@@ -197,13 +195,11 @@ L(done):
197195 / * Align the SRC1 to a dword by doing a bytewise compare and then do
198196 the dword loop . * /
199197L(try_misaligned_words):
200- lsr limit_wd , limit , # 3
201- cbz count , L(do_misaligned)
198+ cbz count , L(src1_aligned)
202199
203200 neg count , count
204201 and count , count , # 7
205202 sub limit , limit , count
206- lsr limit_wd , limit , # 3
207203
208204L(page_end_loop):
209205 ldrb data1w , [ src1 ], # 1
@@ -214,48 +210,100 @@ L(page_end_loop):
214210 subs count , count , # 1
215211 b.hi L(page_end_loop)
216212
217- L(do_misaligned):
218- / * Prepare ourselves for the next page crossing. Unlike the aligned
219- loop , we fetch 1 less dword because we risk crossing bounds on
220- SRC2. * /
221- mov count , # 8
222- subs limit_wd , limit_wd , # 1
223- b.lo L(done_loop)
224- L(loop_misaligned) :
225- and tmp2 , src2 , # 0xff8
226- eor tmp2 , tmp2 , # 0xff8
227- cbz tmp2 , L(page_end_loop)
213+ / * The following diagram explains the comparison of misaligned strings.
214+ The bytes are shown in natural order. For little - endian , it is
215+ reversed in the registers. The "x" bytes are before the string.
216+ The "|" separates data th at is loaded at one time.
217+ src1 | a a a a a a a a | b b b c c c c c | . . .
218+ src2 | x x x x x a a a a a a a a b b b | c c c c c . . .
219+
220+ After shifting in each step , the data looks like this :
221+ STEP_A STEP_B STEP_C
222+ data1 a a a a a a a a b b b c c c c c b b b c c c c c
223+ data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c
228224
225+ The bytes with "0" are eliminated from the syndrome via mask.
226+
227+ Align SRC2 down to 16 bytes. This way we can read 16 bytes at a
228+ time from SRC2. The comparison happens in 3 steps. After each step
229+ the loop can exit , or read from SRC1 or SRC2. * /
230+ L(src1_aligned):
231+ / * Calculate offset from 8 byte alignment to string start in bits. No
232+ need to mask offset since shifts are ignoring upper bits. * /
233+ lsl offset , src2 , # 3
234+ bic src2 , src2 , # 0xf
235+ mov mask , - 1
236+ neg neg_offset , offset
229237 ldr data1 , [ src1 ], # 8
230- ldr data2 , [ src2 ], # 8
231- sub tmp1 , data1 , zeroones
232- orr tmp2 , data1 , #REP8_7f
233- eor diff , data1 , data2 / * Non - zero if differences found. * /
234- bics has_nul , tmp1 , tmp2 / * Non - zero if NUL terminator. * /
235- ccmp diff , # 0 , # 0 , eq
236- b.ne L(not_limit)
237- subs limit_wd , limit_wd , # 1
238- b.pl L(loop_misaligned)
238+ ldp tmp1 , tmp2 , [ src2 ], # 16
239+ LS_BK mask , mask , neg_offset
240+ and neg_offset , neg_offset , # 63 / * Need actual value for cmp later. * /
241+ / * Skip the first compare if data in tmp1 is irrelevant. * /
242+ tbnz offset , 6 , L(misaligned_mid_loop)
239243
240- L(done_loop):
241- / * We found a difference or a NULL before the limit was reached. * /
242- and limit , limit , # 7
243- cbz limit , L(not_limit)
244- / * Read the last word. * /
245- sub src1 , src1 , 8
246- sub src2 , src2 , 8
247- ldr data1 , [ src1 , limit ]
248- ldr data2 , [ src2 , limit ]
249- sub tmp1 , data1 , zeroones
250- orr tmp2 , data1 , #REP8_7f
244+ L(loop_misaligned):
245+ / * STEP_A: Compare full 8 bytes when there is enough data from SRC2. * /
246+ LS_FW data2 , tmp1 , offset
247+ LS_BK tmp1 , tmp2 , neg_offset
248+ subs limit , limit , # 8
249+ orr data2 , data2 , tmp1 / * 8 bytes from SRC2 combined from two regs. * /
250+ sub has_nul , data1 , zeroones
251251 eor diff , data1 , data2 / * Non - zero if differences found. * /
252- bics has_nul , tmp1 , tmp2 / * Non - zero if NUL terminator. * /
253- ccmp diff , # 0 , # 0 , eq
254- b.ne L(not_limit)
252+ orr tmp3 , data1 , #REP8_7f
253+ csinv endloop , diff , xzr , hi / * If limit , set to all ones. * /
254+ bic has_nul , has_nul , tmp3 / * Non - zero if NUL byte found in SRC1. * /
255+ orr tmp3 , endloop , has_nul
256+ cbnz tmp3 , L(full_check)
257+
258+ ldr data1 , [ src1 ], # 8
259+ L(misaligned_mid_loop):
260+ / * STEP_B: Compare first part of data1 to second part of tmp2. * /
261+ LS_FW data2 , tmp2 , offset
262+ #ifdef __AARCH64EB__
263+ / * For big - endian we do a byte reverse to avoid carry - propagation
264+ problem described above. This way we can reuse the has_nul in the
265+ next step and also use syndrome value trick at the end. * /
266+ rev tmp3 , data1
267+ #define data1_fixed tmp3
268+ #else
269+ #define data1_fixed data1
270+ #endif
271+ sub has_nul , data1_fixed , zeroones
272+ orr tmp3 , data1_fixed , #REP8_7f
273+ eor diff , data2 , data1 / * Non - zero if differences found. * /
274+ bic has_nul , has_nul , tmp3 / * Non - zero if NUL terminator. * /
275+ #ifdef __AARCH64EB__
276+ rev has_nul , has_nul
277+ #endif
278+ cmp limit , neg_offset , lsr # 3
279+ orr syndrome , diff , has_nul
280+ bic syndrome , syndrome , mask / * Ignore later bytes. * /
281+ csinv tmp3 , syndrome , xzr , hi / * If limit , set to all ones. * /
282+ cbnz tmp3 , L(syndrome_check)
283+
284+ / * STEP_C: Compare second part of data1 to first part of tmp1. * /
285+ ldp tmp1 , tmp2 , [ src2 ], # 16
286+ cmp limit , # 8
287+ LS_BK data2 , tmp1 , neg_offset
288+ eor diff , data2 , data1 / * Non - zero if differences found. * /
289+ orr syndrome , diff , has_nul
290+ and syndrome , syndrome , mask / * Ignore earlier bytes. * /
291+ csinv tmp3 , syndrome , xzr , hi / * If limit , set to all ones. * /
292+ cbnz tmp3 , L(syndrome_check)
293+
294+ ldr data1 , [ src1 ], # 8
295+ sub limit , limit , # 8
296+ b L(loop_misaligned)
297+
298+ #ifdef __AARCH64EB__
299+ L(syndrome_check):
300+ clz pos , syndrome
301+ cmp pos , limit , lsl # 3
302+ b.lo L(end_quick)
303+ #endif
255304
256305L(ret0):
257306 mov result , # 0
258307 ret
259-
260308SYM_FUNC_END_PI(strncmp)
261309EXPORT_SYMBOL_NOHWKASAN(strncmp)
0 commit comments