Skip to content

Commit 9e51caf

Browse files
rmurphy-armwilldeacon
authored andcommitted
arm64: Better optimised memchr()
Although we implement our own assembly version of memchr(), it turns out to be barely any better than what GCC can generate for the generic C version (and would go wrong if the size_t argument were ever large enough to be interpreted as negative). Unfortunately we can't import the tuned implementation from the Arm optimized-routines library, since that has some Advanced SIMD parts which are not really viable for general kernel library code. What we can do, however, is pep things up with some relatively straightforward word-at-a-time logic for larger calls. Adding some timing to optimized-routines' memchr() test for a simple benchmark, overall this version comes in around half as fast as the SIMD code, but still nearly 4x faster than our existing implementation. Signed-off-by: Robin Murphy <robin.murphy@arm.com> Link: https://lore.kernel.org/r/58471b42f9287e039dafa9e5e7035077152438fd.1622128527.git.robin.murphy@arm.com Signed-off-by: Will Deacon <will@kernel.org>
1 parent 2851330 commit 9e51caf

1 file changed

Lines changed: 53 additions & 12 deletions

File tree

arch/arm64/lib/memchr.S

Lines changed: 53 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
/* SPDX-License-Identifier: GPL-2.0-only */
22
/*
3-
* Based on arch/arm/lib/memchr.S
4-
*
5-
* Copyright (C) 1995-2000 Russell King
6-
* Copyright (C) 2013 ARM Ltd.
3+
* Copyright (C) 2021 Arm Ltd.
74
*/
85

96
#include <linux/linkage.h>
@@ -19,16 +16,60 @@
1916
* Returns:
2017
* x0 - address of first occurrence of 'c' or 0
2118
*/
19+
20+
#define L(label) .L ## label
21+
22+
#define REP8_01 0x0101010101010101
23+
#define REP8_7f 0x7f7f7f7f7f7f7f7f
24+
25+
#define srcin x0
26+
#define chrin w1
27+
#define cntin x2
28+
29+
#define result x0
30+
31+
#define wordcnt x3
32+
#define rep01 x4
33+
#define repchr x5
34+
#define cur_word x6
35+
#define cur_byte w6
36+
#define tmp x7
37+
#define tmp2 x8
38+
39+
.p2align 4
40+
nop
2241
SYM_FUNC_START_WEAK_PI(memchr)
23-
and w1, w1, #0xff
24-
1: subs x2, x2, #1
25-
b.mi 2f
26-
ldrb w3, [x0], #1
27-
cmp w3, w1
28-
b.ne 1b
29-
sub x0, x0, #1
42+
and chrin, chrin, #0xff
43+
lsr wordcnt, cntin, #3
44+
cbz wordcnt, L(byte_loop)
45+
mov rep01, #REP8_01
46+
mul repchr, x1, rep01
47+
and cntin, cntin, #7
48+
L(word_loop):
49+
ldr cur_word, [srcin], #8
50+
sub wordcnt, wordcnt, #1
51+
eor cur_word, cur_word, repchr
52+
sub tmp, cur_word, rep01
53+
orr tmp2, cur_word, #REP8_7f
54+
bics tmp, tmp, tmp2
55+
b.ne L(found_word)
56+
cbnz wordcnt, L(word_loop)
57+
L(byte_loop):
58+
cbz cntin, L(not_found)
59+
ldrb cur_byte, [srcin], #1
60+
sub cntin, cntin, #1
61+
cmp cur_byte, chrin
62+
b.ne L(byte_loop)
63+
sub srcin, srcin, #1
64+
ret
65+
L(found_word):
66+
CPU_LE( rev tmp, tmp)
67+
clz tmp, tmp
68+
sub tmp, tmp, #64
69+
add result, srcin, tmp, asr #3
3070
ret
31-
2: mov x0, #0
71+
L(not_found):
72+
mov result, #0
3273
ret
3374
SYM_FUNC_END_PI(memchr)
3475
EXPORT_SYMBOL_NOKASAN(memchr)

0 commit comments

Comments
 (0)