Skip to content

Commit 8941e93

Browse files
heiherchenhuacai
authored andcommitted
LoongArch: Optimize memory ops (memset/memcpy/memmove)
To optimize memset()/memcpy()/memmove() and so on, we use a jump table to dispatch cases for short data lengths; and for long data lengths, we split the destination into head part (first 8 bytes), tail part (last 8 bytes) and middle part. The head part and tail part may be at unaligned addresses, while the middle part is always aligned (the middle part is allowed to overlap the head/tail part). In this way, the first and last 8 bytes may be unaligned accesses, but we can make sure the data in the middle is processed at an aligned destination address. We have tested micro-bench[1] on a Loongson-3C5000 16-core machine (2.2GHz): 1. memset | length | src offset | dst offset | speed before | speed after | % | |--------|------------|------------|--------------|-------------|---------| | 8 | 0 | 0 | 696.191 | 1518.785 | 118.16% | | 8 | 0 | 1 | 696.325 | 1518.937 | 118.14% | | 50 | 0 | 0 | 969.976 | 8053.902 | 730.32% | | 50 | 0 | 1 | 970.034 | 8058.475 | 730.74% | | 300 | 0 | 0 | 5876.612 | 16544.703 | 181.53% | | 300 | 0 | 1 | 5030.849 | 16549.011 | 228.95% | | 1200 | 0 | 0 | 11797.077 | 16752.137 | 42.00% | | 1200 | 0 | 1 | 5687.141 | 16645.233 | 192.68% | | 4000 | 0 | 0 | 15723.27 | 16761.557 | 6.60% | | 4000 | 0 | 1 | 5906.114 | 16732.316 | 183.30% | | 8000 | 0 | 0 | 16751.403 | 16770.002 | 0.11% | | 8000 | 0 | 1 | 5995.449 | 16754.07 | 179.45% | 2. memcpy | length | src offset | dst offset | speed before | speed after | % | |--------|------------|------------|--------------|-------------|---------| | 8 | 0 | 0 | 696.2 | 1670.605 | 139.96% | | 8 | 0 | 1 | 696.325 | 1671.138 | 139.99% | | 50 | 0 | 0 | 969.974 | 8724.999 | 799.51% | | 50 | 0 | 1 | 970.032 | 8730.138 | 799.98% | | 300 | 0 | 0 | 5564.662 | 16272.652 | 192.43% | | 300 | 0 | 1 | 4670.436 | 14972.842 | 220.59% | | 1200 | 0 | 0 | 10740.23 | 16751.728 | 55.97% | | 1200 | 0 | 1 | 5027.741 | 14874.564 | 195.85% | | 4000 | 0 | 0 | 15122.367 | 16737.642 | 10.68% | | 4000 | 0 | 1 | 5536.918 | 14890.397 | 168.93% | | 8000 | 0 | 0 | 16505.453 | 16553.543 | 0.29% | | 8000 | 0 | 1 | 5821.619 | 14841.804 | 154.94% | 3. memmove | length | src offset | dst offset | speed before | speed after | % | |--------|------------|------------|--------------|-------------|---------| | 8 | 0 | 0 | 982.693 | 1670.568 | 70.00% | | 8 | 0 | 1 | 983.023 | 1671.174 | 70.00% | | 50 | 0 | 0 | 1230.87 | 8727.625 | 609.06% | | 50 | 0 | 1 | 1232.515 | 8730.138 | 608.32% | | 300 | 0 | 0 | 6490.375 | 16296.993 | 151.09% | | 300 | 0 | 1 | 4282.687 | 14972.842 | 249.61% | | 1200 | 0 | 0 | 11742.755 | 16752.546 | 42.66% | | 1200 | 0 | 1 | 5039.338 | 14872.951 | 195.14% | | 4000 | 0 | 0 | 15467.786 | 16737.09 | 8.21% | | 4000 | 0 | 1 | 5009.905 | 14890.542 | 197.22% | | 8000 | 0 | 0 | 16489.664 | 16553.273 | 0.39% | | 8000 | 0 | 1 | 5823.786 | 14858.646 | 155.14% | * speed: MB/s * length: byte [1] https://github.com/heiher/mem-bench Signed-off-by: WANG Rui <wangrui@loongson.cn> Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
1 parent 2b3bd32 commit 8941e93

5 files changed

Lines changed: 603 additions & 167 deletions

File tree

arch/loongarch/lib/clear_user.S

Lines changed: 121 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,14 @@
1313

1414
.irp to, 0, 1, 2, 3, 4, 5, 6, 7
1515
.L_fixup_handle_\to\():
16-
addi.d a0, a1, (\to) * (-8)
16+
sub.d a0, a2, a0
17+
addi.d a0, a0, (\to) * (-8)
18+
jr ra
19+
.endr
20+
21+
.irp to, 0, 2, 4
22+
.L_fixup_handle_s\to\():
23+
addi.d a0, a1, -\to
1724
jr ra
1825
.endr
1926

@@ -44,7 +51,7 @@ SYM_FUNC_START(__clear_user_generic)
4451
2: move a0, a1
4552
jr ra
4653

47-
_asm_extable 1b, .L_fixup_handle_0
54+
_asm_extable 1b, .L_fixup_handle_s0
4855
SYM_FUNC_END(__clear_user_generic)
4956

5057
/*
@@ -54,12 +61,21 @@ SYM_FUNC_END(__clear_user_generic)
5461
* a1: size
5562
*/
5663
SYM_FUNC_START(__clear_user_fast)
57-
beqz a1, 10f
64+
sltui t0, a1, 9
65+
bnez t0, .Lsmall
5866

59-
ori a2, zero, 64
60-
blt a1, a2, 9f
67+
add.d a2, a0, a1
68+
0: st.d zero, a0, 0
69+
70+
/* align up address */
71+
addi.d a0, a0, 8
72+
bstrins.d a0, zero, 2, 0
73+
74+
addi.d a3, a2, -64
75+
bgeu a0, a3, .Llt64
6176

6277
/* set 64 bytes at a time */
78+
.Lloop64:
6379
1: st.d zero, a0, 0
6480
2: st.d zero, a0, 8
6581
3: st.d zero, a0, 16
@@ -68,24 +84,95 @@ SYM_FUNC_START(__clear_user_fast)
6884
6: st.d zero, a0, 40
6985
7: st.d zero, a0, 48
7086
8: st.d zero, a0, 56
71-
7287
addi.d a0, a0, 64
73-
addi.d a1, a1, -64
74-
bge a1, a2, 1b
75-
76-
beqz a1, 10f
88+
bltu a0, a3, .Lloop64
7789

7890
/* set the remaining bytes */
79-
9: st.b zero, a0, 0
80-
addi.d a0, a0, 1
81-
addi.d a1, a1, -1
82-
bgt a1, zero, 9b
91+
.Llt64:
92+
addi.d a3, a2, -32
93+
bgeu a0, a3, .Llt32
94+
9: st.d zero, a0, 0
95+
10: st.d zero, a0, 8
96+
11: st.d zero, a0, 16
97+
12: st.d zero, a0, 24
98+
addi.d a0, a0, 32
99+
100+
.Llt32:
101+
addi.d a3, a2, -16
102+
bgeu a0, a3, .Llt16
103+
13: st.d zero, a0, 0
104+
14: st.d zero, a0, 8
105+
addi.d a0, a0, 16
106+
107+
.Llt16:
108+
addi.d a3, a2, -8
109+
bgeu a0, a3, .Llt8
110+
15: st.d zero, a0, 0
111+
112+
.Llt8:
113+
16: st.d zero, a2, -8
83114

84115
/* return */
85-
10: move a0, a1
116+
move a0, zero
117+
jr ra
118+
119+
.align 4
120+
.Lsmall:
121+
pcaddi t0, 4
122+
slli.d a2, a1, 4
123+
add.d t0, t0, a2
124+
jr t0
125+
126+
.align 4
127+
move a0, zero
128+
jr ra
129+
130+
.align 4
131+
17: st.b zero, a0, 0
132+
move a0, zero
133+
jr ra
134+
135+
.align 4
136+
18: st.h zero, a0, 0
137+
move a0, zero
138+
jr ra
139+
140+
.align 4
141+
19: st.h zero, a0, 0
142+
20: st.b zero, a0, 2
143+
move a0, zero
144+
jr ra
145+
146+
.align 4
147+
21: st.w zero, a0, 0
148+
move a0, zero
149+
jr ra
150+
151+
.align 4
152+
22: st.w zero, a0, 0
153+
23: st.b zero, a0, 4
154+
move a0, zero
155+
jr ra
156+
157+
.align 4
158+
24: st.w zero, a0, 0
159+
25: st.h zero, a0, 4
160+
move a0, zero
161+
jr ra
162+
163+
.align 4
164+
26: st.w zero, a0, 0
165+
27: st.w zero, a0, 3
166+
move a0, zero
167+
jr ra
168+
169+
.align 4
170+
28: st.d zero, a0, 0
171+
move a0, zero
86172
jr ra
87173

88174
/* fixup and ex_table */
175+
_asm_extable 0b, .L_fixup_handle_0
89176
_asm_extable 1b, .L_fixup_handle_0
90177
_asm_extable 2b, .L_fixup_handle_1
91178
_asm_extable 3b, .L_fixup_handle_2
@@ -95,4 +182,23 @@ SYM_FUNC_START(__clear_user_fast)
95182
_asm_extable 7b, .L_fixup_handle_6
96183
_asm_extable 8b, .L_fixup_handle_7
97184
_asm_extable 9b, .L_fixup_handle_0
185+
_asm_extable 10b, .L_fixup_handle_1
186+
_asm_extable 11b, .L_fixup_handle_2
187+
_asm_extable 12b, .L_fixup_handle_3
188+
_asm_extable 13b, .L_fixup_handle_0
189+
_asm_extable 14b, .L_fixup_handle_1
190+
_asm_extable 15b, .L_fixup_handle_0
191+
_asm_extable 16b, .L_fixup_handle_1
192+
_asm_extable 17b, .L_fixup_handle_s0
193+
_asm_extable 18b, .L_fixup_handle_s0
194+
_asm_extable 19b, .L_fixup_handle_s0
195+
_asm_extable 20b, .L_fixup_handle_s2
196+
_asm_extable 21b, .L_fixup_handle_s0
197+
_asm_extable 22b, .L_fixup_handle_s0
198+
_asm_extable 23b, .L_fixup_handle_s4
199+
_asm_extable 24b, .L_fixup_handle_s0
200+
_asm_extable 25b, .L_fixup_handle_s4
201+
_asm_extable 26b, .L_fixup_handle_s0
202+
_asm_extable 27b, .L_fixup_handle_s4
203+
_asm_extable 28b, .L_fixup_handle_s0
98204
SYM_FUNC_END(__clear_user_fast)

0 commit comments

Comments
 (0)