5252# regular CRC code that does not interleave the CRC instructions.
5353#define SMALL_SIZE 200
5454
55- # unsigned int crc_pcl( const u8 *buffer, unsigned int len, unsigned int crc_init );
55+ # u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
5656
5757.text
58- SYM_FUNC_START(crc_pcl)
59- #define bufp %rdi
60- #define bufp_d %edi
61- #define len %esi
62- #define crc_init %edx
63- #define crc_init_q %rdx
58+ SYM_FUNC_START(crc32c_x86_3way)
59+ #define crc0 %edi
60+ #define crc0_q %rdi
61+ #define bufp %rsi
62+ #define bufp_d %esi
63+ #define len %rdx
64+ #define len_dw %edx
6465#define n_misaligned %ecx /* overlaps chunk_bytes! */
6566#define n_misaligned_q %rcx
6667#define chunk_bytes %ecx /* overlaps n_misaligned! */
@@ -85,9 +86,9 @@ SYM_FUNC_START(crc_pcl)
8586.Ldo_align:
8687 movq (bufp), %rax
8788 add n_misaligned_q, bufp
88- sub n_misaligned , len
89+ sub n_misaligned_q , len
8990.Lalign_loop:
90- crc32b %al , crc_init # compute crc32 of 1-byte
91+ crc32b %al , crc0 # compute crc32 of 1-byte
9192 shr $8 , %rax # get next byte
9293 dec n_misaligned
9394 jne .Lalign_loop
@@ -102,7 +103,7 @@ SYM_FUNC_START(crc_pcl)
102103
103104.Lpartial_block:
104105 # Compute floor(len / 24) to get num qwords to process from each lane.
105- imul $2731 , len , %eax # 2731 = ceil(2^16 / 24)
106+ imul $2731 , len_dw , %eax # 2731 = ceil(2^16 / 24)
106107 shr $16 , %eax
107108 jmp .Lcrc_3lanes
108109
@@ -125,16 +126,16 @@ SYM_FUNC_START(crc_pcl)
125126 # Unroll the loop by a factor of 4 to reduce the overhead of the loop
126127 # bookkeeping instructions, which can compete with crc32q for the ALUs.
127128.Lcrc_3lanes_4x_loop:
128- crc32q (bufp), crc_init_q
129+ crc32q (bufp), crc0_q
129130 crc32q (bufp,chunk_bytes_q), crc1
130131 crc32q (bufp,chunk_bytes_q,2 ), crc2
131- crc32q 8 (bufp), crc_init_q
132+ crc32q 8 (bufp), crc0_q
132133 crc32q 8 (bufp,chunk_bytes_q), crc1
133134 crc32q 8 (bufp,chunk_bytes_q,2 ), crc2
134- crc32q 16 (bufp), crc_init_q
135+ crc32q 16 (bufp), crc0_q
135136 crc32q 16 (bufp,chunk_bytes_q), crc1
136137 crc32q 16 (bufp,chunk_bytes_q,2 ), crc2
137- crc32q 24 (bufp), crc_init_q
138+ crc32q 24 (bufp), crc0_q
138139 crc32q 24 (bufp,chunk_bytes_q), crc1
139140 crc32q 24 (bufp,chunk_bytes_q,2 ), crc2
140141 add $32 , bufp
@@ -146,15 +147,15 @@ SYM_FUNC_START(crc_pcl)
146147 jz .Lcrc_3lanes_last_qword
147148
148149.Lcrc_3lanes_1x_loop:
149- crc32q (bufp), crc_init_q
150+ crc32q (bufp), crc0_q
150151 crc32q (bufp,chunk_bytes_q), crc1
151152 crc32q (bufp,chunk_bytes_q,2 ), crc2
152153 add $8 , bufp
153154 dec %eax
154155 jnz .Lcrc_3lanes_1x_loop
155156
156157.Lcrc_3lanes_last_qword:
157- crc32q (bufp), crc_init_q
158+ crc32q (bufp), crc0_q
158159 crc32q (bufp,chunk_bytes_q), crc1
159160# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet
160161
@@ -165,9 +166,9 @@ SYM_FUNC_START(crc_pcl)
165166 lea (K_table-8 )(%rip ), %rax # first entry is for idx 1
166167 pmovzxdq (%rax ,chunk_bytes_q), %xmm0 # 2 consts: K1:K2
167168 lea (chunk_bytes,chunk_bytes,2 ), %eax # chunk_bytes * 3
168- sub %eax , len # len -= chunk_bytes * 3
169+ sub %rax , len # len -= chunk_bytes * 3
169170
170- movq crc_init_q , %xmm1 # CRC for block 1
171+ movq crc0_q , %xmm1 # CRC for block 1
171172 pclmulqdq $0x00 , %xmm0 , %xmm1 # Multiply by K2
172173
173174 movq crc1, %xmm2 # CRC for block 2
@@ -176,8 +177,8 @@ SYM_FUNC_START(crc_pcl)
176177 pxor %xmm2 ,%xmm1
177178 movq %xmm1 , %rax
178179 xor (bufp,chunk_bytes_q,2 ), %rax
179- mov crc2, crc_init_q
180- crc32 %rax , crc_init_q
180+ mov crc2, crc0_q
181+ crc32 %rax , crc0_q
181182 lea 8 (bufp,chunk_bytes_q,2 ), bufp
182183
183184 ################################################################
@@ -193,34 +194,34 @@ SYM_FUNC_START(crc_pcl)
193194 ## 6) Process any remainder without interleaving:
194195 #######################################################################
195196.Lsmall:
196- test len, len
197+ test len_dw, len_dw
197198 jz .Ldone
198- mov len , %eax
199+ mov len_dw , %eax
199200 shr $3 , %eax
200201 jz .Ldo_dword
201202.Ldo_qwords:
202- crc32q (bufp), crc_init_q
203+ crc32q (bufp), crc0_q
203204 add $8 , bufp
204205 dec %eax
205206 jnz .Ldo_qwords
206207.Ldo_dword:
207- test $4 , len
208+ test $4 , len_dw
208209 jz .Ldo_word
209- crc32l (bufp), crc_init
210+ crc32l (bufp), crc0
210211 add $4 , bufp
211212.Ldo_word:
212- test $2 , len
213+ test $2 , len_dw
213214 jz .Ldo_byte
214- crc32w (bufp), crc_init
215+ crc32w (bufp), crc0
215216 add $2 , bufp
216217.Ldo_byte:
217- test $1 , len
218+ test $1 , len_dw
218219 jz .Ldone
219- crc32b (bufp), crc_init
220+ crc32b (bufp), crc0
220221.Ldone:
221- mov crc_init , %eax
222+ mov crc0 , %eax
222223 RET
223- SYM_FUNC_END(crc_pcl )
224+ SYM_FUNC_END(crc32c_x86_3way )
224225
225226.section .rodata, "a" , @progbits
226227 ################################################################
0 commit comments