@@ -167,7 +167,7 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
167167 } while (-- lines > 0 );
168168}
169169
170- struct xor_block_template const xor_block_inner_neon = {
170+ struct xor_block_template xor_block_inner_neon __ro_after_init = {
171171 .name = "__inner_neon__" ,
172172 .do_2 = xor_arm64_neon_2 ,
173173 .do_3 = xor_arm64_neon_3 ,
@@ -176,6 +176,151 @@ struct xor_block_template const xor_block_inner_neon = {
176176};
177177EXPORT_SYMBOL (xor_block_inner_neon );
178178
179+ static inline uint64x2_t eor3 (uint64x2_t p , uint64x2_t q , uint64x2_t r )
180+ {
181+ uint64x2_t res ;
182+
183+ asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
184+ "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
185+ : "=w" (res ) : "w" (p ), "w" (q ), "w" (r ));
186+ return res ;
187+ }
188+
189+ static void xor_arm64_eor3_3 (unsigned long bytes , unsigned long * p1 ,
190+ unsigned long * p2 , unsigned long * p3 )
191+ {
192+ uint64_t * dp1 = (uint64_t * )p1 ;
193+ uint64_t * dp2 = (uint64_t * )p2 ;
194+ uint64_t * dp3 = (uint64_t * )p3 ;
195+
196+ register uint64x2_t v0 , v1 , v2 , v3 ;
197+ long lines = bytes / (sizeof (uint64x2_t ) * 4 );
198+
199+ do {
200+ /* p1 ^= p2 ^ p3 */
201+ v0 = eor3 (vld1q_u64 (dp1 + 0 ), vld1q_u64 (dp2 + 0 ),
202+ vld1q_u64 (dp3 + 0 ));
203+ v1 = eor3 (vld1q_u64 (dp1 + 2 ), vld1q_u64 (dp2 + 2 ),
204+ vld1q_u64 (dp3 + 2 ));
205+ v2 = eor3 (vld1q_u64 (dp1 + 4 ), vld1q_u64 (dp2 + 4 ),
206+ vld1q_u64 (dp3 + 4 ));
207+ v3 = eor3 (vld1q_u64 (dp1 + 6 ), vld1q_u64 (dp2 + 6 ),
208+ vld1q_u64 (dp3 + 6 ));
209+
210+ /* store */
211+ vst1q_u64 (dp1 + 0 , v0 );
212+ vst1q_u64 (dp1 + 2 , v1 );
213+ vst1q_u64 (dp1 + 4 , v2 );
214+ vst1q_u64 (dp1 + 6 , v3 );
215+
216+ dp1 += 8 ;
217+ dp2 += 8 ;
218+ dp3 += 8 ;
219+ } while (-- lines > 0 );
220+ }
221+
222+ static void xor_arm64_eor3_4 (unsigned long bytes , unsigned long * p1 ,
223+ unsigned long * p2 , unsigned long * p3 ,
224+ unsigned long * p4 )
225+ {
226+ uint64_t * dp1 = (uint64_t * )p1 ;
227+ uint64_t * dp2 = (uint64_t * )p2 ;
228+ uint64_t * dp3 = (uint64_t * )p3 ;
229+ uint64_t * dp4 = (uint64_t * )p4 ;
230+
231+ register uint64x2_t v0 , v1 , v2 , v3 ;
232+ long lines = bytes / (sizeof (uint64x2_t ) * 4 );
233+
234+ do {
235+ /* p1 ^= p2 ^ p3 */
236+ v0 = eor3 (vld1q_u64 (dp1 + 0 ), vld1q_u64 (dp2 + 0 ),
237+ vld1q_u64 (dp3 + 0 ));
238+ v1 = eor3 (vld1q_u64 (dp1 + 2 ), vld1q_u64 (dp2 + 2 ),
239+ vld1q_u64 (dp3 + 2 ));
240+ v2 = eor3 (vld1q_u64 (dp1 + 4 ), vld1q_u64 (dp2 + 4 ),
241+ vld1q_u64 (dp3 + 4 ));
242+ v3 = eor3 (vld1q_u64 (dp1 + 6 ), vld1q_u64 (dp2 + 6 ),
243+ vld1q_u64 (dp3 + 6 ));
244+
245+ /* p1 ^= p4 */
246+ v0 = veorq_u64 (v0 , vld1q_u64 (dp4 + 0 ));
247+ v1 = veorq_u64 (v1 , vld1q_u64 (dp4 + 2 ));
248+ v2 = veorq_u64 (v2 , vld1q_u64 (dp4 + 4 ));
249+ v3 = veorq_u64 (v3 , vld1q_u64 (dp4 + 6 ));
250+
251+ /* store */
252+ vst1q_u64 (dp1 + 0 , v0 );
253+ vst1q_u64 (dp1 + 2 , v1 );
254+ vst1q_u64 (dp1 + 4 , v2 );
255+ vst1q_u64 (dp1 + 6 , v3 );
256+
257+ dp1 += 8 ;
258+ dp2 += 8 ;
259+ dp3 += 8 ;
260+ dp4 += 8 ;
261+ } while (-- lines > 0 );
262+ }
263+
264+ static void xor_arm64_eor3_5 (unsigned long bytes , unsigned long * p1 ,
265+ unsigned long * p2 , unsigned long * p3 ,
266+ unsigned long * p4 , unsigned long * p5 )
267+ {
268+ uint64_t * dp1 = (uint64_t * )p1 ;
269+ uint64_t * dp2 = (uint64_t * )p2 ;
270+ uint64_t * dp3 = (uint64_t * )p3 ;
271+ uint64_t * dp4 = (uint64_t * )p4 ;
272+ uint64_t * dp5 = (uint64_t * )p5 ;
273+
274+ register uint64x2_t v0 , v1 , v2 , v3 ;
275+ long lines = bytes / (sizeof (uint64x2_t ) * 4 );
276+
277+ do {
278+ /* p1 ^= p2 ^ p3 */
279+ v0 = eor3 (vld1q_u64 (dp1 + 0 ), vld1q_u64 (dp2 + 0 ),
280+ vld1q_u64 (dp3 + 0 ));
281+ v1 = eor3 (vld1q_u64 (dp1 + 2 ), vld1q_u64 (dp2 + 2 ),
282+ vld1q_u64 (dp3 + 2 ));
283+ v2 = eor3 (vld1q_u64 (dp1 + 4 ), vld1q_u64 (dp2 + 4 ),
284+ vld1q_u64 (dp3 + 4 ));
285+ v3 = eor3 (vld1q_u64 (dp1 + 6 ), vld1q_u64 (dp2 + 6 ),
286+ vld1q_u64 (dp3 + 6 ));
287+
288+ /* p1 ^= p4 ^ p5 */
289+ v0 = eor3 (v0 , vld1q_u64 (dp4 + 0 ), vld1q_u64 (dp5 + 0 ));
290+ v1 = eor3 (v1 , vld1q_u64 (dp4 + 2 ), vld1q_u64 (dp5 + 2 ));
291+ v2 = eor3 (v2 , vld1q_u64 (dp4 + 4 ), vld1q_u64 (dp5 + 4 ));
292+ v3 = eor3 (v3 , vld1q_u64 (dp4 + 6 ), vld1q_u64 (dp5 + 6 ));
293+
294+ /* store */
295+ vst1q_u64 (dp1 + 0 , v0 );
296+ vst1q_u64 (dp1 + 2 , v1 );
297+ vst1q_u64 (dp1 + 4 , v2 );
298+ vst1q_u64 (dp1 + 6 , v3 );
299+
300+ dp1 += 8 ;
301+ dp2 += 8 ;
302+ dp3 += 8 ;
303+ dp4 += 8 ;
304+ dp5 += 8 ;
305+ } while (-- lines > 0 );
306+ }
307+
308+ static int __init xor_neon_init (void )
309+ {
310+ if (IS_ENABLED (CONFIG_AS_HAS_SHA3 ) && cpu_have_named_feature (SHA3 )) {
311+ xor_block_inner_neon .do_3 = xor_arm64_eor3_3 ;
312+ xor_block_inner_neon .do_4 = xor_arm64_eor3_4 ;
313+ xor_block_inner_neon .do_5 = xor_arm64_eor3_5 ;
314+ }
315+ return 0 ;
316+ }
317+ module_init (xor_neon_init );
318+
319+ static void __exit xor_neon_exit (void )
320+ {
321+ }
322+ module_exit (xor_neon_exit );
323+
179324MODULE_AUTHOR ("Jackie Liu <liuyun01@kylinos.cn>" );
180325MODULE_DESCRIPTION ("ARMv8 XOR Extensions" );
181326MODULE_LICENSE ("GPL" );
0 commit comments