Skip to content

Commit 34c3f1e

Browse files
author
Eric Biggers
committed
lib/crypto: arm64/sha256: Add support for 2-way interleaved hashing
Add an implementation of sha256_finup_2x_arch() for arm64. It interleaves the computation of two SHA-256 hashes using the ARMv8 SHA-256 instructions. dm-verity and fs-verity will take advantage of this for greatly improved performance on capable CPUs. This increases the throughput of SHA-256 hashing 4096-byte messages by the following amounts on the following CPUs: ARM Cortex-X1: 70% ARM Cortex-X3: 68% ARM Cortex-A76: 65% ARM Cortex-A715: 43% ARM Cortex-A510: 25% ARM Cortex-A55: 8% Reviewed-by: Ard Biesheuvel <ardb@kernel.org> Link: https://lore.kernel.org/r/20250915160819.140019-3-ebiggers@kernel.org Signed-off-by: Eric Biggers <ebiggers@kernel.org>
1 parent 4ca24d6 commit 34c3f1e

2 files changed

Lines changed: 315 additions & 6 deletions

File tree

lib/crypto/arm64/sha256-ce.S

Lines changed: 278 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,18 +70,22 @@
7070
.word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
7171
.word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
7272

73+
.macro load_round_constants tmp
74+
adr_l \tmp, .Lsha2_rcon
75+
ld1 { v0.4s- v3.4s}, [\tmp], #64
76+
ld1 { v4.4s- v7.4s}, [\tmp], #64
77+
ld1 { v8.4s-v11.4s}, [\tmp], #64
78+
ld1 {v12.4s-v15.4s}, [\tmp]
79+
.endm
80+
7381
/*
7482
* size_t __sha256_ce_transform(struct sha256_block_state *state,
7583
* const u8 *data, size_t nblocks);
7684
*/
7785
.text
7886
SYM_FUNC_START(__sha256_ce_transform)
79-
/* load round constants */
80-
adr_l x8, .Lsha2_rcon
81-
ld1 { v0.4s- v3.4s}, [x8], #64
82-
ld1 { v4.4s- v7.4s}, [x8], #64
83-
ld1 { v8.4s-v11.4s}, [x8], #64
84-
ld1 {v12.4s-v15.4s}, [x8]
87+
88+
load_round_constants x8
8589

8690
/* load state */
8791
ld1 {dgav.4s, dgbv.4s}, [x0]
@@ -134,3 +138,271 @@ CPU_LE( rev32 v19.16b, v19.16b )
134138
mov x0, x2
135139
ret
136140
SYM_FUNC_END(__sha256_ce_transform)
141+
142+
.unreq dga
143+
.unreq dgav
144+
.unreq dgb
145+
.unreq dgbv
146+
.unreq t0
147+
.unreq t1
148+
.unreq dg0q
149+
.unreq dg0v
150+
.unreq dg1q
151+
.unreq dg1v
152+
.unreq dg2q
153+
.unreq dg2v
154+
155+
// parameters for sha256_ce_finup2x()
156+
ctx .req x0
157+
data1 .req x1
158+
data2 .req x2
159+
len .req w3
160+
out1 .req x4
161+
out2 .req x5
162+
163+
// other scalar variables
164+
count .req x6
165+
final_step .req w7
166+
167+
// x8-x9 are used as temporaries.
168+
169+
// v0-v15 are used to cache the SHA-256 round constants.
170+
// v16-v19 are used for the message schedule for the first message.
171+
// v20-v23 are used for the message schedule for the second message.
172+
// v24-v31 are used for the state and temporaries as given below.
173+
// *_a are for the first message and *_b for the second.
174+
state0_a_q .req q24
175+
state0_a .req v24
176+
state1_a_q .req q25
177+
state1_a .req v25
178+
state0_b_q .req q26
179+
state0_b .req v26
180+
state1_b_q .req q27
181+
state1_b .req v27
182+
t0_a .req v28
183+
t0_b .req v29
184+
t1_a_q .req q30
185+
t1_a .req v30
186+
t1_b_q .req q31
187+
t1_b .req v31
188+
189+
#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount)
190+
#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf)
191+
// offsetof(struct __sha256_ctx, state) is assumed to be 0.
192+
193+
// Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a
194+
// and m0_b contain the current 4 message schedule words for the first
195+
// and second message respectively.
196+
//
197+
// If not all the message schedule words have been computed yet, then
198+
// this also computes 4 more message schedule words for each message.
199+
// m1_a-m3_a contain the next 3 groups of 4 message schedule words for
200+
// the first message, and likewise m1_b-m3_b for the second. After
201+
// consuming the current value of m0_a, this macro computes the group
202+
// after m3_a and writes it to m0_a, and likewise for *_b. This means
203+
// that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
204+
// m3_a, m0_a), and likewise for *_b, so the caller must cycle through
205+
// the registers accordingly.
206+
.macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \
207+
m0_b, m1_b, m2_b, m3_b
208+
add t0_a\().4s, \m0_a\().4s, \k\().4s
209+
add t0_b\().4s, \m0_b\().4s, \k\().4s
210+
.if \i < 48
211+
sha256su0 \m0_a\().4s, \m1_a\().4s
212+
sha256su0 \m0_b\().4s, \m1_b\().4s
213+
sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s
214+
sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s
215+
.endif
216+
mov t1_a.16b, state0_a.16b
217+
mov t1_b.16b, state0_b.16b
218+
sha256h state0_a_q, state1_a_q, t0_a\().4s
219+
sha256h state0_b_q, state1_b_q, t0_b\().4s
220+
sha256h2 state1_a_q, t1_a_q, t0_a\().4s
221+
sha256h2 state1_b_q, t1_b_q, t0_b\().4s
222+
.endm
223+
224+
.macro do_16rounds_2x i, k0, k1, k2, k3
225+
do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23
226+
do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20
227+
do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21
228+
do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22
229+
.endm
230+
231+
//
232+
// void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
233+
// const u8 *data1, const u8 *data2, int len,
234+
// u8 out1[SHA256_DIGEST_SIZE],
235+
// u8 out2[SHA256_DIGEST_SIZE]);
236+
//
237+
// This function computes the SHA-256 digests of two messages |data1| and
238+
// |data2| that are both |len| bytes long, starting from the initial context
239+
// |ctx|. |len| must be at least SHA256_BLOCK_SIZE.
240+
//
241+
// The instructions for the two SHA-256 operations are interleaved. On many
242+
// CPUs, this is almost twice as fast as hashing each message individually due
243+
// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
244+
//
245+
SYM_FUNC_START(sha256_ce_finup2x)
246+
sub sp, sp, #128
247+
mov final_step, #0
248+
load_round_constants x8
249+
250+
// Load the initial state from ctx->state.
251+
ld1 {state0_a.4s-state1_a.4s}, [ctx]
252+
253+
// Load ctx->bytecount. Take the mod 64 of it to get the number of
254+
// bytes that are buffered in ctx->buf. Also save it in a register with
255+
// len added to it.
256+
ldr x8, [ctx, #OFFSETOF_BYTECOUNT]
257+
add count, x8, len, sxtw
258+
and x8, x8, #63
259+
cbz x8, .Lfinup2x_enter_loop // No bytes buffered?
260+
261+
// x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them
262+
// followed by the first 64 - x8 bytes of data. Since len >= 64, we
263+
// just load 64 bytes from each of ctx->buf, data1, and data2
264+
// unconditionally and rearrange the data as needed.
265+
add x9, ctx, #OFFSETOF_BUF
266+
ld1 {v16.16b-v19.16b}, [x9]
267+
st1 {v16.16b-v19.16b}, [sp]
268+
269+
ld1 {v16.16b-v19.16b}, [data1], #64
270+
add x9, sp, x8
271+
st1 {v16.16b-v19.16b}, [x9]
272+
ld1 {v16.4s-v19.4s}, [sp]
273+
274+
ld1 {v20.16b-v23.16b}, [data2], #64
275+
st1 {v20.16b-v23.16b}, [x9]
276+
ld1 {v20.4s-v23.4s}, [sp]
277+
278+
sub len, len, #64
279+
sub data1, data1, x8
280+
sub data2, data2, x8
281+
add len, len, w8
282+
mov state0_b.16b, state0_a.16b
283+
mov state1_b.16b, state1_a.16b
284+
b .Lfinup2x_loop_have_data
285+
286+
.Lfinup2x_enter_loop:
287+
sub len, len, #64
288+
mov state0_b.16b, state0_a.16b
289+
mov state1_b.16b, state1_a.16b
290+
.Lfinup2x_loop:
291+
// Load the next two data blocks.
292+
ld1 {v16.4s-v19.4s}, [data1], #64
293+
ld1 {v20.4s-v23.4s}, [data2], #64
294+
.Lfinup2x_loop_have_data:
295+
// Convert the words of the data blocks from big endian.
296+
CPU_LE( rev32 v16.16b, v16.16b )
297+
CPU_LE( rev32 v17.16b, v17.16b )
298+
CPU_LE( rev32 v18.16b, v18.16b )
299+
CPU_LE( rev32 v19.16b, v19.16b )
300+
CPU_LE( rev32 v20.16b, v20.16b )
301+
CPU_LE( rev32 v21.16b, v21.16b )
302+
CPU_LE( rev32 v22.16b, v22.16b )
303+
CPU_LE( rev32 v23.16b, v23.16b )
304+
.Lfinup2x_loop_have_bswapped_data:
305+
306+
// Save the original state for each block.
307+
st1 {state0_a.4s-state1_b.4s}, [sp]
308+
309+
// Do the SHA-256 rounds on each block.
310+
do_16rounds_2x 0, v0, v1, v2, v3
311+
do_16rounds_2x 16, v4, v5, v6, v7
312+
do_16rounds_2x 32, v8, v9, v10, v11
313+
do_16rounds_2x 48, v12, v13, v14, v15
314+
315+
// Add the original state for each block.
316+
ld1 {v16.4s-v19.4s}, [sp]
317+
add state0_a.4s, state0_a.4s, v16.4s
318+
add state1_a.4s, state1_a.4s, v17.4s
319+
add state0_b.4s, state0_b.4s, v18.4s
320+
add state1_b.4s, state1_b.4s, v19.4s
321+
322+
// Update len and loop back if more blocks remain.
323+
sub len, len, #64
324+
tbz len, #31, .Lfinup2x_loop // len >= 0?
325+
326+
// Check if any final blocks need to be handled.
327+
// final_step = 2: all done
328+
// final_step = 1: need to do count-only padding block
329+
// final_step = 0: need to do the block with 0x80 padding byte
330+
tbnz final_step, #1, .Lfinup2x_done
331+
tbnz final_step, #0, .Lfinup2x_finalize_countonly
332+
add len, len, #64
333+
cbz len, .Lfinup2x_finalize_blockaligned
334+
335+
// Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block.
336+
// To do this, write the padding starting with the 0x80 byte to
337+
// &sp[64]. Then for each message, copy the last 64 data bytes to sp
338+
// and load from &sp[64 - len] to get the needed padding block. This
339+
// code relies on the data buffers being >= 64 bytes in length.
340+
sub w8, len, #64 // w8 = len - 64
341+
add data1, data1, w8, sxtw // data1 += len - 64
342+
add data2, data2, w8, sxtw // data2 += len - 64
343+
CPU_LE( mov x9, #0x80 )
344+
CPU_LE( fmov d16, x9 )
345+
CPU_BE( movi v16.16b, #0 )
346+
CPU_BE( mov x9, #0x8000000000000000 )
347+
CPU_BE( mov v16.d[1], x9 )
348+
movi v17.16b, #0
349+
stp q16, q17, [sp, #64]
350+
stp q17, q17, [sp, #96]
351+
sub x9, sp, w8, sxtw // x9 = &sp[64 - len]
352+
cmp len, #56
353+
b.ge 1f // will count spill into its own block?
354+
lsl count, count, #3
355+
CPU_LE( rev count, count )
356+
str count, [x9, #56]
357+
mov final_step, #2 // won't need count-only block
358+
b 2f
359+
1:
360+
mov final_step, #1 // will need count-only block
361+
2:
362+
ld1 {v16.16b-v19.16b}, [data1]
363+
st1 {v16.16b-v19.16b}, [sp]
364+
ld1 {v16.4s-v19.4s}, [x9]
365+
ld1 {v20.16b-v23.16b}, [data2]
366+
st1 {v20.16b-v23.16b}, [sp]
367+
ld1 {v20.4s-v23.4s}, [x9]
368+
b .Lfinup2x_loop_have_data
369+
370+
// Prepare a padding block, either:
371+
//
372+
// {0x80, 0, 0, 0, ..., count (as __be64)}
373+
// This is for a block aligned message.
374+
//
375+
// { 0, 0, 0, 0, ..., count (as __be64)}
376+
// This is for a message whose length mod 64 is >= 56.
377+
//
378+
// Pre-swap the endianness of the words.
379+
.Lfinup2x_finalize_countonly:
380+
movi v16.2d, #0
381+
b 1f
382+
.Lfinup2x_finalize_blockaligned:
383+
mov x8, #0x80000000
384+
fmov d16, x8
385+
1:
386+
movi v17.2d, #0
387+
movi v18.2d, #0
388+
ror count, count, #29 // ror(lsl(count, 3), 32)
389+
mov v19.d[0], xzr
390+
mov v19.d[1], count
391+
mov v20.16b, v16.16b
392+
movi v21.2d, #0
393+
movi v22.2d, #0
394+
mov v23.16b, v19.16b
395+
mov final_step, #2
396+
b .Lfinup2x_loop_have_bswapped_data
397+
398+
.Lfinup2x_done:
399+
// Write the two digests with all bytes in the correct order.
400+
CPU_LE( rev32 state0_a.16b, state0_a.16b )
401+
CPU_LE( rev32 state1_a.16b, state1_a.16b )
402+
CPU_LE( rev32 state0_b.16b, state0_b.16b )
403+
CPU_LE( rev32 state1_b.16b, state1_b.16b )
404+
st1 {state0_a.4s-state1_a.4s}, [out1]
405+
st1 {state0_b.4s-state1_b.4s}, [out2]
406+
add sp, sp, #128
407+
ret
408+
SYM_FUNC_END(sha256_ce_finup2x)

lib/crypto/arm64/sha256.h

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,43 @@ static void sha256_blocks(struct sha256_block_state *state,
4444
}
4545
}
4646

47+
static_assert(offsetof(struct __sha256_ctx, state) == 0);
48+
static_assert(offsetof(struct __sha256_ctx, bytecount) == 32);
49+
static_assert(offsetof(struct __sha256_ctx, buf) == 40);
50+
asmlinkage void sha256_ce_finup2x(const struct __sha256_ctx *ctx,
51+
const u8 *data1, const u8 *data2, int len,
52+
u8 out1[SHA256_DIGEST_SIZE],
53+
u8 out2[SHA256_DIGEST_SIZE]);
54+
55+
#define sha256_finup_2x_arch sha256_finup_2x_arch
56+
static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
57+
const u8 *data1, const u8 *data2, size_t len,
58+
u8 out1[SHA256_DIGEST_SIZE],
59+
u8 out2[SHA256_DIGEST_SIZE])
60+
{
61+
/*
62+
* The assembly requires len >= SHA256_BLOCK_SIZE && len <= INT_MAX.
63+
* Further limit len to 65536 to avoid spending too long with preemption
64+
* disabled. (Of course, in practice len is nearly always 4096 anyway.)
65+
*/
66+
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
67+
static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE &&
68+
len <= 65536 && likely(may_use_simd())) {
69+
kernel_neon_begin();
70+
sha256_ce_finup2x(ctx, data1, data2, len, out1, out2);
71+
kernel_neon_end();
72+
kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
73+
kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
74+
return true;
75+
}
76+
return false;
77+
}
78+
79+
static bool sha256_finup_2x_is_optimized_arch(void)
80+
{
81+
return static_key_enabled(&have_ce);
82+
}
83+
4784
#ifdef CONFIG_KERNEL_MODE_NEON
4885
#define sha256_mod_init_arch sha256_mod_init_arch
4986
static inline void sha256_mod_init_arch(void)

0 commit comments

Comments
 (0)