Skip to content

Commit f88ed14

Browse files
author
Eric Biggers
committed
lib/crypto: x86/sha1-ni: Minor optimizations and cleanup
- Store the previous state in %xmm8-%xmm9 instead of spilling it to the stack. There are plenty of unused XMM registers here, so there is no reason to spill to the stack. (While 32-bit code is limited to %xmm0-%xmm7, this is 64-bit code, so it's free to use %xmm8-%xmm15.) - Remove the unnecessary check for nblocks == 0. sha1_ni_transform() is always passed a positive nblocks. - To get an XMM register with 'e' in the high dword and the rest zeroes, just zeroize the register using pxor, then load 'e'. Previously the code loaded 'e', then zeroized the lower dwords by AND-ing with a constant, which was slightly less efficient. - Instead of computing &DATA_PTR[NBLOCKS << 6] and stopping when DATA_PTR reaches that value, instead just decrement NBLOCKS on each iteration and stop when it reaches 0. This is fewer instructions. - Rename DIGEST_PTR to STATE_PTR. It points to the SHA-1 internal state, not a SHA-1 digest value. This commit shrinks the code size of sha1_ni_transform() from 624 bytes to 589 bytes and also shrinks rodata by 16 bytes. Reviewed-by: Ard Biesheuvel <ardb@kernel.org> Link: https://lore.kernel.org/r/20250718191900.42877-2-ebiggers@kernel.org Signed-off-by: Eric Biggers <ebiggers@kernel.org>
1 parent c76ed87 commit f88ed14

1 file changed

Lines changed: 25 additions & 43 deletions

File tree

lib/crypto/x86/sha1-ni-asm.S

Lines changed: 25 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,10 @@
5555

5656
#include <linux/linkage.h>
5757

58-
#define DIGEST_PTR %rdi /* 1st arg */
58+
#define STATE_PTR %rdi /* 1st arg */
5959
#define DATA_PTR %rsi /* 2nd arg */
6060
#define NUM_BLKS %rdx /* 3rd arg */
6161

62-
/* gcc conversion */
63-
#define FRAME_SIZE 32 /* space for 2x16 bytes */
64-
6562
#define ABCD %xmm0
6663
#define E0 %xmm1 /* Need two E's b/c they ping pong */
6764
#define E1 %xmm2
@@ -70,15 +67,17 @@
7067
#define MSG2 %xmm5
7168
#define MSG3 %xmm6
7269
#define SHUF_MASK %xmm7
73-
70+
#define ABCD_SAVED %xmm8
71+
#define E0_SAVED %xmm9
7472

7573
/*
7674
* Intel SHA Extensions optimized implementation of a SHA-1 block function
7775
*
7876
* This function takes a pointer to the current SHA-1 state, a pointer to the
79-
* input data, and the number of 64-byte blocks to process. Once all blocks
80-
* have been processed, the state is updated with the new state. This function
81-
* only processes complete blocks. State initialization, buffering of partial
77+
* input data, and the number of 64-byte blocks to process. The number of
78+
* blocks to process is assumed to be nonzero. Once all blocks have been
79+
* processed, the state is updated with the new state. This function only
80+
* processes complete blocks. State initialization, buffering of partial
8281
* blocks, and digest finalization are expected to be handled elsewhere.
8382
*
8483
* The indented lines in the loop are instructions related to rounds processing.
@@ -89,27 +88,19 @@
8988
*/
9089
.text
9190
SYM_FUNC_START(sha1_ni_transform)
92-
push %rbp
93-
mov %rsp, %rbp
94-
sub $FRAME_SIZE, %rsp
95-
and $~0xF, %rsp
96-
97-
shl $6, NUM_BLKS /* convert to bytes */
98-
jz .Ldone_hash
99-
add DATA_PTR, NUM_BLKS /* pointer to end of data */
100-
101-
/* load initial hash values */
102-
pinsrd $3, 1*16(DIGEST_PTR), E0
103-
movdqu 0*16(DIGEST_PTR), ABCD
104-
pand UPPER_WORD_MASK(%rip), E0
91+
92+
/* Load the initial state from STATE_PTR. */
93+
pxor E0, E0
94+
pinsrd $3, 16(STATE_PTR), E0
95+
movdqu (STATE_PTR), ABCD
10596
pshufd $0x1B, ABCD, ABCD
10697

10798
movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
10899

109-
.Lloop0:
110-
/* Save hash values for addition after rounds */
111-
movdqa E0, (0*16)(%rsp)
112-
movdqa ABCD, (1*16)(%rsp)
100+
.Lnext_block:
101+
/* Save the state for addition after the rounds. */
102+
movdqa E0, E0_SAVED
103+
movdqa ABCD, ABCD_SAVED
113104

114105
/* Rounds 0-3 */
115106
movdqu 0*16(DATA_PTR), MSG0
@@ -267,23 +258,19 @@ SYM_FUNC_START(sha1_ni_transform)
267258
movdqa ABCD, E0
268259
sha1rnds4 $3, E1, ABCD
269260

270-
/* Add current hash values with previously saved */
271-
sha1nexte (0*16)(%rsp), E0
272-
paddd (1*16)(%rsp), ABCD
261+
/* Add the previous state (before the rounds) to the current state. */
262+
sha1nexte E0_SAVED, E0
263+
paddd ABCD_SAVED, ABCD
273264

274-
/* Increment data pointer and loop if more to process */
265+
/* Advance to the next block, or break if there are no more blocks. */
275266
add $64, DATA_PTR
276-
cmp NUM_BLKS, DATA_PTR
277-
jne .Lloop0
267+
dec NUM_BLKS
268+
jnz .Lnext_block
278269

279-
/* Write hash values back in the correct order */
270+
/* Store the new state to STATE_PTR. */
271+
pextrd $3, E0, 16(STATE_PTR)
280272
pshufd $0x1B, ABCD, ABCD
281-
movdqu ABCD, 0*16(DIGEST_PTR)
282-
pextrd $3, E0, 1*16(DIGEST_PTR)
283-
284-
.Ldone_hash:
285-
mov %rbp, %rsp
286-
pop %rbp
273+
movdqu ABCD, (STATE_PTR)
287274

288275
RET
289276
SYM_FUNC_END(sha1_ni_transform)
@@ -292,8 +279,3 @@ SYM_FUNC_END(sha1_ni_transform)
292279
.align 16
293280
PSHUFFLE_BYTE_FLIP_MASK:
294281
.octa 0x000102030405060708090a0b0c0d0e0f
295-
296-
.section .rodata.cst16.UPPER_WORD_MASK, "aM", @progbits, 16
297-
.align 16
298-
UPPER_WORD_MASK:
299-
.octa 0xFFFFFFFF000000000000000000000000

0 commit comments

Comments
 (0)