|
70 | 70 | .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 |
71 | 71 | .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 |
72 | 72 |
|
| 73 | + .macro load_round_constants tmp |
| 74 | + adr_l \tmp, .Lsha2_rcon |
| 75 | + ld1 { v0.4s- v3.4s}, [\tmp], #64 |
| 76 | + ld1 { v4.4s- v7.4s}, [\tmp], #64 |
| 77 | + ld1 { v8.4s-v11.4s}, [\tmp], #64 |
| 78 | + ld1 {v12.4s-v15.4s}, [\tmp] |
| 79 | + .endm |
| 80 | + |
73 | 81 | /* |
74 | 82 | * size_t __sha256_ce_transform(struct sha256_block_state *state, |
75 | 83 | * const u8 *data, size_t nblocks); |
76 | 84 | */ |
77 | 85 | .text |
78 | 86 | SYM_FUNC_START(__sha256_ce_transform) |
79 | | - /* load round constants */ |
80 | | - adr_l x8, .Lsha2_rcon |
81 | | - ld1 { v0.4s- v3.4s}, [x8], #64 |
82 | | - ld1 { v4.4s- v7.4s}, [x8], #64 |
83 | | - ld1 { v8.4s-v11.4s}, [x8], #64 |
84 | | - ld1 {v12.4s-v15.4s}, [x8] |
| 87 | + |
| 88 | + load_round_constants x8 |
85 | 89 |
|
86 | 90 | /* load state */ |
87 | 91 | ld1 {dgav.4s, dgbv.4s}, [x0] |
@@ -134,3 +138,271 @@ CPU_LE( rev32 v19.16b, v19.16b ) |
134 | 138 | mov x0, x2 |
135 | 139 | ret |
136 | 140 | SYM_FUNC_END(__sha256_ce_transform) |
| 141 | + |
| 142 | + .unreq dga |
| 143 | + .unreq dgav |
| 144 | + .unreq dgb |
| 145 | + .unreq dgbv |
| 146 | + .unreq t0 |
| 147 | + .unreq t1 |
| 148 | + .unreq dg0q |
| 149 | + .unreq dg0v |
| 150 | + .unreq dg1q |
| 151 | + .unreq dg1v |
| 152 | + .unreq dg2q |
| 153 | + .unreq dg2v |
| 154 | + |
| 155 | + // parameters for sha256_ce_finup2x() |
| 156 | + ctx .req x0 |
| 157 | + data1 .req x1 |
| 158 | + data2 .req x2 |
| 159 | + len .req w3 |
| 160 | + out1 .req x4 |
| 161 | + out2 .req x5 |
| 162 | + |
| 163 | + // other scalar variables |
| 164 | + count .req x6 |
| 165 | + final_step .req w7 |
| 166 | + |
| 167 | + // x8-x9 are used as temporaries. |
| 168 | + |
| 169 | + // v0-v15 are used to cache the SHA-256 round constants. |
| 170 | + // v16-v19 are used for the message schedule for the first message. |
| 171 | + // v20-v23 are used for the message schedule for the second message. |
| 172 | + // v24-v31 are used for the state and temporaries as given below. |
| 173 | + // *_a are for the first message and *_b for the second. |
| 174 | + state0_a_q .req q24 |
| 175 | + state0_a .req v24 |
| 176 | + state1_a_q .req q25 |
| 177 | + state1_a .req v25 |
| 178 | + state0_b_q .req q26 |
| 179 | + state0_b .req v26 |
| 180 | + state1_b_q .req q27 |
| 181 | + state1_b .req v27 |
| 182 | + t0_a .req v28 |
| 183 | + t0_b .req v29 |
| 184 | + t1_a_q .req q30 |
| 185 | + t1_a .req v30 |
| 186 | + t1_b_q .req q31 |
| 187 | + t1_b .req v31 |
| 188 | + |
| 189 | +#define OFFSETOF_BYTECOUNT 32 // offsetof(struct __sha256_ctx, bytecount) |
| 190 | +#define OFFSETOF_BUF 40 // offsetof(struct __sha256_ctx, buf) |
| 191 | +// offsetof(struct __sha256_ctx, state) is assumed to be 0. |
| 192 | + |
| 193 | + // Do 4 rounds of SHA-256 for each of two messages (interleaved). m0_a |
| 194 | + // and m0_b contain the current 4 message schedule words for the first |
| 195 | + // and second message respectively. |
| 196 | + // |
| 197 | + // If not all the message schedule words have been computed yet, then |
| 198 | + // this also computes 4 more message schedule words for each message. |
| 199 | + // m1_a-m3_a contain the next 3 groups of 4 message schedule words for |
| 200 | + // the first message, and likewise m1_b-m3_b for the second. After |
| 201 | + // consuming the current value of m0_a, this macro computes the group |
| 202 | + // after m3_a and writes it to m0_a, and likewise for *_b. This means |
| 203 | + // that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a, |
| 204 | + // m3_a, m0_a), and likewise for *_b, so the caller must cycle through |
| 205 | + // the registers accordingly. |
| 206 | + .macro do_4rounds_2x i, k, m0_a, m1_a, m2_a, m3_a, \ |
| 207 | + m0_b, m1_b, m2_b, m3_b |
| 208 | + add t0_a\().4s, \m0_a\().4s, \k\().4s |
| 209 | + add t0_b\().4s, \m0_b\().4s, \k\().4s |
| 210 | + .if \i < 48 |
| 211 | + sha256su0 \m0_a\().4s, \m1_a\().4s |
| 212 | + sha256su0 \m0_b\().4s, \m1_b\().4s |
| 213 | + sha256su1 \m0_a\().4s, \m2_a\().4s, \m3_a\().4s |
| 214 | + sha256su1 \m0_b\().4s, \m2_b\().4s, \m3_b\().4s |
| 215 | + .endif |
| 216 | + mov t1_a.16b, state0_a.16b |
| 217 | + mov t1_b.16b, state0_b.16b |
| 218 | + sha256h state0_a_q, state1_a_q, t0_a\().4s |
| 219 | + sha256h state0_b_q, state1_b_q, t0_b\().4s |
| 220 | + sha256h2 state1_a_q, t1_a_q, t0_a\().4s |
| 221 | + sha256h2 state1_b_q, t1_b_q, t0_b\().4s |
| 222 | + .endm |
| 223 | + |
| 224 | + .macro do_16rounds_2x i, k0, k1, k2, k3 |
| 225 | + do_4rounds_2x \i + 0, \k0, v16, v17, v18, v19, v20, v21, v22, v23 |
| 226 | + do_4rounds_2x \i + 4, \k1, v17, v18, v19, v16, v21, v22, v23, v20 |
| 227 | + do_4rounds_2x \i + 8, \k2, v18, v19, v16, v17, v22, v23, v20, v21 |
| 228 | + do_4rounds_2x \i + 12, \k3, v19, v16, v17, v18, v23, v20, v21, v22 |
| 229 | + .endm |
| 230 | + |
| 231 | +// |
| 232 | +// void sha256_ce_finup2x(const struct __sha256_ctx *ctx, |
| 233 | +// const u8 *data1, const u8 *data2, int len, |
| 234 | +// u8 out1[SHA256_DIGEST_SIZE], |
| 235 | +// u8 out2[SHA256_DIGEST_SIZE]); |
| 236 | +// |
| 237 | +// This function computes the SHA-256 digests of two messages |data1| and |
| 238 | +// |data2| that are both |len| bytes long, starting from the initial context |
| 239 | +// |ctx|. |len| must be at least SHA256_BLOCK_SIZE. |
| 240 | +// |
| 241 | +// The instructions for the two SHA-256 operations are interleaved. On many |
| 242 | +// CPUs, this is almost twice as fast as hashing each message individually due |
| 243 | +// to taking better advantage of the CPU's SHA-256 and SIMD throughput. |
| 244 | +// |
| 245 | +SYM_FUNC_START(sha256_ce_finup2x) |
| 246 | + sub sp, sp, #128 |
| 247 | + mov final_step, #0 |
| 248 | + load_round_constants x8 |
| 249 | + |
| 250 | + // Load the initial state from ctx->state. |
| 251 | + ld1 {state0_a.4s-state1_a.4s}, [ctx] |
| 252 | + |
| 253 | + // Load ctx->bytecount. Take the mod 64 of it to get the number of |
| 254 | + // bytes that are buffered in ctx->buf. Also save it in a register with |
| 255 | + // len added to it. |
| 256 | + ldr x8, [ctx, #OFFSETOF_BYTECOUNT] |
| 257 | + add count, x8, len, sxtw |
| 258 | + and x8, x8, #63 |
| 259 | + cbz x8, .Lfinup2x_enter_loop // No bytes buffered? |
| 260 | + |
| 261 | + // x8 bytes (1 to 63) are currently buffered in ctx->buf. Load them |
| 262 | + // followed by the first 64 - x8 bytes of data. Since len >= 64, we |
| 263 | + // just load 64 bytes from each of ctx->buf, data1, and data2 |
| 264 | + // unconditionally and rearrange the data as needed. |
| 265 | + add x9, ctx, #OFFSETOF_BUF |
| 266 | + ld1 {v16.16b-v19.16b}, [x9] |
| 267 | + st1 {v16.16b-v19.16b}, [sp] |
| 268 | + |
| 269 | + ld1 {v16.16b-v19.16b}, [data1], #64 |
| 270 | + add x9, sp, x8 |
| 271 | + st1 {v16.16b-v19.16b}, [x9] |
| 272 | + ld1 {v16.4s-v19.4s}, [sp] |
| 273 | + |
| 274 | + ld1 {v20.16b-v23.16b}, [data2], #64 |
| 275 | + st1 {v20.16b-v23.16b}, [x9] |
| 276 | + ld1 {v20.4s-v23.4s}, [sp] |
| 277 | + |
| 278 | + sub len, len, #64 |
| 279 | + sub data1, data1, x8 |
| 280 | + sub data2, data2, x8 |
| 281 | + add len, len, w8 |
| 282 | + mov state0_b.16b, state0_a.16b |
| 283 | + mov state1_b.16b, state1_a.16b |
| 284 | + b .Lfinup2x_loop_have_data |
| 285 | + |
| 286 | +.Lfinup2x_enter_loop: |
| 287 | + sub len, len, #64 |
| 288 | + mov state0_b.16b, state0_a.16b |
| 289 | + mov state1_b.16b, state1_a.16b |
| 290 | +.Lfinup2x_loop: |
| 291 | + // Load the next two data blocks. |
| 292 | + ld1 {v16.4s-v19.4s}, [data1], #64 |
| 293 | + ld1 {v20.4s-v23.4s}, [data2], #64 |
| 294 | +.Lfinup2x_loop_have_data: |
| 295 | + // Convert the words of the data blocks from big endian. |
| 296 | +CPU_LE( rev32 v16.16b, v16.16b ) |
| 297 | +CPU_LE( rev32 v17.16b, v17.16b ) |
| 298 | +CPU_LE( rev32 v18.16b, v18.16b ) |
| 299 | +CPU_LE( rev32 v19.16b, v19.16b ) |
| 300 | +CPU_LE( rev32 v20.16b, v20.16b ) |
| 301 | +CPU_LE( rev32 v21.16b, v21.16b ) |
| 302 | +CPU_LE( rev32 v22.16b, v22.16b ) |
| 303 | +CPU_LE( rev32 v23.16b, v23.16b ) |
| 304 | +.Lfinup2x_loop_have_bswapped_data: |
| 305 | + |
| 306 | + // Save the original state for each block. |
| 307 | + st1 {state0_a.4s-state1_b.4s}, [sp] |
| 308 | + |
| 309 | + // Do the SHA-256 rounds on each block. |
| 310 | + do_16rounds_2x 0, v0, v1, v2, v3 |
| 311 | + do_16rounds_2x 16, v4, v5, v6, v7 |
| 312 | + do_16rounds_2x 32, v8, v9, v10, v11 |
| 313 | + do_16rounds_2x 48, v12, v13, v14, v15 |
| 314 | + |
| 315 | + // Add the original state for each block. |
| 316 | + ld1 {v16.4s-v19.4s}, [sp] |
| 317 | + add state0_a.4s, state0_a.4s, v16.4s |
| 318 | + add state1_a.4s, state1_a.4s, v17.4s |
| 319 | + add state0_b.4s, state0_b.4s, v18.4s |
| 320 | + add state1_b.4s, state1_b.4s, v19.4s |
| 321 | + |
| 322 | + // Update len and loop back if more blocks remain. |
| 323 | + sub len, len, #64 |
| 324 | + tbz len, #31, .Lfinup2x_loop // len >= 0? |
| 325 | + |
| 326 | + // Check if any final blocks need to be handled. |
| 327 | + // final_step = 2: all done |
| 328 | + // final_step = 1: need to do count-only padding block |
| 329 | + // final_step = 0: need to do the block with 0x80 padding byte |
| 330 | + tbnz final_step, #1, .Lfinup2x_done |
| 331 | + tbnz final_step, #0, .Lfinup2x_finalize_countonly |
| 332 | + add len, len, #64 |
| 333 | + cbz len, .Lfinup2x_finalize_blockaligned |
| 334 | + |
| 335 | + // Not block-aligned; 1 <= len <= 63 data bytes remain. Pad the block. |
| 336 | + // To do this, write the padding starting with the 0x80 byte to |
| 337 | + // &sp[64]. Then for each message, copy the last 64 data bytes to sp |
| 338 | + // and load from &sp[64 - len] to get the needed padding block. This |
| 339 | + // code relies on the data buffers being >= 64 bytes in length. |
| 340 | + sub w8, len, #64 // w8 = len - 64 |
| 341 | + add data1, data1, w8, sxtw // data1 += len - 64 |
| 342 | + add data2, data2, w8, sxtw // data2 += len - 64 |
| 343 | +CPU_LE( mov x9, #0x80 ) |
| 344 | +CPU_LE( fmov d16, x9 ) |
| 345 | +CPU_BE( movi v16.16b, #0 ) |
| 346 | +CPU_BE( mov x9, #0x8000000000000000 ) |
| 347 | +CPU_BE( mov v16.d[1], x9 ) |
| 348 | + movi v17.16b, #0 |
| 349 | + stp q16, q17, [sp, #64] |
| 350 | + stp q17, q17, [sp, #96] |
| 351 | + sub x9, sp, w8, sxtw // x9 = &sp[64 - len] |
| 352 | + cmp len, #56 |
| 353 | + b.ge 1f // will count spill into its own block? |
| 354 | + lsl count, count, #3 |
| 355 | +CPU_LE( rev count, count ) |
| 356 | + str count, [x9, #56] |
| 357 | + mov final_step, #2 // won't need count-only block |
| 358 | + b 2f |
| 359 | +1: |
| 360 | + mov final_step, #1 // will need count-only block |
| 361 | +2: |
| 362 | + ld1 {v16.16b-v19.16b}, [data1] |
| 363 | + st1 {v16.16b-v19.16b}, [sp] |
| 364 | + ld1 {v16.4s-v19.4s}, [x9] |
| 365 | + ld1 {v20.16b-v23.16b}, [data2] |
| 366 | + st1 {v20.16b-v23.16b}, [sp] |
| 367 | + ld1 {v20.4s-v23.4s}, [x9] |
| 368 | + b .Lfinup2x_loop_have_data |
| 369 | + |
| 370 | + // Prepare a padding block, either: |
| 371 | + // |
| 372 | + // {0x80, 0, 0, 0, ..., count (as __be64)} |
| 373 | + // This is for a block aligned message. |
| 374 | + // |
| 375 | + // { 0, 0, 0, 0, ..., count (as __be64)} |
| 376 | + // This is for a message whose length mod 64 is >= 56. |
| 377 | + // |
| 378 | + // Pre-swap the endianness of the words. |
| 379 | +.Lfinup2x_finalize_countonly: |
| 380 | + movi v16.2d, #0 |
| 381 | + b 1f |
| 382 | +.Lfinup2x_finalize_blockaligned: |
| 383 | + mov x8, #0x80000000 |
| 384 | + fmov d16, x8 |
| 385 | +1: |
| 386 | + movi v17.2d, #0 |
| 387 | + movi v18.2d, #0 |
| 388 | + ror count, count, #29 // ror(lsl(count, 3), 32) |
| 389 | + mov v19.d[0], xzr |
| 390 | + mov v19.d[1], count |
| 391 | + mov v20.16b, v16.16b |
| 392 | + movi v21.2d, #0 |
| 393 | + movi v22.2d, #0 |
| 394 | + mov v23.16b, v19.16b |
| 395 | + mov final_step, #2 |
| 396 | + b .Lfinup2x_loop_have_bswapped_data |
| 397 | + |
| 398 | +.Lfinup2x_done: |
| 399 | + // Write the two digests with all bytes in the correct order. |
| 400 | +CPU_LE( rev32 state0_a.16b, state0_a.16b ) |
| 401 | +CPU_LE( rev32 state1_a.16b, state1_a.16b ) |
| 402 | +CPU_LE( rev32 state0_b.16b, state0_b.16b ) |
| 403 | +CPU_LE( rev32 state1_b.16b, state1_b.16b ) |
| 404 | + st1 {state0_a.4s-state1_a.4s}, [out1] |
| 405 | + st1 {state0_b.4s-state1_b.4s}, [out2] |
| 406 | + add sp, sp, #128 |
| 407 | + ret |
| 408 | +SYM_FUNC_END(sha256_ce_finup2x) |
0 commit comments