|
84 | 84 |
|
85 | 85 | #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ |
86 | 86 | movzbl src ## bh, RID1d; \ |
| 87 | + leaq s1(%rip), RID2; \ |
| 88 | + movl (RID2,RID1,4), dst ## d; \ |
87 | 89 | movzbl src ## bl, RID2d; \ |
| 90 | + leaq s2(%rip), RID1; \ |
| 91 | + op1 (RID1,RID2,4), dst ## d; \ |
88 | 92 | shrq $16, src; \ |
89 | | - movl s1(, RID1, 4), dst ## d; \ |
90 | | - op1 s2(, RID2, 4), dst ## d; \ |
91 | 93 | movzbl src ## bh, RID1d; \ |
| 94 | + leaq s3(%rip), RID2; \ |
| 95 | + op2 (RID2,RID1,4), dst ## d; \ |
92 | 96 | movzbl src ## bl, RID2d; \ |
93 | 97 | interleave_op(il_reg); \ |
94 | | - op2 s3(, RID1, 4), dst ## d; \ |
95 | | - op3 s4(, RID2, 4), dst ## d; |
| 98 | + leaq s4(%rip), RID1; \ |
| 99 | + op3 (RID1,RID2,4), dst ## d; |
96 | 100 |
|
97 | 101 | #define dummy(d) /* do nothing */ |
98 | 102 |
|
|
151 | 155 | subround(l ## 3, r ## 3, l ## 4, r ## 4, f); |
152 | 156 |
|
153 | 157 | #define enc_preload_rkr() \ |
154 | | - vbroadcastss .L16_mask, RKR; \ |
| 158 | + vbroadcastss .L16_mask(%rip), RKR; \ |
155 | 159 | /* add 16-bit rotation to key rotations (mod 32) */ \ |
156 | 160 | vpxor kr(CTX), RKR, RKR; |
157 | 161 |
|
158 | 162 | #define dec_preload_rkr() \ |
159 | | - vbroadcastss .L16_mask, RKR; \ |
| 163 | + vbroadcastss .L16_mask(%rip), RKR; \ |
160 | 164 | /* add 16-bit rotation to key rotations (mod 32) */ \ |
161 | 165 | vpxor kr(CTX), RKR, RKR; \ |
162 | | - vpshufb .Lbswap128_mask, RKR, RKR; |
| 166 | + vpshufb .Lbswap128_mask(%rip), RKR, RKR; |
163 | 167 |
|
164 | 168 | #define transpose_2x4(x0, x1, t0, t1) \ |
165 | 169 | vpunpckldq x1, x0, t0; \ |
@@ -235,9 +239,9 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16) |
235 | 239 |
|
236 | 240 | movq %rdi, CTX; |
237 | 241 |
|
238 | | - vmovdqa .Lbswap_mask, RKM; |
239 | | - vmovd .Lfirst_mask, R1ST; |
240 | | - vmovd .L32_mask, R32; |
| 242 | + vmovdqa .Lbswap_mask(%rip), RKM; |
| 243 | + vmovd .Lfirst_mask(%rip), R1ST; |
| 244 | + vmovd .L32_mask(%rip), R32; |
241 | 245 | enc_preload_rkr(); |
242 | 246 |
|
243 | 247 | inpack_blocks(RL1, RR1, RTMP, RX, RKM); |
@@ -271,7 +275,7 @@ SYM_FUNC_START_LOCAL(__cast5_enc_blk16) |
271 | 275 | popq %rbx; |
272 | 276 | popq %r15; |
273 | 277 |
|
274 | | - vmovdqa .Lbswap_mask, RKM; |
| 278 | + vmovdqa .Lbswap_mask(%rip), RKM; |
275 | 279 |
|
276 | 280 | outunpack_blocks(RR1, RL1, RTMP, RX, RKM); |
277 | 281 | outunpack_blocks(RR2, RL2, RTMP, RX, RKM); |
@@ -308,9 +312,9 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16) |
308 | 312 |
|
309 | 313 | movq %rdi, CTX; |
310 | 314 |
|
311 | | - vmovdqa .Lbswap_mask, RKM; |
312 | | - vmovd .Lfirst_mask, R1ST; |
313 | | - vmovd .L32_mask, R32; |
| 315 | + vmovdqa .Lbswap_mask(%rip), RKM; |
| 316 | + vmovd .Lfirst_mask(%rip), R1ST; |
| 317 | + vmovd .L32_mask(%rip), R32; |
314 | 318 | dec_preload_rkr(); |
315 | 319 |
|
316 | 320 | inpack_blocks(RL1, RR1, RTMP, RX, RKM); |
@@ -341,7 +345,7 @@ SYM_FUNC_START_LOCAL(__cast5_dec_blk16) |
341 | 345 | round(RL, RR, 1, 2); |
342 | 346 | round(RR, RL, 0, 1); |
343 | 347 |
|
344 | | - vmovdqa .Lbswap_mask, RKM; |
| 348 | + vmovdqa .Lbswap_mask(%rip), RKM; |
345 | 349 | popq %rbx; |
346 | 350 | popq %r15; |
347 | 351 |
|
@@ -504,8 +508,8 @@ SYM_FUNC_START(cast5_ctr_16way) |
504 | 508 |
|
505 | 509 | vpcmpeqd RKR, RKR, RKR; |
506 | 510 | vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */ |
507 | | - vmovdqa .Lbswap_iv_mask, R1ST; |
508 | | - vmovdqa .Lbswap128_mask, RKM; |
| 511 | + vmovdqa .Lbswap_iv_mask(%rip), R1ST; |
| 512 | + vmovdqa .Lbswap128_mask(%rip), RKM; |
509 | 513 |
|
510 | 514 | /* load IV and byteswap */ |
511 | 515 | vmovq (%rcx), RX; |
|
0 commit comments