Lines Matching +full:1 +full:- +full:v0

1 /* Do not modify. This file is auto-generated from vpaes-armv8.pl. */
94 .size _vpaes_consts,.-_vpaes_consts
99 // Fills register %r10 -> .aes_consts (so you can -fPIC)
100 // and %xmm9-%xmm15 as specified below.
111 .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
116 // AES-encrypt %xmm0.
120 // %xmm9-%xmm15 as in _vpaes_preheat
124 // Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
125 // Preserves %xmm6 - %xmm8 so you get some local vectors
137 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
140 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
141 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
142 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
150 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
151 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
154 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
157 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
159 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
162 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
163 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
164 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
165 sub w8, w8, #1 // nr--
169 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
170 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
172 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
173 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
174 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
175 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
176 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
177 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
178 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
180 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
186 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
187 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
190 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
192 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
193 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
195 .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
202 stp x29,x30,[sp,#-16]!
208 st1 {v0.16b}, [x1]
213 .size vpaes_encrypt,.-vpaes_encrypt
224 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
230 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
232 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
234 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
244 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
245 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
251 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
256 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
260 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
266 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
268 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
269 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
271 sub w8, w8, #1 // nr--
275 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
276 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
281 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
283 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
285 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
287 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
289 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
291 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
293 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
297 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
304 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
305 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
309 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
313 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
315 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
318 .size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x
331 .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
354 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
358 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
360 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
368 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
369 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
372 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
374 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
378 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
380 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
382 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
386 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
388 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
390 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
394 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
396 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
398 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
399 sub w8, w8, #1 // sub $1,%rax # nr--
403 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
404 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
406 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
407 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
408 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
409 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
410 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
411 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
412 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
414 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
422 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
425 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
426 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
428 .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
435 stp x29,x30,[sp,#-16]!
441 st1 {v0.16b}, [x1]
446 .size vpaes_decrypt,.-vpaes_decrypt
448 // v14-v15 input, v0-v1 output
465 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
472 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
476 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
485 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
486 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
491 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
494 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
500 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
504 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
507 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
513 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
517 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
520 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
526 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
530 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
533 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
535 sub w8, w8, #1 // sub $1,%rax # nr--
539 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
540 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
545 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
547 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
549 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
551 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
553 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
555 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
557 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
561 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
573 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
576 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
578 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
581 .size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x
603 .size _vpaes_key_preheat,.-_vpaes_key_preheat
609 stp x29, x30, [sp,#-16]!
614 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
617 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
619 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
626 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx)
645 // 128-bit specific part of key schedule.
654 sub x0, x0, #1 // dec %esi
663 // 192-bit specific part of key schedule.
665 // The main body of this schedule is the same as the 128-bit
678 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
680 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
686 sub x0, x0, #1 // dec %esi
688 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
691 bl _vpaes_schedule_mangle // save key n+1
701 // 256-bit specific part of key schedule.
703 // The structure here is very similar to the 128-bit
710 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
715 sub x0, x0, #1 // dec %esi
717 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
725 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
754 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
758 sub x2, x2, #16 // add $-16, %rdx
759 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
761 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key
764 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
775 .size _vpaes_schedule_core,.-_vpaes_schedule_core
780 // Smear the short, low side in the 192-bit key schedule.
795 dup v0.4s, v7.s[3]
796 ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
797 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
798 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
800 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
801 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
804 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
822 // Clobbers %xmm1-%xmm4, %r11.
834 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
835 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
847 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
848 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
851 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
852 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
853 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
854 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
856 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
857 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
858 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
860 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
866 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
869 .size _vpaes_schedule_round,.-_vpaes_schedule_round
874 // Linear-transform %xmm0 according to tables at (%r11)
883 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
884 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
888 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
889 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
891 .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
896 // Mangle xmm0 from (basis-transformed) standard version
901 // multiply by circulant 0,1,1,1
914 // Clobbers xmm1-xmm5
919 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
924 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
938 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
972 sub x2, x2, #16 // add $-16, %rdx
976 add x8, x8, #64-16 // add $-16, %r8
977 and x8, x8, #~(1<<6) // and $0x30, %r8
980 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
987 stp x29,x30,[sp,#-16]!
989 stp d8,d9,[sp,#-16]! // ABI spec says so
993 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1004 .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
1011 stp x29,x30,[sp,#-16]!
1013 stp d8,d9,[sp,#-16]! // ABI spec says so
1017 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1022 mov w3, #1 // mov $1,%ecx
1023 lsr w8, w1, #1 // shr $1,%r8d
1032 .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1042 stp x29,x30,[sp,#-16]!
1048 ld1 {v0.16b}, [x4] // load ivec
1055 eor v7.16b, v7.16b, v0.16b // xor with ivec
1057 st1 {v0.16b}, [x1],#16 // save output
1061 st1 {v0.16b}, [x4] // write ivec
1067 .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1074 stp x29,x30,[sp,#-16]!
1076 stp d8,d9,[sp,#-16]! // ABI spec says so
1077 stp d10,d11,[sp,#-16]!
1078 stp d12,d13,[sp,#-16]!
1079 stp d14,d15,[sp,#-16]!
1090 eor v0.16b, v0.16b, v6.16b // xor with ivec
1092 st1 {v0.16b}, [x1], #16
1100 eor v0.16b, v0.16b, v6.16b // xor with ivec
1103 st1 {v0.16b,v1.16b}, [x1], #32
1117 .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1123 stp x29,x30,[sp,#-16]!
1125 stp d8,d9,[sp,#-16]! // ABI spec says so
1126 stp d10,d11,[sp,#-16]!
1127 stp d12,d13,[sp,#-16]!
1128 stp d14,d15,[sp,#-16]!
1138 st1 {v0.16b}, [x1],#16
1146 st1 {v0.16b,v1.16b}, [x1], #32
1158 .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
1165 stp x29,x30,[sp,#-16]!
1167 stp d8,d9,[sp,#-16]! // ABI spec says so
1168 stp d10,d11,[sp,#-16]!
1169 stp d12,d13,[sp,#-16]!
1170 stp d14,d15,[sp,#-16]!
1180 st1 {v0.16b}, [x1],#16
1188 st1 {v0.16b,v1.16b}, [x1], #32
1200 .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt