xref: /freebsd/crypto/openssl/crypto/aes/asm/aesni-xts-avx512.pl (revision e7be843b4a162e68651d3911f0357ed464915629)
1#! /usr/bin/env perl
2# Copyright (C) 2023 Intel Corporation
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9# This implementation is based on the AES-XTS code (AVX512VAES + VPCLMULQDQ)
10# from Intel(R) Intelligent Storage Acceleration Library Crypto Version
11# (https://github.com/intel/isa-l_crypto).
12#
13######################################################################
14# The main building block of the loop is code that encrypts/decrypts
15# 8/16 blocks of data stitching with generation of tweak for the next
16# 8/16 blocks, utilizing VAES and VPCLMULQDQ instructions with full width
17# of ZMM registers. The main loop is selected based on the input length.
18# main_loop_run_16 encrypts/decrypts 16 blocks in parallel and it's selected
19# when input length >= 256 bytes (16 blocks)
20# main_loop_run_8 encrypts/decrypts 8 blocks in parallel and it's selected
21# when 128 bytes <= input length < 256 bytes (8-15 blocks)
22# Input length < 128 bytes (8 blocks) is handled by do_n_blocks.
23#
24# This implementation mainly uses vpshrdq from AVX-512-VBMI2 family and vaesenc,
25# vaesdec, vpclmulqdq from AVX-512F family.
26$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
27$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
28
29$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
30$avx512vaes=0;
31
32$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
33( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
34( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
35die "can't locate x86_64-xlate.pl";
36
37if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
38        =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
39    $avx512vaes = ($1>=2.30);
40}
41
42if (!$avx512vaes && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
43       `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
44    $avx512vaes = ($1==2.11 && $2>=8) + ($1>=2.12);
45}
46
47if (!$avx512vaes && `$ENV{CC} -v 2>&1`
48    =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) {
49    my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
50    if ($1) {
51        # Apple conditions, they use a different version series, see
52        # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2
53        # clang 7.0.0 is Apple clang 10.0.1
54        $avx512vaes = ($ver>=10.0001)
55    } else {
56        $avx512vaes = ($ver>=7.0);
57    }
58}
59
60open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
61    or die "can't call $xlate: $!";
62*STDOUT=*OUT;
63
64#======================================================================
65
66if ($avx512vaes) {
67
68  my $GP_STORAGE  = $win64 ? (16 * 18)  : (16 * 8);    # store rbx
69  my $XMM_STORAGE = $win64 ? (16 * 8) : 0;     # store xmm6:xmm15
70  my $VARIABLE_OFFSET = $win64 ? (16*8 + 16*10 + 8*3) :
71                                 (16*8 + 8*1);
72
73  # right now, >= 0x80 (128) is used for expanded keys. all usages of
74  # rsp should be invoked via $TW, not shadowed by any other name or
75  # used directly.
76  my $TW = "%rsp";
77  my $TEMPHIGH = "%rbx";
78  my $TEMPLOW = "%rax";
79  my $ZPOLY = "%zmm25";
80
81  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
82  # ;;; Function arguments abstraction
83  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
84  my ($key2, $key1, $tweak, $length, $input, $output);
85
86
87$input    = "%rdi";
88$output   = "%rsi";
89$length   = "%rdx";
90$key1     = "%rcx";
91$key2     = "%r8";
92$tweak    = "%r9";
93
94  # arguments for temp parameters
95  my ($tmp1, $gf_poly_8b, $gf_poly_8b_temp);
96    $tmp1                = "%r8";
97    $gf_poly_8b       = "%r10";
98    $gf_poly_8b_temp  = "%r11";
99
100  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
101  # ;;; Helper functions
102  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
103
104  # Generates "random" local labels
105  sub random_string() {
106    my @chars  = ('a' .. 'z', 'A' .. 'Z', '0' .. '9', '_');
107    my $length = 15;
108    my $str;
109    map { $str .= $chars[rand(33)] } 1 .. $length;
110    return $str;
111  }
112
113  # ; Seed the RNG so the labels are generated deterministically
114  srand(12345);
115
116  sub encrypt_tweak {
117    my $state_tweak = $_[0];
118    my $is_128 = $_[1];
119
120    $code.=<<___;
121    vpxor	($key2), $state_tweak, $state_tweak
122    vaesenc	0x10($key2), $state_tweak, $state_tweak
123    vaesenc	0x20($key2), $state_tweak, $state_tweak
124    vaesenc	0x30($key2), $state_tweak, $state_tweak
125    vaesenc	0x40($key2), $state_tweak, $state_tweak
126    vaesenc	0x50($key2), $state_tweak, $state_tweak
127    vaesenc	0x60($key2), $state_tweak, $state_tweak
128    vaesenc	0x70($key2), $state_tweak, $state_tweak
129    vaesenc	0x80($key2), $state_tweak, $state_tweak
130    vaesenc	0x90($key2), $state_tweak, $state_tweak
131___
132
133    if ($is_128) {
134      $code .= "vaesenclast	0xa0($key2), $state_tweak, $state_tweak\n";
135    } else {
136      $code .= "vaesenc	0xa0($key2), $state_tweak, $state_tweak\n";
137      $code .= "vaesenc	0xb0($key2), $state_tweak, $state_tweak\n";
138      $code .= "vaesenc	0xc0($key2), $state_tweak, $state_tweak\n";
139      $code .= "vaesenc	0xd0($key2), $state_tweak, $state_tweak\n";
140      $code .= "vaesenclast	0xe0($key2), $state_tweak, $state_tweak\n";
141    }
142    $code .= "vmovdqa	$state_tweak, ($TW)\n";
143  }
144
145  sub encrypt_final {
146    my $st = $_[0];
147    my $tw = $_[1];
148    my $is_128 = $_[2];
149
150    # xor Tweak value
151	$code .= "vpxor	$tw, $st, $st\n";
152    $code .= "vpxor	($key1), $st, $st\n";
153
154    my $rounds = $is_128 ? 10 : 14;
155    for (my $i = 1; $i < $rounds; $i++) {
156      $code .= "vaesenc	16*$i($key1), $st, $st\n";
157    }
158
159    $code .=<<___;
160    vaesenclast 16*$rounds($key1), $st, $st
161    vpxor	$tw, $st, $st
162___
163  }
164
165  # decrypt initial blocks of AES
166  # 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
167  # next 8 Tweak values are generated
168  sub decrypt_initial {
169    my @st;
170    $st[0] = $_[0];
171    $st[1] = $_[1];
172    $st[2] = $_[2];
173    $st[3] = $_[3];
174    $st[4] = $_[4];
175    $st[5] = $_[5];
176    $st[6] = $_[6];
177    $st[7] = $_[7];
178
179    my @tw;
180    $tw[0] = $_[8];
181    $tw[1] = $_[9];
182    $tw[2] = $_[10];
183    $tw[3] = $_[11];
184    $tw[4] = $_[12];
185    $tw[5] = $_[13];
186    $tw[6] = $_[14];
187    my $t0 = $_[15];
188    my $num_blocks = $_[16];
189    my $lt128 = $_[17];
190    my $is_128 = $_[18];
191
192    # num_blocks blocks encrypted
193    # num_blocks can be 1, 2, 3, 4, 5, 6, 7
194
195    #  xor Tweak value
196    for (my $i = 0; $i < $num_blocks; $i++) {
197      $code .= "vpxor $tw[$i], $st[$i], $st[$i]\n";
198    }
199
200    $code .= "vmovdqu  ($key1), $t0\n";
201
202    for (my $i = 0; $i < $num_blocks; $i++) {
203      $code .= "vpxor $t0, $st[$i], $st[$i]\n";
204    }
205
206    if (0 == $lt128) {
207      $code .= <<___;
208      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
209      shl     \$1, $TEMPLOW
210      adc     $TEMPHIGH, $TEMPHIGH
211___
212    }
213    # round 1
214    $code .= "vmovdqu 0x10($key1), $t0\n";
215
216    for (my $i = 0; $i < $num_blocks; $i++) {
217      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
218    }
219
220    if (0 == $lt128) {
221    $code .= <<___;
222      cmovc   $gf_poly_8b, $gf_poly_8b_temp
223      xor     $gf_poly_8b_temp, $TEMPLOW
224      mov     $TEMPLOW, ($TW)     # next Tweak1 generated
225      mov     $TEMPLOW, 0x08($TW)
226      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
227___
228    }
229
230    # round 2
231    $code .= "vmovdqu 0x20($key1), $t0\n";
232
233    for (my $i = 0; $i < $num_blocks; $i++) {
234      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
235    }
236
237    if (0 == $lt128) {
238      $code .= <<___;
239      shl     \$1, $TEMPLOW
240      adc     $TEMPHIGH, $TEMPHIGH
241      cmovc   $gf_poly_8b, $gf_poly_8b_temp
242      xor     $gf_poly_8b_temp, $TEMPLOW
243      mov     $TEMPLOW, 0x10($TW) # next Tweak2 generated
244___
245    }
246
247    # round 3
248    $code .= "vmovdqu 0x30($key1), $t0\n";
249
250    for (my $i = 0; $i < $num_blocks; $i++) {
251      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
252    }
253
254    if (0 == $lt128) {
255      $code .= <<___;
256      mov     $TEMPHIGH, 0x18($TW)
257      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
258      shl     \$1, $TEMPLOW
259      adc     $TEMPHIGH, $TEMPHIGH
260      cmovc   $gf_poly_8b, $gf_poly_8b_temp
261___
262    }
263
264    # round 4
265    $code .= "vmovdqu 0x40($key1), $t0\n";
266
267    for (my $i = 0; $i < $num_blocks; $i++) {
268      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
269    }
270
271    if (0 == $lt128) {
272    $code .= <<___;
273    xor     $gf_poly_8b_temp, $TEMPLOW
274    mov     $TEMPLOW, 0x20($TW) # next Tweak3 generated
275    mov     $TEMPHIGH, 0x28($TW)
276    xor     $gf_poly_8b_temp, $gf_poly_8b_temp
277    shl     \$1, $TEMPLOW
278___
279    }
280
281    # round 5
282    $code .= "vmovdqu 0x50($key1), $t0\n";
283
284    for (my $i = 0; $i < $num_blocks; $i++) {
285      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
286    }
287
288    if (0 == $lt128) {
289    $code .= <<___;
290      adc     $TEMPHIGH, $TEMPHIGH
291      cmovc   $gf_poly_8b, $gf_poly_8b_temp
292      xor     $gf_poly_8b_temp, $TEMPLOW
293      mov     $TEMPLOW, 0x30($TW) # next Tweak4 generated
294      mov     $TEMPHIGH, 0x38($TW)
295___
296    }
297
298    # round 6
299    $code .= "vmovdqu 0x60($key1), $t0\n";
300
301    for (my $i = 0; $i < $num_blocks; $i++) {
302      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
303    }
304
305    if (0 == $lt128) {
306      $code .= <<___;
307      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
308      shl     \$1, $TEMPLOW
309      adc     $TEMPHIGH, $TEMPHIGH
310      cmovc   $gf_poly_8b, $gf_poly_8b_temp
311      xor     $gf_poly_8b_temp, $TEMPLOW
312      mov     $TEMPLOW, 0x40($TW) # next Tweak5 generated
313      mov     $TEMPHIGH, 0x48($TW)
314___
315    }
316
317    # round 7
318    $code .= "vmovdqu 0x70($key1), $t0\n";
319
320    for (my $i = 0; $i < $num_blocks; $i++) {
321      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
322    }
323
324    if (0 == $lt128) {
325      $code .= <<___;
326      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
327      shl     \$1, $TEMPLOW
328      adc     $TEMPHIGH, $TEMPHIGH
329      cmovc   $gf_poly_8b, $gf_poly_8b_temp
330      xor     $gf_poly_8b_temp, $TEMPLOW
331      mov     $TEMPLOW, 0x50($TW) # next Tweak6 generated
332      mov     $TEMPHIGH, 0x58($TW)
333___
334    }
335
336    # round 8
337    $code .= "vmovdqu 0x80($key1), $t0\n";
338
339    for (my $i = 0; $i < $num_blocks; $i++) {
340      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
341    }
342
343    if (0 == $lt128) {
344      $code .= <<___;
345      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
346      shl     \$1, $TEMPLOW
347      adc     $TEMPHIGH, $TEMPHIGH
348      cmovc   $gf_poly_8b, $gf_poly_8b_temp
349      xor     $gf_poly_8b_temp, $TEMPLOW
350      mov     $TEMPLOW, 0x60($TW) # next Tweak7 generated
351      mov     $TEMPHIGH, 0x68($TW)
352___
353    }
354
355    # round 9
356    $code .= "vmovdqu 0x90($key1), $t0\n";
357
358    for (my $i = 0; $i < $num_blocks; $i++) {
359      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
360    }
361
362    if (0 == $lt128) {
363      $code .= <<___;
364      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
365      shl     \$1, $TEMPLOW
366      adc     $TEMPHIGH, $TEMPHIGH
367      cmovc   $gf_poly_8b, $gf_poly_8b_temp
368      xor     $gf_poly_8b_temp, $TEMPLOW
369      mov     $TEMPLOW, 0x70($TW) # next Tweak8 generated
370      mov     $TEMPHIGH, 0x78($TW)
371___
372    }
373
374    if ($is_128) {
375      # round 10
376      $code .= "vmovdqu 0xa0($key1), $t0\n";
377      for (my $i = 0; $i < $num_blocks; $i++) {
378        $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n";
379      }
380    } else {
381      # round 10
382      $code .= "vmovdqu 0xa0($key1), $t0\n";
383      for (my $i = 0; $i < $num_blocks; $i++) {
384        $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
385      }
386
387      # round 11
388      $code .= "vmovdqu 0xb0($key1), $t0\n";
389      for (my $i = 0; $i < $num_blocks; $i++) {
390        $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
391      }
392
393      # round 12
394      $code .= "vmovdqu 0xc0($key1), $t0\n";
395      for (my $i = 0; $i < $num_blocks; $i++) {
396        $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
397      }
398
399      # round 13
400      $code .= "vmovdqu 0xd0($key1), $t0\n";
401      for (my $i = 0; $i < $num_blocks; $i++) {
402        $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
403      }
404
405      # round 14
406      $code .= "vmovdqu 0xe0($key1), $t0\n";
407      for (my $i = 0; $i < $num_blocks; $i++) {
408        $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n";
409      }
410    }
411
412    # xor Tweak values
413    for (my $i = 0; $i < $num_blocks; $i++) {
414      $code .= "vpxor $tw[$i], $st[$i], $st[$i]\n";
415    }
416
417    if (0 == $lt128) {
418      # load next Tweak values
419      $code .= <<___;
420      vmovdqa  ($TW), $tw1
421      vmovdqa  0x10($TW), $tw2
422      vmovdqa  0x20($TW), $tw3
423      vmovdqa  0x30($TW), $tw4
424      vmovdqa  0x40($TW), $tw5
425      vmovdqa  0x50($TW), $tw6
426      vmovdqa  0x60($TW), $tw7
427___
428    }
429  }
430
431  sub initialize {
432    my @st;
433    $st[0] = $_[0];
434    $st[1] = $_[1];
435    $st[2] = $_[2];
436    $st[3] = $_[3];
437    $st[4] = $_[4];
438    $st[5] = $_[5];
439    $st[6] = $_[6];
440    $st[7] = $_[7];
441
442    my @tw;
443    $tw[0] = $_[8];
444    $tw[1] = $_[9];
445    $tw[2] = $_[10];
446    $tw[3] = $_[11];
447    $tw[4] = $_[12];
448    $tw[5] = $_[13];
449    $tw[6] = $_[14];
450    my $num_initial_blocks = $_[15];
451
452    $code .= <<___;
453    vmovdqa  0x0($TW), $tw[0]
454    mov      0x0($TW), $TEMPLOW
455    mov      0x08($TW), $TEMPHIGH
456    vmovdqu  0x0($input), $st[0]
457___
458
459    if ($num_initial_blocks >= 2) {
460      for (my $i = 1; $i < $num_initial_blocks; $i++) {
461        $code .= "xor      $gf_poly_8b_temp, $gf_poly_8b_temp\n";
462        $code .= "shl      \$1, $TEMPLOW\n";
463        $code .= "adc      $TEMPHIGH, $TEMPHIGH\n";
464        $code .= "cmovc    $gf_poly_8b, $gf_poly_8b_temp\n";
465        $code .= "xor      $gf_poly_8b_temp, $TEMPLOW\n";
466        my $offset = $i * 16;
467        $code .= "mov      $TEMPLOW, $offset($TW)\n";
468        $code .= "mov      $TEMPHIGH, $offset + 8($TW)\n";
469        $code .= "vmovdqa  $offset($TW), $tw[$i]\n";
470        $code .= "vmovdqu  $offset($input), $st[$i]\n";
471      }
472    }
473  }
474
475  # Encrypt 4 blocks in parallel
476  sub encrypt_by_four {
477    my $st1 = $_[0]; # state 1
478    my $tw1 = $_[1]; # tweak 1
479    my $tmp = $_[2];
480    my $is_128 = $_[3];
481
482    $code .= "vbroadcasti32x4 ($key1), $tmp\n";
483    $code .= "vpternlogq      \$0x96, $tmp, $tw1, $st1\n";
484
485    my $rounds = $is_128 ? 10 : 14;
486    for (my $i = 1; $i < $rounds; $i++) {
487      $code .= "vbroadcasti32x4 16*$i($key1), $tmp\n";
488      $code .= "vaesenc  $tmp, $st1, $st1\n";
489    }
490
491    $code .= "vbroadcasti32x4 16*$rounds($key1), $tmp\n";
492    $code .= "vaesenclast  $tmp, $st1, $st1\n";
493
494    $code .= "vpxorq $tw1, $st1, $st1\n";
495  }
496
497  # Encrypt 8 blocks in parallel
498  # generate next 8 tweak values
499  sub encrypt_by_eight_zmm {
500    my $st1 = $_[0];
501    my $st2 = $_[1];
502    my $tw1 = $_[2];
503    my $tw2 = $_[3];
504    my $t0 = $_[4];
505    my $last_eight = $_[5];
506    my $is_128 = $_[6];
507
508    $code .= <<___;
509	vbroadcasti32x4 ($key1), $t0
510	vpternlogq    \$0x96, $t0, $tw1, $st1
511	vpternlogq    \$0x96, $t0, $tw2, $st2
512___
513
514    if (0 == $last_eight) {
515      $code .= <<___;
516      vpsrldq		\$0xf, $tw1, %zmm13
517      vpclmulqdq	\$0x0, $ZPOLY, %zmm13, %zmm14
518      vpslldq		\$0x1, $tw1, %zmm15
519      vpxord		%zmm14, %zmm15, %zmm15
520___
521    }
522    # round 1
523    $code .= <<___;
524    vbroadcasti32x4 0x10($key1), $t0
525    vaesenc  $t0, $st1, $st1
526    vaesenc  $t0, $st2, $st2
527
528    # round 2
529    vbroadcasti32x4 0x20($key1), $t0
530    vaesenc  $t0, $st1, $st1
531    vaesenc  $t0, $st2, $st2
532
533    # round 3
534    vbroadcasti32x4 0x30($key1), $t0
535    vaesenc  $t0, $st1, $st1
536    vaesenc  $t0, $st2, $st2
537___
538
539    if (0 == $last_eight) {
540      $code .= <<___;
541      vpsrldq		\$0xf, $tw2, %zmm13
542      vpclmulqdq	\$0x0, $ZPOLY, %zmm13, %zmm14
543      vpslldq		\$0x1, $tw2, %zmm16
544      vpxord		%zmm14, %zmm16, %zmm16
545___
546    }
547
548    $code .= <<___;
549    # round 4
550    vbroadcasti32x4 0x40($key1), $t0
551    vaesenc  $t0, $st1, $st1
552    vaesenc  $t0, $st2, $st2
553
554    # round 5
555    vbroadcasti32x4 0x50($key1), $t0
556    vaesenc  $t0, $st1, $st1
557    vaesenc  $t0, $st2, $st2
558
559    # round 6
560    vbroadcasti32x4 0x60($key1), $t0
561    vaesenc  $t0, $st1, $st1
562    vaesenc  $t0, $st2, $st2
563
564    # round 7
565    vbroadcasti32x4 0x70($key1), $t0
566    vaesenc  $t0, $st1, $st1
567    vaesenc  $t0, $st2, $st2
568
569    # round 8
570    vbroadcasti32x4 0x80($key1), $t0
571    vaesenc  $t0, $st1, $st1
572    vaesenc  $t0, $st2, $st2
573
574    # round 9
575    vbroadcasti32x4 0x90($key1), $t0
576    vaesenc  $t0, $st1, $st1
577    vaesenc  $t0, $st2, $st2
578___
579
580    if ($is_128) {
581      $code .= <<___;
582      # round 10
583      vbroadcasti32x4 0xa0($key1), $t0
584      vaesenclast  $t0, $st1, $st1
585      vaesenclast  $t0, $st2, $st2
586___
587    } else {
588      $code .= <<___;
589      # round 10
590      vbroadcasti32x4 0xa0($key1), $t0
591      vaesenc  $t0, $st1, $st1
592      vaesenc  $t0, $st2, $st2
593
594      # round 11
595      vbroadcasti32x4 0xb0($key1), $t0
596      vaesenc  $t0, $st1, $st1
597      vaesenc  $t0, $st2, $st2
598
599      # round 12
600      vbroadcasti32x4 0xc0($key1), $t0
601      vaesenc  $t0, $st1, $st1
602      vaesenc  $t0, $st2, $st2
603
604      # round 13
605      vbroadcasti32x4 0xd0($key1), $t0
606      vaesenc  $t0, $st1, $st1
607      vaesenc  $t0, $st2, $st2
608
609      # round 14
610      vbroadcasti32x4 0xe0($key1), $t0
611      vaesenclast  $t0, $st1, $st1
612      vaesenclast  $t0, $st2, $st2
613___
614    }
615
616    # xor Tweak values
617    $code .= "vpxorq    $tw1, $st1, $st1\n";
618    $code .= "vpxorq    $tw2, $st2, $st2\n";
619
620    if (0 == $last_eight) {
621      # load next Tweak values
622      $code .= <<___;
623      vmovdqa32  %zmm15, $tw1
624      vmovdqa32  %zmm16, $tw2
625___
626    }
627  }
628
629  # Decrypt 8 blocks in parallel
630  # generate next 8 tweak values
631  sub decrypt_by_eight_zmm {
632    my $st1 = $_[0];
633    my $st2 = $_[1];
634    my $tw1 = $_[2];
635    my $tw2 = $_[3];
636    my $t0 = $_[4];
637    my $last_eight = $_[5];
638    my $is_128 = $_[6];
639
640    $code .= <<___;
641    # xor Tweak values
642    vpxorq    $tw1, $st1, $st1
643    vpxorq    $tw2, $st2, $st2
644
645    # ARK
646    vbroadcasti32x4 ($key1), $t0
647    vpxorq    $t0, $st1, $st1
648    vpxorq    $t0, $st2, $st2
649___
650
651    if (0 == $last_eight) {
652      $code .= <<___;
653      vpsrldq		\$0xf, $tw1, %zmm13
654      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
655      vpslldq		\$0x1, $tw1, %zmm15
656      vpxord		%zmm14, %zmm15, %zmm15
657___
658    }
659    # round 1
660    $code .= <<___;
661    vbroadcasti32x4 0x10($key1), $t0
662    vaesdec  $t0, $st1, $st1
663    vaesdec  $t0, $st2, $st2
664
665    # round 2
666    vbroadcasti32x4 0x20($key1), $t0
667    vaesdec  $t0, $st1, $st1
668    vaesdec  $t0, $st2, $st2
669
670    # round 3
671    vbroadcasti32x4 0x30($key1), $t0
672    vaesdec  $t0, $st1, $st1
673    vaesdec  $t0, $st2, $st2
674___
675
676    if (0 == $last_eight) {
677      $code .= <<___;
678      vpsrldq		\$0xf, $tw2, %zmm13
679      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
680      vpslldq		\$0x1, $tw2, %zmm16
681      vpxord		%zmm14, %zmm16, %zmm16
682___
683    }
684
685    $code .= <<___;
686    # round 4
687    vbroadcasti32x4 0x40($key1), $t0
688    vaesdec  $t0, $st1, $st1
689    vaesdec  $t0, $st2, $st2
690
691    # round 5
692    vbroadcasti32x4 0x50($key1), $t0
693    vaesdec  $t0, $st1, $st1
694    vaesdec  $t0, $st2, $st2
695
696    # round 6
697    vbroadcasti32x4 0x60($key1), $t0
698    vaesdec  $t0, $st1, $st1
699    vaesdec  $t0, $st2, $st2
700
701    # round 7
702    vbroadcasti32x4 0x70($key1), $t0
703    vaesdec  $t0, $st1, $st1
704    vaesdec  $t0, $st2, $st2
705
706    # round 8
707    vbroadcasti32x4 0x80($key1), $t0
708    vaesdec  $t0, $st1, $st1
709    vaesdec  $t0, $st2, $st2
710
711    # round 9
712    vbroadcasti32x4 0x90($key1), $t0
713    vaesdec  $t0, $st1, $st1
714    vaesdec  $t0, $st2, $st2
715
716___
717    if ($is_128) {
718      $code .= <<___;
719      # round 10
720      vbroadcasti32x4 0xa0($key1), $t0
721      vaesdeclast  $t0, $st1, $st1
722      vaesdeclast  $t0, $st2, $st2
723___
724    } else {
725      $code .= <<___;
726      # round 10
727      vbroadcasti32x4 0xa0($key1), $t0
728      vaesdec  $t0, $st1, $st1
729      vaesdec  $t0, $st2, $st2
730
731      # round 11
732      vbroadcasti32x4 0xb0($key1), $t0
733      vaesdec  $t0, $st1, $st1
734      vaesdec  $t0, $st2, $st2
735
736      # round 12
737      vbroadcasti32x4 0xc0($key1), $t0
738      vaesdec  $t0, $st1, $st1
739      vaesdec  $t0, $st2, $st2
740
741      # round 13
742      vbroadcasti32x4 0xd0($key1), $t0
743      vaesdec  $t0, $st1, $st1
744      vaesdec  $t0, $st2, $st2
745
746      # round 14
747      vbroadcasti32x4 0xe0($key1), $t0
748      vaesdeclast  $t0, $st1, $st1
749      vaesdeclast  $t0, $st2, $st2
750___
751    }
752
753    $code .= <<___;
754    # xor Tweak values
755    vpxorq    $tw1, $st1, $st1
756    vpxorq    $tw2, $st2, $st2
757
758    # load next Tweak values
759    vmovdqa32  %zmm15, $tw1
760    vmovdqa32  %zmm16, $tw2
761___
762  }
763
764  # Encrypt 16 blocks in parallel
765  # generate next 16 tweak values
766  sub encrypt_by_16_zmm {
767    my @st;
768    $st[0] = $_[0];
769    $st[1] = $_[1];
770    $st[2] = $_[2];
771    $st[3] = $_[3];
772
773    my @tw;
774    $tw[0] = $_[4];
775    $tw[1] = $_[5];
776    $tw[2] = $_[6];
777    $tw[3] = $_[7];
778
779    my $t0 = $_[8];
780    my $last_eight = $_[9];
781    my $is_128 = $_[10];
782
783    # xor Tweak values
784    for (my $i = 0; $i < 4; $i++) {
785      $code .= "vpxorq    $tw[$i], $st[$i], $st[$i]\n";
786    }
787
788    # ARK
789    $code .= "vbroadcasti32x4 ($key1), $t0\n";
790    for (my $i = 0; $i < 4; $i++) {
791      $code .= "vpxorq $t0, $st[$i], $st[$i]\n";
792    }
793
794    if (0 == $last_eight) {
795      $code .= <<___;
796      vpsrldq		\$0xf, $tw[2], %zmm13
797      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
798      vpslldq		\$0x1, $tw[2], %zmm15
799      vpxord		%zmm14, %zmm15, %zmm15
800___
801    }
802
803    # round 1
804    $code .= "vbroadcasti32x4 0x10($key1), $t0\n";
805    for (my $i = 0; $i < 4; $i++) {
806      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
807    }
808
809    # round 2
810    $code .= "vbroadcasti32x4 0x20($key1), $t0\n";
811    for (my $i = 0; $i < 4; $i++) {
812      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
813    }
814
815    # round 3
816    $code .= "vbroadcasti32x4 0x30($key1), $t0\n";
817    for (my $i = 0; $i < 4; $i++) {
818      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
819    }
820
821    if (0 == $last_eight) {
822      $code .= <<___;
823      vpsrldq		\$0xf, $tw[3], %zmm13
824      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
825      vpslldq		\$0x1, $tw[3], %zmm16
826      vpxord		%zmm14, %zmm16, %zmm16
827___
828    }
829    # round 4
830    $code .= "vbroadcasti32x4 0x40($key1), $t0\n";
831    for (my $i = 0; $i < 4; $i++) {
832      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
833    }
834
835    # round 5
836    $code .= "vbroadcasti32x4 0x50($key1), $t0\n";
837    for (my $i = 0; $i < 4; $i++) {
838      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
839    }
840
841    # round 6
842    $code .= "vbroadcasti32x4 0x60($key1), $t0\n";
843    for (my $i = 0; $i < 4; $i++) {
844      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
845    }
846
847    if (0 == $last_eight) {
848      $code .= <<___;
849      vpsrldq		\$0xf, %zmm15, %zmm13
850      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
851      vpslldq		\$0x1, %zmm15, %zmm17
852      vpxord		%zmm14, %zmm17, %zmm17
853___
854    }
855    # round 7
856    $code .= "vbroadcasti32x4 0x70($key1), $t0\n";
857    for (my $i = 0; $i < 4; $i++) {
858      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
859    }
860
861    # round 8
862    $code .= "vbroadcasti32x4 0x80($key1), $t0\n";
863    for (my $i = 0; $i < 4; $i++) {
864      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
865    }
866
867    # round 9
868    $code .= "vbroadcasti32x4 0x90($key1), $t0\n";
869    for (my $i = 0; $i < 4; $i++) {
870      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
871    }
872
873    if (0 == $last_eight) {
874      $code .= <<___;
875      vpsrldq		\$0xf, %zmm16, %zmm13
876      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
877      vpslldq		\$0x1, %zmm16, %zmm18
878      vpxord		%zmm14, %zmm18, %zmm18
879___
880    }
881    if ($is_128) {
882      # round 10
883      $code .= "vbroadcasti32x4 0xa0($key1), $t0\n";
884      for (my $i = 0; $i < 4; $i++) {
885        $code .= "vaesenclast $t0, $st[$i], $st[$i]\n";
886      }
887    } else {
888      # round 10
889      $code .= "vbroadcasti32x4 0xa0($key1), $t0\n";
890      for (my $i = 0; $i < 4; $i++) {
891        $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
892      }
893      # round 11
894      $code .= "vbroadcasti32x4 0xb0($key1), $t0\n";
895      for (my $i = 0; $i < 4; $i++) {
896        $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
897      }
898      # round 12
899      $code .= "vbroadcasti32x4 0xc0($key1), $t0\n";
900      for (my $i = 0; $i < 4; $i++) {
901        $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
902      }
903      # round 13
904      $code .= "vbroadcasti32x4 0xd0($key1), $t0\n";
905      for (my $i = 0; $i < 4; $i++) {
906        $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
907      }
908      # round 14
909      $code .= "vbroadcasti32x4 0xe0($key1), $t0\n";
910      for (my $i = 0; $i < 4; $i++) {
911        $code .= "vaesenclast $t0, $st[$i], $st[$i]\n";
912      }
913    }
914
915    # xor Tweak values
916    for (my $i = 0; $i < 4; $i++) {
917      $code .= "vpxorq    $tw[$i], $st[$i], $st[$i]\n";
918    }
919
920    $code .= <<___;
921    # load next Tweak values
922    vmovdqa32  %zmm15, $tw[0]
923    vmovdqa32  %zmm16, $tw[1]
924    vmovdqa32  %zmm17, $tw[2]
925    vmovdqa32  %zmm18, $tw[3]
926___
927  }
928
929  # Decrypt 16 blocks in parallel
930  # generate next 8 tweak values
931  sub decrypt_by_16_zmm {
932    my @st;
933    $st[0] = $_[0];
934    $st[1] = $_[1];
935    $st[2] = $_[2];
936    $st[3] = $_[3];
937
938    my @tw;
939    $tw[0] = $_[4];
940    $tw[1] = $_[5];
941    $tw[2] = $_[6];
942    $tw[3] = $_[7];
943
944    my $t0 = $_[8];
945    my $last_eight = $_[9];
946    my $is_128 = $_[10];
947
948    # xor Tweak values
949    for (my $i = 0; $i < 4; $i++) {
950      $code .= "vpxorq    $tw[$i], $st[$i], $st[$i]\n";
951    }
952
953    # ARK
954    $code .= "vbroadcasti32x4 ($key1), $t0\n";
955    for (my $i = 0; $i < 4; $i++) {
956      $code .= "vpxorq $t0, $st[$i], $st[$i]\n";
957    }
958
959    if (0 == $last_eight) {
960      $code .= <<___;
961      vpsrldq		\$0xf, $tw[2], %zmm13
962      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
963      vpslldq		\$0x1, $tw[2], %zmm15
964      vpxord		%zmm14, %zmm15, %zmm15
965___
966    }
967
968    # round 1
969    $code .= "vbroadcasti32x4 0x10($key1), $t0\n";
970    for (my $i = 0; $i < 4; $i++) {
971      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
972    }
973
974    # round 2
975    $code .= "vbroadcasti32x4 0x20($key1), $t0\n";
976    for (my $i = 0; $i < 4; $i++) {
977      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
978    }
979
980    # round 3
981    $code .= "vbroadcasti32x4 0x30($key1), $t0\n";
982    for (my $i = 0; $i < 4; $i++) {
983      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
984    }
985
986    if (0 == $last_eight) {
987      $code .= <<___;
988      vpsrldq		\$0xf, $tw[3], %zmm13
989      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
990      vpslldq		\$0x1, $tw[3], %zmm16
991      vpxord		%zmm14, %zmm16, %zmm16
992___
993    }
994    # round 4
995    $code .= "vbroadcasti32x4 0x40($key1), $t0\n";
996    for (my $i = 0; $i < 4; $i++) {
997      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
998    }
999
1000    # round 5
1001    $code .= "vbroadcasti32x4 0x50($key1), $t0\n";
1002    for (my $i = 0; $i < 4; $i++) {
1003      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
1004    }
1005
1006    # round 6
1007    $code .= "vbroadcasti32x4 0x60($key1), $t0\n";
1008    for (my $i = 0; $i < 4; $i++) {
1009      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
1010    }
1011
1012    if (0 == $last_eight) {
1013      $code .= <<___;
1014      vpsrldq		\$0xf, %zmm15, %zmm13
1015      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
1016      vpslldq		\$0x1, %zmm15, %zmm17
1017      vpxord		%zmm14, %zmm17, %zmm17
1018___
1019    }
1020    # round 7
1021    $code .= "vbroadcasti32x4 0x70($key1), $t0\n";
1022    for (my $i = 0; $i < 4; $i++) {
1023      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
1024    }
1025
1026    # round 8
1027    $code .= "vbroadcasti32x4 0x80($key1), $t0\n";
1028    for (my $i = 0; $i < 4; $i++) {
1029      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
1030    }
1031
1032    # round 9
1033    $code .= "vbroadcasti32x4 0x90($key1), $t0\n";
1034    for (my $i = 0; $i < 4; $i++) {
1035      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
1036    }
1037
1038    if (0 == $last_eight) {
1039      $code .= <<___;
1040      vpsrldq		\$0xf, %zmm16, %zmm13
1041      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
1042      vpslldq		\$0x1, %zmm16, %zmm18
1043      vpxord		%zmm14, %zmm18, %zmm18
1044___
1045    }
1046    if ($is_128) {
1047      # round 10
1048      $code .= "vbroadcasti32x4 0xa0($key1), $t0\n";
1049      for (my $i = 0; $i < 4; $i++) {
1050        $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n";
1051      }
1052    } else {
1053      # round 10
1054      $code .= "vbroadcasti32x4 0xa0($key1), $t0\n";
1055      for (my $i = 0; $i < 4; $i++) {
1056        $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
1057      }
1058
1059      # round 11
1060      $code .= "vbroadcasti32x4 0xb0($key1), $t0\n";
1061      for (my $i = 0; $i < 4; $i++) {
1062        $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
1063      }
1064
1065      # round 12
1066      $code .= "vbroadcasti32x4 0xc0($key1), $t0\n";
1067      for (my $i = 0; $i < 4; $i++) {
1068        $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
1069      }
1070
1071      # round 13
1072      $code .= "vbroadcasti32x4 0xd0($key1), $t0\n";
1073      for (my $i = 0; $i < 4; $i++) {
1074        $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
1075      }
1076
1077      # round 14
1078      $code .= "vbroadcasti32x4 0xe0($key1), $t0\n";
1079      for (my $i = 0; $i < 4; $i++) {
1080        $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n";
1081      }
1082    }
1083
1084    # xor Tweak values
1085    for (my $i = 0; $i < 4; $i++) {
1086      $code .= "vpxorq    $tw[$i], $st[$i], $st[$i]\n";
1087    }
1088
1089    $code .= <<___;
1090    # load next Tweak values
1091    vmovdqa32  %zmm15, $tw[0]
1092    vmovdqa32  %zmm16, $tw[1]
1093    vmovdqa32  %zmm17, $tw[2]
1094    vmovdqa32  %zmm18, $tw[3]
1095___
1096  }
1097
1098  $code .= ".text\n";
1099
1100  {
1101    $code.=<<"___";
1102    .extern	OPENSSL_ia32cap_P
1103    .globl	aesni_xts_avx512_eligible
1104    .type	aesni_xts_avx512_eligible,\@abi-omnipotent
1105    .align	32
1106    aesni_xts_avx512_eligible:
1107        mov	OPENSSL_ia32cap_P+8(%rip), %ecx
1108        xor	%eax,%eax
1109    	# 1<<31|1<<30|1<<17|1<<16 avx512vl + avx512bw + avx512dq + avx512f
1110        and	\$0xc0030000, %ecx
1111        cmp	\$0xc0030000, %ecx
1112        jne	.L_done
1113        mov	OPENSSL_ia32cap_P+12(%rip), %ecx
1114    	# 1<<10|1<<9|1<<6 vaes + vpclmulqdq + vbmi2
1115        and	\$0x640, %ecx
1116        cmp	\$0x640, %ecx
1117        cmove	%ecx,%eax
1118        .L_done:
1119        ret
1120    .size   aesni_xts_avx512_eligible, .-aesni_xts_avx512_eligible
1121___
1122  }
1123
1124
1125  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1126  # ;void aesni_xts_[128|256]_encrypt_avx512(
1127  # ;               const uint8_t *in,        // input data
1128  # ;               uint8_t *out,             // output data
1129  # ;               size_t length,            // sector size, in bytes
1130  # ;               const AES_KEY *key1,      // key used for "ECB" encryption
1131  # ;               const AES_KEY *key2,      // key used for tweaking
1132  # ;               const uint8_t iv[16])     // initial tweak value, 16 bytes
1133  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1134  sub enc {
1135    my $is_128 = $_[0];
1136    my $rndsuffix = &random_string();
1137
1138    if ($is_128) {
1139      $code.=<<___;
1140      .globl	aesni_xts_128_encrypt_avx512
1141      .hidden	aesni_xts_128_encrypt_avx512
1142      .type	aesni_xts_128_encrypt_avx512,\@function,6
1143      .align	32
1144      aesni_xts_128_encrypt_avx512:
1145      .cfi_startproc
1146      endbranch
1147___
1148    } else {
1149      $code.=<<___;
1150      .globl	aesni_xts_256_encrypt_avx512
1151      .hidden	aesni_xts_256_encrypt_avx512
1152      .type	aesni_xts_256_encrypt_avx512,\@function,6
1153      .align	32
1154      aesni_xts_256_encrypt_avx512:
1155      .cfi_startproc
1156      endbranch
1157___
1158    }
1159    $code .= "push 	 %rbp\n";
1160    $code .= "mov 	 $TW,%rbp\n";
1161    $code .= "sub 	 \$$VARIABLE_OFFSET,$TW\n";
1162    $code .= "and 	 \$0xffffffffffffffc0,$TW\n";
1163    $code .= "mov 	 %rbx,$GP_STORAGE($TW)\n";
1164
1165    if ($win64) {
1166      $code .= "mov 	 %rdi,$GP_STORAGE + 8*1($TW)\n";
1167      $code .= "mov 	 %rsi,$GP_STORAGE + 8*2($TW)\n";
1168      $code .= "vmovdqa      %xmm6, $XMM_STORAGE + 16*0($TW)\n";
1169      $code .= "vmovdqa      %xmm7, $XMM_STORAGE + 16*1($TW)\n";
1170      $code .= "vmovdqa      %xmm8, $XMM_STORAGE + 16*2($TW)\n";
1171      $code .= "vmovdqa      %xmm9, $XMM_STORAGE + 16*3($TW)\n";
1172      $code .= "vmovdqa      %xmm10, $XMM_STORAGE + 16*4($TW)\n";
1173      $code .= "vmovdqa      %xmm11, $XMM_STORAGE + 16*5($TW)\n";
1174      $code .= "vmovdqa      %xmm12, $XMM_STORAGE + 16*6($TW)\n";
1175      $code .= "vmovdqa      %xmm13, $XMM_STORAGE + 16*7($TW)\n";
1176      $code .= "vmovdqa      %xmm14, $XMM_STORAGE + 16*8($TW)\n";
1177      $code .= "vmovdqa      %xmm15, $XMM_STORAGE + 16*9($TW)\n";
1178    }
1179
1180    $code .= "mov 	 \$0x87, $gf_poly_8b\n";
1181    $code .= "vmovdqu 	 ($tweak),%xmm1\n";      # read initial tweak values
1182
1183    encrypt_tweak("%xmm1", $is_128);
1184
1185    if ($win64) {
1186      $code .= "mov	 $input, 8 + 8*5(%rbp)\n";  # ciphertext pointer
1187      $code .= "mov        $output, 8 + 8*6(%rbp)\n"; # plaintext pointer
1188    }
1189
1190    {
1191    $code.=<<___;
1192
1193    cmp 	 \$0x80,$length
1194    jl 	 .L_less_than_128_bytes_${rndsuffix}
1195    vpbroadcastq 	 $gf_poly_8b,$ZPOLY
1196    cmp 	 \$0x100,$length
1197    jge 	 .L_start_by16_${rndsuffix}
1198    cmp 	 \$0x80,$length
1199    jge 	 .L_start_by8_${rndsuffix}
1200
1201    .L_do_n_blocks_${rndsuffix}:
1202    cmp 	 \$0x0,$length
1203    je 	 .L_ret_${rndsuffix}
1204    cmp 	 \$0x70,$length
1205    jge 	 .L_remaining_num_blocks_is_7_${rndsuffix}
1206    cmp 	 \$0x60,$length
1207    jge 	 .L_remaining_num_blocks_is_6_${rndsuffix}
1208    cmp 	 \$0x50,$length
1209    jge 	 .L_remaining_num_blocks_is_5_${rndsuffix}
1210    cmp 	 \$0x40,$length
1211    jge 	 .L_remaining_num_blocks_is_4_${rndsuffix}
1212    cmp 	 \$0x30,$length
1213    jge 	 .L_remaining_num_blocks_is_3_${rndsuffix}
1214    cmp 	 \$0x20,$length
1215    jge 	 .L_remaining_num_blocks_is_2_${rndsuffix}
1216    cmp 	 \$0x10,$length
1217    jge 	 .L_remaining_num_blocks_is_1_${rndsuffix}
1218    vmovdqa 	 %xmm0,%xmm8
1219    vmovdqa 	 %xmm9,%xmm0
1220    jmp 	 .L_steal_cipher_${rndsuffix}
1221
1222    .L_remaining_num_blocks_is_7_${rndsuffix}:
1223    mov 	 \$0x0000ffffffffffff,$tmp1
1224    kmovq 	 $tmp1,%k1
1225    vmovdqu8 	 ($input),%zmm1
1226    vmovdqu8 	 0x40($input),%zmm2{%k1}
1227    add 	 \$0x70,$input
1228___
1229    }
1230
1231    encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1232
1233    {
1234    $code .= <<___;
1235    vmovdqu8 	 %zmm1,($output)
1236    vmovdqu8 	 %zmm2,0x40($output){%k1}
1237    add 	 \$0x70,$output
1238    vextracti32x4 	 \$0x2,%zmm2,%xmm8
1239    vextracti32x4 	 \$0x3,%zmm10,%xmm0
1240    and 	 \$0xf,$length
1241    je 	 .L_ret_${rndsuffix}
1242    jmp 	 .L_steal_cipher_${rndsuffix}
1243
1244    .L_remaining_num_blocks_is_6_${rndsuffix}:
1245    vmovdqu8 	 ($input),%zmm1
1246    vmovdqu8 	 0x40($input),%ymm2
1247    add 	 \$0x60,$input
1248___
1249    }
1250
1251    encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1252
1253    {
1254    $code .= <<___;
1255    vmovdqu8 	 %zmm1,($output)
1256    vmovdqu8 	 %ymm2,0x40($output)
1257    add 	 \$0x60,$output
1258    vextracti32x4 	 \$0x1,%zmm2,%xmm8
1259    vextracti32x4 	 \$0x2,%zmm10,%xmm0
1260    and 	 \$0xf,$length
1261    je 	 .L_ret_${rndsuffix}
1262    jmp 	 .L_steal_cipher_${rndsuffix}
1263
1264    .L_remaining_num_blocks_is_5_${rndsuffix}:
1265    vmovdqu8 	 ($input),%zmm1
1266    vmovdqu 	 0x40($input),%xmm2
1267    add 	 \$0x50,$input
1268___
1269    }
1270
1271    encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1272
1273    {
1274    $code .= <<___;
1275    vmovdqu8 	 %zmm1,($output)
1276    vmovdqu 	 %xmm2,0x40($output)
1277    add 	 \$0x50,$output
1278    vmovdqa 	 %xmm2,%xmm8
1279    vextracti32x4 	 \$0x1,%zmm10,%xmm0
1280    and 	 \$0xf,$length
1281    je 	 .L_ret_${rndsuffix}
1282    jmp 	 .L_steal_cipher_${rndsuffix}
1283
1284    .L_remaining_num_blocks_is_4_${rndsuffix}:
1285    vmovdqu8 	 ($input),%zmm1
1286    add 	 \$0x40,$input
1287___
1288    }
1289
1290    encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128);
1291
1292    {
1293    $code .= <<___;
1294    vmovdqu8	%zmm1,($output)
1295    add	\$0x40,$output
1296    vextracti32x4	\$0x3,%zmm1,%xmm8
1297    vmovdqa64	%xmm10, %xmm0
1298    and	\$0xf,$length
1299    je	.L_ret_${rndsuffix}
1300    jmp	.L_steal_cipher_${rndsuffix}
1301___
1302    }
1303
1304    {
1305    $code .= <<___;
1306    .L_remaining_num_blocks_is_3_${rndsuffix}:
1307    mov	\$-1, $tmp1
1308    shr	\$0x10, $tmp1
1309    kmovq	$tmp1, %k1
1310    vmovdqu8	($input), %zmm1{%k1}
1311    add	\$0x30, $input
1312___
1313    }
1314
1315    encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128);
1316
1317    {
1318    $code .= <<___;
1319    vmovdqu8	%zmm1, ($output){%k1}
1320    add	\$0x30, $output
1321    vextracti32x4	\$0x2, %zmm1, %xmm8
1322    vextracti32x4	\$0x3, %zmm9, %xmm0
1323    and	\$0xf, $length
1324    je	.L_ret_${rndsuffix}
1325    jmp	.L_steal_cipher_${rndsuffix}
1326___
1327    }
1328
1329    {
1330    $code .= <<___;
1331    .L_remaining_num_blocks_is_2_${rndsuffix}:
1332    vmovdqu8	($input), %ymm1
1333    add	\$0x20, $input
1334___
1335    }
1336
1337    encrypt_by_four("%ymm1", "%ymm9", "%ymm0", $is_128);
1338
1339    {
1340    $code .= <<___;
1341    vmovdqu 	 %ymm1,($output)
1342    add 	 \$0x20,$output
1343    vextracti32x4	\$0x1, %zmm1, %xmm8
1344    vextracti32x4	\$0x2,%zmm9,%xmm0
1345    and 	 \$0xf,$length
1346    je 	 .L_ret_${rndsuffix}
1347    jmp 	 .L_steal_cipher_${rndsuffix}
1348___
1349    }
1350
1351    {
1352    $code .= <<___;
1353    .L_remaining_num_blocks_is_1_${rndsuffix}:
1354    vmovdqu 	 ($input),%xmm1
1355    add 	 \$0x10,$input
1356___
1357    }
1358
1359    encrypt_final("%xmm1", "%xmm9", $is_128);
1360
1361    {
1362    $code .= <<___;
1363    vmovdqu 	 %xmm1,($output)
1364    add 	 \$0x10,$output
1365    vmovdqa 	 %xmm1,%xmm8
1366    vextracti32x4 	 \$0x1,%zmm9,%xmm0
1367    and 	 \$0xf,$length
1368    je 	 .L_ret_${rndsuffix}
1369    jmp 	 .L_steal_cipher_${rndsuffix}
1370
1371
1372    .L_start_by16_${rndsuffix}:
1373    vbroadcasti32x4 	 ($TW),%zmm0
1374    vbroadcasti32x4 shufb_15_7(%rip),%zmm8
1375    mov 	 \$0xaa,$tmp1
1376    kmovq 	 $tmp1,%k2
1377    vpshufb 	 %zmm8,%zmm0,%zmm1
1378    vpsllvq const_dq3210(%rip),%zmm0,%zmm4
1379    vpsrlvq const_dq5678(%rip),%zmm1,%zmm2
1380    vpclmulqdq 	 \$0x0,%zmm25,%zmm2,%zmm3
1381    vpxorq 	 %zmm2,%zmm4,%zmm4{%k2}
1382    vpxord 	 %zmm4,%zmm3,%zmm9
1383    vpsllvq const_dq7654(%rip),%zmm0,%zmm5
1384    vpsrlvq const_dq1234(%rip),%zmm1,%zmm6
1385    vpclmulqdq 	 \$0x0,%zmm25,%zmm6,%zmm7
1386    vpxorq 	 %zmm6,%zmm5,%zmm5{%k2}
1387    vpxord 	 %zmm5,%zmm7,%zmm10
1388    vpsrldq 	 \$0xf,%zmm9,%zmm13
1389    vpclmulqdq 	 \$0x0,%zmm25,%zmm13,%zmm14
1390    vpslldq 	 \$0x1,%zmm9,%zmm11
1391    vpxord 	 %zmm14,%zmm11,%zmm11
1392    vpsrldq 	 \$0xf,%zmm10,%zmm15
1393    vpclmulqdq 	 \$0x0,%zmm25,%zmm15,%zmm16
1394    vpslldq 	 \$0x1,%zmm10,%zmm12
1395    vpxord 	 %zmm16,%zmm12,%zmm12
1396
1397    .L_main_loop_run_16_${rndsuffix}:
1398    vmovdqu8 	 ($input),%zmm1
1399    vmovdqu8 	 0x40($input),%zmm2
1400    vmovdqu8 	 0x80($input),%zmm3
1401    vmovdqu8 	 0xc0($input),%zmm4
1402    add 	 \$0x100,$input
1403___
1404    }
1405
1406    encrypt_by_16_zmm("%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm9",
1407                      "%zmm10", "%zmm11", "%zmm12", "%zmm0", 0, $is_128);
1408
1409    {
1410    $code .= <<___;
1411    vmovdqu8 	 %zmm1,($output)
1412    vmovdqu8 	 %zmm2,0x40($output)
1413    vmovdqu8 	 %zmm3,0x80($output)
1414    vmovdqu8 	 %zmm4,0xc0($output)
1415    add 	 \$0x100,$output
1416    sub 	 \$0x100,$length
1417    cmp 	 \$0x100,$length
1418    jae 	 .L_main_loop_run_16_${rndsuffix}
1419    cmp 	 \$0x80,$length
1420    jae 	 .L_main_loop_run_8_${rndsuffix}
1421    vextracti32x4 	 \$0x3,%zmm4,%xmm0
1422    jmp 	 .L_do_n_blocks_${rndsuffix}
1423
1424    .L_start_by8_${rndsuffix}:
1425    vbroadcasti32x4 	 ($TW),%zmm0
1426    vbroadcasti32x4 shufb_15_7(%rip),%zmm8
1427    mov 	 \$0xaa,$tmp1
1428    kmovq 	 $tmp1,%k2
1429    vpshufb 	 %zmm8,%zmm0,%zmm1
1430    vpsllvq const_dq3210(%rip),%zmm0,%zmm4
1431    vpsrlvq const_dq5678(%rip),%zmm1,%zmm2
1432    vpclmulqdq 	 \$0x0,%zmm25,%zmm2,%zmm3
1433    vpxorq 	 %zmm2,%zmm4,%zmm4{%k2}
1434    vpxord 	 %zmm4,%zmm3,%zmm9
1435    vpsllvq const_dq7654(%rip),%zmm0,%zmm5
1436    vpsrlvq const_dq1234(%rip),%zmm1,%zmm6
1437    vpclmulqdq 	 \$0x0,%zmm25,%zmm6,%zmm7
1438    vpxorq 	 %zmm6,%zmm5,%zmm5{%k2}
1439    vpxord 	 %zmm5,%zmm7,%zmm10
1440
1441    .L_main_loop_run_8_${rndsuffix}:
1442    vmovdqu8 	 ($input),%zmm1
1443    vmovdqu8 	 0x40($input),%zmm2
1444    add 	 \$0x80,$input
1445___
1446    }
1447
1448    encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 0, $is_128);
1449
1450    {
1451    $code .= <<___;
1452    vmovdqu8 	 %zmm1,($output)
1453    vmovdqu8 	 %zmm2,0x40($output)
1454    add 	 \$0x80,$output
1455    sub 	 \$0x80,$length
1456    cmp 	 \$0x80,$length
1457    jae 	 .L_main_loop_run_8_${rndsuffix}
1458    vextracti32x4 	 \$0x3,%zmm2,%xmm0
1459    jmp 	 .L_do_n_blocks_${rndsuffix}
1460
1461    .L_steal_cipher_${rndsuffix}:
1462    vmovdqa	%xmm8,%xmm2
1463    lea	vpshufb_shf_table(%rip),$TEMPLOW
1464    vmovdqu	($TEMPLOW,$length,1),%xmm10
1465    vpshufb	%xmm10,%xmm8,%xmm8
1466    vmovdqu	-0x10($input,$length,1),%xmm3
1467    vmovdqu	%xmm8,-0x10($output,$length,1)
1468    lea	vpshufb_shf_table(%rip),$TEMPLOW
1469    add	\$16, $TEMPLOW
1470    sub	$length,$TEMPLOW
1471    vmovdqu	($TEMPLOW),%xmm10
1472    vpxor	mask1(%rip),%xmm10,%xmm10
1473    vpshufb	%xmm10,%xmm3,%xmm3
1474    vpblendvb	%xmm10,%xmm2,%xmm3,%xmm3
1475    vpxor	%xmm0,%xmm3,%xmm8
1476    vpxor	($key1),%xmm8,%xmm8
1477    vaesenc	0x10($key1),%xmm8,%xmm8
1478    vaesenc	0x20($key1),%xmm8,%xmm8
1479    vaesenc	0x30($key1),%xmm8,%xmm8
1480    vaesenc	0x40($key1),%xmm8,%xmm8
1481    vaesenc	0x50($key1),%xmm8,%xmm8
1482    vaesenc	0x60($key1),%xmm8,%xmm8
1483    vaesenc	0x70($key1),%xmm8,%xmm8
1484    vaesenc	0x80($key1),%xmm8,%xmm8
1485    vaesenc	0x90($key1),%xmm8,%xmm8
1486___
1487    if ($is_128) {
1488      $code .= "vaesenclast	0xa0($key1),%xmm8,%xmm8\n";
1489    } else {
1490      $code .= <<___
1491      vaesenc	0xa0($key1),%xmm8,%xmm8
1492      vaesenc	0xb0($key1),%xmm8,%xmm8
1493      vaesenc	0xc0($key1),%xmm8,%xmm8
1494      vaesenc	0xd0($key1),%xmm8,%xmm8
1495      vaesenclast	0xe0($key1),%xmm8,%xmm8
1496___
1497    }
1498    $code .= "vpxor	%xmm0,%xmm8,%xmm8\n";
1499    $code .= "vmovdqu	%xmm8,-0x10($output)\n";
1500    }
1501
1502    {
1503    $code .= <<___;
1504    .L_ret_${rndsuffix}:
1505    mov 	 $GP_STORAGE($TW),%rbx
1506    xor    $tmp1,$tmp1
1507    mov    $tmp1,$GP_STORAGE($TW)
1508    # Zero-out the whole of `%zmm0`.
1509    vpxorq %zmm0,%zmm0,%zmm0
1510___
1511    }
1512
1513    if ($win64) {
1514      $code .= <<___;
1515      mov $GP_STORAGE + 8*1($TW),%rdi
1516      mov $tmp1,$GP_STORAGE + 8*1($TW)
1517      mov $GP_STORAGE + 8*2($TW),%rsi
1518      mov $tmp1,$GP_STORAGE + 8*2($TW)
1519
1520      vmovdqa $XMM_STORAGE + 16 * 0($TW), %xmm6
1521      vmovdqa $XMM_STORAGE + 16 * 1($TW), %xmm7
1522      vmovdqa $XMM_STORAGE + 16 * 2($TW), %xmm8
1523      vmovdqa $XMM_STORAGE + 16 * 3($TW), %xmm9
1524
1525      # Zero the 64 bytes we just restored to the xmm registers.
1526      vmovdqa64 %zmm0,$XMM_STORAGE($TW)
1527
1528      vmovdqa $XMM_STORAGE + 16 * 4($TW), %xmm10
1529      vmovdqa $XMM_STORAGE + 16 * 5($TW), %xmm11
1530      vmovdqa $XMM_STORAGE + 16 * 6($TW), %xmm12
1531      vmovdqa $XMM_STORAGE + 16 * 7($TW), %xmm13
1532
1533      # And again.
1534      vmovdqa64 %zmm0,$XMM_STORAGE + 16 * 4($TW)
1535
1536      vmovdqa $XMM_STORAGE + 16 * 8($TW), %xmm14
1537      vmovdqa $XMM_STORAGE + 16 * 9($TW), %xmm15
1538
1539      # Last round is only 32 bytes (256-bits), so we use `%ymm` as the
1540      # source operand.
1541      vmovdqa %ymm0,$XMM_STORAGE + 16 * 8($TW)
1542___
1543    }
1544
1545    {
1546    $code .= <<___;
1547    mov %rbp,$TW
1548    pop %rbp
1549    vzeroupper
1550    ret
1551
1552    .L_less_than_128_bytes_${rndsuffix}:
1553    vpbroadcastq	$gf_poly_8b, $ZPOLY
1554    cmp 	 \$0x10,$length
1555    jb 	 .L_ret_${rndsuffix}
1556    vbroadcasti32x4	($TW), %zmm0
1557    vbroadcasti32x4	shufb_15_7(%rip), %zmm8
1558    movl    \$0xaa, %r8d
1559    kmovq	%r8, %k2
1560    mov	$length,$tmp1
1561    and	\$0x70,$tmp1
1562    cmp	\$0x60,$tmp1
1563    je	.L_num_blocks_is_6_${rndsuffix}
1564    cmp	\$0x50,$tmp1
1565    je	.L_num_blocks_is_5_${rndsuffix}
1566    cmp	\$0x40,$tmp1
1567    je	.L_num_blocks_is_4_${rndsuffix}
1568    cmp	\$0x30,$tmp1
1569    je	.L_num_blocks_is_3_${rndsuffix}
1570    cmp	\$0x20,$tmp1
1571    je	.L_num_blocks_is_2_${rndsuffix}
1572    cmp	\$0x10,$tmp1
1573    je	.L_num_blocks_is_1_${rndsuffix}
1574
1575    .L_num_blocks_is_7_${rndsuffix}:
1576    vpshufb	%zmm8, %zmm0, %zmm1
1577    vpsllvq	const_dq3210(%rip), %zmm0, %zmm4
1578    vpsrlvq	const_dq5678(%rip), %zmm1, %zmm2
1579    vpclmulqdq	\$0x00, $ZPOLY, %zmm2, %zmm3
1580    vpxorq	%zmm2, %zmm4, %zmm4{%k2}
1581    vpxord	%zmm4, %zmm3, %zmm9
1582    vpsllvq	const_dq7654(%rip), %zmm0, %zmm5
1583    vpsrlvq	const_dq1234(%rip), %zmm1, %zmm6
1584    vpclmulqdq	\$0x00, $ZPOLY, %zmm6, %zmm7
1585    vpxorq	%zmm6, %zmm5, %zmm5{%k2}
1586    vpxord	%zmm5, %zmm7, %zmm10
1587    mov	\$0x0000ffffffffffff, $tmp1
1588    kmovq	$tmp1, %k1
1589    vmovdqu8	16*0($input), %zmm1
1590    vmovdqu8	16*4($input), %zmm2{%k1}
1591
1592    add	\$0x70,$input
1593___
1594    }
1595
1596    encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1597
1598    {
1599    $code .= <<___;
1600    vmovdqu8	%zmm1, 16*0($output)
1601    vmovdqu8	%zmm2, 16*4($output){%k1}
1602    add	\$0x70,$output
1603    vextracti32x4	\$0x2, %zmm2, %xmm8
1604    vextracti32x4	\$0x3, %zmm10, %xmm0
1605    and	\$0xf,$length
1606    je	.L_ret_${rndsuffix}
1607    jmp	.L_steal_cipher_${rndsuffix}
1608___
1609    }
1610
1611    {
1612    $code .= <<___;
1613    .L_num_blocks_is_6_${rndsuffix}:
1614    vpshufb	%zmm8, %zmm0, %zmm1
1615    vpsllvq	const_dq3210(%rip), %zmm0, %zmm4
1616    vpsrlvq	const_dq5678(%rip), %zmm1, %zmm2
1617    vpclmulqdq	\$0x00, $ZPOLY, %zmm2, %zmm3
1618    vpxorq	%zmm2, %zmm4, %zmm4{%k2}
1619    vpxord	%zmm4, %zmm3, %zmm9
1620    vpsllvq	const_dq7654(%rip), %zmm0, %zmm5
1621    vpsrlvq	const_dq1234(%rip), %zmm1, %zmm6
1622    vpclmulqdq	\$0x00, $ZPOLY, %zmm6, %zmm7
1623    vpxorq	%zmm6, %zmm5, %zmm5{%k2}
1624    vpxord	%zmm5, %zmm7, %zmm10
1625    vmovdqu8	16*0($input), %zmm1
1626    vmovdqu8	16*4($input), %ymm2
1627    add	\$96, $input
1628___
1629    }
1630
1631    encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1632
1633    {
1634    $code .= <<___;
1635    vmovdqu8	%zmm1, 16*0($output)
1636    vmovdqu8	%ymm2, 16*4($output)
1637    add	\$96, $output
1638
1639    vextracti32x4	\$0x1, %ymm2, %xmm8
1640    vextracti32x4	\$0x2, %zmm10, %xmm0
1641    and	\$0xf,$length
1642    je	.L_ret_${rndsuffix}
1643    jmp	.L_steal_cipher_${rndsuffix}
1644___
1645    }
1646
1647    {
1648    $code .= <<___;
1649    .L_num_blocks_is_5_${rndsuffix}:
1650    vpshufb	%zmm8, %zmm0, %zmm1
1651    vpsllvq	const_dq3210(%rip), %zmm0, %zmm4
1652    vpsrlvq	const_dq5678(%rip), %zmm1, %zmm2
1653    vpclmulqdq	\$0x00, $ZPOLY, %zmm2, %zmm3
1654    vpxorq	%zmm2, %zmm4, %zmm4{%k2}
1655    vpxord	%zmm4, %zmm3, %zmm9
1656    vpsllvq	const_dq7654(%rip), %zmm0, %zmm5
1657    vpsrlvq	const_dq1234(%rip), %zmm1, %zmm6
1658    vpclmulqdq	\$0x00, $ZPOLY, %zmm6, %zmm7
1659    vpxorq	%zmm6, %zmm5, %zmm5{%k2}
1660    vpxord	%zmm5, %zmm7, %zmm10
1661    vmovdqu8	16*0($input), %zmm1
1662    vmovdqu8	16*4($input), %xmm2
1663    add	\$80, $input
1664___
1665    }
1666
1667    encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1668
1669    {
1670    $code .= <<___;
1671    vmovdqu8	%zmm1, 16*0($output)
1672    vmovdqu8	%xmm2, 16*4($output)
1673    add	\$80, $output
1674
1675    vmovdqa	%xmm2, %xmm8
1676    vextracti32x4	\$0x1, %zmm10, %xmm0
1677    and	\$0xf,$length
1678    je	.L_ret_${rndsuffix}
1679    jmp	.L_steal_cipher_${rndsuffix}
1680___
1681    }
1682
1683    {
1684    $code .= <<___;
1685    .L_num_blocks_is_4_${rndsuffix}:
1686    vpshufb	%zmm8, %zmm0, %zmm1
1687    vpsllvq	const_dq3210(%rip), %zmm0, %zmm4
1688    vpsrlvq	const_dq5678(%rip), %zmm1, %zmm2
1689    vpclmulqdq	\$0x00, $ZPOLY, %zmm2, %zmm3
1690    vpxorq	%zmm2, %zmm4, %zmm4{%k2}
1691    vpxord	%zmm4, %zmm3, %zmm9
1692    vpsllvq	const_dq7654(%rip), %zmm0, %zmm5
1693    vpsrlvq	const_dq1234(%rip), %zmm1, %zmm6
1694    vpclmulqdq	\$0x00, $ZPOLY, %zmm6, %zmm7
1695    vpxorq	%zmm6, %zmm5, %zmm5{%k2}
1696    vpxord	%zmm5, %zmm7, %zmm10
1697    vmovdqu8	16*0($input), %zmm1
1698    add	\$64, $input
1699___
1700    }
1701
1702    encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128);
1703
1704    {
1705    $code .= <<___;
1706    vmovdqu8	%zmm1, 16*0($output)
1707    add	\$64, $output
1708    vextracti32x4	\$0x3, %zmm1, %xmm8
1709    vmovdqa	%xmm10, %xmm0
1710    and	\$0xf,$length
1711    je	.L_ret_${rndsuffix}
1712    jmp	.L_steal_cipher_${rndsuffix}
1713___
1714    }
1715
1716    {
1717    $code .= <<___;
1718    .L_num_blocks_is_3_${rndsuffix}:
1719    vpshufb	%zmm8, %zmm0, %zmm1
1720    vpsllvq	const_dq3210(%rip), %zmm0, %zmm4
1721    vpsrlvq	const_dq5678(%rip), %zmm1, %zmm2
1722    vpclmulqdq	\$0x00, $ZPOLY, %zmm2, %zmm3
1723    vpxorq	%zmm2, %zmm4, %zmm4{%k2}
1724    vpxord	%zmm4, %zmm3, %zmm9
1725    mov	\$0x0000ffffffffffff, $tmp1
1726    kmovq	$tmp1, %k1
1727    vmovdqu8	16*0($input), %zmm1{%k1}
1728    add	\$48, $input
1729___
1730    }
1731
1732    encrypt_by_four("%zmm1", "%zmm9", "%zmm0", $is_128);
1733
1734    {
1735    $code .= <<___;
1736    vmovdqu8	%zmm1, 16*0($output){%k1}
1737    add	\$48, $output
1738    vextracti32x4	\$2, %zmm1, %xmm8
1739    vextracti32x4	\$3, %zmm9, %xmm0
1740    and	\$0xf,$length
1741    je	.L_ret_${rndsuffix}
1742    jmp	.L_steal_cipher_${rndsuffix}
1743___
1744    }
1745
1746    {
1747    $code .= <<___;
1748    .L_num_blocks_is_2_${rndsuffix}:
1749    vpshufb	%zmm8, %zmm0, %zmm1
1750    vpsllvq	const_dq3210(%rip), %zmm0, %zmm4
1751    vpsrlvq	const_dq5678(%rip), %zmm1, %zmm2
1752    vpclmulqdq	\$0x00, $ZPOLY, %zmm2, %zmm3
1753    vpxorq	%zmm2, %zmm4, %zmm4{%k2}
1754    vpxord	%zmm4, %zmm3, %zmm9
1755
1756    vmovdqu8	16*0($input), %ymm1
1757    add	\$32, $input
1758___
1759    }
1760
1761    encrypt_by_four("%ymm1", "%ymm9", "%ymm0", $is_128);
1762
1763    {
1764    $code .= <<___;
1765    vmovdqu8	%ymm1, 16*0($output)
1766    add	\$32, $output
1767
1768    vextracti32x4	\$1, %ymm1, %xmm8
1769    vextracti32x4	\$2, %zmm9, %xmm0
1770    and	\$0xf,$length
1771    je	.L_ret_${rndsuffix}
1772    jmp	.L_steal_cipher_${rndsuffix}
1773___
1774    }
1775
1776    {
1777    $code .= <<___;
1778    .L_num_blocks_is_1_${rndsuffix}:
1779    vpshufb	%zmm8, %zmm0, %zmm1
1780    vpsllvq	const_dq3210(%rip), %zmm0, %zmm4
1781    vpsrlvq	const_dq5678(%rip), %zmm1, %zmm2
1782    vpclmulqdq	\$0x00, $ZPOLY, %zmm2, %zmm3
1783    vpxorq	%zmm2, %zmm4, %zmm4{%k2}
1784    vpxord	%zmm4, %zmm3, %zmm9
1785
1786    vmovdqu8	16*0($input), %xmm1
1787    add	\$16, $input
1788___
1789    }
1790
1791    encrypt_by_four("%ymm1", "%ymm9", "%ymm0", $is_128);
1792
1793    {
1794    $code .= <<___;
1795    vmovdqu8	%xmm1, 16*0($output)
1796    add	\$16, $output
1797
1798    vmovdqa	%xmm1, %xmm8
1799    vextracti32x4	\$1, %zmm9, %xmm0
1800    and	\$0xf,$length
1801    je	.L_ret_${rndsuffix}
1802    jmp	.L_steal_cipher_${rndsuffix}
1803    .cfi_endproc
1804___
1805    }
1806  }
1807
1808  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1809  # ;void aesni_xts_[128|256]_decrypt_avx512(
1810  # ;               const uint8_t *in,        // input data
1811  # ;               uint8_t *out,             // output data
1812  # ;               size_t length,            // sector size, in bytes
1813  # ;               const AES_KEY *key1,      // key used for "ECB" encryption, 16*2 bytes
1814  # ;               const AES_KEY *key2,      // key used for tweaking, 16*2 bytes
1815  # ;               const uint8_t iv[16])      // initial tweak value, 16 bytes
1816  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1817  sub dec {
1818    my $is_128 = $_[0];
1819    my $rndsuffix = &random_string();
1820
1821    if ($is_128) {
1822      $code.=<<___;
1823      .globl	aesni_xts_128_decrypt_avx512
1824      .hidden	aesni_xts_128_decrypt_avx512
1825      .type	aesni_xts_128_decrypt_avx512,\@function,6
1826      .align	32
1827      aesni_xts_128_decrypt_avx512:
1828      .cfi_startproc
1829      endbranch
1830___
1831    } else {
1832      $code.=<<___;
1833      .globl	aesni_xts_256_decrypt_avx512
1834      .hidden	aesni_xts_256_decrypt_avx512
1835      .type	aesni_xts_256_decrypt_avx512,\@function,6
1836      .align	32
1837      aesni_xts_256_decrypt_avx512:
1838      .cfi_startproc
1839      endbranch
1840___
1841    }
1842    $code .= "push 	 %rbp\n";
1843    $code .= "mov 	 $TW,%rbp\n";
1844    $code .= "sub 	 \$$VARIABLE_OFFSET,$TW\n";
1845    $code .= "and 	 \$0xffffffffffffffc0,$TW\n";
1846    $code .= "mov 	 %rbx,$GP_STORAGE($TW)\n";
1847
1848    if ($win64) {
1849      $code .= "mov 	 %rdi,$GP_STORAGE + 8*1($TW)\n";
1850      $code .= "mov 	 %rsi,$GP_STORAGE + 8*2($TW)\n";
1851      $code .= "vmovdqa      %xmm6, $XMM_STORAGE + 16*0($TW)\n";
1852      $code .= "vmovdqa      %xmm7, $XMM_STORAGE + 16*1($TW)\n";
1853      $code .= "vmovdqa      %xmm8, $XMM_STORAGE + 16*2($TW)\n";
1854      $code .= "vmovdqa      %xmm9, $XMM_STORAGE + 16*3($TW)\n";
1855      $code .= "vmovdqa      %xmm10, $XMM_STORAGE + 16*4($TW)\n";
1856      $code .= "vmovdqa      %xmm11, $XMM_STORAGE + 16*5($TW)\n";
1857      $code .= "vmovdqa      %xmm12, $XMM_STORAGE + 16*6($TW)\n";
1858      $code .= "vmovdqa      %xmm13, $XMM_STORAGE + 16*7($TW)\n";
1859      $code .= "vmovdqa      %xmm14, $XMM_STORAGE + 16*8($TW)\n";
1860      $code .= "vmovdqa      %xmm15, $XMM_STORAGE + 16*9($TW)\n";
1861    }
1862
1863    $code .= "mov 	 \$0x87, $gf_poly_8b\n";
1864    $code .= "vmovdqu 	 ($tweak),%xmm1\n";      # read initial tweak values
1865
1866    encrypt_tweak("%xmm1", $is_128);
1867
1868    if ($win64) {
1869      $code .= "mov	 $input, 8 + 8*5(%rbp)\n"; # ciphertext pointer
1870      $code .= "mov        $output, 8 + 8*6(%rbp)\n"; # plaintext pointer
1871    }
1872
1873    {
1874    $code.=<<___;
1875
1876    cmp 	 \$0x80,$length
1877    jb 	 .L_less_than_128_bytes_${rndsuffix}
1878    vpbroadcastq 	 $gf_poly_8b,$ZPOLY
1879    cmp 	 \$0x100,$length
1880    jge 	 .L_start_by16_${rndsuffix}
1881    jmp 	 .L_start_by8_${rndsuffix}
1882
1883    .L_do_n_blocks_${rndsuffix}:
1884    cmp 	 \$0x0,$length
1885    je 	 .L_ret_${rndsuffix}
1886    cmp 	 \$0x70,$length
1887    jge 	 .L_remaining_num_blocks_is_7_${rndsuffix}
1888    cmp 	 \$0x60,$length
1889    jge 	 .L_remaining_num_blocks_is_6_${rndsuffix}
1890    cmp 	 \$0x50,$length
1891    jge 	 .L_remaining_num_blocks_is_5_${rndsuffix}
1892    cmp 	 \$0x40,$length
1893    jge 	 .L_remaining_num_blocks_is_4_${rndsuffix}
1894    cmp 	 \$0x30,$length
1895    jge 	 .L_remaining_num_blocks_is_3_${rndsuffix}
1896    cmp 	 \$0x20,$length
1897    jge 	 .L_remaining_num_blocks_is_2_${rndsuffix}
1898    cmp 	 \$0x10,$length
1899    jge 	 .L_remaining_num_blocks_is_1_${rndsuffix}
1900
1901    # _remaining_num_blocks_is_0:
1902    vmovdqu		%xmm5, %xmm1
1903    # xmm5 contains last full block to decrypt with next teawk
1904___
1905    }
1906    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
1907                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
1908                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128);
1909
1910    {
1911    $code .= <<___;
1912    vmovdqu %xmm1, -0x10($output)
1913    vmovdqa %xmm1, %xmm8
1914
1915    # Calc previous tweak
1916    mov		\$0x1,$tmp1
1917    kmovq		$tmp1, %k1
1918    vpsllq	\$0x3f,%xmm9,%xmm13
1919    vpsraq	\$0x3f,%xmm13,%xmm14
1920    vpandq	%xmm25,%xmm14,%xmm5
1921    vpxorq        %xmm5,%xmm9,%xmm9{%k1}
1922    vpsrldq       \$0x8,%xmm9,%xmm10
1923    .byte 98, 211, 181, 8, 115, 194, 1 #vpshrdq \$0x1,%xmm10,%xmm9,%xmm0
1924    vpslldq       \$0x8,%xmm13,%xmm13
1925    vpxorq        %xmm13,%xmm0,%xmm0
1926    jmp           .L_steal_cipher_${rndsuffix}
1927
1928    .L_remaining_num_blocks_is_7_${rndsuffix}:
1929    mov 	 \$0xffffffffffffffff,$tmp1
1930    shr 	 \$0x10,$tmp1
1931    kmovq 	 $tmp1,%k1
1932    vmovdqu8 	 ($input),%zmm1
1933    vmovdqu8 	 0x40($input),%zmm2{%k1}
1934    add 	         \$0x70,$input
1935    and            \$0xf,$length
1936    je             .L_done_7_remain_${rndsuffix}
1937    vextracti32x4   \$0x2,%zmm10,%xmm12
1938    vextracti32x4   \$0x3,%zmm10,%xmm13
1939    vinserti32x4    \$0x2,%xmm13,%zmm10,%zmm10
1940___
1941    }
1942
1943    decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1944
1945    {
1946    $code .= <<___;
1947    vmovdqu8 	 %zmm1, ($output)
1948    vmovdqu8 	 %zmm2, 0x40($output){%k1}
1949    add 	         \$0x70, $output
1950    vextracti32x4  \$0x2,%zmm2,%xmm8
1951    vmovdqa        %xmm12,%xmm0
1952    jmp            .L_steal_cipher_${rndsuffix}
1953___
1954    }
1955
1956    $code .= "\n.L_done_7_remain_${rndsuffix}:\n";
1957    decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1958
1959    {
1960    $code .= <<___;
1961    vmovdqu8        %zmm1, ($output)
1962    vmovdqu8        %zmm2, 0x40($output){%k1}
1963    jmp     .L_ret_${rndsuffix}
1964
1965    .L_remaining_num_blocks_is_6_${rndsuffix}:
1966    vmovdqu8 	 ($input),%zmm1
1967    vmovdqu8 	 0x40($input),%ymm2
1968    add 	         \$0x60,$input
1969    and            \$0xf, $length
1970    je             .L_done_6_remain_${rndsuffix}
1971    vextracti32x4   \$0x1,%zmm10,%xmm12
1972    vextracti32x4   \$0x2,%zmm10,%xmm13
1973    vinserti32x4    \$0x1,%xmm13,%zmm10,%zmm10
1974___
1975    }
1976
1977    decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1978
1979    {
1980    $code .= <<___;
1981    vmovdqu8 	 %zmm1, ($output)
1982    vmovdqu8 	 %ymm2, 0x40($output)
1983    add 	         \$0x60,$output
1984    vextracti32x4  \$0x1,%zmm2,%xmm8
1985    vmovdqa        %xmm12,%xmm0
1986    jmp            .L_steal_cipher_${rndsuffix}
1987___
1988    }
1989
1990    $code .= "\n.L_done_6_remain_${rndsuffix}:\n";
1991    decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
1992
1993    {
1994    $code .= <<___;
1995    vmovdqu8        %zmm1, ($output)
1996    vmovdqu8        %ymm2,0x40($output)
1997    jmp             .L_ret_${rndsuffix}
1998
1999    .L_remaining_num_blocks_is_5_${rndsuffix}:
2000    vmovdqu8 	 ($input),%zmm1
2001    vmovdqu 	 0x40($input),%xmm2
2002    add 	         \$0x50,$input
2003    and            \$0xf,$length
2004    je             .L_done_5_remain_${rndsuffix}
2005    vmovdqa        %xmm10,%xmm12
2006    vextracti32x4  \$0x1,%zmm10,%xmm10
2007___
2008    }
2009
2010    decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
2011
2012    {
2013    $code .= <<___;
2014    vmovdqu8         %zmm1, ($output)
2015    vmovdqu          %xmm2, 0x40($output)
2016    add              \$0x50, $output
2017    vmovdqa          %xmm2,%xmm8
2018    vmovdqa          %xmm12,%xmm0
2019    jmp              .L_steal_cipher_${rndsuffix}
2020___
2021    }
2022
2023    $code .= "\n.L_done_5_remain_${rndsuffix}:\n";
2024    decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
2025
2026    {
2027    $code .= <<___;
2028    vmovdqu8        %zmm1, ($output)
2029    vmovdqu8        %xmm2, 0x40($output)
2030    jmp             .L_ret_${rndsuffix}
2031
2032    .L_remaining_num_blocks_is_4_${rndsuffix}:
2033    vmovdqu8 	 ($input),%zmm1
2034    add 	         \$0x40,$input
2035    and            \$0xf, $length
2036    je             .L_done_4_remain_${rndsuffix}
2037    vextracti32x4   \$0x3,%zmm9,%xmm12
2038    vinserti32x4    \$0x3,%xmm10,%zmm9,%zmm9
2039___
2040    }
2041
2042    decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
2043
2044    {
2045    $code .= <<___;
2046    vmovdqu8        %zmm1,($output)
2047    add             \$0x40,$output
2048    vextracti32x4   \$0x3,%zmm1,%xmm8
2049    vmovdqa         %xmm12,%xmm0
2050    jmp             .L_steal_cipher_${rndsuffix}
2051___
2052    }
2053
2054    $code .= "\n.L_done_4_remain_${rndsuffix}:\n";
2055    decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1, $is_128);
2056
2057    {
2058    $code .= <<___;
2059    vmovdqu8        %zmm1, ($output)
2060    jmp             .L_ret_${rndsuffix}
2061
2062    .L_remaining_num_blocks_is_3_${rndsuffix}:
2063    vmovdqu         ($input),%xmm1
2064    vmovdqu         0x10($input),%xmm2
2065    vmovdqu         0x20($input),%xmm3
2066    add             \$0x30,$input
2067    and             \$0xf,$length
2068    je              .L_done_3_remain_${rndsuffix}
2069    vextracti32x4   \$0x2,%zmm9,%xmm13
2070    vextracti32x4   \$0x1,%zmm9,%xmm10
2071    vextracti32x4   \$0x3,%zmm9,%xmm11
2072___
2073    }
2074
2075    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2076                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2077                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128);
2078
2079    {
2080    $code .= <<___;
2081    vmovdqu 	 %xmm1,($output)
2082    vmovdqu 	 %xmm2,0x10($output)
2083    vmovdqu 	 %xmm3,0x20($output)
2084    add 	         \$0x30,$output
2085    vmovdqa 	 %xmm3,%xmm8
2086    vmovdqa        %xmm13,%xmm0
2087    jmp 	         .L_steal_cipher_${rndsuffix}
2088___
2089    }
2090    $code .= "\n.L_done_3_remain_${rndsuffix}:\n";
2091    $code .= "vextracti32x4   \$0x1,%zmm9,%xmm10\n";
2092    $code .= "vextracti32x4   \$0x2,%zmm9,%xmm11\n";
2093
2094    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2095                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2096                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128);
2097
2098    {
2099    $code .= <<___;
2100    vmovdqu %xmm1,($output)
2101    vmovdqu %xmm2,0x10($output)
2102    vmovdqu %xmm3,0x20($output)
2103    jmp     .L_ret_${rndsuffix}
2104
2105    .L_remaining_num_blocks_is_2_${rndsuffix}:
2106    vmovdqu         ($input),%xmm1
2107    vmovdqu         0x10($input),%xmm2
2108    add             \$0x20,$input
2109    and             \$0xf,$length
2110    je              .L_done_2_remain_${rndsuffix}
2111    vextracti32x4   \$0x2,%zmm9,%xmm10
2112    vextracti32x4   \$0x1,%zmm9,%xmm12
2113___
2114    }
2115
2116    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2117                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2118                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128);
2119
2120    {
2121    $code .= <<___;
2122    vmovdqu 	 %xmm1,($output)
2123    vmovdqu 	 %xmm2,0x10($output)
2124    add 	         \$0x20,$output
2125    vmovdqa 	 %xmm2,%xmm8
2126    vmovdqa 	 %xmm12,%xmm0
2127    jmp 	         .L_steal_cipher_${rndsuffix}
2128___
2129    }
2130    $code .= "\n.L_done_2_remain_${rndsuffix}:\n";
2131    $code .= "vextracti32x4   \$0x1,%zmm9,%xmm10\n";
2132
2133    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2134                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2135                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128);
2136
2137    {
2138    $code .= <<___;
2139    vmovdqu   %xmm1,($output)
2140    vmovdqu   %xmm2,0x10($output)
2141    jmp       .L_ret_${rndsuffix}
2142
2143    .L_remaining_num_blocks_is_1_${rndsuffix}:
2144    vmovdqu 	 ($input),%xmm1
2145    add 	         \$0x10,$input
2146    and            \$0xf,$length
2147    je             .L_done_1_remain_${rndsuffix}
2148    vextracti32x4  \$0x1,%zmm9,%xmm11
2149___
2150    }
2151
2152    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2153                    "%xmm7", "%xmm8", "%xmm11", "%xmm10", "%xmm9", "%xmm12",
2154                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128);
2155    {
2156    $code .= <<___;
2157    vmovdqu 	 %xmm1,($output)
2158    add 	         \$0x10,$output
2159    vmovdqa 	 %xmm1,%xmm8
2160    vmovdqa 	 %xmm9,%xmm0
2161    jmp 	         .L_steal_cipher_${rndsuffix}
2162___
2163    }
2164
2165    $code .= "\n.L_done_1_remain_${rndsuffix}:\n";
2166
2167    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2168                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2169                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128);
2170
2171    {
2172    $code .= <<___;
2173    vmovdqu   %xmm1, ($output)
2174    jmp       .L_ret_${rndsuffix}
2175
2176    .L_start_by16_${rndsuffix}:
2177    vbroadcasti32x4 	 ($TW),%zmm0
2178    vbroadcasti32x4 shufb_15_7(%rip),%zmm8
2179    mov 	 \$0xaa,$tmp1
2180    kmovq 	 $tmp1,%k2
2181
2182    # Mult tweak by 2^{3, 2, 1, 0}
2183    vpshufb 	 %zmm8,%zmm0,%zmm1
2184    vpsllvq const_dq3210(%rip),%zmm0,%zmm4
2185    vpsrlvq const_dq5678(%rip),%zmm1,%zmm2
2186    vpclmulqdq 	 \$0x0,$ZPOLY,%zmm2,%zmm3
2187    vpxorq 	 %zmm2,%zmm4,%zmm4{%k2}
2188    vpxord 	 %zmm4,%zmm3,%zmm9
2189
2190    # Mult tweak by 2^{7, 6, 5, 4}
2191    vpsllvq const_dq7654(%rip),%zmm0,%zmm5
2192    vpsrlvq const_dq1234(%rip),%zmm1,%zmm6
2193    vpclmulqdq 	 \$0x0,%zmm25,%zmm6,%zmm7
2194    vpxorq 	 %zmm6,%zmm5,%zmm5{%k2}
2195    vpxord 	 %zmm5,%zmm7,%zmm10
2196
2197    # Make next 8 tweek values by all x 2^8
2198    vpsrldq 	 \$0xf,%zmm9,%zmm13
2199    vpclmulqdq 	 \$0x0,%zmm25,%zmm13,%zmm14
2200    vpslldq 	 \$0x1,%zmm9,%zmm11
2201    vpxord 	 %zmm14,%zmm11,%zmm11
2202
2203    vpsrldq 	 \$0xf,%zmm10,%zmm15
2204    vpclmulqdq 	 \$0x0,%zmm25,%zmm15,%zmm16
2205    vpslldq 	 \$0x1,%zmm10,%zmm12
2206    vpxord 	 %zmm16,%zmm12,%zmm12
2207
2208    .L_main_loop_run_16_${rndsuffix}:
2209    vmovdqu8 	 ($input),%zmm1
2210    vmovdqu8 	 0x40($input),%zmm2
2211    vmovdqu8 	 0x80($input),%zmm3
2212    vmovdqu8 	 0xc0($input),%zmm4
2213    vmovdqu8 	 0xf0($input),%xmm5
2214    add 	 \$0x100,$input
2215___
2216    }
2217
2218    decrypt_by_16_zmm("%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm9",
2219                      "%zmm10", "%zmm11", "%zmm12", "%zmm0", 0, $is_128);
2220
2221    {
2222    $code .= <<___;
2223    vmovdqu8 	 %zmm1,($output)
2224    vmovdqu8 	 %zmm2,0x40($output)
2225    vmovdqu8 	 %zmm3,0x80($output)
2226    vmovdqu8 	 %zmm4,0xc0($output)
2227    add 	 \$0x100,$output
2228    sub 	 \$0x100,$length
2229    cmp 	 \$0x100,$length
2230    jge 	 .L_main_loop_run_16_${rndsuffix}
2231
2232    cmp 	 \$0x80,$length
2233    jge 	 .L_main_loop_run_8_${rndsuffix}
2234    jmp 	 .L_do_n_blocks_${rndsuffix}
2235
2236    .L_start_by8_${rndsuffix}:
2237    # Make first 7 tweek values
2238    vbroadcasti32x4 	 ($TW),%zmm0
2239    vbroadcasti32x4 shufb_15_7(%rip),%zmm8
2240    mov 	 \$0xaa,$tmp1
2241    kmovq 	 $tmp1,%k2
2242
2243    # Mult tweak by 2^{3, 2, 1, 0}
2244    vpshufb 	 %zmm8,%zmm0,%zmm1
2245    vpsllvq const_dq3210(%rip),%zmm0,%zmm4
2246    vpsrlvq const_dq5678(%rip),%zmm1,%zmm2
2247    vpclmulqdq 	 \$0x0,%zmm25,%zmm2,%zmm3
2248    vpxorq 	 %zmm2,%zmm4,%zmm4{%k2}
2249    vpxord 	 %zmm4,%zmm3,%zmm9
2250
2251    # Mult tweak by 2^{7, 6, 5, 4}
2252    vpsllvq const_dq7654(%rip),%zmm0,%zmm5
2253    vpsrlvq const_dq1234(%rip),%zmm1,%zmm6
2254    vpclmulqdq 	 \$0x0,%zmm25,%zmm6,%zmm7
2255    vpxorq 	 %zmm6,%zmm5,%zmm5{%k2}
2256    vpxord 	 %zmm5,%zmm7,%zmm10
2257
2258    .L_main_loop_run_8_${rndsuffix}:
2259    vmovdqu8 	 ($input),%zmm1
2260    vmovdqu8 	 0x40($input),%zmm2
2261    vmovdqu8 	 0x70($input),%xmm5
2262    add 	         \$0x80,$input
2263___
2264    }
2265
2266
2267    decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 0, $is_128);
2268
2269    {
2270    $code .= <<___;
2271    vmovdqu8 	 %zmm1,($output)
2272    vmovdqu8 	 %zmm2,0x40($output)
2273    add 	 \$0x80,$output
2274    sub 	 \$0x80,$length
2275    cmp 	 \$0x80,$length
2276    jge 	 .L_main_loop_run_8_${rndsuffix}
2277    jmp 	 .L_do_n_blocks_${rndsuffix}
2278
2279    .L_steal_cipher_${rndsuffix}:
2280    # start cipher stealing simplified: xmm8-last cipher block, xmm0-next tweak
2281    vmovdqa 	 %xmm8,%xmm2
2282
2283    # shift xmm8 to the left by 16-N_val bytes
2284    lea vpshufb_shf_table(%rip),$TEMPLOW
2285    vmovdqu 	 ($TEMPLOW,$length,1),%xmm10
2286    vpshufb 	 %xmm10,%xmm8,%xmm8
2287
2288
2289    vmovdqu 	 -0x10($input,$length,1),%xmm3
2290    vmovdqu 	 %xmm8,-0x10($output,$length,1)
2291
2292    # shift xmm3 to the right by 16-N_val bytes
2293    lea vpshufb_shf_table(%rip), $TEMPLOW
2294    add \$16, $TEMPLOW
2295    sub 	 $length,$TEMPLOW
2296    vmovdqu 	 ($TEMPLOW),%xmm10
2297    vpxor mask1(%rip),%xmm10,%xmm10
2298    vpshufb 	 %xmm10,%xmm3,%xmm3
2299
2300    vpblendvb 	 %xmm10,%xmm2,%xmm3,%xmm3
2301
2302    # xor Tweak value
2303    vpxor 	 %xmm0,%xmm3,%xmm8
2304
2305    # decrypt last block with cipher stealing
2306    vpxor	($key1),%xmm8,%xmm8
2307    vaesdec	0x10($key1),%xmm8,%xmm8
2308    vaesdec	0x20($key1),%xmm8,%xmm8
2309    vaesdec	0x30($key1),%xmm8,%xmm8
2310    vaesdec	0x40($key1),%xmm8,%xmm8
2311    vaesdec	0x50($key1),%xmm8,%xmm8
2312    vaesdec	0x60($key1),%xmm8,%xmm8
2313    vaesdec	0x70($key1),%xmm8,%xmm8
2314    vaesdec	0x80($key1),%xmm8,%xmm8
2315    vaesdec	0x90($key1),%xmm8,%xmm8
2316___
2317    if ($is_128) {
2318      $code .= "vaesdeclast	0xa0($key1),%xmm8,%xmm8\n";
2319    } else {
2320      $code .= <<___;
2321      vaesdec	0xa0($key1),%xmm8,%xmm8
2322      vaesdec	0xb0($key1),%xmm8,%xmm8
2323      vaesdec	0xc0($key1),%xmm8,%xmm8
2324      vaesdec	0xd0($key1),%xmm8,%xmm8
2325      vaesdeclast	0xe0($key1),%xmm8,%xmm8
2326___
2327    }
2328    $code .= <<___
2329    # xor Tweak value
2330    vpxor 	 %xmm0,%xmm8,%xmm8
2331
2332    .L_done_${rndsuffix}:
2333    # store last ciphertext value
2334    vmovdqu 	 %xmm8,-0x10($output)
2335___
2336    }
2337
2338    {
2339    $code .= <<___;
2340    .L_ret_${rndsuffix}:
2341    mov 	 $GP_STORAGE($TW),%rbx
2342    xor    $tmp1,$tmp1
2343    mov    $tmp1,$GP_STORAGE($TW)
2344    # Zero-out the whole of `%zmm0`.
2345    vpxorq %zmm0,%zmm0,%zmm0
2346___
2347    }
2348
2349    if ($win64) {
2350      $code .= <<___;
2351      mov $GP_STORAGE + 8*1($TW),%rdi
2352      mov $tmp1,$GP_STORAGE + 8*1($TW)
2353      mov $GP_STORAGE + 8*2($TW),%rsi
2354      mov $tmp1,$GP_STORAGE + 8*2($TW)
2355
2356      vmovdqa $XMM_STORAGE + 16 * 0($TW), %xmm6
2357      vmovdqa $XMM_STORAGE + 16 * 1($TW), %xmm7
2358      vmovdqa $XMM_STORAGE + 16 * 2($TW), %xmm8
2359      vmovdqa $XMM_STORAGE + 16 * 3($TW), %xmm9
2360
2361      # Zero the 64 bytes we just restored to the xmm registers.
2362      vmovdqa64 %zmm0,$XMM_STORAGE($TW)
2363
2364      vmovdqa $XMM_STORAGE + 16 * 4($TW), %xmm10
2365      vmovdqa $XMM_STORAGE + 16 * 5($TW), %xmm11
2366      vmovdqa $XMM_STORAGE + 16 * 6($TW), %xmm12
2367      vmovdqa $XMM_STORAGE + 16 * 7($TW), %xmm13
2368
2369      # And again.
2370      vmovdqa64 %zmm0,$XMM_STORAGE + 16 * 4($TW)
2371
2372      vmovdqa $XMM_STORAGE + 16 * 8($TW), %xmm14
2373      vmovdqa $XMM_STORAGE + 16 * 9($TW), %xmm15
2374
2375      # Last round is only 32 bytes (256-bits), so we use `%ymm` as the
2376      # source operand.
2377      vmovdqa %ymm0,$XMM_STORAGE + 16 * 8($TW)
2378___
2379    }
2380
2381    {
2382    $code .= <<___;
2383    mov %rbp,$TW
2384    pop %rbp
2385    vzeroupper
2386    ret
2387
2388    .L_less_than_128_bytes_${rndsuffix}:
2389    cmp 	 \$0x10,$length
2390    jb 	 .L_ret_${rndsuffix}
2391
2392    mov 	 $length,$tmp1
2393    and 	 \$0x70,$tmp1
2394    cmp 	 \$0x60,$tmp1
2395    je 	 .L_num_blocks_is_6_${rndsuffix}
2396    cmp 	 \$0x50,$tmp1
2397    je 	 .L_num_blocks_is_5_${rndsuffix}
2398    cmp 	 \$0x40,$tmp1
2399    je 	 .L_num_blocks_is_4_${rndsuffix}
2400    cmp 	 \$0x30,$tmp1
2401    je 	 .L_num_blocks_is_3_${rndsuffix}
2402    cmp 	 \$0x20,$tmp1
2403    je 	 .L_num_blocks_is_2_${rndsuffix}
2404    cmp 	 \$0x10,$tmp1
2405    je 	 .L_num_blocks_is_1_${rndsuffix}
2406___
2407    }
2408
2409    $code .= "\n.L_num_blocks_is_7_${rndsuffix}:\n";
2410    initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2411               "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2412               "%xmm13", "%xmm14", "%xmm15", 7);
2413
2414    {
2415    $code .= <<___;
2416    add    \$0x70,$input
2417    and    \$0xf,$length
2418    je      .L_done_7_${rndsuffix}
2419
2420    .L_steal_cipher_7_${rndsuffix}:
2421     xor         $gf_poly_8b_temp, $gf_poly_8b_temp
2422     shl         \$1, $TEMPLOW
2423     adc         $TEMPHIGH, $TEMPHIGH
2424     cmovc       $gf_poly_8b, $gf_poly_8b_temp
2425     xor         $gf_poly_8b_temp, $TEMPLOW
2426     mov         $TEMPLOW,0x10($TW)
2427     mov         $TEMPHIGH,0x18($TW)
2428     vmovdqa64   %xmm15,%xmm16
2429     vmovdqa     0x10($TW),%xmm15
2430___
2431    }
2432
2433    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2434                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2435                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 7, 1, $is_128);
2436
2437    {
2438    $code .= <<___;
2439    vmovdqu 	 %xmm1,($output)
2440    vmovdqu 	 %xmm2,0x10($output)
2441    vmovdqu 	 %xmm3,0x20($output)
2442    vmovdqu 	 %xmm4,0x30($output)
2443    vmovdqu 	 %xmm5,0x40($output)
2444    vmovdqu 	 %xmm6,0x50($output)
2445    add 	         \$0x70,$output
2446    vmovdqa64 	 %xmm16,%xmm0
2447    vmovdqa 	 %xmm7,%xmm8
2448    jmp 	         .L_steal_cipher_${rndsuffix}
2449___
2450    }
2451
2452    $code .= "\n.L_done_7_${rndsuffix}:\n";
2453    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2454                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2455                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 7, 1, $is_128);
2456
2457    {
2458    $code .= <<___;
2459    vmovdqu 	 %xmm1,($output)
2460    vmovdqu 	 %xmm2,0x10($output)
2461    vmovdqu 	 %xmm3,0x20($output)
2462    vmovdqu 	 %xmm4,0x30($output)
2463    vmovdqu 	 %xmm5,0x40($output)
2464    vmovdqu 	 %xmm6,0x50($output)
2465    add 	         \$0x70,$output
2466    vmovdqa 	 %xmm7,%xmm8
2467    jmp 	         .L_done_${rndsuffix}
2468___
2469    }
2470
2471    $code .= "\n.L_num_blocks_is_6_${rndsuffix}:\n";
2472    initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2473               "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2474               "%xmm13", "%xmm14", "%xmm15", 6);
2475
2476    {
2477    $code .= <<___;
2478    add    \$0x60,$input
2479    and    \$0xf,$length
2480    je      .L_done_6_${rndsuffix}
2481
2482    .L_steal_cipher_6_${rndsuffix}:
2483     xor         $gf_poly_8b_temp, $gf_poly_8b_temp
2484     shl         \$1, $TEMPLOW
2485     adc         $TEMPHIGH, $TEMPHIGH
2486     cmovc       $gf_poly_8b, $gf_poly_8b_temp
2487     xor         $gf_poly_8b_temp, $TEMPLOW
2488     mov         $TEMPLOW,0x10($TW)
2489     mov         $TEMPHIGH,0x18($TW)
2490     vmovdqa64   %xmm14,%xmm15
2491     vmovdqa     0x10($TW),%xmm14
2492___
2493    }
2494
2495    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2496                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2497                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 6, 1, $is_128);
2498
2499    {
2500    $code .= <<___;
2501    vmovdqu 	 %xmm1,($output)
2502    vmovdqu 	 %xmm2,0x10($output)
2503    vmovdqu 	 %xmm3,0x20($output)
2504    vmovdqu 	 %xmm4,0x30($output)
2505    vmovdqu 	 %xmm5,0x40($output)
2506    add 	         \$0x60,$output
2507    vmovdqa 	 %xmm15,%xmm0
2508    vmovdqa 	 %xmm6,%xmm8
2509    jmp 	         .L_steal_cipher_${rndsuffix}
2510___
2511    }
2512    $code .= "\n.L_done_6_${rndsuffix}:\n";
2513    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2514                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2515                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 6, 1, $is_128);
2516
2517    {
2518    $code .= <<___;
2519    vmovdqu 	 %xmm1,($output)
2520    vmovdqu 	 %xmm2,0x10($output)
2521    vmovdqu 	 %xmm3,0x20($output)
2522    vmovdqu 	 %xmm4,0x30($output)
2523    vmovdqu 	 %xmm5,0x40($output)
2524    add 	         \$0x60,$output
2525    vmovdqa 	 %xmm6,%xmm8
2526    jmp 	         .L_done_${rndsuffix}
2527___
2528    }
2529
2530    $code .= "\n.L_num_blocks_is_5_${rndsuffix}:\n";
2531    initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2532               "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2533               "%xmm13", "%xmm14", "%xmm15", 5);
2534
2535    {
2536    $code .= <<___;
2537    add    \$0x50,$input
2538    and    \$0xf,$length
2539    je      .L_done_5_${rndsuffix}
2540
2541    .L_steal_cipher_5_${rndsuffix}:
2542     xor         $gf_poly_8b_temp, $gf_poly_8b_temp
2543     shl         \$1, $TEMPLOW
2544     adc         $TEMPHIGH, $TEMPHIGH
2545     cmovc       $gf_poly_8b, $gf_poly_8b_temp
2546     xor         $gf_poly_8b_temp, $TEMPLOW
2547     mov         $TEMPLOW,0x10($TW)
2548     mov         $TEMPHIGH,0x18($TW)
2549     vmovdqa64   %xmm13,%xmm14
2550     vmovdqa     0x10($TW),%xmm13
2551___
2552    }
2553
2554    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2555                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2556                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 5, 1, $is_128);
2557
2558    {
2559    $code .= <<___;
2560    vmovdqu 	 %xmm1,($output)
2561    vmovdqu 	 %xmm2,0x10($output)
2562    vmovdqu 	 %xmm3,0x20($output)
2563    vmovdqu 	 %xmm4,0x30($output)
2564    add 	         \$0x50,$output
2565    vmovdqa 	 %xmm14,%xmm0
2566    vmovdqa 	 %xmm5,%xmm8
2567    jmp 	         .L_steal_cipher_${rndsuffix}
2568___
2569    }
2570
2571    $code .= "\n.L_done_5_${rndsuffix}:\n";
2572    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2573                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2574                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 5, 1, $is_128);
2575
2576    {
2577    $code .= <<___;
2578    vmovdqu 	 %xmm1,($output)
2579    vmovdqu 	 %xmm2,0x10($output)
2580    vmovdqu 	 %xmm3,0x20($output)
2581    vmovdqu 	 %xmm4,0x30($output)
2582    add 	         \$0x50,$output
2583    vmovdqa 	 %xmm5,%xmm8
2584    jmp 	         .L_done_${rndsuffix}
2585___
2586    }
2587
2588    $code .= "\n.L_num_blocks_is_4_${rndsuffix}:\n";
2589
2590    initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2591               "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2592               "%xmm13", "%xmm14", "%xmm15", 4);
2593
2594    {
2595    $code .= <<___;
2596    add    \$0x40,$input
2597    and    \$0xf,$length
2598    je      .L_done_4_${rndsuffix}
2599
2600    .L_steal_cipher_4_${rndsuffix}:
2601     xor         $gf_poly_8b_temp, $gf_poly_8b_temp
2602     shl         \$1, $TEMPLOW
2603     adc         $TEMPHIGH, $TEMPHIGH
2604     cmovc       $gf_poly_8b, $gf_poly_8b_temp
2605     xor         $gf_poly_8b_temp, $TEMPLOW
2606     mov         $TEMPLOW,0x10($TW)
2607     mov         $TEMPHIGH,0x18($TW)
2608     vmovdqa64   %xmm12,%xmm13
2609     vmovdqa     0x10($TW),%xmm12
2610___
2611    }
2612
2613    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2614                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2615                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 4, 1, $is_128);
2616
2617    {
2618    $code .= <<___;
2619    vmovdqu 	 %xmm1,($output)
2620    vmovdqu 	 %xmm2,0x10($output)
2621    vmovdqu 	 %xmm3,0x20($output)
2622    add 	         \$0x40,$output
2623    vmovdqa 	 %xmm13,%xmm0
2624    vmovdqa 	 %xmm4,%xmm8
2625    jmp 	         .L_steal_cipher_${rndsuffix}
2626___
2627    }
2628
2629    $code .= "\n.L_done_4_${rndsuffix}:\n";
2630    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2631                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2632                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 4, 1, $is_128);
2633
2634    {
2635    $code .= <<___;
2636    vmovdqu 	 %xmm1,($output)
2637    vmovdqu 	 %xmm2,0x10($output)
2638    vmovdqu 	 %xmm3,0x20($output)
2639    add 	         \$0x40,$output
2640    vmovdqa 	 %xmm4,%xmm8
2641    jmp 	         .L_done_${rndsuffix}
2642___
2643    }
2644
2645    $code .= "\n.L_num_blocks_is_3_${rndsuffix}:\n";
2646
2647    initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2648               "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2649               "%xmm13", "%xmm14", "%xmm15", 3);
2650
2651    {
2652    $code .= <<___;
2653    add    \$0x30,$input
2654    and    \$0xf,$length
2655    je      .L_done_3_${rndsuffix}
2656
2657    .L_steal_cipher_3_${rndsuffix}:
2658     xor         $gf_poly_8b_temp, $gf_poly_8b_temp
2659     shl         \$1, $TEMPLOW
2660     adc         $TEMPHIGH, $TEMPHIGH
2661     cmovc       $gf_poly_8b, $gf_poly_8b_temp
2662     xor         $gf_poly_8b_temp, $TEMPLOW
2663     mov         $TEMPLOW,0x10($TW)
2664     mov         $TEMPHIGH,0x18($TW)
2665     vmovdqa64   %xmm11,%xmm12
2666     vmovdqa     0x10($TW),%xmm11
2667___
2668    }
2669
2670    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2671                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2672                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128);
2673
2674    {
2675    $code .= <<___;
2676    vmovdqu 	 %xmm1,($output)
2677    vmovdqu 	 %xmm2,0x10($output)
2678    add 	         \$0x30,$output
2679    vmovdqa 	 %xmm12,%xmm0
2680    vmovdqa 	 %xmm3,%xmm8
2681    jmp 	         .L_steal_cipher_${rndsuffix}
2682___
2683    }
2684    $code .= "\n.L_done_3_${rndsuffix}:\n";
2685    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2686                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2687                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1, $is_128);
2688
2689    {
2690    $code .= <<___;
2691    vmovdqu 	 %xmm1,($output)
2692    vmovdqu 	 %xmm2,0x10($output)
2693    add 	         \$0x30,$output
2694    vmovdqa 	 %xmm3,%xmm8
2695    jmp 	         .L_done_${rndsuffix}
2696___
2697    }
2698
2699    $code .= "\n.L_num_blocks_is_2_${rndsuffix}:\n";
2700
2701    initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2702               "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2703               "%xmm13", "%xmm14", "%xmm15", 2);
2704
2705    {
2706    $code .= <<___;
2707    add    \$0x20,$input
2708    and    \$0xf,$length
2709    je      .L_done_2_${rndsuffix}
2710
2711    .L_steal_cipher_2_${rndsuffix}:
2712     xor         $gf_poly_8b_temp, $gf_poly_8b_temp
2713     shl         \$1, $TEMPLOW
2714     adc         $TEMPHIGH, $TEMPHIGH
2715     cmovc       $gf_poly_8b, $gf_poly_8b_temp
2716     xor         $gf_poly_8b_temp, $TEMPLOW
2717     mov         $TEMPLOW,0x10($TW)
2718     mov         $TEMPHIGH,0x18($TW)
2719     vmovdqa64   %xmm10,%xmm11
2720     vmovdqa     0x10($TW),%xmm10
2721___
2722    }
2723
2724    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2725                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2726                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128);
2727
2728    {
2729    $code .= <<___;
2730    vmovdqu 	 %xmm1,($output)
2731    add 	         \$0x20,$output
2732    vmovdqa 	 %xmm11,%xmm0
2733    vmovdqa 	 %xmm2,%xmm8
2734    jmp 	         .L_steal_cipher_${rndsuffix}
2735___
2736    }
2737
2738    $code .= "\n.L_done_2_${rndsuffix}:\n";
2739    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2740                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2741                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1, $is_128);
2742
2743    {
2744    $code .= <<___;
2745    vmovdqu 	 %xmm1,($output)
2746    add 	         \$0x20,$output
2747    vmovdqa 	 %xmm2,%xmm8
2748    jmp 	         .L_done_${rndsuffix}
2749___
2750    }
2751
2752    $code .= "\n.L_num_blocks_is_1_${rndsuffix}:\n";
2753
2754    initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2755               "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2756               "%xmm13", "%xmm14", "%xmm15", 1);
2757
2758    {
2759    $code .= <<___;
2760    add    \$0x10,$input
2761    and    \$0xf,$length
2762    je      .L_done_1_${rndsuffix}
2763
2764    .L_steal_cipher_1_${rndsuffix}:
2765     xor         $gf_poly_8b_temp, $gf_poly_8b_temp
2766     shl         \$1, $TEMPLOW
2767     adc         $TEMPHIGH, $TEMPHIGH
2768     cmovc       $gf_poly_8b, $gf_poly_8b_temp
2769     xor         $gf_poly_8b_temp, $TEMPLOW
2770     mov         $TEMPLOW,0x10($TW)
2771     mov         $TEMPHIGH,0x18($TW)
2772     vmovdqa64   %xmm9,%xmm10
2773     vmovdqa     0x10($TW),%xmm9
2774___
2775    }
2776    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2777                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2778                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128);
2779
2780    {
2781    $code .= <<___;
2782    add 	         \$0x10,$output
2783    vmovdqa 	 %xmm10,%xmm0
2784    vmovdqa 	 %xmm1,%xmm8
2785    jmp 	         .L_steal_cipher_${rndsuffix}
2786___
2787    }
2788    $code .= "\n.L_done_1_${rndsuffix}:\n";
2789    decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
2790                    "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
2791                    "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1, $is_128);
2792
2793    {
2794    $code .= <<___;
2795    add 	         \$0x10,$output
2796    vmovdqa 	 %xmm1,%xmm8
2797    jmp 	         .L_done_${rndsuffix}
2798    .cfi_endproc
2799___
2800    }
2801
2802  }
2803
2804  # The only difference between AES-XTS-128 and -256 is the number of rounds,
2805  # so we generate from the same perlasm base, extending to 14 rounds when
2806  # `$is_128' is 0.
2807
2808  enc(1);
2809  dec(1);
2810
2811  enc(0);
2812  dec(0);
2813
2814  $code .= <<___;
2815  .section .rodata
2816  .align 16
2817
2818  vpshufb_shf_table:
2819    .quad 0x8786858483828100, 0x8f8e8d8c8b8a8988
2820    .quad 0x0706050403020100, 0x000e0d0c0b0a0908
2821
2822  mask1:
2823    .quad 0x8080808080808080, 0x8080808080808080
2824
2825  const_dq3210:
2826    .quad 0, 0, 1, 1, 2, 2, 3, 3
2827  const_dq5678:
2828    .quad 8, 8, 7, 7, 6, 6, 5, 5
2829  const_dq7654:
2830    .quad 4, 4, 5, 5, 6, 6, 7, 7
2831  const_dq1234:
2832    .quad 4, 4, 3, 3, 2, 2, 1, 1
2833
2834  shufb_15_7:
2835    .byte  15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff
2836    .byte  0xff, 0xff, 0xff, 0xff, 0xff
2837
2838.text
2839___
2840
2841} else {
2842    $code .= <<___;
2843    .text
2844    .globl  aesni_xts_128_encrypt_avx512
2845    .globl  aesni_xts_128_decrypt_avx512
2846
2847    aesni_xts_128_encrypt_avx512:
2848    aesni_xts_128_decrypt_avx512:
2849    .byte   0x0f,0x0b    # ud2
2850    ret
2851
2852    .globl  aesni_xts_256_encrypt_avx512
2853    .globl  aesni_xts_256_decrypt_avx512
2854
2855    aesni_xts_256_encrypt_avx512:
2856    aesni_xts_256_decrypt_avx512:
2857    .byte   0x0f,0x0b    # ud2
2858    ret
2859
2860    .globl  aesni_xts_avx512_eligible
2861    .type   aesni_xts_avx512_eligible,\@abi-omnipotent
2862    aesni_xts_avx512_eligible:
2863    xor	%eax,%eax
2864    ret
2865    .size   aesni_xts_avx512_eligible, .-aesni_xts_avx512_eligible
2866
2867___
2868}
2869
2870print $code;
2871
2872close STDOUT or die "error closing STDOUT: $!";
2873