1#! /usr/bin/env perl 2# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10################################################################### 11### AES-128 [originally in CTR mode] ### 12### bitsliced implementation for Intel Core 2 processors ### 13### requires support of SSE extensions up to SSSE3 ### 14### Author: Emilia Käsper and Peter Schwabe ### 15### Date: 2009-03-19 ### 16### Public domain ### 17### ### 18### See http://homes.esat.kuleuven.be/~ekasper/#software for ### 19### further information. ### 20################################################################### 21# 22# September 2011. 23# 24# Started as transliteration to "perlasm" the original code has 25# undergone following changes: 26# 27# - code was made position-independent; 28# - rounds were folded into a loop resulting in >5x size reduction 29# from 12.5KB to 2.2KB; 30# - above was possibile thanks to mixcolumns() modification that 31# allowed to feed its output back to aesenc[last], this was 32# achieved at cost of two additional inter-registers moves; 33# - some instruction reordering and interleaving; 34# - this module doesn't implement key setup subroutine, instead it 35# relies on conversion of "conventional" key schedule as returned 36# by AES_set_encrypt_key (see discussion below); 37# - first and last round keys are treated differently, which allowed 38# to skip one shiftrows(), reduce bit-sliced key schedule and 39# speed-up conversion by 22%; 40# - support for 192- and 256-bit keys was added; 41# 42# Resulting performance in CPU cycles spent to encrypt one byte out 43# of 4096-byte buffer with 128-bit key is: 44# 45# Emilia's this(*) difference 46# 47# Core 2 9.30 8.69 +7% 48# Nehalem(**) 7.63 6.88 +11% 49# Atom 17.1 16.4 +4% 50# Silvermont - 12.9 51# Goldmont - 8.85 52# 53# (*) Comparison is not completely fair, because "this" is ECB, 54# i.e. no extra processing such as counter values calculation 55# and xor-ing input as in Emilia's CTR implementation is 56# performed. However, the CTR calculations stand for not more 57# than 1% of total time, so comparison is *rather* fair. 58# 59# (**) Results were collected on Westmere, which is considered to 60# be equivalent to Nehalem for this code. 61# 62# As for key schedule conversion subroutine. Interface to OpenSSL 63# relies on per-invocation on-the-fly conversion. This naturally 64# has impact on performance, especially for short inputs. Conversion 65# time in CPU cycles and its ratio to CPU cycles spent in 8x block 66# function is: 67# 68# conversion conversion/8x block 69# Core 2 240 0.22 70# Nehalem 180 0.20 71# Atom 430 0.20 72# 73# The ratio values mean that 128-byte blocks will be processed 74# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, 75# etc. Then keep in mind that input sizes not divisible by 128 are 76# *effectively* slower, especially shortest ones, e.g. consecutive 77# 144-byte blocks are processed 44% slower than one would expect, 78# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" 79# it's still faster than ["hyper-threading-safe" code path in] 80# aes-x86_64.pl on all lengths above 64 bytes... 81# 82# October 2011. 83# 84# Add decryption procedure. Performance in CPU cycles spent to decrypt 85# one byte out of 4096-byte buffer with 128-bit key is: 86# 87# Core 2 9.98 88# Nehalem 7.80 89# Atom 17.9 90# Silvermont 14.0 91# Goldmont 10.2 92# 93# November 2011. 94# 95# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is 96# suboptimal, but XTS is meant to be used with larger blocks... 97# 98# <appro@openssl.org> 99 100$flavour = shift; 101$output = shift; 102if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 103 104$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 105 106$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 107( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 108( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 109die "can't locate x86_64-xlate.pl"; 110 111open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 112*STDOUT=*OUT; 113 114my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); 115my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) 116my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... 117 118{ 119my ($key,$rounds,$const)=("%rax","%r10d","%r11"); 120 121sub Sbox { 122# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 123# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb 124my @b=@_[0..7]; 125my @t=@_[8..11]; 126my @s=@_[12..15]; 127 &InBasisChange (@b); 128 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); 129 &OutBasisChange (@b[7,1,4,2,6,5,0,3]); 130} 131 132sub InBasisChange { 133# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 134# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 135my @b=@_[0..7]; 136$code.=<<___; 137 pxor @b[6], @b[5] 138 pxor @b[1], @b[2] 139 pxor @b[0], @b[3] 140 pxor @b[2], @b[6] 141 pxor @b[0], @b[5] 142 143 pxor @b[3], @b[6] 144 pxor @b[7], @b[3] 145 pxor @b[5], @b[7] 146 pxor @b[4], @b[3] 147 pxor @b[5], @b[4] 148 pxor @b[1], @b[3] 149 150 pxor @b[7], @b[2] 151 pxor @b[5], @b[1] 152___ 153} 154 155sub OutBasisChange { 156# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 157# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb 158my @b=@_[0..7]; 159$code.=<<___; 160 pxor @b[6], @b[0] 161 pxor @b[4], @b[1] 162 pxor @b[0], @b[2] 163 pxor @b[6], @b[4] 164 pxor @b[1], @b[6] 165 166 pxor @b[5], @b[1] 167 pxor @b[3], @b[5] 168 pxor @b[7], @b[3] 169 pxor @b[5], @b[7] 170 pxor @b[5], @b[2] 171 172 pxor @b[7], @b[4] 173___ 174} 175 176sub InvSbox { 177# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 178# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb 179my @b=@_[0..7]; 180my @t=@_[8..11]; 181my @s=@_[12..15]; 182 &InvInBasisChange (@b); 183 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); 184 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); 185} 186 187sub InvInBasisChange { # OutBasisChange in reverse 188my @b=@_[5,1,2,6,3,7,0,4]; 189$code.=<<___ 190 pxor @b[7], @b[4] 191 192 pxor @b[5], @b[7] 193 pxor @b[5], @b[2] 194 pxor @b[7], @b[3] 195 pxor @b[3], @b[5] 196 pxor @b[5], @b[1] 197 198 pxor @b[1], @b[6] 199 pxor @b[0], @b[2] 200 pxor @b[6], @b[4] 201 pxor @b[6], @b[0] 202 pxor @b[4], @b[1] 203___ 204} 205 206sub InvOutBasisChange { # InBasisChange in reverse 207my @b=@_[2,5,7,3,6,1,0,4]; 208$code.=<<___; 209 pxor @b[5], @b[1] 210 pxor @b[7], @b[2] 211 212 pxor @b[1], @b[3] 213 pxor @b[5], @b[4] 214 pxor @b[5], @b[7] 215 pxor @b[4], @b[3] 216 pxor @b[0], @b[5] 217 pxor @b[7], @b[3] 218 pxor @b[2], @b[6] 219 pxor @b[1], @b[2] 220 pxor @b[3], @b[6] 221 222 pxor @b[0], @b[3] 223 pxor @b[6], @b[5] 224___ 225} 226 227sub Mul_GF4 { 228#;************************************************************* 229#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * 230#;************************************************************* 231my ($x0,$x1,$y0,$y1,$t0)=@_; 232$code.=<<___; 233 movdqa $y0, $t0 234 pxor $y1, $t0 235 pand $x0, $t0 236 pxor $x1, $x0 237 pand $y0, $x1 238 pand $y1, $x0 239 pxor $x1, $x0 240 pxor $t0, $x1 241___ 242} 243 244sub Mul_GF4_N { # not used, see next subroutine 245# multiply and scale by N 246my ($x0,$x1,$y0,$y1,$t0)=@_; 247$code.=<<___; 248 movdqa $y0, $t0 249 pxor $y1, $t0 250 pand $x0, $t0 251 pxor $x1, $x0 252 pand $y0, $x1 253 pand $y1, $x0 254 pxor $x0, $x1 255 pxor $t0, $x0 256___ 257} 258 259sub Mul_GF4_N_GF4 { 260# interleaved Mul_GF4_N and Mul_GF4 261my ($x0,$x1,$y0,$y1,$t0, 262 $x2,$x3,$y2,$y3,$t1)=@_; 263$code.=<<___; 264 movdqa $y0, $t0 265 movdqa $y2, $t1 266 pxor $y1, $t0 267 pxor $y3, $t1 268 pand $x0, $t0 269 pand $x2, $t1 270 pxor $x1, $x0 271 pxor $x3, $x2 272 pand $y0, $x1 273 pand $y2, $x3 274 pand $y1, $x0 275 pand $y3, $x2 276 pxor $x0, $x1 277 pxor $x3, $x2 278 pxor $t0, $x0 279 pxor $t1, $x3 280___ 281} 282sub Mul_GF16_2 { 283my @x=@_[0..7]; 284my @y=@_[8..11]; 285my @t=@_[12..15]; 286$code.=<<___; 287 movdqa @x[0], @t[0] 288 movdqa @x[1], @t[1] 289___ 290 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); 291$code.=<<___; 292 pxor @x[2], @t[0] 293 pxor @x[3], @t[1] 294 pxor @y[2], @y[0] 295 pxor @y[3], @y[1] 296___ 297 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 298 @x[2], @x[3], @y[2], @y[3], @t[2]); 299$code.=<<___; 300 pxor @t[0], @x[0] 301 pxor @t[0], @x[2] 302 pxor @t[1], @x[1] 303 pxor @t[1], @x[3] 304 305 movdqa @x[4], @t[0] 306 movdqa @x[5], @t[1] 307 pxor @x[6], @t[0] 308 pxor @x[7], @t[1] 309___ 310 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 311 @x[6], @x[7], @y[2], @y[3], @t[2]); 312$code.=<<___; 313 pxor @y[2], @y[0] 314 pxor @y[3], @y[1] 315___ 316 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); 317$code.=<<___; 318 pxor @t[0], @x[4] 319 pxor @t[0], @x[6] 320 pxor @t[1], @x[5] 321 pxor @t[1], @x[7] 322___ 323} 324sub Inv_GF256 { 325#;******************************************************************** 326#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * 327#;******************************************************************** 328my @x=@_[0..7]; 329my @t=@_[8..11]; 330my @s=@_[12..15]; 331# direct optimizations from hardware 332$code.=<<___; 333 movdqa @x[4], @t[3] 334 movdqa @x[5], @t[2] 335 movdqa @x[1], @t[1] 336 movdqa @x[7], @s[1] 337 movdqa @x[0], @s[0] 338 339 pxor @x[6], @t[3] 340 pxor @x[7], @t[2] 341 pxor @x[3], @t[1] 342 movdqa @t[3], @s[2] 343 pxor @x[6], @s[1] 344 movdqa @t[2], @t[0] 345 pxor @x[2], @s[0] 346 movdqa @t[3], @s[3] 347 348 por @t[1], @t[2] 349 por @s[0], @t[3] 350 pxor @t[0], @s[3] 351 pand @s[0], @s[2] 352 pxor @t[1], @s[0] 353 pand @t[1], @t[0] 354 pand @s[0], @s[3] 355 movdqa @x[3], @s[0] 356 pxor @x[2], @s[0] 357 pand @s[0], @s[1] 358 pxor @s[1], @t[3] 359 pxor @s[1], @t[2] 360 movdqa @x[4], @s[1] 361 movdqa @x[1], @s[0] 362 pxor @x[5], @s[1] 363 pxor @x[0], @s[0] 364 movdqa @s[1], @t[1] 365 pand @s[0], @s[1] 366 por @s[0], @t[1] 367 pxor @s[1], @t[0] 368 pxor @s[3], @t[3] 369 pxor @s[2], @t[2] 370 pxor @s[3], @t[1] 371 movdqa @x[7], @s[0] 372 pxor @s[2], @t[0] 373 movdqa @x[6], @s[1] 374 pxor @s[2], @t[1] 375 movdqa @x[5], @s[2] 376 pand @x[3], @s[0] 377 movdqa @x[4], @s[3] 378 pand @x[2], @s[1] 379 pand @x[1], @s[2] 380 por @x[0], @s[3] 381 pxor @s[0], @t[3] 382 pxor @s[1], @t[2] 383 pxor @s[2], @t[1] 384 pxor @s[3], @t[0] 385 386 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 387 388 # new smaller inversion 389 390 movdqa @t[3], @s[0] 391 pand @t[1], @t[3] 392 pxor @t[2], @s[0] 393 394 movdqa @t[0], @s[2] 395 movdqa @s[0], @s[3] 396 pxor @t[3], @s[2] 397 pand @s[2], @s[3] 398 399 movdqa @t[1], @s[1] 400 pxor @t[2], @s[3] 401 pxor @t[0], @s[1] 402 403 pxor @t[2], @t[3] 404 405 pand @t[3], @s[1] 406 407 movdqa @s[2], @t[2] 408 pxor @t[0], @s[1] 409 410 pxor @s[1], @t[2] 411 pxor @s[1], @t[1] 412 413 pand @t[0], @t[2] 414 415 pxor @t[2], @s[2] 416 pxor @t[2], @t[1] 417 418 pand @s[3], @s[2] 419 420 pxor @s[0], @s[2] 421___ 422# output in s3, s2, s1, t1 423 424# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 425 426# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 427 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); 428 429### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb 430} 431 432# AES linear components 433 434sub ShiftRows { 435my @x=@_[0..7]; 436my $mask=pop; 437$code.=<<___; 438 pxor 0x00($key),@x[0] 439 pxor 0x10($key),@x[1] 440 pxor 0x20($key),@x[2] 441 pxor 0x30($key),@x[3] 442 pshufb $mask,@x[0] 443 pshufb $mask,@x[1] 444 pxor 0x40($key),@x[4] 445 pxor 0x50($key),@x[5] 446 pshufb $mask,@x[2] 447 pshufb $mask,@x[3] 448 pxor 0x60($key),@x[6] 449 pxor 0x70($key),@x[7] 450 pshufb $mask,@x[4] 451 pshufb $mask,@x[5] 452 pshufb $mask,@x[6] 453 pshufb $mask,@x[7] 454 lea 0x80($key),$key 455___ 456} 457 458sub MixColumns { 459# modified to emit output in order suitable for feeding back to aesenc[last] 460my @x=@_[0..7]; 461my @t=@_[8..15]; 462my $inv=@_[16]; # optional 463$code.=<<___; 464 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 465 pshufd \$0x93, @x[1], @t[1] 466 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) 467 pshufd \$0x93, @x[2], @t[2] 468 pxor @t[1], @x[1] 469 pshufd \$0x93, @x[3], @t[3] 470 pxor @t[2], @x[2] 471 pshufd \$0x93, @x[4], @t[4] 472 pxor @t[3], @x[3] 473 pshufd \$0x93, @x[5], @t[5] 474 pxor @t[4], @x[4] 475 pshufd \$0x93, @x[6], @t[6] 476 pxor @t[5], @x[5] 477 pshufd \$0x93, @x[7], @t[7] 478 pxor @t[6], @x[6] 479 pxor @t[7], @x[7] 480 481 pxor @x[0], @t[1] 482 pxor @x[7], @t[0] 483 pxor @x[7], @t[1] 484 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) 485 pxor @x[1], @t[2] 486 pshufd \$0x4E, @x[1], @x[1] 487 pxor @x[4], @t[5] 488 pxor @t[0], @x[0] 489 pxor @x[5], @t[6] 490 pxor @t[1], @x[1] 491 pxor @x[3], @t[4] 492 pshufd \$0x4E, @x[4], @t[0] 493 pxor @x[6], @t[7] 494 pshufd \$0x4E, @x[5], @t[1] 495 pxor @x[2], @t[3] 496 pshufd \$0x4E, @x[3], @x[4] 497 pxor @x[7], @t[3] 498 pshufd \$0x4E, @x[7], @x[5] 499 pxor @x[7], @t[4] 500 pshufd \$0x4E, @x[6], @x[3] 501 pxor @t[4], @t[0] 502 pshufd \$0x4E, @x[2], @x[6] 503 pxor @t[5], @t[1] 504___ 505$code.=<<___ if (!$inv); 506 pxor @t[3], @x[4] 507 pxor @t[7], @x[5] 508 pxor @t[6], @x[3] 509 movdqa @t[0], @x[2] 510 pxor @t[2], @x[6] 511 movdqa @t[1], @x[7] 512___ 513$code.=<<___ if ($inv); 514 pxor @x[4], @t[3] 515 pxor @t[7], @x[5] 516 pxor @x[3], @t[6] 517 movdqa @t[0], @x[3] 518 pxor @t[2], @x[6] 519 movdqa @t[6], @x[2] 520 movdqa @t[1], @x[7] 521 movdqa @x[6], @x[4] 522 movdqa @t[3], @x[6] 523___ 524} 525 526sub InvMixColumns_orig { 527my @x=@_[0..7]; 528my @t=@_[8..15]; 529 530$code.=<<___; 531 # multiplication by 0x0e 532 pshufd \$0x93, @x[7], @t[7] 533 movdqa @x[2], @t[2] 534 pxor @x[5], @x[7] # 7 5 535 pxor @x[5], @x[2] # 2 5 536 pshufd \$0x93, @x[0], @t[0] 537 movdqa @x[5], @t[5] 538 pxor @x[0], @x[5] # 5 0 [1] 539 pxor @x[1], @x[0] # 0 1 540 pshufd \$0x93, @x[1], @t[1] 541 pxor @x[2], @x[1] # 1 25 542 pxor @x[6], @x[0] # 01 6 [2] 543 pxor @x[3], @x[1] # 125 3 [4] 544 pshufd \$0x93, @x[3], @t[3] 545 pxor @x[0], @x[2] # 25 016 [3] 546 pxor @x[7], @x[3] # 3 75 547 pxor @x[6], @x[7] # 75 6 [0] 548 pshufd \$0x93, @x[6], @t[6] 549 movdqa @x[4], @t[4] 550 pxor @x[4], @x[6] # 6 4 551 pxor @x[3], @x[4] # 4 375 [6] 552 pxor @x[7], @x[3] # 375 756=36 553 pxor @t[5], @x[6] # 64 5 [7] 554 pxor @t[2], @x[3] # 36 2 555 pxor @t[4], @x[3] # 362 4 [5] 556 pshufd \$0x93, @t[5], @t[5] 557___ 558 my @y = @x[7,5,0,2,1,3,4,6]; 559$code.=<<___; 560 # multiplication by 0x0b 561 pxor @y[0], @y[1] 562 pxor @t[0], @y[0] 563 pxor @t[1], @y[1] 564 pshufd \$0x93, @t[2], @t[2] 565 pxor @t[5], @y[0] 566 pxor @t[6], @y[1] 567 pxor @t[7], @y[0] 568 pshufd \$0x93, @t[4], @t[4] 569 pxor @t[6], @t[7] # clobber t[7] 570 pxor @y[0], @y[1] 571 572 pxor @t[0], @y[3] 573 pshufd \$0x93, @t[0], @t[0] 574 pxor @t[1], @y[2] 575 pxor @t[1], @y[4] 576 pxor @t[2], @y[2] 577 pshufd \$0x93, @t[1], @t[1] 578 pxor @t[2], @y[3] 579 pxor @t[2], @y[5] 580 pxor @t[7], @y[2] 581 pshufd \$0x93, @t[2], @t[2] 582 pxor @t[3], @y[3] 583 pxor @t[3], @y[6] 584 pxor @t[3], @y[4] 585 pshufd \$0x93, @t[3], @t[3] 586 pxor @t[4], @y[7] 587 pxor @t[4], @y[5] 588 pxor @t[7], @y[7] 589 pxor @t[5], @y[3] 590 pxor @t[4], @y[4] 591 pxor @t[5], @t[7] # clobber t[7] even more 592 593 pxor @t[7], @y[5] 594 pshufd \$0x93, @t[4], @t[4] 595 pxor @t[7], @y[6] 596 pxor @t[7], @y[4] 597 598 pxor @t[5], @t[7] 599 pshufd \$0x93, @t[5], @t[5] 600 pxor @t[6], @t[7] # restore t[7] 601 602 # multiplication by 0x0d 603 pxor @y[7], @y[4] 604 pxor @t[4], @y[7] 605 pshufd \$0x93, @t[6], @t[6] 606 pxor @t[0], @y[2] 607 pxor @t[5], @y[7] 608 pxor @t[2], @y[2] 609 pshufd \$0x93, @t[7], @t[7] 610 611 pxor @y[1], @y[3] 612 pxor @t[1], @y[1] 613 pxor @t[0], @y[0] 614 pxor @t[0], @y[3] 615 pxor @t[5], @y[1] 616 pxor @t[5], @y[0] 617 pxor @t[7], @y[1] 618 pshufd \$0x93, @t[0], @t[0] 619 pxor @t[6], @y[0] 620 pxor @y[1], @y[3] 621 pxor @t[1], @y[4] 622 pshufd \$0x93, @t[1], @t[1] 623 624 pxor @t[7], @y[7] 625 pxor @t[2], @y[4] 626 pxor @t[2], @y[5] 627 pshufd \$0x93, @t[2], @t[2] 628 pxor @t[6], @y[2] 629 pxor @t[3], @t[6] # clobber t[6] 630 pxor @y[7], @y[4] 631 pxor @t[6], @y[3] 632 633 pxor @t[6], @y[6] 634 pxor @t[5], @y[5] 635 pxor @t[4], @y[6] 636 pshufd \$0x93, @t[4], @t[4] 637 pxor @t[6], @y[5] 638 pxor @t[7], @y[6] 639 pxor @t[3], @t[6] # restore t[6] 640 641 pshufd \$0x93, @t[5], @t[5] 642 pshufd \$0x93, @t[6], @t[6] 643 pshufd \$0x93, @t[7], @t[7] 644 pshufd \$0x93, @t[3], @t[3] 645 646 # multiplication by 0x09 647 pxor @y[1], @y[4] 648 pxor @y[1], @t[1] # t[1]=y[1] 649 pxor @t[5], @t[0] # clobber t[0] 650 pxor @t[5], @t[1] 651 pxor @t[0], @y[3] 652 pxor @y[0], @t[0] # t[0]=y[0] 653 pxor @t[6], @t[1] 654 pxor @t[7], @t[6] # clobber t[6] 655 pxor @t[1], @y[4] 656 pxor @t[4], @y[7] 657 pxor @y[4], @t[4] # t[4]=y[4] 658 pxor @t[3], @y[6] 659 pxor @y[3], @t[3] # t[3]=y[3] 660 pxor @t[2], @y[5] 661 pxor @y[2], @t[2] # t[2]=y[2] 662 pxor @t[7], @t[3] 663 pxor @y[5], @t[5] # t[5]=y[5] 664 pxor @t[6], @t[2] 665 pxor @t[6], @t[5] 666 pxor @y[6], @t[6] # t[6]=y[6] 667 pxor @y[7], @t[7] # t[7]=y[7] 668 669 movdqa @t[0],@XMM[0] 670 movdqa @t[1],@XMM[1] 671 movdqa @t[2],@XMM[2] 672 movdqa @t[3],@XMM[3] 673 movdqa @t[4],@XMM[4] 674 movdqa @t[5],@XMM[5] 675 movdqa @t[6],@XMM[6] 676 movdqa @t[7],@XMM[7] 677___ 678} 679 680sub InvMixColumns { 681my @x=@_[0..7]; 682my @t=@_[8..15]; 683 684# Thanks to Jussi Kivilinna for providing pointer to 685# 686# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | 687# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | 688# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | 689# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | 690 691$code.=<<___; 692 # multiplication by 0x05-0x00-0x04-0x00 693 pshufd \$0x4E, @x[0], @t[0] 694 pshufd \$0x4E, @x[6], @t[6] 695 pxor @x[0], @t[0] 696 pshufd \$0x4E, @x[7], @t[7] 697 pxor @x[6], @t[6] 698 pshufd \$0x4E, @x[1], @t[1] 699 pxor @x[7], @t[7] 700 pshufd \$0x4E, @x[2], @t[2] 701 pxor @x[1], @t[1] 702 pshufd \$0x4E, @x[3], @t[3] 703 pxor @x[2], @t[2] 704 pxor @t[6], @x[0] 705 pxor @t[6], @x[1] 706 pshufd \$0x4E, @x[4], @t[4] 707 pxor @x[3], @t[3] 708 pxor @t[0], @x[2] 709 pxor @t[1], @x[3] 710 pshufd \$0x4E, @x[5], @t[5] 711 pxor @x[4], @t[4] 712 pxor @t[7], @x[1] 713 pxor @t[2], @x[4] 714 pxor @x[5], @t[5] 715 716 pxor @t[7], @x[2] 717 pxor @t[6], @x[3] 718 pxor @t[6], @x[4] 719 pxor @t[3], @x[5] 720 pxor @t[4], @x[6] 721 pxor @t[7], @x[4] 722 pxor @t[7], @x[5] 723 pxor @t[5], @x[7] 724___ 725 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 726} 727 728sub aesenc { # not used 729my @b=@_[0..7]; 730my @t=@_[8..15]; 731$code.=<<___; 732 movdqa 0x30($const),@t[0] # .LSR 733___ 734 &ShiftRows (@b,@t[0]); 735 &Sbox (@b,@t); 736 &MixColumns (@b[0,1,4,6,3,7,2,5],@t); 737} 738 739sub aesenclast { # not used 740my @b=@_[0..7]; 741my @t=@_[8..15]; 742$code.=<<___; 743 movdqa 0x40($const),@t[0] # .LSRM0 744___ 745 &ShiftRows (@b,@t[0]); 746 &Sbox (@b,@t); 747$code.=<<___ 748 pxor 0x00($key),@b[0] 749 pxor 0x10($key),@b[1] 750 pxor 0x20($key),@b[4] 751 pxor 0x30($key),@b[6] 752 pxor 0x40($key),@b[3] 753 pxor 0x50($key),@b[7] 754 pxor 0x60($key),@b[2] 755 pxor 0x70($key),@b[5] 756___ 757} 758 759sub swapmove { 760my ($a,$b,$n,$mask,$t)=@_; 761$code.=<<___; 762 movdqa $b,$t 763 psrlq \$$n,$b 764 pxor $a,$b 765 pand $mask,$b 766 pxor $b,$a 767 psllq \$$n,$b 768 pxor $t,$b 769___ 770} 771sub swapmove2x { 772my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; 773$code.=<<___; 774 movdqa $b0,$t0 775 psrlq \$$n,$b0 776 movdqa $b1,$t1 777 psrlq \$$n,$b1 778 pxor $a0,$b0 779 pxor $a1,$b1 780 pand $mask,$b0 781 pand $mask,$b1 782 pxor $b0,$a0 783 psllq \$$n,$b0 784 pxor $b1,$a1 785 psllq \$$n,$b1 786 pxor $t0,$b0 787 pxor $t1,$b1 788___ 789} 790 791sub bitslice { 792my @x=reverse(@_[0..7]); 793my ($t0,$t1,$t2,$t3)=@_[8..11]; 794$code.=<<___; 795 movdqa 0x00($const),$t0 # .LBS0 796 movdqa 0x10($const),$t1 # .LBS1 797___ 798 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); 799 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 800$code.=<<___; 801 movdqa 0x20($const),$t0 # .LBS2 802___ 803 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); 804 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 805 806 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); 807 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); 808} 809 810$code.=<<___; 811.text 812 813.extern asm_AES_encrypt 814.extern asm_AES_decrypt 815 816.type _bsaes_encrypt8,\@abi-omnipotent 817.align 64 818_bsaes_encrypt8: 819 lea .LBS0(%rip), $const # constants table 820 821 movdqa ($key), @XMM[9] # round 0 key 822 lea 0x10($key), $key 823 movdqa 0x50($const), @XMM[8] # .LM0SR 824 pxor @XMM[9], @XMM[0] # xor with round0 key 825 pxor @XMM[9], @XMM[1] 826 pxor @XMM[9], @XMM[2] 827 pxor @XMM[9], @XMM[3] 828 pshufb @XMM[8], @XMM[0] 829 pshufb @XMM[8], @XMM[1] 830 pxor @XMM[9], @XMM[4] 831 pxor @XMM[9], @XMM[5] 832 pshufb @XMM[8], @XMM[2] 833 pshufb @XMM[8], @XMM[3] 834 pxor @XMM[9], @XMM[6] 835 pxor @XMM[9], @XMM[7] 836 pshufb @XMM[8], @XMM[4] 837 pshufb @XMM[8], @XMM[5] 838 pshufb @XMM[8], @XMM[6] 839 pshufb @XMM[8], @XMM[7] 840_bsaes_encrypt8_bitslice: 841___ 842 &bitslice (@XMM[0..7, 8..11]); 843$code.=<<___; 844 dec $rounds 845 jmp .Lenc_sbox 846.align 16 847.Lenc_loop: 848___ 849 &ShiftRows (@XMM[0..7, 8]); 850$code.=".Lenc_sbox:\n"; 851 &Sbox (@XMM[0..7, 8..15]); 852$code.=<<___; 853 dec $rounds 854 jl .Lenc_done 855___ 856 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); 857$code.=<<___; 858 movdqa 0x30($const), @XMM[8] # .LSR 859 jnz .Lenc_loop 860 movdqa 0x40($const), @XMM[8] # .LSRM0 861 jmp .Lenc_loop 862.align 16 863.Lenc_done: 864___ 865 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb 866 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); 867$code.=<<___; 868 movdqa ($key), @XMM[8] # last round key 869 pxor @XMM[8], @XMM[4] 870 pxor @XMM[8], @XMM[6] 871 pxor @XMM[8], @XMM[3] 872 pxor @XMM[8], @XMM[7] 873 pxor @XMM[8], @XMM[2] 874 pxor @XMM[8], @XMM[5] 875 pxor @XMM[8], @XMM[0] 876 pxor @XMM[8], @XMM[1] 877 ret 878.size _bsaes_encrypt8,.-_bsaes_encrypt8 879 880.type _bsaes_decrypt8,\@abi-omnipotent 881.align 64 882_bsaes_decrypt8: 883 lea .LBS0(%rip), $const # constants table 884 885 movdqa ($key), @XMM[9] # round 0 key 886 lea 0x10($key), $key 887 movdqa -0x30($const), @XMM[8] # .LM0ISR 888 pxor @XMM[9], @XMM[0] # xor with round0 key 889 pxor @XMM[9], @XMM[1] 890 pxor @XMM[9], @XMM[2] 891 pxor @XMM[9], @XMM[3] 892 pshufb @XMM[8], @XMM[0] 893 pshufb @XMM[8], @XMM[1] 894 pxor @XMM[9], @XMM[4] 895 pxor @XMM[9], @XMM[5] 896 pshufb @XMM[8], @XMM[2] 897 pshufb @XMM[8], @XMM[3] 898 pxor @XMM[9], @XMM[6] 899 pxor @XMM[9], @XMM[7] 900 pshufb @XMM[8], @XMM[4] 901 pshufb @XMM[8], @XMM[5] 902 pshufb @XMM[8], @XMM[6] 903 pshufb @XMM[8], @XMM[7] 904___ 905 &bitslice (@XMM[0..7, 8..11]); 906$code.=<<___; 907 dec $rounds 908 jmp .Ldec_sbox 909.align 16 910.Ldec_loop: 911___ 912 &ShiftRows (@XMM[0..7, 8]); 913$code.=".Ldec_sbox:\n"; 914 &InvSbox (@XMM[0..7, 8..15]); 915$code.=<<___; 916 dec $rounds 917 jl .Ldec_done 918___ 919 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); 920$code.=<<___; 921 movdqa -0x10($const), @XMM[8] # .LISR 922 jnz .Ldec_loop 923 movdqa -0x20($const), @XMM[8] # .LISRM0 924 jmp .Ldec_loop 925.align 16 926.Ldec_done: 927___ 928 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); 929$code.=<<___; 930 movdqa ($key), @XMM[8] # last round key 931 pxor @XMM[8], @XMM[6] 932 pxor @XMM[8], @XMM[4] 933 pxor @XMM[8], @XMM[2] 934 pxor @XMM[8], @XMM[7] 935 pxor @XMM[8], @XMM[3] 936 pxor @XMM[8], @XMM[5] 937 pxor @XMM[8], @XMM[0] 938 pxor @XMM[8], @XMM[1] 939 ret 940.size _bsaes_decrypt8,.-_bsaes_decrypt8 941___ 942} 943{ 944my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); 945 946sub bitslice_key { 947my @x=reverse(@_[0..7]); 948my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; 949 950 &swapmove (@x[0,1],1,$bs0,$t2,$t3); 951$code.=<<___; 952 #&swapmove(@x[2,3],1,$t0,$t2,$t3); 953 movdqa @x[0], @x[2] 954 movdqa @x[1], @x[3] 955___ 956 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 957 958 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); 959$code.=<<___; 960 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 961 movdqa @x[0], @x[4] 962 movdqa @x[2], @x[6] 963 movdqa @x[1], @x[5] 964 movdqa @x[3], @x[7] 965___ 966 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); 967 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); 968} 969 970$code.=<<___; 971.type _bsaes_key_convert,\@abi-omnipotent 972.align 16 973_bsaes_key_convert: 974 lea .Lmasks(%rip), $const 975 movdqu ($inp), %xmm7 # load round 0 key 976 lea 0x10($inp), $inp 977 movdqa 0x00($const), %xmm0 # 0x01... 978 movdqa 0x10($const), %xmm1 # 0x02... 979 movdqa 0x20($const), %xmm2 # 0x04... 980 movdqa 0x30($const), %xmm3 # 0x08... 981 movdqa 0x40($const), %xmm4 # .LM0 982 pcmpeqd %xmm5, %xmm5 # .LNOT 983 984 movdqu ($inp), %xmm6 # load round 1 key 985 movdqa %xmm7, ($out) # save round 0 key 986 lea 0x10($out), $out 987 dec $rounds 988 jmp .Lkey_loop 989.align 16 990.Lkey_loop: 991 pshufb %xmm4, %xmm6 # .LM0 992 993 movdqa %xmm0, %xmm8 994 movdqa %xmm1, %xmm9 995 996 pand %xmm6, %xmm8 997 pand %xmm6, %xmm9 998 movdqa %xmm2, %xmm10 999 pcmpeqb %xmm0, %xmm8 1000 psllq \$4, %xmm0 # 0x10... 1001 movdqa %xmm3, %xmm11 1002 pcmpeqb %xmm1, %xmm9 1003 psllq \$4, %xmm1 # 0x20... 1004 1005 pand %xmm6, %xmm10 1006 pand %xmm6, %xmm11 1007 movdqa %xmm0, %xmm12 1008 pcmpeqb %xmm2, %xmm10 1009 psllq \$4, %xmm2 # 0x40... 1010 movdqa %xmm1, %xmm13 1011 pcmpeqb %xmm3, %xmm11 1012 psllq \$4, %xmm3 # 0x80... 1013 1014 movdqa %xmm2, %xmm14 1015 movdqa %xmm3, %xmm15 1016 pxor %xmm5, %xmm8 # "pnot" 1017 pxor %xmm5, %xmm9 1018 1019 pand %xmm6, %xmm12 1020 pand %xmm6, %xmm13 1021 movdqa %xmm8, 0x00($out) # write bit-sliced round key 1022 pcmpeqb %xmm0, %xmm12 1023 psrlq \$4, %xmm0 # 0x01... 1024 movdqa %xmm9, 0x10($out) 1025 pcmpeqb %xmm1, %xmm13 1026 psrlq \$4, %xmm1 # 0x02... 1027 lea 0x10($inp), $inp 1028 1029 pand %xmm6, %xmm14 1030 pand %xmm6, %xmm15 1031 movdqa %xmm10, 0x20($out) 1032 pcmpeqb %xmm2, %xmm14 1033 psrlq \$4, %xmm2 # 0x04... 1034 movdqa %xmm11, 0x30($out) 1035 pcmpeqb %xmm3, %xmm15 1036 psrlq \$4, %xmm3 # 0x08... 1037 movdqu ($inp), %xmm6 # load next round key 1038 1039 pxor %xmm5, %xmm13 # "pnot" 1040 pxor %xmm5, %xmm14 1041 movdqa %xmm12, 0x40($out) 1042 movdqa %xmm13, 0x50($out) 1043 movdqa %xmm14, 0x60($out) 1044 movdqa %xmm15, 0x70($out) 1045 lea 0x80($out),$out 1046 dec $rounds 1047 jnz .Lkey_loop 1048 1049 movdqa 0x50($const), %xmm7 # .L63 1050 #movdqa %xmm6, ($out) # don't save last round key 1051 ret 1052.size _bsaes_key_convert,.-_bsaes_key_convert 1053___ 1054} 1055 1056if (0 && !$win64) { # following four functions are unsupported interface 1057 # used for benchmarking... 1058$code.=<<___; 1059.globl bsaes_enc_key_convert 1060.type bsaes_enc_key_convert,\@function,2 1061.align 16 1062bsaes_enc_key_convert: 1063 mov 240($inp),%r10d # pass rounds 1064 mov $inp,%rcx # pass key 1065 mov $out,%rax # pass key schedule 1066 call _bsaes_key_convert 1067 pxor %xmm6,%xmm7 # fix up last round key 1068 movdqa %xmm7,(%rax) # save last round key 1069 ret 1070.size bsaes_enc_key_convert,.-bsaes_enc_key_convert 1071 1072.globl bsaes_encrypt_128 1073.type bsaes_encrypt_128,\@function,4 1074.align 16 1075bsaes_encrypt_128: 1076.Lenc128_loop: 1077 movdqu 0x00($inp), @XMM[0] # load input 1078 movdqu 0x10($inp), @XMM[1] 1079 movdqu 0x20($inp), @XMM[2] 1080 movdqu 0x30($inp), @XMM[3] 1081 movdqu 0x40($inp), @XMM[4] 1082 movdqu 0x50($inp), @XMM[5] 1083 movdqu 0x60($inp), @XMM[6] 1084 movdqu 0x70($inp), @XMM[7] 1085 mov $key, %rax # pass the $key 1086 lea 0x80($inp), $inp 1087 mov \$10,%r10d 1088 1089 call _bsaes_encrypt8 1090 1091 movdqu @XMM[0], 0x00($out) # write output 1092 movdqu @XMM[1], 0x10($out) 1093 movdqu @XMM[4], 0x20($out) 1094 movdqu @XMM[6], 0x30($out) 1095 movdqu @XMM[3], 0x40($out) 1096 movdqu @XMM[7], 0x50($out) 1097 movdqu @XMM[2], 0x60($out) 1098 movdqu @XMM[5], 0x70($out) 1099 lea 0x80($out), $out 1100 sub \$0x80,$len 1101 ja .Lenc128_loop 1102 ret 1103.size bsaes_encrypt_128,.-bsaes_encrypt_128 1104 1105.globl bsaes_dec_key_convert 1106.type bsaes_dec_key_convert,\@function,2 1107.align 16 1108bsaes_dec_key_convert: 1109 mov 240($inp),%r10d # pass rounds 1110 mov $inp,%rcx # pass key 1111 mov $out,%rax # pass key schedule 1112 call _bsaes_key_convert 1113 pxor ($out),%xmm7 # fix up round 0 key 1114 movdqa %xmm6,(%rax) # save last round key 1115 movdqa %xmm7,($out) 1116 ret 1117.size bsaes_dec_key_convert,.-bsaes_dec_key_convert 1118 1119.globl bsaes_decrypt_128 1120.type bsaes_decrypt_128,\@function,4 1121.align 16 1122bsaes_decrypt_128: 1123.Ldec128_loop: 1124 movdqu 0x00($inp), @XMM[0] # load input 1125 movdqu 0x10($inp), @XMM[1] 1126 movdqu 0x20($inp), @XMM[2] 1127 movdqu 0x30($inp), @XMM[3] 1128 movdqu 0x40($inp), @XMM[4] 1129 movdqu 0x50($inp), @XMM[5] 1130 movdqu 0x60($inp), @XMM[6] 1131 movdqu 0x70($inp), @XMM[7] 1132 mov $key, %rax # pass the $key 1133 lea 0x80($inp), $inp 1134 mov \$10,%r10d 1135 1136 call _bsaes_decrypt8 1137 1138 movdqu @XMM[0], 0x00($out) # write output 1139 movdqu @XMM[1], 0x10($out) 1140 movdqu @XMM[6], 0x20($out) 1141 movdqu @XMM[4], 0x30($out) 1142 movdqu @XMM[2], 0x40($out) 1143 movdqu @XMM[7], 0x50($out) 1144 movdqu @XMM[3], 0x60($out) 1145 movdqu @XMM[5], 0x70($out) 1146 lea 0x80($out), $out 1147 sub \$0x80,$len 1148 ja .Ldec128_loop 1149 ret 1150.size bsaes_decrypt_128,.-bsaes_decrypt_128 1151___ 1152} 1153{ 1154###################################################################### 1155# 1156# OpenSSL interface 1157# 1158my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") 1159 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1160my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); 1161 1162if ($ecb) { 1163$code.=<<___; 1164.globl bsaes_ecb_encrypt_blocks 1165.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent 1166.align 16 1167bsaes_ecb_encrypt_blocks: 1168.cfi_startproc 1169 mov %rsp, %rax 1170.Lecb_enc_prologue: 1171 push %rbp 1172.cfi_push %rbp 1173 push %rbx 1174.cfi_push %rbx 1175 push %r12 1176.cfi_push %r12 1177 push %r13 1178.cfi_push %r13 1179 push %r14 1180.cfi_push %r14 1181 push %r15 1182.cfi_push %r15 1183 lea -0x48(%rsp),%rsp 1184.cfi_adjust_cfa_offset 0x48 1185___ 1186$code.=<<___ if ($win64); 1187 lea -0xa0(%rsp), %rsp 1188 movaps %xmm6, 0x40(%rsp) 1189 movaps %xmm7, 0x50(%rsp) 1190 movaps %xmm8, 0x60(%rsp) 1191 movaps %xmm9, 0x70(%rsp) 1192 movaps %xmm10, 0x80(%rsp) 1193 movaps %xmm11, 0x90(%rsp) 1194 movaps %xmm12, 0xa0(%rsp) 1195 movaps %xmm13, 0xb0(%rsp) 1196 movaps %xmm14, 0xc0(%rsp) 1197 movaps %xmm15, 0xd0(%rsp) 1198.Lecb_enc_body: 1199___ 1200$code.=<<___; 1201 mov %rsp,%rbp # backup %rsp 1202.cfi_def_cfa_register %rbp 1203 mov 240($arg4),%eax # rounds 1204 mov $arg1,$inp # backup arguments 1205 mov $arg2,$out 1206 mov $arg3,$len 1207 mov $arg4,$key 1208 cmp \$8,$arg3 1209 jb .Lecb_enc_short 1210 1211 mov %eax,%ebx # backup rounds 1212 shl \$7,%rax # 128 bytes per inner round key 1213 sub \$`128-32`,%rax # size of bit-sliced key schedule 1214 sub %rax,%rsp 1215 mov %rsp,%rax # pass key schedule 1216 mov $key,%rcx # pass key 1217 mov %ebx,%r10d # pass rounds 1218 call _bsaes_key_convert 1219 pxor %xmm6,%xmm7 # fix up last round key 1220 movdqa %xmm7,(%rax) # save last round key 1221 1222 sub \$8,$len 1223.Lecb_enc_loop: 1224 movdqu 0x00($inp), @XMM[0] # load input 1225 movdqu 0x10($inp), @XMM[1] 1226 movdqu 0x20($inp), @XMM[2] 1227 movdqu 0x30($inp), @XMM[3] 1228 movdqu 0x40($inp), @XMM[4] 1229 movdqu 0x50($inp), @XMM[5] 1230 mov %rsp, %rax # pass key schedule 1231 movdqu 0x60($inp), @XMM[6] 1232 mov %ebx,%r10d # pass rounds 1233 movdqu 0x70($inp), @XMM[7] 1234 lea 0x80($inp), $inp 1235 1236 call _bsaes_encrypt8 1237 1238 movdqu @XMM[0], 0x00($out) # write output 1239 movdqu @XMM[1], 0x10($out) 1240 movdqu @XMM[4], 0x20($out) 1241 movdqu @XMM[6], 0x30($out) 1242 movdqu @XMM[3], 0x40($out) 1243 movdqu @XMM[7], 0x50($out) 1244 movdqu @XMM[2], 0x60($out) 1245 movdqu @XMM[5], 0x70($out) 1246 lea 0x80($out), $out 1247 sub \$8,$len 1248 jnc .Lecb_enc_loop 1249 1250 add \$8,$len 1251 jz .Lecb_enc_done 1252 1253 movdqu 0x00($inp), @XMM[0] # load input 1254 mov %rsp, %rax # pass key schedule 1255 mov %ebx,%r10d # pass rounds 1256 cmp \$2,$len 1257 jb .Lecb_enc_one 1258 movdqu 0x10($inp), @XMM[1] 1259 je .Lecb_enc_two 1260 movdqu 0x20($inp), @XMM[2] 1261 cmp \$4,$len 1262 jb .Lecb_enc_three 1263 movdqu 0x30($inp), @XMM[3] 1264 je .Lecb_enc_four 1265 movdqu 0x40($inp), @XMM[4] 1266 cmp \$6,$len 1267 jb .Lecb_enc_five 1268 movdqu 0x50($inp), @XMM[5] 1269 je .Lecb_enc_six 1270 movdqu 0x60($inp), @XMM[6] 1271 call _bsaes_encrypt8 1272 movdqu @XMM[0], 0x00($out) # write output 1273 movdqu @XMM[1], 0x10($out) 1274 movdqu @XMM[4], 0x20($out) 1275 movdqu @XMM[6], 0x30($out) 1276 movdqu @XMM[3], 0x40($out) 1277 movdqu @XMM[7], 0x50($out) 1278 movdqu @XMM[2], 0x60($out) 1279 jmp .Lecb_enc_done 1280.align 16 1281.Lecb_enc_six: 1282 call _bsaes_encrypt8 1283 movdqu @XMM[0], 0x00($out) # write output 1284 movdqu @XMM[1], 0x10($out) 1285 movdqu @XMM[4], 0x20($out) 1286 movdqu @XMM[6], 0x30($out) 1287 movdqu @XMM[3], 0x40($out) 1288 movdqu @XMM[7], 0x50($out) 1289 jmp .Lecb_enc_done 1290.align 16 1291.Lecb_enc_five: 1292 call _bsaes_encrypt8 1293 movdqu @XMM[0], 0x00($out) # write output 1294 movdqu @XMM[1], 0x10($out) 1295 movdqu @XMM[4], 0x20($out) 1296 movdqu @XMM[6], 0x30($out) 1297 movdqu @XMM[3], 0x40($out) 1298 jmp .Lecb_enc_done 1299.align 16 1300.Lecb_enc_four: 1301 call _bsaes_encrypt8 1302 movdqu @XMM[0], 0x00($out) # write output 1303 movdqu @XMM[1], 0x10($out) 1304 movdqu @XMM[4], 0x20($out) 1305 movdqu @XMM[6], 0x30($out) 1306 jmp .Lecb_enc_done 1307.align 16 1308.Lecb_enc_three: 1309 call _bsaes_encrypt8 1310 movdqu @XMM[0], 0x00($out) # write output 1311 movdqu @XMM[1], 0x10($out) 1312 movdqu @XMM[4], 0x20($out) 1313 jmp .Lecb_enc_done 1314.align 16 1315.Lecb_enc_two: 1316 call _bsaes_encrypt8 1317 movdqu @XMM[0], 0x00($out) # write output 1318 movdqu @XMM[1], 0x10($out) 1319 jmp .Lecb_enc_done 1320.align 16 1321.Lecb_enc_one: 1322 call _bsaes_encrypt8 1323 movdqu @XMM[0], 0x00($out) # write output 1324 jmp .Lecb_enc_done 1325.align 16 1326.Lecb_enc_short: 1327 lea ($inp), $arg1 1328 lea ($out), $arg2 1329 lea ($key), $arg3 1330 call asm_AES_encrypt 1331 lea 16($inp), $inp 1332 lea 16($out), $out 1333 dec $len 1334 jnz .Lecb_enc_short 1335 1336.Lecb_enc_done: 1337 lea (%rsp),%rax 1338 pxor %xmm0, %xmm0 1339.Lecb_enc_bzero: # wipe key schedule [if any] 1340 movdqa %xmm0, 0x00(%rax) 1341 movdqa %xmm0, 0x10(%rax) 1342 lea 0x20(%rax), %rax 1343 cmp %rax, %rbp 1344 jb .Lecb_enc_bzero 1345 1346 lea 0x78(%rbp),%rax 1347.cfi_def_cfa %rax,8 1348___ 1349$code.=<<___ if ($win64); 1350 movaps 0x40(%rbp), %xmm6 1351 movaps 0x50(%rbp), %xmm7 1352 movaps 0x60(%rbp), %xmm8 1353 movaps 0x70(%rbp), %xmm9 1354 movaps 0x80(%rbp), %xmm10 1355 movaps 0x90(%rbp), %xmm11 1356 movaps 0xa0(%rbp), %xmm12 1357 movaps 0xb0(%rbp), %xmm13 1358 movaps 0xc0(%rbp), %xmm14 1359 movaps 0xd0(%rbp), %xmm15 1360 lea 0xa0(%rax), %rax 1361.Lecb_enc_tail: 1362___ 1363$code.=<<___; 1364 mov -48(%rax), %r15 1365.cfi_restore %r15 1366 mov -40(%rax), %r14 1367.cfi_restore %r14 1368 mov -32(%rax), %r13 1369.cfi_restore %r13 1370 mov -24(%rax), %r12 1371.cfi_restore %r12 1372 mov -16(%rax), %rbx 1373.cfi_restore %rbx 1374 mov -8(%rax), %rbp 1375.cfi_restore %rbp 1376 lea (%rax), %rsp # restore %rsp 1377.cfi_def_cfa_register %rsp 1378.Lecb_enc_epilogue: 1379 ret 1380.cfi_endproc 1381.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks 1382 1383.globl bsaes_ecb_decrypt_blocks 1384.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent 1385.align 16 1386bsaes_ecb_decrypt_blocks: 1387.cfi_startproc 1388 mov %rsp, %rax 1389.Lecb_dec_prologue: 1390 push %rbp 1391.cfi_push %rbp 1392 push %rbx 1393.cfi_push %rbx 1394 push %r12 1395.cfi_push %r12 1396 push %r13 1397.cfi_push %r13 1398 push %r14 1399.cfi_push %r14 1400 push %r15 1401.cfi_push %r15 1402 lea -0x48(%rsp),%rsp 1403.cfi_adjust_cfa_offset 0x48 1404___ 1405$code.=<<___ if ($win64); 1406 lea -0xa0(%rsp), %rsp 1407 movaps %xmm6, 0x40(%rsp) 1408 movaps %xmm7, 0x50(%rsp) 1409 movaps %xmm8, 0x60(%rsp) 1410 movaps %xmm9, 0x70(%rsp) 1411 movaps %xmm10, 0x80(%rsp) 1412 movaps %xmm11, 0x90(%rsp) 1413 movaps %xmm12, 0xa0(%rsp) 1414 movaps %xmm13, 0xb0(%rsp) 1415 movaps %xmm14, 0xc0(%rsp) 1416 movaps %xmm15, 0xd0(%rsp) 1417.Lecb_dec_body: 1418___ 1419$code.=<<___; 1420 mov %rsp,%rbp # backup %rsp 1421.cfi_def_cfa_register %rbp 1422 mov 240($arg4),%eax # rounds 1423 mov $arg1,$inp # backup arguments 1424 mov $arg2,$out 1425 mov $arg3,$len 1426 mov $arg4,$key 1427 cmp \$8,$arg3 1428 jb .Lecb_dec_short 1429 1430 mov %eax,%ebx # backup rounds 1431 shl \$7,%rax # 128 bytes per inner round key 1432 sub \$`128-32`,%rax # size of bit-sliced key schedule 1433 sub %rax,%rsp 1434 mov %rsp,%rax # pass key schedule 1435 mov $key,%rcx # pass key 1436 mov %ebx,%r10d # pass rounds 1437 call _bsaes_key_convert 1438 pxor (%rsp),%xmm7 # fix up 0 round key 1439 movdqa %xmm6,(%rax) # save last round key 1440 movdqa %xmm7,(%rsp) 1441 1442 sub \$8,$len 1443.Lecb_dec_loop: 1444 movdqu 0x00($inp), @XMM[0] # load input 1445 movdqu 0x10($inp), @XMM[1] 1446 movdqu 0x20($inp), @XMM[2] 1447 movdqu 0x30($inp), @XMM[3] 1448 movdqu 0x40($inp), @XMM[4] 1449 movdqu 0x50($inp), @XMM[5] 1450 mov %rsp, %rax # pass key schedule 1451 movdqu 0x60($inp), @XMM[6] 1452 mov %ebx,%r10d # pass rounds 1453 movdqu 0x70($inp), @XMM[7] 1454 lea 0x80($inp), $inp 1455 1456 call _bsaes_decrypt8 1457 1458 movdqu @XMM[0], 0x00($out) # write output 1459 movdqu @XMM[1], 0x10($out) 1460 movdqu @XMM[6], 0x20($out) 1461 movdqu @XMM[4], 0x30($out) 1462 movdqu @XMM[2], 0x40($out) 1463 movdqu @XMM[7], 0x50($out) 1464 movdqu @XMM[3], 0x60($out) 1465 movdqu @XMM[5], 0x70($out) 1466 lea 0x80($out), $out 1467 sub \$8,$len 1468 jnc .Lecb_dec_loop 1469 1470 add \$8,$len 1471 jz .Lecb_dec_done 1472 1473 movdqu 0x00($inp), @XMM[0] # load input 1474 mov %rsp, %rax # pass key schedule 1475 mov %ebx,%r10d # pass rounds 1476 cmp \$2,$len 1477 jb .Lecb_dec_one 1478 movdqu 0x10($inp), @XMM[1] 1479 je .Lecb_dec_two 1480 movdqu 0x20($inp), @XMM[2] 1481 cmp \$4,$len 1482 jb .Lecb_dec_three 1483 movdqu 0x30($inp), @XMM[3] 1484 je .Lecb_dec_four 1485 movdqu 0x40($inp), @XMM[4] 1486 cmp \$6,$len 1487 jb .Lecb_dec_five 1488 movdqu 0x50($inp), @XMM[5] 1489 je .Lecb_dec_six 1490 movdqu 0x60($inp), @XMM[6] 1491 call _bsaes_decrypt8 1492 movdqu @XMM[0], 0x00($out) # write output 1493 movdqu @XMM[1], 0x10($out) 1494 movdqu @XMM[6], 0x20($out) 1495 movdqu @XMM[4], 0x30($out) 1496 movdqu @XMM[2], 0x40($out) 1497 movdqu @XMM[7], 0x50($out) 1498 movdqu @XMM[3], 0x60($out) 1499 jmp .Lecb_dec_done 1500.align 16 1501.Lecb_dec_six: 1502 call _bsaes_decrypt8 1503 movdqu @XMM[0], 0x00($out) # write output 1504 movdqu @XMM[1], 0x10($out) 1505 movdqu @XMM[6], 0x20($out) 1506 movdqu @XMM[4], 0x30($out) 1507 movdqu @XMM[2], 0x40($out) 1508 movdqu @XMM[7], 0x50($out) 1509 jmp .Lecb_dec_done 1510.align 16 1511.Lecb_dec_five: 1512 call _bsaes_decrypt8 1513 movdqu @XMM[0], 0x00($out) # write output 1514 movdqu @XMM[1], 0x10($out) 1515 movdqu @XMM[6], 0x20($out) 1516 movdqu @XMM[4], 0x30($out) 1517 movdqu @XMM[2], 0x40($out) 1518 jmp .Lecb_dec_done 1519.align 16 1520.Lecb_dec_four: 1521 call _bsaes_decrypt8 1522 movdqu @XMM[0], 0x00($out) # write output 1523 movdqu @XMM[1], 0x10($out) 1524 movdqu @XMM[6], 0x20($out) 1525 movdqu @XMM[4], 0x30($out) 1526 jmp .Lecb_dec_done 1527.align 16 1528.Lecb_dec_three: 1529 call _bsaes_decrypt8 1530 movdqu @XMM[0], 0x00($out) # write output 1531 movdqu @XMM[1], 0x10($out) 1532 movdqu @XMM[6], 0x20($out) 1533 jmp .Lecb_dec_done 1534.align 16 1535.Lecb_dec_two: 1536 call _bsaes_decrypt8 1537 movdqu @XMM[0], 0x00($out) # write output 1538 movdqu @XMM[1], 0x10($out) 1539 jmp .Lecb_dec_done 1540.align 16 1541.Lecb_dec_one: 1542 call _bsaes_decrypt8 1543 movdqu @XMM[0], 0x00($out) # write output 1544 jmp .Lecb_dec_done 1545.align 16 1546.Lecb_dec_short: 1547 lea ($inp), $arg1 1548 lea ($out), $arg2 1549 lea ($key), $arg3 1550 call asm_AES_decrypt 1551 lea 16($inp), $inp 1552 lea 16($out), $out 1553 dec $len 1554 jnz .Lecb_dec_short 1555 1556.Lecb_dec_done: 1557 lea (%rsp),%rax 1558 pxor %xmm0, %xmm0 1559.Lecb_dec_bzero: # wipe key schedule [if any] 1560 movdqa %xmm0, 0x00(%rax) 1561 movdqa %xmm0, 0x10(%rax) 1562 lea 0x20(%rax), %rax 1563 cmp %rax, %rbp 1564 jb .Lecb_dec_bzero 1565 1566 lea 0x78(%rbp),%rax 1567.cfi_def_cfa %rax,8 1568___ 1569$code.=<<___ if ($win64); 1570 movaps 0x40(%rbp), %xmm6 1571 movaps 0x50(%rbp), %xmm7 1572 movaps 0x60(%rbp), %xmm8 1573 movaps 0x70(%rbp), %xmm9 1574 movaps 0x80(%rbp), %xmm10 1575 movaps 0x90(%rbp), %xmm11 1576 movaps 0xa0(%rbp), %xmm12 1577 movaps 0xb0(%rbp), %xmm13 1578 movaps 0xc0(%rbp), %xmm14 1579 movaps 0xd0(%rbp), %xmm15 1580 lea 0xa0(%rax), %rax 1581.Lecb_dec_tail: 1582___ 1583$code.=<<___; 1584 mov -48(%rax), %r15 1585.cfi_restore %r15 1586 mov -40(%rax), %r14 1587.cfi_restore %r14 1588 mov -32(%rax), %r13 1589.cfi_restore %r13 1590 mov -24(%rax), %r12 1591.cfi_restore %r12 1592 mov -16(%rax), %rbx 1593.cfi_restore %rbx 1594 mov -8(%rax), %rbp 1595.cfi_restore %rbp 1596 lea (%rax), %rsp # restore %rsp 1597.cfi_def_cfa_register %rsp 1598.Lecb_dec_epilogue: 1599 ret 1600.cfi_endproc 1601.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks 1602___ 1603} 1604$code.=<<___; 1605.extern asm_AES_cbc_encrypt 1606.globl bsaes_cbc_encrypt 1607.type bsaes_cbc_encrypt,\@abi-omnipotent 1608.align 16 1609bsaes_cbc_encrypt: 1610.cfi_startproc 1611___ 1612$code.=<<___ if ($win64); 1613 mov 48(%rsp),$arg6 # pull direction flag 1614___ 1615$code.=<<___; 1616 cmp \$0,$arg6 1617 jne asm_AES_cbc_encrypt 1618 cmp \$128,$arg3 1619 jb asm_AES_cbc_encrypt 1620 1621 mov %rsp, %rax 1622.Lcbc_dec_prologue: 1623 push %rbp 1624.cfi_push %rbp 1625 push %rbx 1626.cfi_push %rbx 1627 push %r12 1628.cfi_push %r12 1629 push %r13 1630.cfi_push %r13 1631 push %r14 1632.cfi_push %r14 1633 push %r15 1634.cfi_push %r15 1635 lea -0x48(%rsp), %rsp 1636.cfi_adjust_cfa_offset 0x48 1637___ 1638$code.=<<___ if ($win64); 1639 mov 0xa0(%rsp),$arg5 # pull ivp 1640 lea -0xa0(%rsp), %rsp 1641 movaps %xmm6, 0x40(%rsp) 1642 movaps %xmm7, 0x50(%rsp) 1643 movaps %xmm8, 0x60(%rsp) 1644 movaps %xmm9, 0x70(%rsp) 1645 movaps %xmm10, 0x80(%rsp) 1646 movaps %xmm11, 0x90(%rsp) 1647 movaps %xmm12, 0xa0(%rsp) 1648 movaps %xmm13, 0xb0(%rsp) 1649 movaps %xmm14, 0xc0(%rsp) 1650 movaps %xmm15, 0xd0(%rsp) 1651.Lcbc_dec_body: 1652___ 1653$code.=<<___; 1654 mov %rsp, %rbp # backup %rsp 1655.cfi_def_cfa_register %rbp 1656 mov 240($arg4), %eax # rounds 1657 mov $arg1, $inp # backup arguments 1658 mov $arg2, $out 1659 mov $arg3, $len 1660 mov $arg4, $key 1661 mov $arg5, %rbx 1662 shr \$4, $len # bytes to blocks 1663 1664 mov %eax, %edx # rounds 1665 shl \$7, %rax # 128 bytes per inner round key 1666 sub \$`128-32`, %rax # size of bit-sliced key schedule 1667 sub %rax, %rsp 1668 1669 mov %rsp, %rax # pass key schedule 1670 mov $key, %rcx # pass key 1671 mov %edx, %r10d # pass rounds 1672 call _bsaes_key_convert 1673 pxor (%rsp),%xmm7 # fix up 0 round key 1674 movdqa %xmm6,(%rax) # save last round key 1675 movdqa %xmm7,(%rsp) 1676 1677 movdqu (%rbx), @XMM[15] # load IV 1678 sub \$8,$len 1679.Lcbc_dec_loop: 1680 movdqu 0x00($inp), @XMM[0] # load input 1681 movdqu 0x10($inp), @XMM[1] 1682 movdqu 0x20($inp), @XMM[2] 1683 movdqu 0x30($inp), @XMM[3] 1684 movdqu 0x40($inp), @XMM[4] 1685 movdqu 0x50($inp), @XMM[5] 1686 mov %rsp, %rax # pass key schedule 1687 movdqu 0x60($inp), @XMM[6] 1688 mov %edx,%r10d # pass rounds 1689 movdqu 0x70($inp), @XMM[7] 1690 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1691 1692 call _bsaes_decrypt8 1693 1694 pxor 0x20(%rbp), @XMM[0] # ^= IV 1695 movdqu 0x00($inp), @XMM[8] # re-load input 1696 movdqu 0x10($inp), @XMM[9] 1697 pxor @XMM[8], @XMM[1] 1698 movdqu 0x20($inp), @XMM[10] 1699 pxor @XMM[9], @XMM[6] 1700 movdqu 0x30($inp), @XMM[11] 1701 pxor @XMM[10], @XMM[4] 1702 movdqu 0x40($inp), @XMM[12] 1703 pxor @XMM[11], @XMM[2] 1704 movdqu 0x50($inp), @XMM[13] 1705 pxor @XMM[12], @XMM[7] 1706 movdqu 0x60($inp), @XMM[14] 1707 pxor @XMM[13], @XMM[3] 1708 movdqu 0x70($inp), @XMM[15] # IV 1709 pxor @XMM[14], @XMM[5] 1710 movdqu @XMM[0], 0x00($out) # write output 1711 lea 0x80($inp), $inp 1712 movdqu @XMM[1], 0x10($out) 1713 movdqu @XMM[6], 0x20($out) 1714 movdqu @XMM[4], 0x30($out) 1715 movdqu @XMM[2], 0x40($out) 1716 movdqu @XMM[7], 0x50($out) 1717 movdqu @XMM[3], 0x60($out) 1718 movdqu @XMM[5], 0x70($out) 1719 lea 0x80($out), $out 1720 sub \$8,$len 1721 jnc .Lcbc_dec_loop 1722 1723 add \$8,$len 1724 jz .Lcbc_dec_done 1725 1726 movdqu 0x00($inp), @XMM[0] # load input 1727 mov %rsp, %rax # pass key schedule 1728 mov %edx, %r10d # pass rounds 1729 cmp \$2,$len 1730 jb .Lcbc_dec_one 1731 movdqu 0x10($inp), @XMM[1] 1732 je .Lcbc_dec_two 1733 movdqu 0x20($inp), @XMM[2] 1734 cmp \$4,$len 1735 jb .Lcbc_dec_three 1736 movdqu 0x30($inp), @XMM[3] 1737 je .Lcbc_dec_four 1738 movdqu 0x40($inp), @XMM[4] 1739 cmp \$6,$len 1740 jb .Lcbc_dec_five 1741 movdqu 0x50($inp), @XMM[5] 1742 je .Lcbc_dec_six 1743 movdqu 0x60($inp), @XMM[6] 1744 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1745 call _bsaes_decrypt8 1746 pxor 0x20(%rbp), @XMM[0] # ^= IV 1747 movdqu 0x00($inp), @XMM[8] # re-load input 1748 movdqu 0x10($inp), @XMM[9] 1749 pxor @XMM[8], @XMM[1] 1750 movdqu 0x20($inp), @XMM[10] 1751 pxor @XMM[9], @XMM[6] 1752 movdqu 0x30($inp), @XMM[11] 1753 pxor @XMM[10], @XMM[4] 1754 movdqu 0x40($inp), @XMM[12] 1755 pxor @XMM[11], @XMM[2] 1756 movdqu 0x50($inp), @XMM[13] 1757 pxor @XMM[12], @XMM[7] 1758 movdqu 0x60($inp), @XMM[15] # IV 1759 pxor @XMM[13], @XMM[3] 1760 movdqu @XMM[0], 0x00($out) # write output 1761 movdqu @XMM[1], 0x10($out) 1762 movdqu @XMM[6], 0x20($out) 1763 movdqu @XMM[4], 0x30($out) 1764 movdqu @XMM[2], 0x40($out) 1765 movdqu @XMM[7], 0x50($out) 1766 movdqu @XMM[3], 0x60($out) 1767 jmp .Lcbc_dec_done 1768.align 16 1769.Lcbc_dec_six: 1770 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1771 call _bsaes_decrypt8 1772 pxor 0x20(%rbp), @XMM[0] # ^= IV 1773 movdqu 0x00($inp), @XMM[8] # re-load input 1774 movdqu 0x10($inp), @XMM[9] 1775 pxor @XMM[8], @XMM[1] 1776 movdqu 0x20($inp), @XMM[10] 1777 pxor @XMM[9], @XMM[6] 1778 movdqu 0x30($inp), @XMM[11] 1779 pxor @XMM[10], @XMM[4] 1780 movdqu 0x40($inp), @XMM[12] 1781 pxor @XMM[11], @XMM[2] 1782 movdqu 0x50($inp), @XMM[15] # IV 1783 pxor @XMM[12], @XMM[7] 1784 movdqu @XMM[0], 0x00($out) # write output 1785 movdqu @XMM[1], 0x10($out) 1786 movdqu @XMM[6], 0x20($out) 1787 movdqu @XMM[4], 0x30($out) 1788 movdqu @XMM[2], 0x40($out) 1789 movdqu @XMM[7], 0x50($out) 1790 jmp .Lcbc_dec_done 1791.align 16 1792.Lcbc_dec_five: 1793 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1794 call _bsaes_decrypt8 1795 pxor 0x20(%rbp), @XMM[0] # ^= IV 1796 movdqu 0x00($inp), @XMM[8] # re-load input 1797 movdqu 0x10($inp), @XMM[9] 1798 pxor @XMM[8], @XMM[1] 1799 movdqu 0x20($inp), @XMM[10] 1800 pxor @XMM[9], @XMM[6] 1801 movdqu 0x30($inp), @XMM[11] 1802 pxor @XMM[10], @XMM[4] 1803 movdqu 0x40($inp), @XMM[15] # IV 1804 pxor @XMM[11], @XMM[2] 1805 movdqu @XMM[0], 0x00($out) # write output 1806 movdqu @XMM[1], 0x10($out) 1807 movdqu @XMM[6], 0x20($out) 1808 movdqu @XMM[4], 0x30($out) 1809 movdqu @XMM[2], 0x40($out) 1810 jmp .Lcbc_dec_done 1811.align 16 1812.Lcbc_dec_four: 1813 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1814 call _bsaes_decrypt8 1815 pxor 0x20(%rbp), @XMM[0] # ^= IV 1816 movdqu 0x00($inp), @XMM[8] # re-load input 1817 movdqu 0x10($inp), @XMM[9] 1818 pxor @XMM[8], @XMM[1] 1819 movdqu 0x20($inp), @XMM[10] 1820 pxor @XMM[9], @XMM[6] 1821 movdqu 0x30($inp), @XMM[15] # IV 1822 pxor @XMM[10], @XMM[4] 1823 movdqu @XMM[0], 0x00($out) # write output 1824 movdqu @XMM[1], 0x10($out) 1825 movdqu @XMM[6], 0x20($out) 1826 movdqu @XMM[4], 0x30($out) 1827 jmp .Lcbc_dec_done 1828.align 16 1829.Lcbc_dec_three: 1830 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1831 call _bsaes_decrypt8 1832 pxor 0x20(%rbp), @XMM[0] # ^= IV 1833 movdqu 0x00($inp), @XMM[8] # re-load input 1834 movdqu 0x10($inp), @XMM[9] 1835 pxor @XMM[8], @XMM[1] 1836 movdqu 0x20($inp), @XMM[15] # IV 1837 pxor @XMM[9], @XMM[6] 1838 movdqu @XMM[0], 0x00($out) # write output 1839 movdqu @XMM[1], 0x10($out) 1840 movdqu @XMM[6], 0x20($out) 1841 jmp .Lcbc_dec_done 1842.align 16 1843.Lcbc_dec_two: 1844 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1845 call _bsaes_decrypt8 1846 pxor 0x20(%rbp), @XMM[0] # ^= IV 1847 movdqu 0x00($inp), @XMM[8] # re-load input 1848 movdqu 0x10($inp), @XMM[15] # IV 1849 pxor @XMM[8], @XMM[1] 1850 movdqu @XMM[0], 0x00($out) # write output 1851 movdqu @XMM[1], 0x10($out) 1852 jmp .Lcbc_dec_done 1853.align 16 1854.Lcbc_dec_one: 1855 lea ($inp), $arg1 1856 lea 0x20(%rbp), $arg2 # buffer output 1857 lea ($key), $arg3 1858 call asm_AES_decrypt # doesn't touch %xmm 1859 pxor 0x20(%rbp), @XMM[15] # ^= IV 1860 movdqu @XMM[15], ($out) # write output 1861 movdqa @XMM[0], @XMM[15] # IV 1862 1863.Lcbc_dec_done: 1864 movdqu @XMM[15], (%rbx) # return IV 1865 lea (%rsp), %rax 1866 pxor %xmm0, %xmm0 1867.Lcbc_dec_bzero: # wipe key schedule [if any] 1868 movdqa %xmm0, 0x00(%rax) 1869 movdqa %xmm0, 0x10(%rax) 1870 lea 0x20(%rax), %rax 1871 cmp %rax, %rbp 1872 ja .Lcbc_dec_bzero 1873 1874 lea 0x78(%rbp),%rax 1875.cfi_def_cfa %rax,8 1876___ 1877$code.=<<___ if ($win64); 1878 movaps 0x40(%rbp), %xmm6 1879 movaps 0x50(%rbp), %xmm7 1880 movaps 0x60(%rbp), %xmm8 1881 movaps 0x70(%rbp), %xmm9 1882 movaps 0x80(%rbp), %xmm10 1883 movaps 0x90(%rbp), %xmm11 1884 movaps 0xa0(%rbp), %xmm12 1885 movaps 0xb0(%rbp), %xmm13 1886 movaps 0xc0(%rbp), %xmm14 1887 movaps 0xd0(%rbp), %xmm15 1888 lea 0xa0(%rax), %rax 1889.Lcbc_dec_tail: 1890___ 1891$code.=<<___; 1892 mov -48(%rax), %r15 1893.cfi_restore %r15 1894 mov -40(%rax), %r14 1895.cfi_restore %r14 1896 mov -32(%rax), %r13 1897.cfi_restore %r13 1898 mov -24(%rax), %r12 1899.cfi_restore %r12 1900 mov -16(%rax), %rbx 1901.cfi_restore %rbx 1902 mov -8(%rax), %rbp 1903.cfi_restore %rbp 1904 lea (%rax), %rsp # restore %rsp 1905.cfi_def_cfa_register %rsp 1906.Lcbc_dec_epilogue: 1907 ret 1908.cfi_endproc 1909.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt 1910 1911.globl bsaes_ctr32_encrypt_blocks 1912.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent 1913.align 16 1914bsaes_ctr32_encrypt_blocks: 1915.cfi_startproc 1916 mov %rsp, %rax 1917.Lctr_enc_prologue: 1918 push %rbp 1919.cfi_push %rbp 1920 push %rbx 1921.cfi_push %rbx 1922 push %r12 1923.cfi_push %r12 1924 push %r13 1925.cfi_push %r13 1926 push %r14 1927.cfi_push %r14 1928 push %r15 1929.cfi_push %r15 1930 lea -0x48(%rsp), %rsp 1931.cfi_adjust_cfa_offset 0x48 1932___ 1933$code.=<<___ if ($win64); 1934 mov 0xa0(%rsp),$arg5 # pull ivp 1935 lea -0xa0(%rsp), %rsp 1936 movaps %xmm6, 0x40(%rsp) 1937 movaps %xmm7, 0x50(%rsp) 1938 movaps %xmm8, 0x60(%rsp) 1939 movaps %xmm9, 0x70(%rsp) 1940 movaps %xmm10, 0x80(%rsp) 1941 movaps %xmm11, 0x90(%rsp) 1942 movaps %xmm12, 0xa0(%rsp) 1943 movaps %xmm13, 0xb0(%rsp) 1944 movaps %xmm14, 0xc0(%rsp) 1945 movaps %xmm15, 0xd0(%rsp) 1946.Lctr_enc_body: 1947___ 1948$code.=<<___; 1949 mov %rsp, %rbp # backup %rsp 1950.cfi_def_cfa_register %rbp 1951 movdqu ($arg5), %xmm0 # load counter 1952 mov 240($arg4), %eax # rounds 1953 mov $arg1, $inp # backup arguments 1954 mov $arg2, $out 1955 mov $arg3, $len 1956 mov $arg4, $key 1957 movdqa %xmm0, 0x20(%rbp) # copy counter 1958 cmp \$8, $arg3 1959 jb .Lctr_enc_short 1960 1961 mov %eax, %ebx # rounds 1962 shl \$7, %rax # 128 bytes per inner round key 1963 sub \$`128-32`, %rax # size of bit-sliced key schedule 1964 sub %rax, %rsp 1965 1966 mov %rsp, %rax # pass key schedule 1967 mov $key, %rcx # pass key 1968 mov %ebx, %r10d # pass rounds 1969 call _bsaes_key_convert 1970 pxor %xmm6,%xmm7 # fix up last round key 1971 movdqa %xmm7,(%rax) # save last round key 1972 1973 movdqa (%rsp), @XMM[9] # load round0 key 1974 lea .LADD1(%rip), %r11 1975 movdqa 0x20(%rbp), @XMM[0] # counter copy 1976 movdqa -0x20(%r11), @XMM[8] # .LSWPUP 1977 pshufb @XMM[8], @XMM[9] # byte swap upper part 1978 pshufb @XMM[8], @XMM[0] 1979 movdqa @XMM[9], (%rsp) # save adjusted round0 key 1980 jmp .Lctr_enc_loop 1981.align 16 1982.Lctr_enc_loop: 1983 movdqa @XMM[0], 0x20(%rbp) # save counter 1984 movdqa @XMM[0], @XMM[1] # prepare 8 counter values 1985 movdqa @XMM[0], @XMM[2] 1986 paddd 0x00(%r11), @XMM[1] # .LADD1 1987 movdqa @XMM[0], @XMM[3] 1988 paddd 0x10(%r11), @XMM[2] # .LADD2 1989 movdqa @XMM[0], @XMM[4] 1990 paddd 0x20(%r11), @XMM[3] # .LADD3 1991 movdqa @XMM[0], @XMM[5] 1992 paddd 0x30(%r11), @XMM[4] # .LADD4 1993 movdqa @XMM[0], @XMM[6] 1994 paddd 0x40(%r11), @XMM[5] # .LADD5 1995 movdqa @XMM[0], @XMM[7] 1996 paddd 0x50(%r11), @XMM[6] # .LADD6 1997 paddd 0x60(%r11), @XMM[7] # .LADD7 1998 1999 # Borrow prologue from _bsaes_encrypt8 to use the opportunity 2000 # to flip byte order in 32-bit counter 2001 movdqa (%rsp), @XMM[9] # round 0 key 2002 lea 0x10(%rsp), %rax # pass key schedule 2003 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR 2004 pxor @XMM[9], @XMM[0] # xor with round0 key 2005 pxor @XMM[9], @XMM[1] 2006 pxor @XMM[9], @XMM[2] 2007 pxor @XMM[9], @XMM[3] 2008 pshufb @XMM[8], @XMM[0] 2009 pshufb @XMM[8], @XMM[1] 2010 pxor @XMM[9], @XMM[4] 2011 pxor @XMM[9], @XMM[5] 2012 pshufb @XMM[8], @XMM[2] 2013 pshufb @XMM[8], @XMM[3] 2014 pxor @XMM[9], @XMM[6] 2015 pxor @XMM[9], @XMM[7] 2016 pshufb @XMM[8], @XMM[4] 2017 pshufb @XMM[8], @XMM[5] 2018 pshufb @XMM[8], @XMM[6] 2019 pshufb @XMM[8], @XMM[7] 2020 lea .LBS0(%rip), %r11 # constants table 2021 mov %ebx,%r10d # pass rounds 2022 2023 call _bsaes_encrypt8_bitslice 2024 2025 sub \$8,$len 2026 jc .Lctr_enc_loop_done 2027 2028 movdqu 0x00($inp), @XMM[8] # load input 2029 movdqu 0x10($inp), @XMM[9] 2030 movdqu 0x20($inp), @XMM[10] 2031 movdqu 0x30($inp), @XMM[11] 2032 movdqu 0x40($inp), @XMM[12] 2033 movdqu 0x50($inp), @XMM[13] 2034 movdqu 0x60($inp), @XMM[14] 2035 movdqu 0x70($inp), @XMM[15] 2036 lea 0x80($inp),$inp 2037 pxor @XMM[0], @XMM[8] 2038 movdqa 0x20(%rbp), @XMM[0] # load counter 2039 pxor @XMM[9], @XMM[1] 2040 movdqu @XMM[8], 0x00($out) # write output 2041 pxor @XMM[10], @XMM[4] 2042 movdqu @XMM[1], 0x10($out) 2043 pxor @XMM[11], @XMM[6] 2044 movdqu @XMM[4], 0x20($out) 2045 pxor @XMM[12], @XMM[3] 2046 movdqu @XMM[6], 0x30($out) 2047 pxor @XMM[13], @XMM[7] 2048 movdqu @XMM[3], 0x40($out) 2049 pxor @XMM[14], @XMM[2] 2050 movdqu @XMM[7], 0x50($out) 2051 pxor @XMM[15], @XMM[5] 2052 movdqu @XMM[2], 0x60($out) 2053 lea .LADD1(%rip), %r11 2054 movdqu @XMM[5], 0x70($out) 2055 lea 0x80($out), $out 2056 paddd 0x70(%r11), @XMM[0] # .LADD8 2057 jnz .Lctr_enc_loop 2058 2059 jmp .Lctr_enc_done 2060.align 16 2061.Lctr_enc_loop_done: 2062 add \$8, $len 2063 movdqu 0x00($inp), @XMM[8] # load input 2064 pxor @XMM[8], @XMM[0] 2065 movdqu @XMM[0], 0x00($out) # write output 2066 cmp \$2,$len 2067 jb .Lctr_enc_done 2068 movdqu 0x10($inp), @XMM[9] 2069 pxor @XMM[9], @XMM[1] 2070 movdqu @XMM[1], 0x10($out) 2071 je .Lctr_enc_done 2072 movdqu 0x20($inp), @XMM[10] 2073 pxor @XMM[10], @XMM[4] 2074 movdqu @XMM[4], 0x20($out) 2075 cmp \$4,$len 2076 jb .Lctr_enc_done 2077 movdqu 0x30($inp), @XMM[11] 2078 pxor @XMM[11], @XMM[6] 2079 movdqu @XMM[6], 0x30($out) 2080 je .Lctr_enc_done 2081 movdqu 0x40($inp), @XMM[12] 2082 pxor @XMM[12], @XMM[3] 2083 movdqu @XMM[3], 0x40($out) 2084 cmp \$6,$len 2085 jb .Lctr_enc_done 2086 movdqu 0x50($inp), @XMM[13] 2087 pxor @XMM[13], @XMM[7] 2088 movdqu @XMM[7], 0x50($out) 2089 je .Lctr_enc_done 2090 movdqu 0x60($inp), @XMM[14] 2091 pxor @XMM[14], @XMM[2] 2092 movdqu @XMM[2], 0x60($out) 2093 jmp .Lctr_enc_done 2094 2095.align 16 2096.Lctr_enc_short: 2097 lea 0x20(%rbp), $arg1 2098 lea 0x30(%rbp), $arg2 2099 lea ($key), $arg3 2100 call asm_AES_encrypt 2101 movdqu ($inp), @XMM[1] 2102 lea 16($inp), $inp 2103 mov 0x2c(%rbp), %eax # load 32-bit counter 2104 bswap %eax 2105 pxor 0x30(%rbp), @XMM[1] 2106 inc %eax # increment 2107 movdqu @XMM[1], ($out) 2108 bswap %eax 2109 lea 16($out), $out 2110 mov %eax, 0x2c(%rsp) # save 32-bit counter 2111 dec $len 2112 jnz .Lctr_enc_short 2113 2114.Lctr_enc_done: 2115 lea (%rsp), %rax 2116 pxor %xmm0, %xmm0 2117.Lctr_enc_bzero: # wipe key schedule [if any] 2118 movdqa %xmm0, 0x00(%rax) 2119 movdqa %xmm0, 0x10(%rax) 2120 lea 0x20(%rax), %rax 2121 cmp %rax, %rbp 2122 ja .Lctr_enc_bzero 2123 2124 lea 0x78(%rbp),%rax 2125.cfi_def_cfa %rax,8 2126___ 2127$code.=<<___ if ($win64); 2128 movaps 0x40(%rbp), %xmm6 2129 movaps 0x50(%rbp), %xmm7 2130 movaps 0x60(%rbp), %xmm8 2131 movaps 0x70(%rbp), %xmm9 2132 movaps 0x80(%rbp), %xmm10 2133 movaps 0x90(%rbp), %xmm11 2134 movaps 0xa0(%rbp), %xmm12 2135 movaps 0xb0(%rbp), %xmm13 2136 movaps 0xc0(%rbp), %xmm14 2137 movaps 0xd0(%rbp), %xmm15 2138 lea 0xa0(%rax), %rax 2139.Lctr_enc_tail: 2140___ 2141$code.=<<___; 2142 mov -48(%rax), %r15 2143.cfi_restore %r15 2144 mov -40(%rax), %r14 2145.cfi_restore %r14 2146 mov -32(%rax), %r13 2147.cfi_restore %r13 2148 mov -24(%rax), %r12 2149.cfi_restore %r12 2150 mov -16(%rax), %rbx 2151.cfi_restore %rbx 2152 mov -8(%rax), %rbp 2153.cfi_restore %rbp 2154 lea (%rax), %rsp # restore %rsp 2155.cfi_def_cfa_register %rsp 2156.Lctr_enc_epilogue: 2157 ret 2158.cfi_endproc 2159.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks 2160___ 2161###################################################################### 2162# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, 2163# const AES_KEY *key1, const AES_KEY *key2, 2164# const unsigned char iv[16]); 2165# 2166my ($twmask,$twres,$twtmp)=@XMM[13..15]; 2167$arg6=~s/d$//; 2168 2169$code.=<<___; 2170.globl bsaes_xts_encrypt 2171.type bsaes_xts_encrypt,\@abi-omnipotent 2172.align 16 2173bsaes_xts_encrypt: 2174.cfi_startproc 2175 mov %rsp, %rax 2176.Lxts_enc_prologue: 2177 push %rbp 2178.cfi_push %rbp 2179 push %rbx 2180.cfi_push %rbx 2181 push %r12 2182.cfi_push %r12 2183 push %r13 2184.cfi_push %r13 2185 push %r14 2186.cfi_push %r14 2187 push %r15 2188.cfi_push %r15 2189 lea -0x48(%rsp), %rsp 2190.cfi_adjust_cfa_offset 0x48 2191___ 2192$code.=<<___ if ($win64); 2193 mov 0xa0(%rsp),$arg5 # pull key2 2194 mov 0xa8(%rsp),$arg6 # pull ivp 2195 lea -0xa0(%rsp), %rsp 2196 movaps %xmm6, 0x40(%rsp) 2197 movaps %xmm7, 0x50(%rsp) 2198 movaps %xmm8, 0x60(%rsp) 2199 movaps %xmm9, 0x70(%rsp) 2200 movaps %xmm10, 0x80(%rsp) 2201 movaps %xmm11, 0x90(%rsp) 2202 movaps %xmm12, 0xa0(%rsp) 2203 movaps %xmm13, 0xb0(%rsp) 2204 movaps %xmm14, 0xc0(%rsp) 2205 movaps %xmm15, 0xd0(%rsp) 2206.Lxts_enc_body: 2207___ 2208$code.=<<___; 2209 mov %rsp, %rbp # backup %rsp 2210.cfi_def_cfa_register %rbp 2211 mov $arg1, $inp # backup arguments 2212 mov $arg2, $out 2213 mov $arg3, $len 2214 mov $arg4, $key 2215 2216 lea ($arg6), $arg1 2217 lea 0x20(%rbp), $arg2 2218 lea ($arg5), $arg3 2219 call asm_AES_encrypt # generate initial tweak 2220 2221 mov 240($key), %eax # rounds 2222 mov $len, %rbx # backup $len 2223 2224 mov %eax, %edx # rounds 2225 shl \$7, %rax # 128 bytes per inner round key 2226 sub \$`128-32`, %rax # size of bit-sliced key schedule 2227 sub %rax, %rsp 2228 2229 mov %rsp, %rax # pass key schedule 2230 mov $key, %rcx # pass key 2231 mov %edx, %r10d # pass rounds 2232 call _bsaes_key_convert 2233 pxor %xmm6, %xmm7 # fix up last round key 2234 movdqa %xmm7, (%rax) # save last round key 2235 2236 and \$-16, $len 2237 sub \$0x80, %rsp # place for tweak[8] 2238 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2239 2240 pxor $twtmp, $twtmp 2241 movdqa .Lxts_magic(%rip), $twmask 2242 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2243 2244 sub \$0x80, $len 2245 jc .Lxts_enc_short 2246 jmp .Lxts_enc_loop 2247 2248.align 16 2249.Lxts_enc_loop: 2250___ 2251 for ($i=0;$i<7;$i++) { 2252 $code.=<<___; 2253 pshufd \$0x13, $twtmp, $twres 2254 pxor $twtmp, $twtmp 2255 movdqa @XMM[7], @XMM[$i] 2256 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2257 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2258 pand $twmask, $twres # isolate carry and residue 2259 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2260 pxor $twres, @XMM[7] 2261___ 2262 $code.=<<___ if ($i>=1); 2263 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2264___ 2265 $code.=<<___ if ($i>=2); 2266 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2267___ 2268 } 2269$code.=<<___; 2270 movdqu 0x60($inp), @XMM[8+6] 2271 pxor @XMM[8+5], @XMM[5] 2272 movdqu 0x70($inp), @XMM[8+7] 2273 lea 0x80($inp), $inp 2274 movdqa @XMM[7], 0x70(%rsp) 2275 pxor @XMM[8+6], @XMM[6] 2276 lea 0x80(%rsp), %rax # pass key schedule 2277 pxor @XMM[8+7], @XMM[7] 2278 mov %edx, %r10d # pass rounds 2279 2280 call _bsaes_encrypt8 2281 2282 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2283 pxor 0x10(%rsp), @XMM[1] 2284 movdqu @XMM[0], 0x00($out) # write output 2285 pxor 0x20(%rsp), @XMM[4] 2286 movdqu @XMM[1], 0x10($out) 2287 pxor 0x30(%rsp), @XMM[6] 2288 movdqu @XMM[4], 0x20($out) 2289 pxor 0x40(%rsp), @XMM[3] 2290 movdqu @XMM[6], 0x30($out) 2291 pxor 0x50(%rsp), @XMM[7] 2292 movdqu @XMM[3], 0x40($out) 2293 pxor 0x60(%rsp), @XMM[2] 2294 movdqu @XMM[7], 0x50($out) 2295 pxor 0x70(%rsp), @XMM[5] 2296 movdqu @XMM[2], 0x60($out) 2297 movdqu @XMM[5], 0x70($out) 2298 lea 0x80($out), $out 2299 2300 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2301 pxor $twtmp, $twtmp 2302 movdqa .Lxts_magic(%rip), $twmask 2303 pcmpgtd @XMM[7], $twtmp 2304 pshufd \$0x13, $twtmp, $twres 2305 pxor $twtmp, $twtmp 2306 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2307 pand $twmask, $twres # isolate carry and residue 2308 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2309 pxor $twres, @XMM[7] 2310 2311 sub \$0x80,$len 2312 jnc .Lxts_enc_loop 2313 2314.Lxts_enc_short: 2315 add \$0x80, $len 2316 jz .Lxts_enc_done 2317___ 2318 for ($i=0;$i<7;$i++) { 2319 $code.=<<___; 2320 pshufd \$0x13, $twtmp, $twres 2321 pxor $twtmp, $twtmp 2322 movdqa @XMM[7], @XMM[$i] 2323 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2324 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2325 pand $twmask, $twres # isolate carry and residue 2326 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2327 pxor $twres, @XMM[7] 2328___ 2329 $code.=<<___ if ($i>=1); 2330 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2331 cmp \$`0x10*$i`,$len 2332 je .Lxts_enc_$i 2333___ 2334 $code.=<<___ if ($i>=2); 2335 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2336___ 2337 } 2338$code.=<<___; 2339 movdqu 0x60($inp), @XMM[8+6] 2340 pxor @XMM[8+5], @XMM[5] 2341 movdqa @XMM[7], 0x70(%rsp) 2342 lea 0x70($inp), $inp 2343 pxor @XMM[8+6], @XMM[6] 2344 lea 0x80(%rsp), %rax # pass key schedule 2345 mov %edx, %r10d # pass rounds 2346 2347 call _bsaes_encrypt8 2348 2349 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2350 pxor 0x10(%rsp), @XMM[1] 2351 movdqu @XMM[0], 0x00($out) # write output 2352 pxor 0x20(%rsp), @XMM[4] 2353 movdqu @XMM[1], 0x10($out) 2354 pxor 0x30(%rsp), @XMM[6] 2355 movdqu @XMM[4], 0x20($out) 2356 pxor 0x40(%rsp), @XMM[3] 2357 movdqu @XMM[6], 0x30($out) 2358 pxor 0x50(%rsp), @XMM[7] 2359 movdqu @XMM[3], 0x40($out) 2360 pxor 0x60(%rsp), @XMM[2] 2361 movdqu @XMM[7], 0x50($out) 2362 movdqu @XMM[2], 0x60($out) 2363 lea 0x70($out), $out 2364 2365 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2366 jmp .Lxts_enc_done 2367.align 16 2368.Lxts_enc_6: 2369 pxor @XMM[8+4], @XMM[4] 2370 lea 0x60($inp), $inp 2371 pxor @XMM[8+5], @XMM[5] 2372 lea 0x80(%rsp), %rax # pass key schedule 2373 mov %edx, %r10d # pass rounds 2374 2375 call _bsaes_encrypt8 2376 2377 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2378 pxor 0x10(%rsp), @XMM[1] 2379 movdqu @XMM[0], 0x00($out) # write output 2380 pxor 0x20(%rsp), @XMM[4] 2381 movdqu @XMM[1], 0x10($out) 2382 pxor 0x30(%rsp), @XMM[6] 2383 movdqu @XMM[4], 0x20($out) 2384 pxor 0x40(%rsp), @XMM[3] 2385 movdqu @XMM[6], 0x30($out) 2386 pxor 0x50(%rsp), @XMM[7] 2387 movdqu @XMM[3], 0x40($out) 2388 movdqu @XMM[7], 0x50($out) 2389 lea 0x60($out), $out 2390 2391 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2392 jmp .Lxts_enc_done 2393.align 16 2394.Lxts_enc_5: 2395 pxor @XMM[8+3], @XMM[3] 2396 lea 0x50($inp), $inp 2397 pxor @XMM[8+4], @XMM[4] 2398 lea 0x80(%rsp), %rax # pass key schedule 2399 mov %edx, %r10d # pass rounds 2400 2401 call _bsaes_encrypt8 2402 2403 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2404 pxor 0x10(%rsp), @XMM[1] 2405 movdqu @XMM[0], 0x00($out) # write output 2406 pxor 0x20(%rsp), @XMM[4] 2407 movdqu @XMM[1], 0x10($out) 2408 pxor 0x30(%rsp), @XMM[6] 2409 movdqu @XMM[4], 0x20($out) 2410 pxor 0x40(%rsp), @XMM[3] 2411 movdqu @XMM[6], 0x30($out) 2412 movdqu @XMM[3], 0x40($out) 2413 lea 0x50($out), $out 2414 2415 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2416 jmp .Lxts_enc_done 2417.align 16 2418.Lxts_enc_4: 2419 pxor @XMM[8+2], @XMM[2] 2420 lea 0x40($inp), $inp 2421 pxor @XMM[8+3], @XMM[3] 2422 lea 0x80(%rsp), %rax # pass key schedule 2423 mov %edx, %r10d # pass rounds 2424 2425 call _bsaes_encrypt8 2426 2427 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2428 pxor 0x10(%rsp), @XMM[1] 2429 movdqu @XMM[0], 0x00($out) # write output 2430 pxor 0x20(%rsp), @XMM[4] 2431 movdqu @XMM[1], 0x10($out) 2432 pxor 0x30(%rsp), @XMM[6] 2433 movdqu @XMM[4], 0x20($out) 2434 movdqu @XMM[6], 0x30($out) 2435 lea 0x40($out), $out 2436 2437 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2438 jmp .Lxts_enc_done 2439.align 16 2440.Lxts_enc_3: 2441 pxor @XMM[8+1], @XMM[1] 2442 lea 0x30($inp), $inp 2443 pxor @XMM[8+2], @XMM[2] 2444 lea 0x80(%rsp), %rax # pass key schedule 2445 mov %edx, %r10d # pass rounds 2446 2447 call _bsaes_encrypt8 2448 2449 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2450 pxor 0x10(%rsp), @XMM[1] 2451 movdqu @XMM[0], 0x00($out) # write output 2452 pxor 0x20(%rsp), @XMM[4] 2453 movdqu @XMM[1], 0x10($out) 2454 movdqu @XMM[4], 0x20($out) 2455 lea 0x30($out), $out 2456 2457 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2458 jmp .Lxts_enc_done 2459.align 16 2460.Lxts_enc_2: 2461 pxor @XMM[8+0], @XMM[0] 2462 lea 0x20($inp), $inp 2463 pxor @XMM[8+1], @XMM[1] 2464 lea 0x80(%rsp), %rax # pass key schedule 2465 mov %edx, %r10d # pass rounds 2466 2467 call _bsaes_encrypt8 2468 2469 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2470 pxor 0x10(%rsp), @XMM[1] 2471 movdqu @XMM[0], 0x00($out) # write output 2472 movdqu @XMM[1], 0x10($out) 2473 lea 0x20($out), $out 2474 2475 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2476 jmp .Lxts_enc_done 2477.align 16 2478.Lxts_enc_1: 2479 pxor @XMM[0], @XMM[8] 2480 lea 0x10($inp), $inp 2481 movdqa @XMM[8], 0x20(%rbp) 2482 lea 0x20(%rbp), $arg1 2483 lea 0x20(%rbp), $arg2 2484 lea ($key), $arg3 2485 call asm_AES_encrypt # doesn't touch %xmm 2486 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2487 #pxor @XMM[8], @XMM[0] 2488 #lea 0x80(%rsp), %rax # pass key schedule 2489 #mov %edx, %r10d # pass rounds 2490 #call _bsaes_encrypt8 2491 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2492 movdqu @XMM[0], 0x00($out) # write output 2493 lea 0x10($out), $out 2494 2495 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2496 2497.Lxts_enc_done: 2498 and \$15, %ebx 2499 jz .Lxts_enc_ret 2500 mov $out, %rdx 2501 2502.Lxts_enc_steal: 2503 movzb ($inp), %eax 2504 movzb -16(%rdx), %ecx 2505 lea 1($inp), $inp 2506 mov %al, -16(%rdx) 2507 mov %cl, 0(%rdx) 2508 lea 1(%rdx), %rdx 2509 sub \$1,%ebx 2510 jnz .Lxts_enc_steal 2511 2512 movdqu -16($out), @XMM[0] 2513 lea 0x20(%rbp), $arg1 2514 pxor @XMM[7], @XMM[0] 2515 lea 0x20(%rbp), $arg2 2516 movdqa @XMM[0], 0x20(%rbp) 2517 lea ($key), $arg3 2518 call asm_AES_encrypt # doesn't touch %xmm 2519 pxor 0x20(%rbp), @XMM[7] 2520 movdqu @XMM[7], -16($out) 2521 2522.Lxts_enc_ret: 2523 lea (%rsp), %rax 2524 pxor %xmm0, %xmm0 2525.Lxts_enc_bzero: # wipe key schedule [if any] 2526 movdqa %xmm0, 0x00(%rax) 2527 movdqa %xmm0, 0x10(%rax) 2528 lea 0x20(%rax), %rax 2529 cmp %rax, %rbp 2530 ja .Lxts_enc_bzero 2531 2532 lea 0x78(%rbp),%rax 2533.cfi_def_cfa %rax,8 2534___ 2535$code.=<<___ if ($win64); 2536 movaps 0x40(%rbp), %xmm6 2537 movaps 0x50(%rbp), %xmm7 2538 movaps 0x60(%rbp), %xmm8 2539 movaps 0x70(%rbp), %xmm9 2540 movaps 0x80(%rbp), %xmm10 2541 movaps 0x90(%rbp), %xmm11 2542 movaps 0xa0(%rbp), %xmm12 2543 movaps 0xb0(%rbp), %xmm13 2544 movaps 0xc0(%rbp), %xmm14 2545 movaps 0xd0(%rbp), %xmm15 2546 lea 0xa0(%rax), %rax 2547.Lxts_enc_tail: 2548___ 2549$code.=<<___; 2550 mov -48(%rax), %r15 2551.cfi_restore %r15 2552 mov -40(%rax), %r14 2553.cfi_restore %r14 2554 mov -32(%rax), %r13 2555.cfi_restore %r13 2556 mov -24(%rax), %r12 2557.cfi_restore %r12 2558 mov -16(%rax), %rbx 2559.cfi_restore %rbx 2560 mov -8(%rax), %rbp 2561.cfi_restore %rbp 2562 lea (%rax), %rsp # restore %rsp 2563.cfi_def_cfa_register %rsp 2564.Lxts_enc_epilogue: 2565 ret 2566.cfi_endproc 2567.size bsaes_xts_encrypt,.-bsaes_xts_encrypt 2568 2569.globl bsaes_xts_decrypt 2570.type bsaes_xts_decrypt,\@abi-omnipotent 2571.align 16 2572bsaes_xts_decrypt: 2573.cfi_startproc 2574 mov %rsp, %rax 2575.Lxts_dec_prologue: 2576 push %rbp 2577.cfi_push %rbp 2578 push %rbx 2579.cfi_push %rbx 2580 push %r12 2581.cfi_push %r12 2582 push %r13 2583.cfi_push %r13 2584 push %r14 2585.cfi_push %r14 2586 push %r15 2587.cfi_push %r15 2588 lea -0x48(%rsp), %rsp 2589.cfi_adjust_cfa_offset 0x48 2590___ 2591$code.=<<___ if ($win64); 2592 mov 0xa0(%rsp),$arg5 # pull key2 2593 mov 0xa8(%rsp),$arg6 # pull ivp 2594 lea -0xa0(%rsp), %rsp 2595 movaps %xmm6, 0x40(%rsp) 2596 movaps %xmm7, 0x50(%rsp) 2597 movaps %xmm8, 0x60(%rsp) 2598 movaps %xmm9, 0x70(%rsp) 2599 movaps %xmm10, 0x80(%rsp) 2600 movaps %xmm11, 0x90(%rsp) 2601 movaps %xmm12, 0xa0(%rsp) 2602 movaps %xmm13, 0xb0(%rsp) 2603 movaps %xmm14, 0xc0(%rsp) 2604 movaps %xmm15, 0xd0(%rsp) 2605.Lxts_dec_body: 2606___ 2607$code.=<<___; 2608 mov %rsp, %rbp # backup %rsp 2609 mov $arg1, $inp # backup arguments 2610 mov $arg2, $out 2611 mov $arg3, $len 2612 mov $arg4, $key 2613 2614 lea ($arg6), $arg1 2615 lea 0x20(%rbp), $arg2 2616 lea ($arg5), $arg3 2617 call asm_AES_encrypt # generate initial tweak 2618 2619 mov 240($key), %eax # rounds 2620 mov $len, %rbx # backup $len 2621 2622 mov %eax, %edx # rounds 2623 shl \$7, %rax # 128 bytes per inner round key 2624 sub \$`128-32`, %rax # size of bit-sliced key schedule 2625 sub %rax, %rsp 2626 2627 mov %rsp, %rax # pass key schedule 2628 mov $key, %rcx # pass key 2629 mov %edx, %r10d # pass rounds 2630 call _bsaes_key_convert 2631 pxor (%rsp), %xmm7 # fix up round 0 key 2632 movdqa %xmm6, (%rax) # save last round key 2633 movdqa %xmm7, (%rsp) 2634 2635 xor %eax, %eax # if ($len%16) len-=16; 2636 and \$-16, $len 2637 test \$15, %ebx 2638 setnz %al 2639 shl \$4, %rax 2640 sub %rax, $len 2641 2642 sub \$0x80, %rsp # place for tweak[8] 2643 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2644 2645 pxor $twtmp, $twtmp 2646 movdqa .Lxts_magic(%rip), $twmask 2647 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2648 2649 sub \$0x80, $len 2650 jc .Lxts_dec_short 2651 jmp .Lxts_dec_loop 2652 2653.align 16 2654.Lxts_dec_loop: 2655___ 2656 for ($i=0;$i<7;$i++) { 2657 $code.=<<___; 2658 pshufd \$0x13, $twtmp, $twres 2659 pxor $twtmp, $twtmp 2660 movdqa @XMM[7], @XMM[$i] 2661 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2662 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2663 pand $twmask, $twres # isolate carry and residue 2664 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2665 pxor $twres, @XMM[7] 2666___ 2667 $code.=<<___ if ($i>=1); 2668 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2669___ 2670 $code.=<<___ if ($i>=2); 2671 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2672___ 2673 } 2674$code.=<<___; 2675 movdqu 0x60($inp), @XMM[8+6] 2676 pxor @XMM[8+5], @XMM[5] 2677 movdqu 0x70($inp), @XMM[8+7] 2678 lea 0x80($inp), $inp 2679 movdqa @XMM[7], 0x70(%rsp) 2680 pxor @XMM[8+6], @XMM[6] 2681 lea 0x80(%rsp), %rax # pass key schedule 2682 pxor @XMM[8+7], @XMM[7] 2683 mov %edx, %r10d # pass rounds 2684 2685 call _bsaes_decrypt8 2686 2687 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2688 pxor 0x10(%rsp), @XMM[1] 2689 movdqu @XMM[0], 0x00($out) # write output 2690 pxor 0x20(%rsp), @XMM[6] 2691 movdqu @XMM[1], 0x10($out) 2692 pxor 0x30(%rsp), @XMM[4] 2693 movdqu @XMM[6], 0x20($out) 2694 pxor 0x40(%rsp), @XMM[2] 2695 movdqu @XMM[4], 0x30($out) 2696 pxor 0x50(%rsp), @XMM[7] 2697 movdqu @XMM[2], 0x40($out) 2698 pxor 0x60(%rsp), @XMM[3] 2699 movdqu @XMM[7], 0x50($out) 2700 pxor 0x70(%rsp), @XMM[5] 2701 movdqu @XMM[3], 0x60($out) 2702 movdqu @XMM[5], 0x70($out) 2703 lea 0x80($out), $out 2704 2705 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2706 pxor $twtmp, $twtmp 2707 movdqa .Lxts_magic(%rip), $twmask 2708 pcmpgtd @XMM[7], $twtmp 2709 pshufd \$0x13, $twtmp, $twres 2710 pxor $twtmp, $twtmp 2711 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2712 pand $twmask, $twres # isolate carry and residue 2713 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2714 pxor $twres, @XMM[7] 2715 2716 sub \$0x80,$len 2717 jnc .Lxts_dec_loop 2718 2719.Lxts_dec_short: 2720 add \$0x80, $len 2721 jz .Lxts_dec_done 2722___ 2723 for ($i=0;$i<7;$i++) { 2724 $code.=<<___; 2725 pshufd \$0x13, $twtmp, $twres 2726 pxor $twtmp, $twtmp 2727 movdqa @XMM[7], @XMM[$i] 2728 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2729 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2730 pand $twmask, $twres # isolate carry and residue 2731 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2732 pxor $twres, @XMM[7] 2733___ 2734 $code.=<<___ if ($i>=1); 2735 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2736 cmp \$`0x10*$i`,$len 2737 je .Lxts_dec_$i 2738___ 2739 $code.=<<___ if ($i>=2); 2740 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2741___ 2742 } 2743$code.=<<___; 2744 movdqu 0x60($inp), @XMM[8+6] 2745 pxor @XMM[8+5], @XMM[5] 2746 movdqa @XMM[7], 0x70(%rsp) 2747 lea 0x70($inp), $inp 2748 pxor @XMM[8+6], @XMM[6] 2749 lea 0x80(%rsp), %rax # pass key schedule 2750 mov %edx, %r10d # pass rounds 2751 2752 call _bsaes_decrypt8 2753 2754 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2755 pxor 0x10(%rsp), @XMM[1] 2756 movdqu @XMM[0], 0x00($out) # write output 2757 pxor 0x20(%rsp), @XMM[6] 2758 movdqu @XMM[1], 0x10($out) 2759 pxor 0x30(%rsp), @XMM[4] 2760 movdqu @XMM[6], 0x20($out) 2761 pxor 0x40(%rsp), @XMM[2] 2762 movdqu @XMM[4], 0x30($out) 2763 pxor 0x50(%rsp), @XMM[7] 2764 movdqu @XMM[2], 0x40($out) 2765 pxor 0x60(%rsp), @XMM[3] 2766 movdqu @XMM[7], 0x50($out) 2767 movdqu @XMM[3], 0x60($out) 2768 lea 0x70($out), $out 2769 2770 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2771 jmp .Lxts_dec_done 2772.align 16 2773.Lxts_dec_6: 2774 pxor @XMM[8+4], @XMM[4] 2775 lea 0x60($inp), $inp 2776 pxor @XMM[8+5], @XMM[5] 2777 lea 0x80(%rsp), %rax # pass key schedule 2778 mov %edx, %r10d # pass rounds 2779 2780 call _bsaes_decrypt8 2781 2782 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2783 pxor 0x10(%rsp), @XMM[1] 2784 movdqu @XMM[0], 0x00($out) # write output 2785 pxor 0x20(%rsp), @XMM[6] 2786 movdqu @XMM[1], 0x10($out) 2787 pxor 0x30(%rsp), @XMM[4] 2788 movdqu @XMM[6], 0x20($out) 2789 pxor 0x40(%rsp), @XMM[2] 2790 movdqu @XMM[4], 0x30($out) 2791 pxor 0x50(%rsp), @XMM[7] 2792 movdqu @XMM[2], 0x40($out) 2793 movdqu @XMM[7], 0x50($out) 2794 lea 0x60($out), $out 2795 2796 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2797 jmp .Lxts_dec_done 2798.align 16 2799.Lxts_dec_5: 2800 pxor @XMM[8+3], @XMM[3] 2801 lea 0x50($inp), $inp 2802 pxor @XMM[8+4], @XMM[4] 2803 lea 0x80(%rsp), %rax # pass key schedule 2804 mov %edx, %r10d # pass rounds 2805 2806 call _bsaes_decrypt8 2807 2808 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2809 pxor 0x10(%rsp), @XMM[1] 2810 movdqu @XMM[0], 0x00($out) # write output 2811 pxor 0x20(%rsp), @XMM[6] 2812 movdqu @XMM[1], 0x10($out) 2813 pxor 0x30(%rsp), @XMM[4] 2814 movdqu @XMM[6], 0x20($out) 2815 pxor 0x40(%rsp), @XMM[2] 2816 movdqu @XMM[4], 0x30($out) 2817 movdqu @XMM[2], 0x40($out) 2818 lea 0x50($out), $out 2819 2820 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2821 jmp .Lxts_dec_done 2822.align 16 2823.Lxts_dec_4: 2824 pxor @XMM[8+2], @XMM[2] 2825 lea 0x40($inp), $inp 2826 pxor @XMM[8+3], @XMM[3] 2827 lea 0x80(%rsp), %rax # pass key schedule 2828 mov %edx, %r10d # pass rounds 2829 2830 call _bsaes_decrypt8 2831 2832 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2833 pxor 0x10(%rsp), @XMM[1] 2834 movdqu @XMM[0], 0x00($out) # write output 2835 pxor 0x20(%rsp), @XMM[6] 2836 movdqu @XMM[1], 0x10($out) 2837 pxor 0x30(%rsp), @XMM[4] 2838 movdqu @XMM[6], 0x20($out) 2839 movdqu @XMM[4], 0x30($out) 2840 lea 0x40($out), $out 2841 2842 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2843 jmp .Lxts_dec_done 2844.align 16 2845.Lxts_dec_3: 2846 pxor @XMM[8+1], @XMM[1] 2847 lea 0x30($inp), $inp 2848 pxor @XMM[8+2], @XMM[2] 2849 lea 0x80(%rsp), %rax # pass key schedule 2850 mov %edx, %r10d # pass rounds 2851 2852 call _bsaes_decrypt8 2853 2854 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2855 pxor 0x10(%rsp), @XMM[1] 2856 movdqu @XMM[0], 0x00($out) # write output 2857 pxor 0x20(%rsp), @XMM[6] 2858 movdqu @XMM[1], 0x10($out) 2859 movdqu @XMM[6], 0x20($out) 2860 lea 0x30($out), $out 2861 2862 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2863 jmp .Lxts_dec_done 2864.align 16 2865.Lxts_dec_2: 2866 pxor @XMM[8+0], @XMM[0] 2867 lea 0x20($inp), $inp 2868 pxor @XMM[8+1], @XMM[1] 2869 lea 0x80(%rsp), %rax # pass key schedule 2870 mov %edx, %r10d # pass rounds 2871 2872 call _bsaes_decrypt8 2873 2874 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2875 pxor 0x10(%rsp), @XMM[1] 2876 movdqu @XMM[0], 0x00($out) # write output 2877 movdqu @XMM[1], 0x10($out) 2878 lea 0x20($out), $out 2879 2880 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2881 jmp .Lxts_dec_done 2882.align 16 2883.Lxts_dec_1: 2884 pxor @XMM[0], @XMM[8] 2885 lea 0x10($inp), $inp 2886 movdqa @XMM[8], 0x20(%rbp) 2887 lea 0x20(%rbp), $arg1 2888 lea 0x20(%rbp), $arg2 2889 lea ($key), $arg3 2890 call asm_AES_decrypt # doesn't touch %xmm 2891 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2892 #pxor @XMM[8], @XMM[0] 2893 #lea 0x80(%rsp), %rax # pass key schedule 2894 #mov %edx, %r10d # pass rounds 2895 #call _bsaes_decrypt8 2896 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2897 movdqu @XMM[0], 0x00($out) # write output 2898 lea 0x10($out), $out 2899 2900 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2901 2902.Lxts_dec_done: 2903 and \$15, %ebx 2904 jz .Lxts_dec_ret 2905 2906 pxor $twtmp, $twtmp 2907 movdqa .Lxts_magic(%rip), $twmask 2908 pcmpgtd @XMM[7], $twtmp 2909 pshufd \$0x13, $twtmp, $twres 2910 movdqa @XMM[7], @XMM[6] 2911 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2912 pand $twmask, $twres # isolate carry and residue 2913 movdqu ($inp), @XMM[0] 2914 pxor $twres, @XMM[7] 2915 2916 lea 0x20(%rbp), $arg1 2917 pxor @XMM[7], @XMM[0] 2918 lea 0x20(%rbp), $arg2 2919 movdqa @XMM[0], 0x20(%rbp) 2920 lea ($key), $arg3 2921 call asm_AES_decrypt # doesn't touch %xmm 2922 pxor 0x20(%rbp), @XMM[7] 2923 mov $out, %rdx 2924 movdqu @XMM[7], ($out) 2925 2926.Lxts_dec_steal: 2927 movzb 16($inp), %eax 2928 movzb (%rdx), %ecx 2929 lea 1($inp), $inp 2930 mov %al, (%rdx) 2931 mov %cl, 16(%rdx) 2932 lea 1(%rdx), %rdx 2933 sub \$1,%ebx 2934 jnz .Lxts_dec_steal 2935 2936 movdqu ($out), @XMM[0] 2937 lea 0x20(%rbp), $arg1 2938 pxor @XMM[6], @XMM[0] 2939 lea 0x20(%rbp), $arg2 2940 movdqa @XMM[0], 0x20(%rbp) 2941 lea ($key), $arg3 2942 call asm_AES_decrypt # doesn't touch %xmm 2943 pxor 0x20(%rbp), @XMM[6] 2944 movdqu @XMM[6], ($out) 2945 2946.Lxts_dec_ret: 2947 lea (%rsp), %rax 2948 pxor %xmm0, %xmm0 2949.Lxts_dec_bzero: # wipe key schedule [if any] 2950 movdqa %xmm0, 0x00(%rax) 2951 movdqa %xmm0, 0x10(%rax) 2952 lea 0x20(%rax), %rax 2953 cmp %rax, %rbp 2954 ja .Lxts_dec_bzero 2955 2956 lea 0x78(%rbp),%rax 2957.cfi_def_cfa %rax,8 2958___ 2959$code.=<<___ if ($win64); 2960 movaps 0x40(%rbp), %xmm6 2961 movaps 0x50(%rbp), %xmm7 2962 movaps 0x60(%rbp), %xmm8 2963 movaps 0x70(%rbp), %xmm9 2964 movaps 0x80(%rbp), %xmm10 2965 movaps 0x90(%rbp), %xmm11 2966 movaps 0xa0(%rbp), %xmm12 2967 movaps 0xb0(%rbp), %xmm13 2968 movaps 0xc0(%rbp), %xmm14 2969 movaps 0xd0(%rbp), %xmm15 2970 lea 0xa0(%rax), %rax 2971.Lxts_dec_tail: 2972___ 2973$code.=<<___; 2974 mov -48(%rax), %r15 2975.cfi_restore %r15 2976 mov -40(%rax), %r14 2977.cfi_restore %r14 2978 mov -32(%rax), %r13 2979.cfi_restore %r13 2980 mov -24(%rax), %r12 2981.cfi_restore %r12 2982 mov -16(%rax), %rbx 2983.cfi_restore %rbx 2984 mov -8(%rax), %rbp 2985.cfi_restore %rbp 2986 lea (%rax), %rsp # restore %rsp 2987.cfi_def_cfa_register %rsp 2988.Lxts_dec_epilogue: 2989 ret 2990.cfi_endproc 2991.size bsaes_xts_decrypt,.-bsaes_xts_decrypt 2992___ 2993} 2994$code.=<<___; 2995.type _bsaes_const,\@object 2996.align 64 2997_bsaes_const: 2998.LM0ISR: # InvShiftRows constants 2999 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 3000.LISRM0: 3001 .quad 0x01040b0e0205080f, 0x0306090c00070a0d 3002.LISR: 3003 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 3004.LBS0: # bit-slice constants 3005 .quad 0x5555555555555555, 0x5555555555555555 3006.LBS1: 3007 .quad 0x3333333333333333, 0x3333333333333333 3008.LBS2: 3009 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f 3010.LSR: # shiftrows constants 3011 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b 3012.LSRM0: 3013 .quad 0x0304090e00050a0f, 0x01060b0c0207080d 3014.LM0SR: 3015 .quad 0x0a0e02060f03070b, 0x0004080c05090d01 3016.LSWPUP: # byte-swap upper dword 3017 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 3018.LSWPUPM0SR: 3019 .quad 0x0a0d02060c03070b, 0x0004080f05090e01 3020.LADD1: # counter increment constants 3021 .quad 0x0000000000000000, 0x0000000100000000 3022.LADD2: 3023 .quad 0x0000000000000000, 0x0000000200000000 3024.LADD3: 3025 .quad 0x0000000000000000, 0x0000000300000000 3026.LADD4: 3027 .quad 0x0000000000000000, 0x0000000400000000 3028.LADD5: 3029 .quad 0x0000000000000000, 0x0000000500000000 3030.LADD6: 3031 .quad 0x0000000000000000, 0x0000000600000000 3032.LADD7: 3033 .quad 0x0000000000000000, 0x0000000700000000 3034.LADD8: 3035 .quad 0x0000000000000000, 0x0000000800000000 3036.Lxts_magic: 3037 .long 0x87,0,1,0 3038.Lmasks: 3039 .quad 0x0101010101010101, 0x0101010101010101 3040 .quad 0x0202020202020202, 0x0202020202020202 3041 .quad 0x0404040404040404, 0x0404040404040404 3042 .quad 0x0808080808080808, 0x0808080808080808 3043.LM0: 3044 .quad 0x02060a0e03070b0f, 0x0004080c0105090d 3045.L63: 3046 .quad 0x6363636363636363, 0x6363636363636363 3047.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov" 3048.align 64 3049.size _bsaes_const,.-_bsaes_const 3050___ 3051 3052# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3053# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3054if ($win64) { 3055$rec="%rcx"; 3056$frame="%rdx"; 3057$context="%r8"; 3058$disp="%r9"; 3059 3060$code.=<<___; 3061.extern __imp_RtlVirtualUnwind 3062.type se_handler,\@abi-omnipotent 3063.align 16 3064se_handler: 3065 push %rsi 3066 push %rdi 3067 push %rbx 3068 push %rbp 3069 push %r12 3070 push %r13 3071 push %r14 3072 push %r15 3073 pushfq 3074 sub \$64,%rsp 3075 3076 mov 120($context),%rax # pull context->Rax 3077 mov 248($context),%rbx # pull context->Rip 3078 3079 mov 8($disp),%rsi # disp->ImageBase 3080 mov 56($disp),%r11 # disp->HandlerData 3081 3082 mov 0(%r11),%r10d # HandlerData[0] 3083 lea (%rsi,%r10),%r10 # prologue label 3084 cmp %r10,%rbx # context->Rip<=prologue label 3085 jbe .Lin_prologue 3086 3087 mov 4(%r11),%r10d # HandlerData[1] 3088 lea (%rsi,%r10),%r10 # epilogue label 3089 cmp %r10,%rbx # context->Rip>=epilogue label 3090 jae .Lin_prologue 3091 3092 mov 8(%r11),%r10d # HandlerData[2] 3093 lea (%rsi,%r10),%r10 # epilogue label 3094 cmp %r10,%rbx # context->Rip>=tail label 3095 jae .Lin_tail 3096 3097 mov 160($context),%rax # pull context->Rbp 3098 3099 lea 0x40(%rax),%rsi # %xmm save area 3100 lea 512($context),%rdi # &context.Xmm6 3101 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 3102 .long 0xa548f3fc # cld; rep movsq 3103 lea 0xa0+0x78(%rax),%rax # adjust stack pointer 3104 3105.Lin_tail: 3106 mov -48(%rax),%rbp 3107 mov -40(%rax),%rbx 3108 mov -32(%rax),%r12 3109 mov -24(%rax),%r13 3110 mov -16(%rax),%r14 3111 mov -8(%rax),%r15 3112 mov %rbx,144($context) # restore context->Rbx 3113 mov %rbp,160($context) # restore context->Rbp 3114 mov %r12,216($context) # restore context->R12 3115 mov %r13,224($context) # restore context->R13 3116 mov %r14,232($context) # restore context->R14 3117 mov %r15,240($context) # restore context->R15 3118 3119.Lin_prologue: 3120 mov %rax,152($context) # restore context->Rsp 3121 3122 mov 40($disp),%rdi # disp->ContextRecord 3123 mov $context,%rsi # context 3124 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 3125 .long 0xa548f3fc # cld; rep movsq 3126 3127 mov $disp,%rsi 3128 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3129 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3130 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3131 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3132 mov 40(%rsi),%r10 # disp->ContextRecord 3133 lea 56(%rsi),%r11 # &disp->HandlerData 3134 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3135 mov %r10,32(%rsp) # arg5 3136 mov %r11,40(%rsp) # arg6 3137 mov %r12,48(%rsp) # arg7 3138 mov %rcx,56(%rsp) # arg8, (NULL) 3139 call *__imp_RtlVirtualUnwind(%rip) 3140 3141 mov \$1,%eax # ExceptionContinueSearch 3142 add \$64,%rsp 3143 popfq 3144 pop %r15 3145 pop %r14 3146 pop %r13 3147 pop %r12 3148 pop %rbp 3149 pop %rbx 3150 pop %rdi 3151 pop %rsi 3152 ret 3153.size se_handler,.-se_handler 3154 3155.section .pdata 3156.align 4 3157___ 3158$code.=<<___ if ($ecb); 3159 .rva .Lecb_enc_prologue 3160 .rva .Lecb_enc_epilogue 3161 .rva .Lecb_enc_info 3162 3163 .rva .Lecb_dec_prologue 3164 .rva .Lecb_dec_epilogue 3165 .rva .Lecb_dec_info 3166___ 3167$code.=<<___; 3168 .rva .Lcbc_dec_prologue 3169 .rva .Lcbc_dec_epilogue 3170 .rva .Lcbc_dec_info 3171 3172 .rva .Lctr_enc_prologue 3173 .rva .Lctr_enc_epilogue 3174 .rva .Lctr_enc_info 3175 3176 .rva .Lxts_enc_prologue 3177 .rva .Lxts_enc_epilogue 3178 .rva .Lxts_enc_info 3179 3180 .rva .Lxts_dec_prologue 3181 .rva .Lxts_dec_epilogue 3182 .rva .Lxts_dec_info 3183 3184.section .xdata 3185.align 8 3186___ 3187$code.=<<___ if ($ecb); 3188.Lecb_enc_info: 3189 .byte 9,0,0,0 3190 .rva se_handler 3191 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] 3192 .rva .Lecb_enc_tail 3193 .long 0 3194.Lecb_dec_info: 3195 .byte 9,0,0,0 3196 .rva se_handler 3197 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] 3198 .rva .Lecb_dec_tail 3199 .long 0 3200___ 3201$code.=<<___; 3202.Lcbc_dec_info: 3203 .byte 9,0,0,0 3204 .rva se_handler 3205 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] 3206 .rva .Lcbc_dec_tail 3207 .long 0 3208.Lctr_enc_info: 3209 .byte 9,0,0,0 3210 .rva se_handler 3211 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] 3212 .rva .Lctr_enc_tail 3213 .long 0 3214.Lxts_enc_info: 3215 .byte 9,0,0,0 3216 .rva se_handler 3217 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 3218 .rva .Lxts_enc_tail 3219 .long 0 3220.Lxts_dec_info: 3221 .byte 9,0,0,0 3222 .rva se_handler 3223 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 3224 .rva .Lxts_dec_tail 3225 .long 0 3226___ 3227} 3228 3229$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3230 3231print $code; 3232 3233close STDOUT; 3234