1#! /usr/bin/env perl 2# Copyright 2011-2019 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10################################################################### 11### AES-128 [originally in CTR mode] ### 12### bitsliced implementation for Intel Core 2 processors ### 13### requires support of SSE extensions up to SSSE3 ### 14### Author: Emilia Käsper and Peter Schwabe ### 15### Date: 2009-03-19 ### 16### Public domain ### 17### ### 18### See http://homes.esat.kuleuven.be/~ekasper/#software for ### 19### further information. ### 20################################################################### 21# 22# September 2011. 23# 24# Started as transliteration to "perlasm" the original code has 25# undergone following changes: 26# 27# - code was made position-independent; 28# - rounds were folded into a loop resulting in >5x size reduction 29# from 12.5KB to 2.2KB; 30# - above was possibile thanks to mixcolumns() modification that 31# allowed to feed its output back to aesenc[last], this was 32# achieved at cost of two additional inter-registers moves; 33# - some instruction reordering and interleaving; 34# - this module doesn't implement key setup subroutine, instead it 35# relies on conversion of "conventional" key schedule as returned 36# by AES_set_encrypt_key (see discussion below); 37# - first and last round keys are treated differently, which allowed 38# to skip one shiftrows(), reduce bit-sliced key schedule and 39# speed-up conversion by 22%; 40# - support for 192- and 256-bit keys was added; 41# 42# Resulting performance in CPU cycles spent to encrypt one byte out 43# of 4096-byte buffer with 128-bit key is: 44# 45# Emilia's this(*) difference 46# 47# Core 2 9.30 8.69 +7% 48# Nehalem(**) 7.63 6.88 +11% 49# Atom 17.1 16.4 +4% 50# Silvermont - 12.9 51# Goldmont - 8.85 52# 53# (*) Comparison is not completely fair, because "this" is ECB, 54# i.e. no extra processing such as counter values calculation 55# and xor-ing input as in Emilia's CTR implementation is 56# performed. However, the CTR calculations stand for not more 57# than 1% of total time, so comparison is *rather* fair. 58# 59# (**) Results were collected on Westmere, which is considered to 60# be equivalent to Nehalem for this code. 61# 62# As for key schedule conversion subroutine. Interface to OpenSSL 63# relies on per-invocation on-the-fly conversion. This naturally 64# has impact on performance, especially for short inputs. Conversion 65# time in CPU cycles and its ratio to CPU cycles spent in 8x block 66# function is: 67# 68# conversion conversion/8x block 69# Core 2 240 0.22 70# Nehalem 180 0.20 71# Atom 430 0.20 72# 73# The ratio values mean that 128-byte blocks will be processed 74# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, 75# etc. Then keep in mind that input sizes not divisible by 128 are 76# *effectively* slower, especially shortest ones, e.g. consecutive 77# 144-byte blocks are processed 44% slower than one would expect, 78# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" 79# it's still faster than ["hyper-threading-safe" code path in] 80# aes-x86_64.pl on all lengths above 64 bytes... 81# 82# October 2011. 83# 84# Add decryption procedure. Performance in CPU cycles spent to decrypt 85# one byte out of 4096-byte buffer with 128-bit key is: 86# 87# Core 2 9.98 88# Nehalem 7.80 89# Atom 17.9 90# Silvermont 14.0 91# Goldmont 10.2 92# 93# November 2011. 94# 95# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is 96# suboptimal, but XTS is meant to be used with larger blocks... 97# 98# <appro@openssl.org> 99 100$flavour = shift; 101$output = shift; 102if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 103 104$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 105 106$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 107( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 108( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 109die "can't locate x86_64-xlate.pl"; 110 111open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 112*STDOUT=*OUT; 113 114my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); 115my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) 116my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... 117 118{ 119my ($key,$rounds,$const)=("%rax","%r10d","%r11"); 120 121sub Sbox { 122# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 123# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb 124my @b=@_[0..7]; 125my @t=@_[8..11]; 126my @s=@_[12..15]; 127 &InBasisChange (@b); 128 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); 129 &OutBasisChange (@b[7,1,4,2,6,5,0,3]); 130} 131 132sub InBasisChange { 133# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 134# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 135my @b=@_[0..7]; 136$code.=<<___; 137 pxor @b[6], @b[5] 138 pxor @b[1], @b[2] 139 pxor @b[0], @b[3] 140 pxor @b[2], @b[6] 141 pxor @b[0], @b[5] 142 143 pxor @b[3], @b[6] 144 pxor @b[7], @b[3] 145 pxor @b[5], @b[7] 146 pxor @b[4], @b[3] 147 pxor @b[5], @b[4] 148 pxor @b[1], @b[3] 149 150 pxor @b[7], @b[2] 151 pxor @b[5], @b[1] 152___ 153} 154 155sub OutBasisChange { 156# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 157# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb 158my @b=@_[0..7]; 159$code.=<<___; 160 pxor @b[6], @b[0] 161 pxor @b[4], @b[1] 162 pxor @b[0], @b[2] 163 pxor @b[6], @b[4] 164 pxor @b[1], @b[6] 165 166 pxor @b[5], @b[1] 167 pxor @b[3], @b[5] 168 pxor @b[7], @b[3] 169 pxor @b[5], @b[7] 170 pxor @b[5], @b[2] 171 172 pxor @b[7], @b[4] 173___ 174} 175 176sub InvSbox { 177# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 178# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb 179my @b=@_[0..7]; 180my @t=@_[8..11]; 181my @s=@_[12..15]; 182 &InvInBasisChange (@b); 183 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); 184 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); 185} 186 187sub InvInBasisChange { # OutBasisChange in reverse 188my @b=@_[5,1,2,6,3,7,0,4]; 189$code.=<<___ 190 pxor @b[7], @b[4] 191 192 pxor @b[5], @b[7] 193 pxor @b[5], @b[2] 194 pxor @b[7], @b[3] 195 pxor @b[3], @b[5] 196 pxor @b[5], @b[1] 197 198 pxor @b[1], @b[6] 199 pxor @b[0], @b[2] 200 pxor @b[6], @b[4] 201 pxor @b[6], @b[0] 202 pxor @b[4], @b[1] 203___ 204} 205 206sub InvOutBasisChange { # InBasisChange in reverse 207my @b=@_[2,5,7,3,6,1,0,4]; 208$code.=<<___; 209 pxor @b[5], @b[1] 210 pxor @b[7], @b[2] 211 212 pxor @b[1], @b[3] 213 pxor @b[5], @b[4] 214 pxor @b[5], @b[7] 215 pxor @b[4], @b[3] 216 pxor @b[0], @b[5] 217 pxor @b[7], @b[3] 218 pxor @b[2], @b[6] 219 pxor @b[1], @b[2] 220 pxor @b[3], @b[6] 221 222 pxor @b[0], @b[3] 223 pxor @b[6], @b[5] 224___ 225} 226 227sub Mul_GF4 { 228#;************************************************************* 229#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * 230#;************************************************************* 231my ($x0,$x1,$y0,$y1,$t0)=@_; 232$code.=<<___; 233 movdqa $y0, $t0 234 pxor $y1, $t0 235 pand $x0, $t0 236 pxor $x1, $x0 237 pand $y0, $x1 238 pand $y1, $x0 239 pxor $x1, $x0 240 pxor $t0, $x1 241___ 242} 243 244sub Mul_GF4_N { # not used, see next subroutine 245# multiply and scale by N 246my ($x0,$x1,$y0,$y1,$t0)=@_; 247$code.=<<___; 248 movdqa $y0, $t0 249 pxor $y1, $t0 250 pand $x0, $t0 251 pxor $x1, $x0 252 pand $y0, $x1 253 pand $y1, $x0 254 pxor $x0, $x1 255 pxor $t0, $x0 256___ 257} 258 259sub Mul_GF4_N_GF4 { 260# interleaved Mul_GF4_N and Mul_GF4 261my ($x0,$x1,$y0,$y1,$t0, 262 $x2,$x3,$y2,$y3,$t1)=@_; 263$code.=<<___; 264 movdqa $y0, $t0 265 movdqa $y2, $t1 266 pxor $y1, $t0 267 pxor $y3, $t1 268 pand $x0, $t0 269 pand $x2, $t1 270 pxor $x1, $x0 271 pxor $x3, $x2 272 pand $y0, $x1 273 pand $y2, $x3 274 pand $y1, $x0 275 pand $y3, $x2 276 pxor $x0, $x1 277 pxor $x3, $x2 278 pxor $t0, $x0 279 pxor $t1, $x3 280___ 281} 282sub Mul_GF16_2 { 283my @x=@_[0..7]; 284my @y=@_[8..11]; 285my @t=@_[12..15]; 286$code.=<<___; 287 movdqa @x[0], @t[0] 288 movdqa @x[1], @t[1] 289___ 290 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); 291$code.=<<___; 292 pxor @x[2], @t[0] 293 pxor @x[3], @t[1] 294 pxor @y[2], @y[0] 295 pxor @y[3], @y[1] 296___ 297 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 298 @x[2], @x[3], @y[2], @y[3], @t[2]); 299$code.=<<___; 300 pxor @t[0], @x[0] 301 pxor @t[0], @x[2] 302 pxor @t[1], @x[1] 303 pxor @t[1], @x[3] 304 305 movdqa @x[4], @t[0] 306 movdqa @x[5], @t[1] 307 pxor @x[6], @t[0] 308 pxor @x[7], @t[1] 309___ 310 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 311 @x[6], @x[7], @y[2], @y[3], @t[2]); 312$code.=<<___; 313 pxor @y[2], @y[0] 314 pxor @y[3], @y[1] 315___ 316 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); 317$code.=<<___; 318 pxor @t[0], @x[4] 319 pxor @t[0], @x[6] 320 pxor @t[1], @x[5] 321 pxor @t[1], @x[7] 322___ 323} 324sub Inv_GF256 { 325#;******************************************************************** 326#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * 327#;******************************************************************** 328my @x=@_[0..7]; 329my @t=@_[8..11]; 330my @s=@_[12..15]; 331# direct optimizations from hardware 332$code.=<<___; 333 movdqa @x[4], @t[3] 334 movdqa @x[5], @t[2] 335 movdqa @x[1], @t[1] 336 movdqa @x[7], @s[1] 337 movdqa @x[0], @s[0] 338 339 pxor @x[6], @t[3] 340 pxor @x[7], @t[2] 341 pxor @x[3], @t[1] 342 movdqa @t[3], @s[2] 343 pxor @x[6], @s[1] 344 movdqa @t[2], @t[0] 345 pxor @x[2], @s[0] 346 movdqa @t[3], @s[3] 347 348 por @t[1], @t[2] 349 por @s[0], @t[3] 350 pxor @t[0], @s[3] 351 pand @s[0], @s[2] 352 pxor @t[1], @s[0] 353 pand @t[1], @t[0] 354 pand @s[0], @s[3] 355 movdqa @x[3], @s[0] 356 pxor @x[2], @s[0] 357 pand @s[0], @s[1] 358 pxor @s[1], @t[3] 359 pxor @s[1], @t[2] 360 movdqa @x[4], @s[1] 361 movdqa @x[1], @s[0] 362 pxor @x[5], @s[1] 363 pxor @x[0], @s[0] 364 movdqa @s[1], @t[1] 365 pand @s[0], @s[1] 366 por @s[0], @t[1] 367 pxor @s[1], @t[0] 368 pxor @s[3], @t[3] 369 pxor @s[2], @t[2] 370 pxor @s[3], @t[1] 371 movdqa @x[7], @s[0] 372 pxor @s[2], @t[0] 373 movdqa @x[6], @s[1] 374 pxor @s[2], @t[1] 375 movdqa @x[5], @s[2] 376 pand @x[3], @s[0] 377 movdqa @x[4], @s[3] 378 pand @x[2], @s[1] 379 pand @x[1], @s[2] 380 por @x[0], @s[3] 381 pxor @s[0], @t[3] 382 pxor @s[1], @t[2] 383 pxor @s[2], @t[1] 384 pxor @s[3], @t[0] 385 386 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 387 388 # new smaller inversion 389 390 movdqa @t[3], @s[0] 391 pand @t[1], @t[3] 392 pxor @t[2], @s[0] 393 394 movdqa @t[0], @s[2] 395 movdqa @s[0], @s[3] 396 pxor @t[3], @s[2] 397 pand @s[2], @s[3] 398 399 movdqa @t[1], @s[1] 400 pxor @t[2], @s[3] 401 pxor @t[0], @s[1] 402 403 pxor @t[2], @t[3] 404 405 pand @t[3], @s[1] 406 407 movdqa @s[2], @t[2] 408 pxor @t[0], @s[1] 409 410 pxor @s[1], @t[2] 411 pxor @s[1], @t[1] 412 413 pand @t[0], @t[2] 414 415 pxor @t[2], @s[2] 416 pxor @t[2], @t[1] 417 418 pand @s[3], @s[2] 419 420 pxor @s[0], @s[2] 421___ 422# output in s3, s2, s1, t1 423 424# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 425 426# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 427 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); 428 429### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb 430} 431 432# AES linear components 433 434sub ShiftRows { 435my @x=@_[0..7]; 436my $mask=pop; 437$code.=<<___; 438 pxor 0x00($key),@x[0] 439 pxor 0x10($key),@x[1] 440 pxor 0x20($key),@x[2] 441 pxor 0x30($key),@x[3] 442 pshufb $mask,@x[0] 443 pshufb $mask,@x[1] 444 pxor 0x40($key),@x[4] 445 pxor 0x50($key),@x[5] 446 pshufb $mask,@x[2] 447 pshufb $mask,@x[3] 448 pxor 0x60($key),@x[6] 449 pxor 0x70($key),@x[7] 450 pshufb $mask,@x[4] 451 pshufb $mask,@x[5] 452 pshufb $mask,@x[6] 453 pshufb $mask,@x[7] 454 lea 0x80($key),$key 455___ 456} 457 458sub MixColumns { 459# modified to emit output in order suitable for feeding back to aesenc[last] 460my @x=@_[0..7]; 461my @t=@_[8..15]; 462my $inv=@_[16]; # optional 463$code.=<<___; 464 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 465 pshufd \$0x93, @x[1], @t[1] 466 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) 467 pshufd \$0x93, @x[2], @t[2] 468 pxor @t[1], @x[1] 469 pshufd \$0x93, @x[3], @t[3] 470 pxor @t[2], @x[2] 471 pshufd \$0x93, @x[4], @t[4] 472 pxor @t[3], @x[3] 473 pshufd \$0x93, @x[5], @t[5] 474 pxor @t[4], @x[4] 475 pshufd \$0x93, @x[6], @t[6] 476 pxor @t[5], @x[5] 477 pshufd \$0x93, @x[7], @t[7] 478 pxor @t[6], @x[6] 479 pxor @t[7], @x[7] 480 481 pxor @x[0], @t[1] 482 pxor @x[7], @t[0] 483 pxor @x[7], @t[1] 484 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) 485 pxor @x[1], @t[2] 486 pshufd \$0x4E, @x[1], @x[1] 487 pxor @x[4], @t[5] 488 pxor @t[0], @x[0] 489 pxor @x[5], @t[6] 490 pxor @t[1], @x[1] 491 pxor @x[3], @t[4] 492 pshufd \$0x4E, @x[4], @t[0] 493 pxor @x[6], @t[7] 494 pshufd \$0x4E, @x[5], @t[1] 495 pxor @x[2], @t[3] 496 pshufd \$0x4E, @x[3], @x[4] 497 pxor @x[7], @t[3] 498 pshufd \$0x4E, @x[7], @x[5] 499 pxor @x[7], @t[4] 500 pshufd \$0x4E, @x[6], @x[3] 501 pxor @t[4], @t[0] 502 pshufd \$0x4E, @x[2], @x[6] 503 pxor @t[5], @t[1] 504___ 505$code.=<<___ if (!$inv); 506 pxor @t[3], @x[4] 507 pxor @t[7], @x[5] 508 pxor @t[6], @x[3] 509 movdqa @t[0], @x[2] 510 pxor @t[2], @x[6] 511 movdqa @t[1], @x[7] 512___ 513$code.=<<___ if ($inv); 514 pxor @x[4], @t[3] 515 pxor @t[7], @x[5] 516 pxor @x[3], @t[6] 517 movdqa @t[0], @x[3] 518 pxor @t[2], @x[6] 519 movdqa @t[6], @x[2] 520 movdqa @t[1], @x[7] 521 movdqa @x[6], @x[4] 522 movdqa @t[3], @x[6] 523___ 524} 525 526sub InvMixColumns_orig { 527my @x=@_[0..7]; 528my @t=@_[8..15]; 529 530$code.=<<___; 531 # multiplication by 0x0e 532 pshufd \$0x93, @x[7], @t[7] 533 movdqa @x[2], @t[2] 534 pxor @x[5], @x[7] # 7 5 535 pxor @x[5], @x[2] # 2 5 536 pshufd \$0x93, @x[0], @t[0] 537 movdqa @x[5], @t[5] 538 pxor @x[0], @x[5] # 5 0 [1] 539 pxor @x[1], @x[0] # 0 1 540 pshufd \$0x93, @x[1], @t[1] 541 pxor @x[2], @x[1] # 1 25 542 pxor @x[6], @x[0] # 01 6 [2] 543 pxor @x[3], @x[1] # 125 3 [4] 544 pshufd \$0x93, @x[3], @t[3] 545 pxor @x[0], @x[2] # 25 016 [3] 546 pxor @x[7], @x[3] # 3 75 547 pxor @x[6], @x[7] # 75 6 [0] 548 pshufd \$0x93, @x[6], @t[6] 549 movdqa @x[4], @t[4] 550 pxor @x[4], @x[6] # 6 4 551 pxor @x[3], @x[4] # 4 375 [6] 552 pxor @x[7], @x[3] # 375 756=36 553 pxor @t[5], @x[6] # 64 5 [7] 554 pxor @t[2], @x[3] # 36 2 555 pxor @t[4], @x[3] # 362 4 [5] 556 pshufd \$0x93, @t[5], @t[5] 557___ 558 my @y = @x[7,5,0,2,1,3,4,6]; 559$code.=<<___; 560 # multiplication by 0x0b 561 pxor @y[0], @y[1] 562 pxor @t[0], @y[0] 563 pxor @t[1], @y[1] 564 pshufd \$0x93, @t[2], @t[2] 565 pxor @t[5], @y[0] 566 pxor @t[6], @y[1] 567 pxor @t[7], @y[0] 568 pshufd \$0x93, @t[4], @t[4] 569 pxor @t[6], @t[7] # clobber t[7] 570 pxor @y[0], @y[1] 571 572 pxor @t[0], @y[3] 573 pshufd \$0x93, @t[0], @t[0] 574 pxor @t[1], @y[2] 575 pxor @t[1], @y[4] 576 pxor @t[2], @y[2] 577 pshufd \$0x93, @t[1], @t[1] 578 pxor @t[2], @y[3] 579 pxor @t[2], @y[5] 580 pxor @t[7], @y[2] 581 pshufd \$0x93, @t[2], @t[2] 582 pxor @t[3], @y[3] 583 pxor @t[3], @y[6] 584 pxor @t[3], @y[4] 585 pshufd \$0x93, @t[3], @t[3] 586 pxor @t[4], @y[7] 587 pxor @t[4], @y[5] 588 pxor @t[7], @y[7] 589 pxor @t[5], @y[3] 590 pxor @t[4], @y[4] 591 pxor @t[5], @t[7] # clobber t[7] even more 592 593 pxor @t[7], @y[5] 594 pshufd \$0x93, @t[4], @t[4] 595 pxor @t[7], @y[6] 596 pxor @t[7], @y[4] 597 598 pxor @t[5], @t[7] 599 pshufd \$0x93, @t[5], @t[5] 600 pxor @t[6], @t[7] # restore t[7] 601 602 # multiplication by 0x0d 603 pxor @y[7], @y[4] 604 pxor @t[4], @y[7] 605 pshufd \$0x93, @t[6], @t[6] 606 pxor @t[0], @y[2] 607 pxor @t[5], @y[7] 608 pxor @t[2], @y[2] 609 pshufd \$0x93, @t[7], @t[7] 610 611 pxor @y[1], @y[3] 612 pxor @t[1], @y[1] 613 pxor @t[0], @y[0] 614 pxor @t[0], @y[3] 615 pxor @t[5], @y[1] 616 pxor @t[5], @y[0] 617 pxor @t[7], @y[1] 618 pshufd \$0x93, @t[0], @t[0] 619 pxor @t[6], @y[0] 620 pxor @y[1], @y[3] 621 pxor @t[1], @y[4] 622 pshufd \$0x93, @t[1], @t[1] 623 624 pxor @t[7], @y[7] 625 pxor @t[2], @y[4] 626 pxor @t[2], @y[5] 627 pshufd \$0x93, @t[2], @t[2] 628 pxor @t[6], @y[2] 629 pxor @t[3], @t[6] # clobber t[6] 630 pxor @y[7], @y[4] 631 pxor @t[6], @y[3] 632 633 pxor @t[6], @y[6] 634 pxor @t[5], @y[5] 635 pxor @t[4], @y[6] 636 pshufd \$0x93, @t[4], @t[4] 637 pxor @t[6], @y[5] 638 pxor @t[7], @y[6] 639 pxor @t[3], @t[6] # restore t[6] 640 641 pshufd \$0x93, @t[5], @t[5] 642 pshufd \$0x93, @t[6], @t[6] 643 pshufd \$0x93, @t[7], @t[7] 644 pshufd \$0x93, @t[3], @t[3] 645 646 # multiplication by 0x09 647 pxor @y[1], @y[4] 648 pxor @y[1], @t[1] # t[1]=y[1] 649 pxor @t[5], @t[0] # clobber t[0] 650 pxor @t[5], @t[1] 651 pxor @t[0], @y[3] 652 pxor @y[0], @t[0] # t[0]=y[0] 653 pxor @t[6], @t[1] 654 pxor @t[7], @t[6] # clobber t[6] 655 pxor @t[1], @y[4] 656 pxor @t[4], @y[7] 657 pxor @y[4], @t[4] # t[4]=y[4] 658 pxor @t[3], @y[6] 659 pxor @y[3], @t[3] # t[3]=y[3] 660 pxor @t[2], @y[5] 661 pxor @y[2], @t[2] # t[2]=y[2] 662 pxor @t[7], @t[3] 663 pxor @y[5], @t[5] # t[5]=y[5] 664 pxor @t[6], @t[2] 665 pxor @t[6], @t[5] 666 pxor @y[6], @t[6] # t[6]=y[6] 667 pxor @y[7], @t[7] # t[7]=y[7] 668 669 movdqa @t[0],@XMM[0] 670 movdqa @t[1],@XMM[1] 671 movdqa @t[2],@XMM[2] 672 movdqa @t[3],@XMM[3] 673 movdqa @t[4],@XMM[4] 674 movdqa @t[5],@XMM[5] 675 movdqa @t[6],@XMM[6] 676 movdqa @t[7],@XMM[7] 677___ 678} 679 680sub InvMixColumns { 681my @x=@_[0..7]; 682my @t=@_[8..15]; 683 684# Thanks to Jussi Kivilinna for providing pointer to 685# 686# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | 687# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | 688# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | 689# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | 690 691$code.=<<___; 692 # multiplication by 0x05-0x00-0x04-0x00 693 pshufd \$0x4E, @x[0], @t[0] 694 pshufd \$0x4E, @x[6], @t[6] 695 pxor @x[0], @t[0] 696 pshufd \$0x4E, @x[7], @t[7] 697 pxor @x[6], @t[6] 698 pshufd \$0x4E, @x[1], @t[1] 699 pxor @x[7], @t[7] 700 pshufd \$0x4E, @x[2], @t[2] 701 pxor @x[1], @t[1] 702 pshufd \$0x4E, @x[3], @t[3] 703 pxor @x[2], @t[2] 704 pxor @t[6], @x[0] 705 pxor @t[6], @x[1] 706 pshufd \$0x4E, @x[4], @t[4] 707 pxor @x[3], @t[3] 708 pxor @t[0], @x[2] 709 pxor @t[1], @x[3] 710 pshufd \$0x4E, @x[5], @t[5] 711 pxor @x[4], @t[4] 712 pxor @t[7], @x[1] 713 pxor @t[2], @x[4] 714 pxor @x[5], @t[5] 715 716 pxor @t[7], @x[2] 717 pxor @t[6], @x[3] 718 pxor @t[6], @x[4] 719 pxor @t[3], @x[5] 720 pxor @t[4], @x[6] 721 pxor @t[7], @x[4] 722 pxor @t[7], @x[5] 723 pxor @t[5], @x[7] 724___ 725 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 726} 727 728sub aesenc { # not used 729my @b=@_[0..7]; 730my @t=@_[8..15]; 731$code.=<<___; 732 movdqa 0x30($const),@t[0] # .LSR 733___ 734 &ShiftRows (@b,@t[0]); 735 &Sbox (@b,@t); 736 &MixColumns (@b[0,1,4,6,3,7,2,5],@t); 737} 738 739sub aesenclast { # not used 740my @b=@_[0..7]; 741my @t=@_[8..15]; 742$code.=<<___; 743 movdqa 0x40($const),@t[0] # .LSRM0 744___ 745 &ShiftRows (@b,@t[0]); 746 &Sbox (@b,@t); 747$code.=<<___ 748 pxor 0x00($key),@b[0] 749 pxor 0x10($key),@b[1] 750 pxor 0x20($key),@b[4] 751 pxor 0x30($key),@b[6] 752 pxor 0x40($key),@b[3] 753 pxor 0x50($key),@b[7] 754 pxor 0x60($key),@b[2] 755 pxor 0x70($key),@b[5] 756___ 757} 758 759sub swapmove { 760my ($a,$b,$n,$mask,$t)=@_; 761$code.=<<___; 762 movdqa $b,$t 763 psrlq \$$n,$b 764 pxor $a,$b 765 pand $mask,$b 766 pxor $b,$a 767 psllq \$$n,$b 768 pxor $t,$b 769___ 770} 771sub swapmove2x { 772my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; 773$code.=<<___; 774 movdqa $b0,$t0 775 psrlq \$$n,$b0 776 movdqa $b1,$t1 777 psrlq \$$n,$b1 778 pxor $a0,$b0 779 pxor $a1,$b1 780 pand $mask,$b0 781 pand $mask,$b1 782 pxor $b0,$a0 783 psllq \$$n,$b0 784 pxor $b1,$a1 785 psllq \$$n,$b1 786 pxor $t0,$b0 787 pxor $t1,$b1 788___ 789} 790 791sub bitslice { 792my @x=reverse(@_[0..7]); 793my ($t0,$t1,$t2,$t3)=@_[8..11]; 794$code.=<<___; 795 movdqa 0x00($const),$t0 # .LBS0 796 movdqa 0x10($const),$t1 # .LBS1 797___ 798 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); 799 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 800$code.=<<___; 801 movdqa 0x20($const),$t0 # .LBS2 802___ 803 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); 804 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 805 806 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); 807 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); 808} 809 810$code.=<<___; 811.text 812 813.extern asm_AES_encrypt 814.extern asm_AES_decrypt 815 816.type _bsaes_encrypt8,\@abi-omnipotent 817.align 64 818_bsaes_encrypt8: 819.cfi_startproc 820 lea .LBS0(%rip), $const # constants table 821 822 movdqa ($key), @XMM[9] # round 0 key 823 lea 0x10($key), $key 824 movdqa 0x50($const), @XMM[8] # .LM0SR 825 pxor @XMM[9], @XMM[0] # xor with round0 key 826 pxor @XMM[9], @XMM[1] 827 pxor @XMM[9], @XMM[2] 828 pxor @XMM[9], @XMM[3] 829 pshufb @XMM[8], @XMM[0] 830 pshufb @XMM[8], @XMM[1] 831 pxor @XMM[9], @XMM[4] 832 pxor @XMM[9], @XMM[5] 833 pshufb @XMM[8], @XMM[2] 834 pshufb @XMM[8], @XMM[3] 835 pxor @XMM[9], @XMM[6] 836 pxor @XMM[9], @XMM[7] 837 pshufb @XMM[8], @XMM[4] 838 pshufb @XMM[8], @XMM[5] 839 pshufb @XMM[8], @XMM[6] 840 pshufb @XMM[8], @XMM[7] 841_bsaes_encrypt8_bitslice: 842___ 843 &bitslice (@XMM[0..7, 8..11]); 844$code.=<<___; 845 dec $rounds 846 jmp .Lenc_sbox 847.align 16 848.Lenc_loop: 849___ 850 &ShiftRows (@XMM[0..7, 8]); 851$code.=".Lenc_sbox:\n"; 852 &Sbox (@XMM[0..7, 8..15]); 853$code.=<<___; 854 dec $rounds 855 jl .Lenc_done 856___ 857 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); 858$code.=<<___; 859 movdqa 0x30($const), @XMM[8] # .LSR 860 jnz .Lenc_loop 861 movdqa 0x40($const), @XMM[8] # .LSRM0 862 jmp .Lenc_loop 863.align 16 864.Lenc_done: 865___ 866 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb 867 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); 868$code.=<<___; 869 movdqa ($key), @XMM[8] # last round key 870 pxor @XMM[8], @XMM[4] 871 pxor @XMM[8], @XMM[6] 872 pxor @XMM[8], @XMM[3] 873 pxor @XMM[8], @XMM[7] 874 pxor @XMM[8], @XMM[2] 875 pxor @XMM[8], @XMM[5] 876 pxor @XMM[8], @XMM[0] 877 pxor @XMM[8], @XMM[1] 878 ret 879.cfi_endproc 880.size _bsaes_encrypt8,.-_bsaes_encrypt8 881 882.type _bsaes_decrypt8,\@abi-omnipotent 883.align 64 884_bsaes_decrypt8: 885.cfi_startproc 886 lea .LBS0(%rip), $const # constants table 887 888 movdqa ($key), @XMM[9] # round 0 key 889 lea 0x10($key), $key 890 movdqa -0x30($const), @XMM[8] # .LM0ISR 891 pxor @XMM[9], @XMM[0] # xor with round0 key 892 pxor @XMM[9], @XMM[1] 893 pxor @XMM[9], @XMM[2] 894 pxor @XMM[9], @XMM[3] 895 pshufb @XMM[8], @XMM[0] 896 pshufb @XMM[8], @XMM[1] 897 pxor @XMM[9], @XMM[4] 898 pxor @XMM[9], @XMM[5] 899 pshufb @XMM[8], @XMM[2] 900 pshufb @XMM[8], @XMM[3] 901 pxor @XMM[9], @XMM[6] 902 pxor @XMM[9], @XMM[7] 903 pshufb @XMM[8], @XMM[4] 904 pshufb @XMM[8], @XMM[5] 905 pshufb @XMM[8], @XMM[6] 906 pshufb @XMM[8], @XMM[7] 907___ 908 &bitslice (@XMM[0..7, 8..11]); 909$code.=<<___; 910 dec $rounds 911 jmp .Ldec_sbox 912.align 16 913.Ldec_loop: 914___ 915 &ShiftRows (@XMM[0..7, 8]); 916$code.=".Ldec_sbox:\n"; 917 &InvSbox (@XMM[0..7, 8..15]); 918$code.=<<___; 919 dec $rounds 920 jl .Ldec_done 921___ 922 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); 923$code.=<<___; 924 movdqa -0x10($const), @XMM[8] # .LISR 925 jnz .Ldec_loop 926 movdqa -0x20($const), @XMM[8] # .LISRM0 927 jmp .Ldec_loop 928.align 16 929.Ldec_done: 930___ 931 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); 932$code.=<<___; 933 movdqa ($key), @XMM[8] # last round key 934 pxor @XMM[8], @XMM[6] 935 pxor @XMM[8], @XMM[4] 936 pxor @XMM[8], @XMM[2] 937 pxor @XMM[8], @XMM[7] 938 pxor @XMM[8], @XMM[3] 939 pxor @XMM[8], @XMM[5] 940 pxor @XMM[8], @XMM[0] 941 pxor @XMM[8], @XMM[1] 942 ret 943.cfi_endproc 944.size _bsaes_decrypt8,.-_bsaes_decrypt8 945___ 946} 947{ 948my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); 949 950sub bitslice_key { 951my @x=reverse(@_[0..7]); 952my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; 953 954 &swapmove (@x[0,1],1,$bs0,$t2,$t3); 955$code.=<<___; 956 #&swapmove(@x[2,3],1,$t0,$t2,$t3); 957 movdqa @x[0], @x[2] 958 movdqa @x[1], @x[3] 959___ 960 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 961 962 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); 963$code.=<<___; 964 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 965 movdqa @x[0], @x[4] 966 movdqa @x[2], @x[6] 967 movdqa @x[1], @x[5] 968 movdqa @x[3], @x[7] 969___ 970 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); 971 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); 972} 973 974$code.=<<___; 975.type _bsaes_key_convert,\@abi-omnipotent 976.align 16 977_bsaes_key_convert: 978.cfi_startproc 979 lea .Lmasks(%rip), $const 980 movdqu ($inp), %xmm7 # load round 0 key 981 lea 0x10($inp), $inp 982 movdqa 0x00($const), %xmm0 # 0x01... 983 movdqa 0x10($const), %xmm1 # 0x02... 984 movdqa 0x20($const), %xmm2 # 0x04... 985 movdqa 0x30($const), %xmm3 # 0x08... 986 movdqa 0x40($const), %xmm4 # .LM0 987 pcmpeqd %xmm5, %xmm5 # .LNOT 988 989 movdqu ($inp), %xmm6 # load round 1 key 990 movdqa %xmm7, ($out) # save round 0 key 991 lea 0x10($out), $out 992 dec $rounds 993 jmp .Lkey_loop 994.align 16 995.Lkey_loop: 996 pshufb %xmm4, %xmm6 # .LM0 997 998 movdqa %xmm0, %xmm8 999 movdqa %xmm1, %xmm9 1000 1001 pand %xmm6, %xmm8 1002 pand %xmm6, %xmm9 1003 movdqa %xmm2, %xmm10 1004 pcmpeqb %xmm0, %xmm8 1005 psllq \$4, %xmm0 # 0x10... 1006 movdqa %xmm3, %xmm11 1007 pcmpeqb %xmm1, %xmm9 1008 psllq \$4, %xmm1 # 0x20... 1009 1010 pand %xmm6, %xmm10 1011 pand %xmm6, %xmm11 1012 movdqa %xmm0, %xmm12 1013 pcmpeqb %xmm2, %xmm10 1014 psllq \$4, %xmm2 # 0x40... 1015 movdqa %xmm1, %xmm13 1016 pcmpeqb %xmm3, %xmm11 1017 psllq \$4, %xmm3 # 0x80... 1018 1019 movdqa %xmm2, %xmm14 1020 movdqa %xmm3, %xmm15 1021 pxor %xmm5, %xmm8 # "pnot" 1022 pxor %xmm5, %xmm9 1023 1024 pand %xmm6, %xmm12 1025 pand %xmm6, %xmm13 1026 movdqa %xmm8, 0x00($out) # write bit-sliced round key 1027 pcmpeqb %xmm0, %xmm12 1028 psrlq \$4, %xmm0 # 0x01... 1029 movdqa %xmm9, 0x10($out) 1030 pcmpeqb %xmm1, %xmm13 1031 psrlq \$4, %xmm1 # 0x02... 1032 lea 0x10($inp), $inp 1033 1034 pand %xmm6, %xmm14 1035 pand %xmm6, %xmm15 1036 movdqa %xmm10, 0x20($out) 1037 pcmpeqb %xmm2, %xmm14 1038 psrlq \$4, %xmm2 # 0x04... 1039 movdqa %xmm11, 0x30($out) 1040 pcmpeqb %xmm3, %xmm15 1041 psrlq \$4, %xmm3 # 0x08... 1042 movdqu ($inp), %xmm6 # load next round key 1043 1044 pxor %xmm5, %xmm13 # "pnot" 1045 pxor %xmm5, %xmm14 1046 movdqa %xmm12, 0x40($out) 1047 movdqa %xmm13, 0x50($out) 1048 movdqa %xmm14, 0x60($out) 1049 movdqa %xmm15, 0x70($out) 1050 lea 0x80($out),$out 1051 dec $rounds 1052 jnz .Lkey_loop 1053 1054 movdqa 0x50($const), %xmm7 # .L63 1055 #movdqa %xmm6, ($out) # don't save last round key 1056 ret 1057.cfi_endproc 1058.size _bsaes_key_convert,.-_bsaes_key_convert 1059___ 1060} 1061 1062if (0 && !$win64) { # following four functions are unsupported interface 1063 # used for benchmarking... 1064$code.=<<___; 1065.globl bsaes_enc_key_convert 1066.type bsaes_enc_key_convert,\@function,2 1067.align 16 1068bsaes_enc_key_convert: 1069 mov 240($inp),%r10d # pass rounds 1070 mov $inp,%rcx # pass key 1071 mov $out,%rax # pass key schedule 1072 call _bsaes_key_convert 1073 pxor %xmm6,%xmm7 # fix up last round key 1074 movdqa %xmm7,(%rax) # save last round key 1075 ret 1076.size bsaes_enc_key_convert,.-bsaes_enc_key_convert 1077 1078.globl bsaes_encrypt_128 1079.type bsaes_encrypt_128,\@function,4 1080.align 16 1081bsaes_encrypt_128: 1082.Lenc128_loop: 1083 movdqu 0x00($inp), @XMM[0] # load input 1084 movdqu 0x10($inp), @XMM[1] 1085 movdqu 0x20($inp), @XMM[2] 1086 movdqu 0x30($inp), @XMM[3] 1087 movdqu 0x40($inp), @XMM[4] 1088 movdqu 0x50($inp), @XMM[5] 1089 movdqu 0x60($inp), @XMM[6] 1090 movdqu 0x70($inp), @XMM[7] 1091 mov $key, %rax # pass the $key 1092 lea 0x80($inp), $inp 1093 mov \$10,%r10d 1094 1095 call _bsaes_encrypt8 1096 1097 movdqu @XMM[0], 0x00($out) # write output 1098 movdqu @XMM[1], 0x10($out) 1099 movdqu @XMM[4], 0x20($out) 1100 movdqu @XMM[6], 0x30($out) 1101 movdqu @XMM[3], 0x40($out) 1102 movdqu @XMM[7], 0x50($out) 1103 movdqu @XMM[2], 0x60($out) 1104 movdqu @XMM[5], 0x70($out) 1105 lea 0x80($out), $out 1106 sub \$0x80,$len 1107 ja .Lenc128_loop 1108 ret 1109.size bsaes_encrypt_128,.-bsaes_encrypt_128 1110 1111.globl bsaes_dec_key_convert 1112.type bsaes_dec_key_convert,\@function,2 1113.align 16 1114bsaes_dec_key_convert: 1115 mov 240($inp),%r10d # pass rounds 1116 mov $inp,%rcx # pass key 1117 mov $out,%rax # pass key schedule 1118 call _bsaes_key_convert 1119 pxor ($out),%xmm7 # fix up round 0 key 1120 movdqa %xmm6,(%rax) # save last round key 1121 movdqa %xmm7,($out) 1122 ret 1123.size bsaes_dec_key_convert,.-bsaes_dec_key_convert 1124 1125.globl bsaes_decrypt_128 1126.type bsaes_decrypt_128,\@function,4 1127.align 16 1128bsaes_decrypt_128: 1129.Ldec128_loop: 1130 movdqu 0x00($inp), @XMM[0] # load input 1131 movdqu 0x10($inp), @XMM[1] 1132 movdqu 0x20($inp), @XMM[2] 1133 movdqu 0x30($inp), @XMM[3] 1134 movdqu 0x40($inp), @XMM[4] 1135 movdqu 0x50($inp), @XMM[5] 1136 movdqu 0x60($inp), @XMM[6] 1137 movdqu 0x70($inp), @XMM[7] 1138 mov $key, %rax # pass the $key 1139 lea 0x80($inp), $inp 1140 mov \$10,%r10d 1141 1142 call _bsaes_decrypt8 1143 1144 movdqu @XMM[0], 0x00($out) # write output 1145 movdqu @XMM[1], 0x10($out) 1146 movdqu @XMM[6], 0x20($out) 1147 movdqu @XMM[4], 0x30($out) 1148 movdqu @XMM[2], 0x40($out) 1149 movdqu @XMM[7], 0x50($out) 1150 movdqu @XMM[3], 0x60($out) 1151 movdqu @XMM[5], 0x70($out) 1152 lea 0x80($out), $out 1153 sub \$0x80,$len 1154 ja .Ldec128_loop 1155 ret 1156.size bsaes_decrypt_128,.-bsaes_decrypt_128 1157___ 1158} 1159{ 1160###################################################################### 1161# 1162# OpenSSL interface 1163# 1164my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") 1165 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1166my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); 1167 1168if ($ecb) { 1169$code.=<<___; 1170.globl bsaes_ecb_encrypt_blocks 1171.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent 1172.align 16 1173bsaes_ecb_encrypt_blocks: 1174.cfi_startproc 1175 mov %rsp, %rax 1176.Lecb_enc_prologue: 1177 push %rbp 1178.cfi_push %rbp 1179 push %rbx 1180.cfi_push %rbx 1181 push %r12 1182.cfi_push %r12 1183 push %r13 1184.cfi_push %r13 1185 push %r14 1186.cfi_push %r14 1187 push %r15 1188.cfi_push %r15 1189 lea -0x48(%rsp),%rsp 1190.cfi_adjust_cfa_offset 0x48 1191___ 1192$code.=<<___ if ($win64); 1193 lea -0xa0(%rsp), %rsp 1194 movaps %xmm6, 0x40(%rsp) 1195 movaps %xmm7, 0x50(%rsp) 1196 movaps %xmm8, 0x60(%rsp) 1197 movaps %xmm9, 0x70(%rsp) 1198 movaps %xmm10, 0x80(%rsp) 1199 movaps %xmm11, 0x90(%rsp) 1200 movaps %xmm12, 0xa0(%rsp) 1201 movaps %xmm13, 0xb0(%rsp) 1202 movaps %xmm14, 0xc0(%rsp) 1203 movaps %xmm15, 0xd0(%rsp) 1204.Lecb_enc_body: 1205___ 1206$code.=<<___; 1207 mov %rsp,%rbp # backup %rsp 1208.cfi_def_cfa_register %rbp 1209 mov 240($arg4),%eax # rounds 1210 mov $arg1,$inp # backup arguments 1211 mov $arg2,$out 1212 mov $arg3,$len 1213 mov $arg4,$key 1214 cmp \$8,$arg3 1215 jb .Lecb_enc_short 1216 1217 mov %eax,%ebx # backup rounds 1218 shl \$7,%rax # 128 bytes per inner round key 1219 sub \$`128-32`,%rax # size of bit-sliced key schedule 1220 sub %rax,%rsp 1221 mov %rsp,%rax # pass key schedule 1222 mov $key,%rcx # pass key 1223 mov %ebx,%r10d # pass rounds 1224 call _bsaes_key_convert 1225 pxor %xmm6,%xmm7 # fix up last round key 1226 movdqa %xmm7,(%rax) # save last round key 1227 1228 sub \$8,$len 1229.Lecb_enc_loop: 1230 movdqu 0x00($inp), @XMM[0] # load input 1231 movdqu 0x10($inp), @XMM[1] 1232 movdqu 0x20($inp), @XMM[2] 1233 movdqu 0x30($inp), @XMM[3] 1234 movdqu 0x40($inp), @XMM[4] 1235 movdqu 0x50($inp), @XMM[5] 1236 mov %rsp, %rax # pass key schedule 1237 movdqu 0x60($inp), @XMM[6] 1238 mov %ebx,%r10d # pass rounds 1239 movdqu 0x70($inp), @XMM[7] 1240 lea 0x80($inp), $inp 1241 1242 call _bsaes_encrypt8 1243 1244 movdqu @XMM[0], 0x00($out) # write output 1245 movdqu @XMM[1], 0x10($out) 1246 movdqu @XMM[4], 0x20($out) 1247 movdqu @XMM[6], 0x30($out) 1248 movdqu @XMM[3], 0x40($out) 1249 movdqu @XMM[7], 0x50($out) 1250 movdqu @XMM[2], 0x60($out) 1251 movdqu @XMM[5], 0x70($out) 1252 lea 0x80($out), $out 1253 sub \$8,$len 1254 jnc .Lecb_enc_loop 1255 1256 add \$8,$len 1257 jz .Lecb_enc_done 1258 1259 movdqu 0x00($inp), @XMM[0] # load input 1260 mov %rsp, %rax # pass key schedule 1261 mov %ebx,%r10d # pass rounds 1262 cmp \$2,$len 1263 jb .Lecb_enc_one 1264 movdqu 0x10($inp), @XMM[1] 1265 je .Lecb_enc_two 1266 movdqu 0x20($inp), @XMM[2] 1267 cmp \$4,$len 1268 jb .Lecb_enc_three 1269 movdqu 0x30($inp), @XMM[3] 1270 je .Lecb_enc_four 1271 movdqu 0x40($inp), @XMM[4] 1272 cmp \$6,$len 1273 jb .Lecb_enc_five 1274 movdqu 0x50($inp), @XMM[5] 1275 je .Lecb_enc_six 1276 movdqu 0x60($inp), @XMM[6] 1277 call _bsaes_encrypt8 1278 movdqu @XMM[0], 0x00($out) # write output 1279 movdqu @XMM[1], 0x10($out) 1280 movdqu @XMM[4], 0x20($out) 1281 movdqu @XMM[6], 0x30($out) 1282 movdqu @XMM[3], 0x40($out) 1283 movdqu @XMM[7], 0x50($out) 1284 movdqu @XMM[2], 0x60($out) 1285 jmp .Lecb_enc_done 1286.align 16 1287.Lecb_enc_six: 1288 call _bsaes_encrypt8 1289 movdqu @XMM[0], 0x00($out) # write output 1290 movdqu @XMM[1], 0x10($out) 1291 movdqu @XMM[4], 0x20($out) 1292 movdqu @XMM[6], 0x30($out) 1293 movdqu @XMM[3], 0x40($out) 1294 movdqu @XMM[7], 0x50($out) 1295 jmp .Lecb_enc_done 1296.align 16 1297.Lecb_enc_five: 1298 call _bsaes_encrypt8 1299 movdqu @XMM[0], 0x00($out) # write output 1300 movdqu @XMM[1], 0x10($out) 1301 movdqu @XMM[4], 0x20($out) 1302 movdqu @XMM[6], 0x30($out) 1303 movdqu @XMM[3], 0x40($out) 1304 jmp .Lecb_enc_done 1305.align 16 1306.Lecb_enc_four: 1307 call _bsaes_encrypt8 1308 movdqu @XMM[0], 0x00($out) # write output 1309 movdqu @XMM[1], 0x10($out) 1310 movdqu @XMM[4], 0x20($out) 1311 movdqu @XMM[6], 0x30($out) 1312 jmp .Lecb_enc_done 1313.align 16 1314.Lecb_enc_three: 1315 call _bsaes_encrypt8 1316 movdqu @XMM[0], 0x00($out) # write output 1317 movdqu @XMM[1], 0x10($out) 1318 movdqu @XMM[4], 0x20($out) 1319 jmp .Lecb_enc_done 1320.align 16 1321.Lecb_enc_two: 1322 call _bsaes_encrypt8 1323 movdqu @XMM[0], 0x00($out) # write output 1324 movdqu @XMM[1], 0x10($out) 1325 jmp .Lecb_enc_done 1326.align 16 1327.Lecb_enc_one: 1328 call _bsaes_encrypt8 1329 movdqu @XMM[0], 0x00($out) # write output 1330 jmp .Lecb_enc_done 1331.align 16 1332.Lecb_enc_short: 1333 lea ($inp), $arg1 1334 lea ($out), $arg2 1335 lea ($key), $arg3 1336 call asm_AES_encrypt 1337 lea 16($inp), $inp 1338 lea 16($out), $out 1339 dec $len 1340 jnz .Lecb_enc_short 1341 1342.Lecb_enc_done: 1343 lea (%rsp),%rax 1344 pxor %xmm0, %xmm0 1345.Lecb_enc_bzero: # wipe key schedule [if any] 1346 movdqa %xmm0, 0x00(%rax) 1347 movdqa %xmm0, 0x10(%rax) 1348 lea 0x20(%rax), %rax 1349 cmp %rax, %rbp 1350 jb .Lecb_enc_bzero 1351 1352 lea 0x78(%rbp),%rax 1353.cfi_def_cfa %rax,8 1354___ 1355$code.=<<___ if ($win64); 1356 movaps 0x40(%rbp), %xmm6 1357 movaps 0x50(%rbp), %xmm7 1358 movaps 0x60(%rbp), %xmm8 1359 movaps 0x70(%rbp), %xmm9 1360 movaps 0x80(%rbp), %xmm10 1361 movaps 0x90(%rbp), %xmm11 1362 movaps 0xa0(%rbp), %xmm12 1363 movaps 0xb0(%rbp), %xmm13 1364 movaps 0xc0(%rbp), %xmm14 1365 movaps 0xd0(%rbp), %xmm15 1366 lea 0xa0(%rax), %rax 1367.Lecb_enc_tail: 1368___ 1369$code.=<<___; 1370 mov -48(%rax), %r15 1371.cfi_restore %r15 1372 mov -40(%rax), %r14 1373.cfi_restore %r14 1374 mov -32(%rax), %r13 1375.cfi_restore %r13 1376 mov -24(%rax), %r12 1377.cfi_restore %r12 1378 mov -16(%rax), %rbx 1379.cfi_restore %rbx 1380 mov -8(%rax), %rbp 1381.cfi_restore %rbp 1382 lea (%rax), %rsp # restore %rsp 1383.cfi_def_cfa_register %rsp 1384.Lecb_enc_epilogue: 1385 ret 1386.cfi_endproc 1387.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks 1388 1389.globl bsaes_ecb_decrypt_blocks 1390.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent 1391.align 16 1392bsaes_ecb_decrypt_blocks: 1393.cfi_startproc 1394 mov %rsp, %rax 1395.Lecb_dec_prologue: 1396 push %rbp 1397.cfi_push %rbp 1398 push %rbx 1399.cfi_push %rbx 1400 push %r12 1401.cfi_push %r12 1402 push %r13 1403.cfi_push %r13 1404 push %r14 1405.cfi_push %r14 1406 push %r15 1407.cfi_push %r15 1408 lea -0x48(%rsp),%rsp 1409.cfi_adjust_cfa_offset 0x48 1410___ 1411$code.=<<___ if ($win64); 1412 lea -0xa0(%rsp), %rsp 1413 movaps %xmm6, 0x40(%rsp) 1414 movaps %xmm7, 0x50(%rsp) 1415 movaps %xmm8, 0x60(%rsp) 1416 movaps %xmm9, 0x70(%rsp) 1417 movaps %xmm10, 0x80(%rsp) 1418 movaps %xmm11, 0x90(%rsp) 1419 movaps %xmm12, 0xa0(%rsp) 1420 movaps %xmm13, 0xb0(%rsp) 1421 movaps %xmm14, 0xc0(%rsp) 1422 movaps %xmm15, 0xd0(%rsp) 1423.Lecb_dec_body: 1424___ 1425$code.=<<___; 1426 mov %rsp,%rbp # backup %rsp 1427.cfi_def_cfa_register %rbp 1428 mov 240($arg4),%eax # rounds 1429 mov $arg1,$inp # backup arguments 1430 mov $arg2,$out 1431 mov $arg3,$len 1432 mov $arg4,$key 1433 cmp \$8,$arg3 1434 jb .Lecb_dec_short 1435 1436 mov %eax,%ebx # backup rounds 1437 shl \$7,%rax # 128 bytes per inner round key 1438 sub \$`128-32`,%rax # size of bit-sliced key schedule 1439 sub %rax,%rsp 1440 mov %rsp,%rax # pass key schedule 1441 mov $key,%rcx # pass key 1442 mov %ebx,%r10d # pass rounds 1443 call _bsaes_key_convert 1444 pxor (%rsp),%xmm7 # fix up 0 round key 1445 movdqa %xmm6,(%rax) # save last round key 1446 movdqa %xmm7,(%rsp) 1447 1448 sub \$8,$len 1449.Lecb_dec_loop: 1450 movdqu 0x00($inp), @XMM[0] # load input 1451 movdqu 0x10($inp), @XMM[1] 1452 movdqu 0x20($inp), @XMM[2] 1453 movdqu 0x30($inp), @XMM[3] 1454 movdqu 0x40($inp), @XMM[4] 1455 movdqu 0x50($inp), @XMM[5] 1456 mov %rsp, %rax # pass key schedule 1457 movdqu 0x60($inp), @XMM[6] 1458 mov %ebx,%r10d # pass rounds 1459 movdqu 0x70($inp), @XMM[7] 1460 lea 0x80($inp), $inp 1461 1462 call _bsaes_decrypt8 1463 1464 movdqu @XMM[0], 0x00($out) # write output 1465 movdqu @XMM[1], 0x10($out) 1466 movdqu @XMM[6], 0x20($out) 1467 movdqu @XMM[4], 0x30($out) 1468 movdqu @XMM[2], 0x40($out) 1469 movdqu @XMM[7], 0x50($out) 1470 movdqu @XMM[3], 0x60($out) 1471 movdqu @XMM[5], 0x70($out) 1472 lea 0x80($out), $out 1473 sub \$8,$len 1474 jnc .Lecb_dec_loop 1475 1476 add \$8,$len 1477 jz .Lecb_dec_done 1478 1479 movdqu 0x00($inp), @XMM[0] # load input 1480 mov %rsp, %rax # pass key schedule 1481 mov %ebx,%r10d # pass rounds 1482 cmp \$2,$len 1483 jb .Lecb_dec_one 1484 movdqu 0x10($inp), @XMM[1] 1485 je .Lecb_dec_two 1486 movdqu 0x20($inp), @XMM[2] 1487 cmp \$4,$len 1488 jb .Lecb_dec_three 1489 movdqu 0x30($inp), @XMM[3] 1490 je .Lecb_dec_four 1491 movdqu 0x40($inp), @XMM[4] 1492 cmp \$6,$len 1493 jb .Lecb_dec_five 1494 movdqu 0x50($inp), @XMM[5] 1495 je .Lecb_dec_six 1496 movdqu 0x60($inp), @XMM[6] 1497 call _bsaes_decrypt8 1498 movdqu @XMM[0], 0x00($out) # write output 1499 movdqu @XMM[1], 0x10($out) 1500 movdqu @XMM[6], 0x20($out) 1501 movdqu @XMM[4], 0x30($out) 1502 movdqu @XMM[2], 0x40($out) 1503 movdqu @XMM[7], 0x50($out) 1504 movdqu @XMM[3], 0x60($out) 1505 jmp .Lecb_dec_done 1506.align 16 1507.Lecb_dec_six: 1508 call _bsaes_decrypt8 1509 movdqu @XMM[0], 0x00($out) # write output 1510 movdqu @XMM[1], 0x10($out) 1511 movdqu @XMM[6], 0x20($out) 1512 movdqu @XMM[4], 0x30($out) 1513 movdqu @XMM[2], 0x40($out) 1514 movdqu @XMM[7], 0x50($out) 1515 jmp .Lecb_dec_done 1516.align 16 1517.Lecb_dec_five: 1518 call _bsaes_decrypt8 1519 movdqu @XMM[0], 0x00($out) # write output 1520 movdqu @XMM[1], 0x10($out) 1521 movdqu @XMM[6], 0x20($out) 1522 movdqu @XMM[4], 0x30($out) 1523 movdqu @XMM[2], 0x40($out) 1524 jmp .Lecb_dec_done 1525.align 16 1526.Lecb_dec_four: 1527 call _bsaes_decrypt8 1528 movdqu @XMM[0], 0x00($out) # write output 1529 movdqu @XMM[1], 0x10($out) 1530 movdqu @XMM[6], 0x20($out) 1531 movdqu @XMM[4], 0x30($out) 1532 jmp .Lecb_dec_done 1533.align 16 1534.Lecb_dec_three: 1535 call _bsaes_decrypt8 1536 movdqu @XMM[0], 0x00($out) # write output 1537 movdqu @XMM[1], 0x10($out) 1538 movdqu @XMM[6], 0x20($out) 1539 jmp .Lecb_dec_done 1540.align 16 1541.Lecb_dec_two: 1542 call _bsaes_decrypt8 1543 movdqu @XMM[0], 0x00($out) # write output 1544 movdqu @XMM[1], 0x10($out) 1545 jmp .Lecb_dec_done 1546.align 16 1547.Lecb_dec_one: 1548 call _bsaes_decrypt8 1549 movdqu @XMM[0], 0x00($out) # write output 1550 jmp .Lecb_dec_done 1551.align 16 1552.Lecb_dec_short: 1553 lea ($inp), $arg1 1554 lea ($out), $arg2 1555 lea ($key), $arg3 1556 call asm_AES_decrypt 1557 lea 16($inp), $inp 1558 lea 16($out), $out 1559 dec $len 1560 jnz .Lecb_dec_short 1561 1562.Lecb_dec_done: 1563 lea (%rsp),%rax 1564 pxor %xmm0, %xmm0 1565.Lecb_dec_bzero: # wipe key schedule [if any] 1566 movdqa %xmm0, 0x00(%rax) 1567 movdqa %xmm0, 0x10(%rax) 1568 lea 0x20(%rax), %rax 1569 cmp %rax, %rbp 1570 jb .Lecb_dec_bzero 1571 1572 lea 0x78(%rbp),%rax 1573.cfi_def_cfa %rax,8 1574___ 1575$code.=<<___ if ($win64); 1576 movaps 0x40(%rbp), %xmm6 1577 movaps 0x50(%rbp), %xmm7 1578 movaps 0x60(%rbp), %xmm8 1579 movaps 0x70(%rbp), %xmm9 1580 movaps 0x80(%rbp), %xmm10 1581 movaps 0x90(%rbp), %xmm11 1582 movaps 0xa0(%rbp), %xmm12 1583 movaps 0xb0(%rbp), %xmm13 1584 movaps 0xc0(%rbp), %xmm14 1585 movaps 0xd0(%rbp), %xmm15 1586 lea 0xa0(%rax), %rax 1587.Lecb_dec_tail: 1588___ 1589$code.=<<___; 1590 mov -48(%rax), %r15 1591.cfi_restore %r15 1592 mov -40(%rax), %r14 1593.cfi_restore %r14 1594 mov -32(%rax), %r13 1595.cfi_restore %r13 1596 mov -24(%rax), %r12 1597.cfi_restore %r12 1598 mov -16(%rax), %rbx 1599.cfi_restore %rbx 1600 mov -8(%rax), %rbp 1601.cfi_restore %rbp 1602 lea (%rax), %rsp # restore %rsp 1603.cfi_def_cfa_register %rsp 1604.Lecb_dec_epilogue: 1605 ret 1606.cfi_endproc 1607.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks 1608___ 1609} 1610$code.=<<___; 1611.extern asm_AES_cbc_encrypt 1612.globl bsaes_cbc_encrypt 1613.type bsaes_cbc_encrypt,\@abi-omnipotent 1614.align 16 1615bsaes_cbc_encrypt: 1616.cfi_startproc 1617___ 1618$code.=<<___ if ($win64); 1619 mov 48(%rsp),$arg6 # pull direction flag 1620___ 1621$code.=<<___; 1622 cmp \$0,$arg6 1623 jne asm_AES_cbc_encrypt 1624 cmp \$128,$arg3 1625 jb asm_AES_cbc_encrypt 1626 1627 mov %rsp, %rax 1628.Lcbc_dec_prologue: 1629 push %rbp 1630.cfi_push %rbp 1631 push %rbx 1632.cfi_push %rbx 1633 push %r12 1634.cfi_push %r12 1635 push %r13 1636.cfi_push %r13 1637 push %r14 1638.cfi_push %r14 1639 push %r15 1640.cfi_push %r15 1641 lea -0x48(%rsp), %rsp 1642.cfi_adjust_cfa_offset 0x48 1643___ 1644$code.=<<___ if ($win64); 1645 mov 0xa0(%rsp),$arg5 # pull ivp 1646 lea -0xa0(%rsp), %rsp 1647 movaps %xmm6, 0x40(%rsp) 1648 movaps %xmm7, 0x50(%rsp) 1649 movaps %xmm8, 0x60(%rsp) 1650 movaps %xmm9, 0x70(%rsp) 1651 movaps %xmm10, 0x80(%rsp) 1652 movaps %xmm11, 0x90(%rsp) 1653 movaps %xmm12, 0xa0(%rsp) 1654 movaps %xmm13, 0xb0(%rsp) 1655 movaps %xmm14, 0xc0(%rsp) 1656 movaps %xmm15, 0xd0(%rsp) 1657.Lcbc_dec_body: 1658___ 1659$code.=<<___; 1660 mov %rsp, %rbp # backup %rsp 1661.cfi_def_cfa_register %rbp 1662 mov 240($arg4), %eax # rounds 1663 mov $arg1, $inp # backup arguments 1664 mov $arg2, $out 1665 mov $arg3, $len 1666 mov $arg4, $key 1667 mov $arg5, %rbx 1668 shr \$4, $len # bytes to blocks 1669 1670 mov %eax, %edx # rounds 1671 shl \$7, %rax # 128 bytes per inner round key 1672 sub \$`128-32`, %rax # size of bit-sliced key schedule 1673 sub %rax, %rsp 1674 1675 mov %rsp, %rax # pass key schedule 1676 mov $key, %rcx # pass key 1677 mov %edx, %r10d # pass rounds 1678 call _bsaes_key_convert 1679 pxor (%rsp),%xmm7 # fix up 0 round key 1680 movdqa %xmm6,(%rax) # save last round key 1681 movdqa %xmm7,(%rsp) 1682 1683 movdqu (%rbx), @XMM[15] # load IV 1684 sub \$8,$len 1685.Lcbc_dec_loop: 1686 movdqu 0x00($inp), @XMM[0] # load input 1687 movdqu 0x10($inp), @XMM[1] 1688 movdqu 0x20($inp), @XMM[2] 1689 movdqu 0x30($inp), @XMM[3] 1690 movdqu 0x40($inp), @XMM[4] 1691 movdqu 0x50($inp), @XMM[5] 1692 mov %rsp, %rax # pass key schedule 1693 movdqu 0x60($inp), @XMM[6] 1694 mov %edx,%r10d # pass rounds 1695 movdqu 0x70($inp), @XMM[7] 1696 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1697 1698 call _bsaes_decrypt8 1699 1700 pxor 0x20(%rbp), @XMM[0] # ^= IV 1701 movdqu 0x00($inp), @XMM[8] # re-load input 1702 movdqu 0x10($inp), @XMM[9] 1703 pxor @XMM[8], @XMM[1] 1704 movdqu 0x20($inp), @XMM[10] 1705 pxor @XMM[9], @XMM[6] 1706 movdqu 0x30($inp), @XMM[11] 1707 pxor @XMM[10], @XMM[4] 1708 movdqu 0x40($inp), @XMM[12] 1709 pxor @XMM[11], @XMM[2] 1710 movdqu 0x50($inp), @XMM[13] 1711 pxor @XMM[12], @XMM[7] 1712 movdqu 0x60($inp), @XMM[14] 1713 pxor @XMM[13], @XMM[3] 1714 movdqu 0x70($inp), @XMM[15] # IV 1715 pxor @XMM[14], @XMM[5] 1716 movdqu @XMM[0], 0x00($out) # write output 1717 lea 0x80($inp), $inp 1718 movdqu @XMM[1], 0x10($out) 1719 movdqu @XMM[6], 0x20($out) 1720 movdqu @XMM[4], 0x30($out) 1721 movdqu @XMM[2], 0x40($out) 1722 movdqu @XMM[7], 0x50($out) 1723 movdqu @XMM[3], 0x60($out) 1724 movdqu @XMM[5], 0x70($out) 1725 lea 0x80($out), $out 1726 sub \$8,$len 1727 jnc .Lcbc_dec_loop 1728 1729 add \$8,$len 1730 jz .Lcbc_dec_done 1731 1732 movdqu 0x00($inp), @XMM[0] # load input 1733 mov %rsp, %rax # pass key schedule 1734 mov %edx, %r10d # pass rounds 1735 cmp \$2,$len 1736 jb .Lcbc_dec_one 1737 movdqu 0x10($inp), @XMM[1] 1738 je .Lcbc_dec_two 1739 movdqu 0x20($inp), @XMM[2] 1740 cmp \$4,$len 1741 jb .Lcbc_dec_three 1742 movdqu 0x30($inp), @XMM[3] 1743 je .Lcbc_dec_four 1744 movdqu 0x40($inp), @XMM[4] 1745 cmp \$6,$len 1746 jb .Lcbc_dec_five 1747 movdqu 0x50($inp), @XMM[5] 1748 je .Lcbc_dec_six 1749 movdqu 0x60($inp), @XMM[6] 1750 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1751 call _bsaes_decrypt8 1752 pxor 0x20(%rbp), @XMM[0] # ^= IV 1753 movdqu 0x00($inp), @XMM[8] # re-load input 1754 movdqu 0x10($inp), @XMM[9] 1755 pxor @XMM[8], @XMM[1] 1756 movdqu 0x20($inp), @XMM[10] 1757 pxor @XMM[9], @XMM[6] 1758 movdqu 0x30($inp), @XMM[11] 1759 pxor @XMM[10], @XMM[4] 1760 movdqu 0x40($inp), @XMM[12] 1761 pxor @XMM[11], @XMM[2] 1762 movdqu 0x50($inp), @XMM[13] 1763 pxor @XMM[12], @XMM[7] 1764 movdqu 0x60($inp), @XMM[15] # IV 1765 pxor @XMM[13], @XMM[3] 1766 movdqu @XMM[0], 0x00($out) # write output 1767 movdqu @XMM[1], 0x10($out) 1768 movdqu @XMM[6], 0x20($out) 1769 movdqu @XMM[4], 0x30($out) 1770 movdqu @XMM[2], 0x40($out) 1771 movdqu @XMM[7], 0x50($out) 1772 movdqu @XMM[3], 0x60($out) 1773 jmp .Lcbc_dec_done 1774.align 16 1775.Lcbc_dec_six: 1776 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1777 call _bsaes_decrypt8 1778 pxor 0x20(%rbp), @XMM[0] # ^= IV 1779 movdqu 0x00($inp), @XMM[8] # re-load input 1780 movdqu 0x10($inp), @XMM[9] 1781 pxor @XMM[8], @XMM[1] 1782 movdqu 0x20($inp), @XMM[10] 1783 pxor @XMM[9], @XMM[6] 1784 movdqu 0x30($inp), @XMM[11] 1785 pxor @XMM[10], @XMM[4] 1786 movdqu 0x40($inp), @XMM[12] 1787 pxor @XMM[11], @XMM[2] 1788 movdqu 0x50($inp), @XMM[15] # IV 1789 pxor @XMM[12], @XMM[7] 1790 movdqu @XMM[0], 0x00($out) # write output 1791 movdqu @XMM[1], 0x10($out) 1792 movdqu @XMM[6], 0x20($out) 1793 movdqu @XMM[4], 0x30($out) 1794 movdqu @XMM[2], 0x40($out) 1795 movdqu @XMM[7], 0x50($out) 1796 jmp .Lcbc_dec_done 1797.align 16 1798.Lcbc_dec_five: 1799 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1800 call _bsaes_decrypt8 1801 pxor 0x20(%rbp), @XMM[0] # ^= IV 1802 movdqu 0x00($inp), @XMM[8] # re-load input 1803 movdqu 0x10($inp), @XMM[9] 1804 pxor @XMM[8], @XMM[1] 1805 movdqu 0x20($inp), @XMM[10] 1806 pxor @XMM[9], @XMM[6] 1807 movdqu 0x30($inp), @XMM[11] 1808 pxor @XMM[10], @XMM[4] 1809 movdqu 0x40($inp), @XMM[15] # IV 1810 pxor @XMM[11], @XMM[2] 1811 movdqu @XMM[0], 0x00($out) # write output 1812 movdqu @XMM[1], 0x10($out) 1813 movdqu @XMM[6], 0x20($out) 1814 movdqu @XMM[4], 0x30($out) 1815 movdqu @XMM[2], 0x40($out) 1816 jmp .Lcbc_dec_done 1817.align 16 1818.Lcbc_dec_four: 1819 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1820 call _bsaes_decrypt8 1821 pxor 0x20(%rbp), @XMM[0] # ^= IV 1822 movdqu 0x00($inp), @XMM[8] # re-load input 1823 movdqu 0x10($inp), @XMM[9] 1824 pxor @XMM[8], @XMM[1] 1825 movdqu 0x20($inp), @XMM[10] 1826 pxor @XMM[9], @XMM[6] 1827 movdqu 0x30($inp), @XMM[15] # IV 1828 pxor @XMM[10], @XMM[4] 1829 movdqu @XMM[0], 0x00($out) # write output 1830 movdqu @XMM[1], 0x10($out) 1831 movdqu @XMM[6], 0x20($out) 1832 movdqu @XMM[4], 0x30($out) 1833 jmp .Lcbc_dec_done 1834.align 16 1835.Lcbc_dec_three: 1836 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1837 call _bsaes_decrypt8 1838 pxor 0x20(%rbp), @XMM[0] # ^= IV 1839 movdqu 0x00($inp), @XMM[8] # re-load input 1840 movdqu 0x10($inp), @XMM[9] 1841 pxor @XMM[8], @XMM[1] 1842 movdqu 0x20($inp), @XMM[15] # IV 1843 pxor @XMM[9], @XMM[6] 1844 movdqu @XMM[0], 0x00($out) # write output 1845 movdqu @XMM[1], 0x10($out) 1846 movdqu @XMM[6], 0x20($out) 1847 jmp .Lcbc_dec_done 1848.align 16 1849.Lcbc_dec_two: 1850 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1851 call _bsaes_decrypt8 1852 pxor 0x20(%rbp), @XMM[0] # ^= IV 1853 movdqu 0x00($inp), @XMM[8] # re-load input 1854 movdqu 0x10($inp), @XMM[15] # IV 1855 pxor @XMM[8], @XMM[1] 1856 movdqu @XMM[0], 0x00($out) # write output 1857 movdqu @XMM[1], 0x10($out) 1858 jmp .Lcbc_dec_done 1859.align 16 1860.Lcbc_dec_one: 1861 lea ($inp), $arg1 1862 lea 0x20(%rbp), $arg2 # buffer output 1863 lea ($key), $arg3 1864 call asm_AES_decrypt # doesn't touch %xmm 1865 pxor 0x20(%rbp), @XMM[15] # ^= IV 1866 movdqu @XMM[15], ($out) # write output 1867 movdqa @XMM[0], @XMM[15] # IV 1868 1869.Lcbc_dec_done: 1870 movdqu @XMM[15], (%rbx) # return IV 1871 lea (%rsp), %rax 1872 pxor %xmm0, %xmm0 1873.Lcbc_dec_bzero: # wipe key schedule [if any] 1874 movdqa %xmm0, 0x00(%rax) 1875 movdqa %xmm0, 0x10(%rax) 1876 lea 0x20(%rax), %rax 1877 cmp %rax, %rbp 1878 ja .Lcbc_dec_bzero 1879 1880 lea 0x78(%rbp),%rax 1881.cfi_def_cfa %rax,8 1882___ 1883$code.=<<___ if ($win64); 1884 movaps 0x40(%rbp), %xmm6 1885 movaps 0x50(%rbp), %xmm7 1886 movaps 0x60(%rbp), %xmm8 1887 movaps 0x70(%rbp), %xmm9 1888 movaps 0x80(%rbp), %xmm10 1889 movaps 0x90(%rbp), %xmm11 1890 movaps 0xa0(%rbp), %xmm12 1891 movaps 0xb0(%rbp), %xmm13 1892 movaps 0xc0(%rbp), %xmm14 1893 movaps 0xd0(%rbp), %xmm15 1894 lea 0xa0(%rax), %rax 1895.Lcbc_dec_tail: 1896___ 1897$code.=<<___; 1898 mov -48(%rax), %r15 1899.cfi_restore %r15 1900 mov -40(%rax), %r14 1901.cfi_restore %r14 1902 mov -32(%rax), %r13 1903.cfi_restore %r13 1904 mov -24(%rax), %r12 1905.cfi_restore %r12 1906 mov -16(%rax), %rbx 1907.cfi_restore %rbx 1908 mov -8(%rax), %rbp 1909.cfi_restore %rbp 1910 lea (%rax), %rsp # restore %rsp 1911.cfi_def_cfa_register %rsp 1912.Lcbc_dec_epilogue: 1913 ret 1914.cfi_endproc 1915.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt 1916 1917.globl bsaes_ctr32_encrypt_blocks 1918.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent 1919.align 16 1920bsaes_ctr32_encrypt_blocks: 1921.cfi_startproc 1922 mov %rsp, %rax 1923.Lctr_enc_prologue: 1924 push %rbp 1925.cfi_push %rbp 1926 push %rbx 1927.cfi_push %rbx 1928 push %r12 1929.cfi_push %r12 1930 push %r13 1931.cfi_push %r13 1932 push %r14 1933.cfi_push %r14 1934 push %r15 1935.cfi_push %r15 1936 lea -0x48(%rsp), %rsp 1937.cfi_adjust_cfa_offset 0x48 1938___ 1939$code.=<<___ if ($win64); 1940 mov 0xa0(%rsp),$arg5 # pull ivp 1941 lea -0xa0(%rsp), %rsp 1942 movaps %xmm6, 0x40(%rsp) 1943 movaps %xmm7, 0x50(%rsp) 1944 movaps %xmm8, 0x60(%rsp) 1945 movaps %xmm9, 0x70(%rsp) 1946 movaps %xmm10, 0x80(%rsp) 1947 movaps %xmm11, 0x90(%rsp) 1948 movaps %xmm12, 0xa0(%rsp) 1949 movaps %xmm13, 0xb0(%rsp) 1950 movaps %xmm14, 0xc0(%rsp) 1951 movaps %xmm15, 0xd0(%rsp) 1952.Lctr_enc_body: 1953___ 1954$code.=<<___; 1955 mov %rsp, %rbp # backup %rsp 1956.cfi_def_cfa_register %rbp 1957 movdqu ($arg5), %xmm0 # load counter 1958 mov 240($arg4), %eax # rounds 1959 mov $arg1, $inp # backup arguments 1960 mov $arg2, $out 1961 mov $arg3, $len 1962 mov $arg4, $key 1963 movdqa %xmm0, 0x20(%rbp) # copy counter 1964 cmp \$8, $arg3 1965 jb .Lctr_enc_short 1966 1967 mov %eax, %ebx # rounds 1968 shl \$7, %rax # 128 bytes per inner round key 1969 sub \$`128-32`, %rax # size of bit-sliced key schedule 1970 sub %rax, %rsp 1971 1972 mov %rsp, %rax # pass key schedule 1973 mov $key, %rcx # pass key 1974 mov %ebx, %r10d # pass rounds 1975 call _bsaes_key_convert 1976 pxor %xmm6,%xmm7 # fix up last round key 1977 movdqa %xmm7,(%rax) # save last round key 1978 1979 movdqa (%rsp), @XMM[9] # load round0 key 1980 lea .LADD1(%rip), %r11 1981 movdqa 0x20(%rbp), @XMM[0] # counter copy 1982 movdqa -0x20(%r11), @XMM[8] # .LSWPUP 1983 pshufb @XMM[8], @XMM[9] # byte swap upper part 1984 pshufb @XMM[8], @XMM[0] 1985 movdqa @XMM[9], (%rsp) # save adjusted round0 key 1986 jmp .Lctr_enc_loop 1987.align 16 1988.Lctr_enc_loop: 1989 movdqa @XMM[0], 0x20(%rbp) # save counter 1990 movdqa @XMM[0], @XMM[1] # prepare 8 counter values 1991 movdqa @XMM[0], @XMM[2] 1992 paddd 0x00(%r11), @XMM[1] # .LADD1 1993 movdqa @XMM[0], @XMM[3] 1994 paddd 0x10(%r11), @XMM[2] # .LADD2 1995 movdqa @XMM[0], @XMM[4] 1996 paddd 0x20(%r11), @XMM[3] # .LADD3 1997 movdqa @XMM[0], @XMM[5] 1998 paddd 0x30(%r11), @XMM[4] # .LADD4 1999 movdqa @XMM[0], @XMM[6] 2000 paddd 0x40(%r11), @XMM[5] # .LADD5 2001 movdqa @XMM[0], @XMM[7] 2002 paddd 0x50(%r11), @XMM[6] # .LADD6 2003 paddd 0x60(%r11), @XMM[7] # .LADD7 2004 2005 # Borrow prologue from _bsaes_encrypt8 to use the opportunity 2006 # to flip byte order in 32-bit counter 2007 movdqa (%rsp), @XMM[9] # round 0 key 2008 lea 0x10(%rsp), %rax # pass key schedule 2009 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR 2010 pxor @XMM[9], @XMM[0] # xor with round0 key 2011 pxor @XMM[9], @XMM[1] 2012 pxor @XMM[9], @XMM[2] 2013 pxor @XMM[9], @XMM[3] 2014 pshufb @XMM[8], @XMM[0] 2015 pshufb @XMM[8], @XMM[1] 2016 pxor @XMM[9], @XMM[4] 2017 pxor @XMM[9], @XMM[5] 2018 pshufb @XMM[8], @XMM[2] 2019 pshufb @XMM[8], @XMM[3] 2020 pxor @XMM[9], @XMM[6] 2021 pxor @XMM[9], @XMM[7] 2022 pshufb @XMM[8], @XMM[4] 2023 pshufb @XMM[8], @XMM[5] 2024 pshufb @XMM[8], @XMM[6] 2025 pshufb @XMM[8], @XMM[7] 2026 lea .LBS0(%rip), %r11 # constants table 2027 mov %ebx,%r10d # pass rounds 2028 2029 call _bsaes_encrypt8_bitslice 2030 2031 sub \$8,$len 2032 jc .Lctr_enc_loop_done 2033 2034 movdqu 0x00($inp), @XMM[8] # load input 2035 movdqu 0x10($inp), @XMM[9] 2036 movdqu 0x20($inp), @XMM[10] 2037 movdqu 0x30($inp), @XMM[11] 2038 movdqu 0x40($inp), @XMM[12] 2039 movdqu 0x50($inp), @XMM[13] 2040 movdqu 0x60($inp), @XMM[14] 2041 movdqu 0x70($inp), @XMM[15] 2042 lea 0x80($inp),$inp 2043 pxor @XMM[0], @XMM[8] 2044 movdqa 0x20(%rbp), @XMM[0] # load counter 2045 pxor @XMM[9], @XMM[1] 2046 movdqu @XMM[8], 0x00($out) # write output 2047 pxor @XMM[10], @XMM[4] 2048 movdqu @XMM[1], 0x10($out) 2049 pxor @XMM[11], @XMM[6] 2050 movdqu @XMM[4], 0x20($out) 2051 pxor @XMM[12], @XMM[3] 2052 movdqu @XMM[6], 0x30($out) 2053 pxor @XMM[13], @XMM[7] 2054 movdqu @XMM[3], 0x40($out) 2055 pxor @XMM[14], @XMM[2] 2056 movdqu @XMM[7], 0x50($out) 2057 pxor @XMM[15], @XMM[5] 2058 movdqu @XMM[2], 0x60($out) 2059 lea .LADD1(%rip), %r11 2060 movdqu @XMM[5], 0x70($out) 2061 lea 0x80($out), $out 2062 paddd 0x70(%r11), @XMM[0] # .LADD8 2063 jnz .Lctr_enc_loop 2064 2065 jmp .Lctr_enc_done 2066.align 16 2067.Lctr_enc_loop_done: 2068 add \$8, $len 2069 movdqu 0x00($inp), @XMM[8] # load input 2070 pxor @XMM[8], @XMM[0] 2071 movdqu @XMM[0], 0x00($out) # write output 2072 cmp \$2,$len 2073 jb .Lctr_enc_done 2074 movdqu 0x10($inp), @XMM[9] 2075 pxor @XMM[9], @XMM[1] 2076 movdqu @XMM[1], 0x10($out) 2077 je .Lctr_enc_done 2078 movdqu 0x20($inp), @XMM[10] 2079 pxor @XMM[10], @XMM[4] 2080 movdqu @XMM[4], 0x20($out) 2081 cmp \$4,$len 2082 jb .Lctr_enc_done 2083 movdqu 0x30($inp), @XMM[11] 2084 pxor @XMM[11], @XMM[6] 2085 movdqu @XMM[6], 0x30($out) 2086 je .Lctr_enc_done 2087 movdqu 0x40($inp), @XMM[12] 2088 pxor @XMM[12], @XMM[3] 2089 movdqu @XMM[3], 0x40($out) 2090 cmp \$6,$len 2091 jb .Lctr_enc_done 2092 movdqu 0x50($inp), @XMM[13] 2093 pxor @XMM[13], @XMM[7] 2094 movdqu @XMM[7], 0x50($out) 2095 je .Lctr_enc_done 2096 movdqu 0x60($inp), @XMM[14] 2097 pxor @XMM[14], @XMM[2] 2098 movdqu @XMM[2], 0x60($out) 2099 jmp .Lctr_enc_done 2100 2101.align 16 2102.Lctr_enc_short: 2103 lea 0x20(%rbp), $arg1 2104 lea 0x30(%rbp), $arg2 2105 lea ($key), $arg3 2106 call asm_AES_encrypt 2107 movdqu ($inp), @XMM[1] 2108 lea 16($inp), $inp 2109 mov 0x2c(%rbp), %eax # load 32-bit counter 2110 bswap %eax 2111 pxor 0x30(%rbp), @XMM[1] 2112 inc %eax # increment 2113 movdqu @XMM[1], ($out) 2114 bswap %eax 2115 lea 16($out), $out 2116 mov %eax, 0x2c(%rsp) # save 32-bit counter 2117 dec $len 2118 jnz .Lctr_enc_short 2119 2120.Lctr_enc_done: 2121 lea (%rsp), %rax 2122 pxor %xmm0, %xmm0 2123.Lctr_enc_bzero: # wipe key schedule [if any] 2124 movdqa %xmm0, 0x00(%rax) 2125 movdqa %xmm0, 0x10(%rax) 2126 lea 0x20(%rax), %rax 2127 cmp %rax, %rbp 2128 ja .Lctr_enc_bzero 2129 2130 lea 0x78(%rbp),%rax 2131.cfi_def_cfa %rax,8 2132___ 2133$code.=<<___ if ($win64); 2134 movaps 0x40(%rbp), %xmm6 2135 movaps 0x50(%rbp), %xmm7 2136 movaps 0x60(%rbp), %xmm8 2137 movaps 0x70(%rbp), %xmm9 2138 movaps 0x80(%rbp), %xmm10 2139 movaps 0x90(%rbp), %xmm11 2140 movaps 0xa0(%rbp), %xmm12 2141 movaps 0xb0(%rbp), %xmm13 2142 movaps 0xc0(%rbp), %xmm14 2143 movaps 0xd0(%rbp), %xmm15 2144 lea 0xa0(%rax), %rax 2145.Lctr_enc_tail: 2146___ 2147$code.=<<___; 2148 mov -48(%rax), %r15 2149.cfi_restore %r15 2150 mov -40(%rax), %r14 2151.cfi_restore %r14 2152 mov -32(%rax), %r13 2153.cfi_restore %r13 2154 mov -24(%rax), %r12 2155.cfi_restore %r12 2156 mov -16(%rax), %rbx 2157.cfi_restore %rbx 2158 mov -8(%rax), %rbp 2159.cfi_restore %rbp 2160 lea (%rax), %rsp # restore %rsp 2161.cfi_def_cfa_register %rsp 2162.Lctr_enc_epilogue: 2163 ret 2164.cfi_endproc 2165.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks 2166___ 2167###################################################################### 2168# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, 2169# const AES_KEY *key1, const AES_KEY *key2, 2170# const unsigned char iv[16]); 2171# 2172my ($twmask,$twres,$twtmp)=@XMM[13..15]; 2173$arg6=~s/d$//; 2174 2175$code.=<<___; 2176.globl bsaes_xts_encrypt 2177.type bsaes_xts_encrypt,\@abi-omnipotent 2178.align 16 2179bsaes_xts_encrypt: 2180.cfi_startproc 2181 mov %rsp, %rax 2182.Lxts_enc_prologue: 2183 push %rbp 2184.cfi_push %rbp 2185 push %rbx 2186.cfi_push %rbx 2187 push %r12 2188.cfi_push %r12 2189 push %r13 2190.cfi_push %r13 2191 push %r14 2192.cfi_push %r14 2193 push %r15 2194.cfi_push %r15 2195 lea -0x48(%rsp), %rsp 2196.cfi_adjust_cfa_offset 0x48 2197___ 2198$code.=<<___ if ($win64); 2199 mov 0xa0(%rsp),$arg5 # pull key2 2200 mov 0xa8(%rsp),$arg6 # pull ivp 2201 lea -0xa0(%rsp), %rsp 2202 movaps %xmm6, 0x40(%rsp) 2203 movaps %xmm7, 0x50(%rsp) 2204 movaps %xmm8, 0x60(%rsp) 2205 movaps %xmm9, 0x70(%rsp) 2206 movaps %xmm10, 0x80(%rsp) 2207 movaps %xmm11, 0x90(%rsp) 2208 movaps %xmm12, 0xa0(%rsp) 2209 movaps %xmm13, 0xb0(%rsp) 2210 movaps %xmm14, 0xc0(%rsp) 2211 movaps %xmm15, 0xd0(%rsp) 2212.Lxts_enc_body: 2213___ 2214$code.=<<___; 2215 mov %rsp, %rbp # backup %rsp 2216.cfi_def_cfa_register %rbp 2217 mov $arg1, $inp # backup arguments 2218 mov $arg2, $out 2219 mov $arg3, $len 2220 mov $arg4, $key 2221 2222 lea ($arg6), $arg1 2223 lea 0x20(%rbp), $arg2 2224 lea ($arg5), $arg3 2225 call asm_AES_encrypt # generate initial tweak 2226 2227 mov 240($key), %eax # rounds 2228 mov $len, %rbx # backup $len 2229 2230 mov %eax, %edx # rounds 2231 shl \$7, %rax # 128 bytes per inner round key 2232 sub \$`128-32`, %rax # size of bit-sliced key schedule 2233 sub %rax, %rsp 2234 2235 mov %rsp, %rax # pass key schedule 2236 mov $key, %rcx # pass key 2237 mov %edx, %r10d # pass rounds 2238 call _bsaes_key_convert 2239 pxor %xmm6, %xmm7 # fix up last round key 2240 movdqa %xmm7, (%rax) # save last round key 2241 2242 and \$-16, $len 2243 sub \$0x80, %rsp # place for tweak[8] 2244 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2245 2246 pxor $twtmp, $twtmp 2247 movdqa .Lxts_magic(%rip), $twmask 2248 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2249 2250 sub \$0x80, $len 2251 jc .Lxts_enc_short 2252 jmp .Lxts_enc_loop 2253 2254.align 16 2255.Lxts_enc_loop: 2256___ 2257 for ($i=0;$i<7;$i++) { 2258 $code.=<<___; 2259 pshufd \$0x13, $twtmp, $twres 2260 pxor $twtmp, $twtmp 2261 movdqa @XMM[7], @XMM[$i] 2262 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2263 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2264 pand $twmask, $twres # isolate carry and residue 2265 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2266 pxor $twres, @XMM[7] 2267___ 2268 $code.=<<___ if ($i>=1); 2269 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2270___ 2271 $code.=<<___ if ($i>=2); 2272 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2273___ 2274 } 2275$code.=<<___; 2276 movdqu 0x60($inp), @XMM[8+6] 2277 pxor @XMM[8+5], @XMM[5] 2278 movdqu 0x70($inp), @XMM[8+7] 2279 lea 0x80($inp), $inp 2280 movdqa @XMM[7], 0x70(%rsp) 2281 pxor @XMM[8+6], @XMM[6] 2282 lea 0x80(%rsp), %rax # pass key schedule 2283 pxor @XMM[8+7], @XMM[7] 2284 mov %edx, %r10d # pass rounds 2285 2286 call _bsaes_encrypt8 2287 2288 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2289 pxor 0x10(%rsp), @XMM[1] 2290 movdqu @XMM[0], 0x00($out) # write output 2291 pxor 0x20(%rsp), @XMM[4] 2292 movdqu @XMM[1], 0x10($out) 2293 pxor 0x30(%rsp), @XMM[6] 2294 movdqu @XMM[4], 0x20($out) 2295 pxor 0x40(%rsp), @XMM[3] 2296 movdqu @XMM[6], 0x30($out) 2297 pxor 0x50(%rsp), @XMM[7] 2298 movdqu @XMM[3], 0x40($out) 2299 pxor 0x60(%rsp), @XMM[2] 2300 movdqu @XMM[7], 0x50($out) 2301 pxor 0x70(%rsp), @XMM[5] 2302 movdqu @XMM[2], 0x60($out) 2303 movdqu @XMM[5], 0x70($out) 2304 lea 0x80($out), $out 2305 2306 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2307 pxor $twtmp, $twtmp 2308 movdqa .Lxts_magic(%rip), $twmask 2309 pcmpgtd @XMM[7], $twtmp 2310 pshufd \$0x13, $twtmp, $twres 2311 pxor $twtmp, $twtmp 2312 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2313 pand $twmask, $twres # isolate carry and residue 2314 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2315 pxor $twres, @XMM[7] 2316 2317 sub \$0x80,$len 2318 jnc .Lxts_enc_loop 2319 2320.Lxts_enc_short: 2321 add \$0x80, $len 2322 jz .Lxts_enc_done 2323___ 2324 for ($i=0;$i<7;$i++) { 2325 $code.=<<___; 2326 pshufd \$0x13, $twtmp, $twres 2327 pxor $twtmp, $twtmp 2328 movdqa @XMM[7], @XMM[$i] 2329 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2330 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2331 pand $twmask, $twres # isolate carry and residue 2332 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2333 pxor $twres, @XMM[7] 2334___ 2335 $code.=<<___ if ($i>=1); 2336 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2337 cmp \$`0x10*$i`,$len 2338 je .Lxts_enc_$i 2339___ 2340 $code.=<<___ if ($i>=2); 2341 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2342___ 2343 } 2344$code.=<<___; 2345 movdqu 0x60($inp), @XMM[8+6] 2346 pxor @XMM[8+5], @XMM[5] 2347 movdqa @XMM[7], 0x70(%rsp) 2348 lea 0x70($inp), $inp 2349 pxor @XMM[8+6], @XMM[6] 2350 lea 0x80(%rsp), %rax # pass key schedule 2351 mov %edx, %r10d # pass rounds 2352 2353 call _bsaes_encrypt8 2354 2355 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2356 pxor 0x10(%rsp), @XMM[1] 2357 movdqu @XMM[0], 0x00($out) # write output 2358 pxor 0x20(%rsp), @XMM[4] 2359 movdqu @XMM[1], 0x10($out) 2360 pxor 0x30(%rsp), @XMM[6] 2361 movdqu @XMM[4], 0x20($out) 2362 pxor 0x40(%rsp), @XMM[3] 2363 movdqu @XMM[6], 0x30($out) 2364 pxor 0x50(%rsp), @XMM[7] 2365 movdqu @XMM[3], 0x40($out) 2366 pxor 0x60(%rsp), @XMM[2] 2367 movdqu @XMM[7], 0x50($out) 2368 movdqu @XMM[2], 0x60($out) 2369 lea 0x70($out), $out 2370 2371 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2372 jmp .Lxts_enc_done 2373.align 16 2374.Lxts_enc_6: 2375 pxor @XMM[8+4], @XMM[4] 2376 lea 0x60($inp), $inp 2377 pxor @XMM[8+5], @XMM[5] 2378 lea 0x80(%rsp), %rax # pass key schedule 2379 mov %edx, %r10d # pass rounds 2380 2381 call _bsaes_encrypt8 2382 2383 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2384 pxor 0x10(%rsp), @XMM[1] 2385 movdqu @XMM[0], 0x00($out) # write output 2386 pxor 0x20(%rsp), @XMM[4] 2387 movdqu @XMM[1], 0x10($out) 2388 pxor 0x30(%rsp), @XMM[6] 2389 movdqu @XMM[4], 0x20($out) 2390 pxor 0x40(%rsp), @XMM[3] 2391 movdqu @XMM[6], 0x30($out) 2392 pxor 0x50(%rsp), @XMM[7] 2393 movdqu @XMM[3], 0x40($out) 2394 movdqu @XMM[7], 0x50($out) 2395 lea 0x60($out), $out 2396 2397 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2398 jmp .Lxts_enc_done 2399.align 16 2400.Lxts_enc_5: 2401 pxor @XMM[8+3], @XMM[3] 2402 lea 0x50($inp), $inp 2403 pxor @XMM[8+4], @XMM[4] 2404 lea 0x80(%rsp), %rax # pass key schedule 2405 mov %edx, %r10d # pass rounds 2406 2407 call _bsaes_encrypt8 2408 2409 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2410 pxor 0x10(%rsp), @XMM[1] 2411 movdqu @XMM[0], 0x00($out) # write output 2412 pxor 0x20(%rsp), @XMM[4] 2413 movdqu @XMM[1], 0x10($out) 2414 pxor 0x30(%rsp), @XMM[6] 2415 movdqu @XMM[4], 0x20($out) 2416 pxor 0x40(%rsp), @XMM[3] 2417 movdqu @XMM[6], 0x30($out) 2418 movdqu @XMM[3], 0x40($out) 2419 lea 0x50($out), $out 2420 2421 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2422 jmp .Lxts_enc_done 2423.align 16 2424.Lxts_enc_4: 2425 pxor @XMM[8+2], @XMM[2] 2426 lea 0x40($inp), $inp 2427 pxor @XMM[8+3], @XMM[3] 2428 lea 0x80(%rsp), %rax # pass key schedule 2429 mov %edx, %r10d # pass rounds 2430 2431 call _bsaes_encrypt8 2432 2433 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2434 pxor 0x10(%rsp), @XMM[1] 2435 movdqu @XMM[0], 0x00($out) # write output 2436 pxor 0x20(%rsp), @XMM[4] 2437 movdqu @XMM[1], 0x10($out) 2438 pxor 0x30(%rsp), @XMM[6] 2439 movdqu @XMM[4], 0x20($out) 2440 movdqu @XMM[6], 0x30($out) 2441 lea 0x40($out), $out 2442 2443 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2444 jmp .Lxts_enc_done 2445.align 16 2446.Lxts_enc_3: 2447 pxor @XMM[8+1], @XMM[1] 2448 lea 0x30($inp), $inp 2449 pxor @XMM[8+2], @XMM[2] 2450 lea 0x80(%rsp), %rax # pass key schedule 2451 mov %edx, %r10d # pass rounds 2452 2453 call _bsaes_encrypt8 2454 2455 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2456 pxor 0x10(%rsp), @XMM[1] 2457 movdqu @XMM[0], 0x00($out) # write output 2458 pxor 0x20(%rsp), @XMM[4] 2459 movdqu @XMM[1], 0x10($out) 2460 movdqu @XMM[4], 0x20($out) 2461 lea 0x30($out), $out 2462 2463 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2464 jmp .Lxts_enc_done 2465.align 16 2466.Lxts_enc_2: 2467 pxor @XMM[8+0], @XMM[0] 2468 lea 0x20($inp), $inp 2469 pxor @XMM[8+1], @XMM[1] 2470 lea 0x80(%rsp), %rax # pass key schedule 2471 mov %edx, %r10d # pass rounds 2472 2473 call _bsaes_encrypt8 2474 2475 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2476 pxor 0x10(%rsp), @XMM[1] 2477 movdqu @XMM[0], 0x00($out) # write output 2478 movdqu @XMM[1], 0x10($out) 2479 lea 0x20($out), $out 2480 2481 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2482 jmp .Lxts_enc_done 2483.align 16 2484.Lxts_enc_1: 2485 pxor @XMM[0], @XMM[8] 2486 lea 0x10($inp), $inp 2487 movdqa @XMM[8], 0x20(%rbp) 2488 lea 0x20(%rbp), $arg1 2489 lea 0x20(%rbp), $arg2 2490 lea ($key), $arg3 2491 call asm_AES_encrypt # doesn't touch %xmm 2492 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2493 #pxor @XMM[8], @XMM[0] 2494 #lea 0x80(%rsp), %rax # pass key schedule 2495 #mov %edx, %r10d # pass rounds 2496 #call _bsaes_encrypt8 2497 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2498 movdqu @XMM[0], 0x00($out) # write output 2499 lea 0x10($out), $out 2500 2501 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2502 2503.Lxts_enc_done: 2504 and \$15, %ebx 2505 jz .Lxts_enc_ret 2506 mov $out, %rdx 2507 2508.Lxts_enc_steal: 2509 movzb ($inp), %eax 2510 movzb -16(%rdx), %ecx 2511 lea 1($inp), $inp 2512 mov %al, -16(%rdx) 2513 mov %cl, 0(%rdx) 2514 lea 1(%rdx), %rdx 2515 sub \$1,%ebx 2516 jnz .Lxts_enc_steal 2517 2518 movdqu -16($out), @XMM[0] 2519 lea 0x20(%rbp), $arg1 2520 pxor @XMM[7], @XMM[0] 2521 lea 0x20(%rbp), $arg2 2522 movdqa @XMM[0], 0x20(%rbp) 2523 lea ($key), $arg3 2524 call asm_AES_encrypt # doesn't touch %xmm 2525 pxor 0x20(%rbp), @XMM[7] 2526 movdqu @XMM[7], -16($out) 2527 2528.Lxts_enc_ret: 2529 lea (%rsp), %rax 2530 pxor %xmm0, %xmm0 2531.Lxts_enc_bzero: # wipe key schedule [if any] 2532 movdqa %xmm0, 0x00(%rax) 2533 movdqa %xmm0, 0x10(%rax) 2534 lea 0x20(%rax), %rax 2535 cmp %rax, %rbp 2536 ja .Lxts_enc_bzero 2537 2538 lea 0x78(%rbp),%rax 2539.cfi_def_cfa %rax,8 2540___ 2541$code.=<<___ if ($win64); 2542 movaps 0x40(%rbp), %xmm6 2543 movaps 0x50(%rbp), %xmm7 2544 movaps 0x60(%rbp), %xmm8 2545 movaps 0x70(%rbp), %xmm9 2546 movaps 0x80(%rbp), %xmm10 2547 movaps 0x90(%rbp), %xmm11 2548 movaps 0xa0(%rbp), %xmm12 2549 movaps 0xb0(%rbp), %xmm13 2550 movaps 0xc0(%rbp), %xmm14 2551 movaps 0xd0(%rbp), %xmm15 2552 lea 0xa0(%rax), %rax 2553.Lxts_enc_tail: 2554___ 2555$code.=<<___; 2556 mov -48(%rax), %r15 2557.cfi_restore %r15 2558 mov -40(%rax), %r14 2559.cfi_restore %r14 2560 mov -32(%rax), %r13 2561.cfi_restore %r13 2562 mov -24(%rax), %r12 2563.cfi_restore %r12 2564 mov -16(%rax), %rbx 2565.cfi_restore %rbx 2566 mov -8(%rax), %rbp 2567.cfi_restore %rbp 2568 lea (%rax), %rsp # restore %rsp 2569.cfi_def_cfa_register %rsp 2570.Lxts_enc_epilogue: 2571 ret 2572.cfi_endproc 2573.size bsaes_xts_encrypt,.-bsaes_xts_encrypt 2574 2575.globl bsaes_xts_decrypt 2576.type bsaes_xts_decrypt,\@abi-omnipotent 2577.align 16 2578bsaes_xts_decrypt: 2579.cfi_startproc 2580 mov %rsp, %rax 2581.Lxts_dec_prologue: 2582 push %rbp 2583.cfi_push %rbp 2584 push %rbx 2585.cfi_push %rbx 2586 push %r12 2587.cfi_push %r12 2588 push %r13 2589.cfi_push %r13 2590 push %r14 2591.cfi_push %r14 2592 push %r15 2593.cfi_push %r15 2594 lea -0x48(%rsp), %rsp 2595.cfi_adjust_cfa_offset 0x48 2596___ 2597$code.=<<___ if ($win64); 2598 mov 0xa0(%rsp),$arg5 # pull key2 2599 mov 0xa8(%rsp),$arg6 # pull ivp 2600 lea -0xa0(%rsp), %rsp 2601 movaps %xmm6, 0x40(%rsp) 2602 movaps %xmm7, 0x50(%rsp) 2603 movaps %xmm8, 0x60(%rsp) 2604 movaps %xmm9, 0x70(%rsp) 2605 movaps %xmm10, 0x80(%rsp) 2606 movaps %xmm11, 0x90(%rsp) 2607 movaps %xmm12, 0xa0(%rsp) 2608 movaps %xmm13, 0xb0(%rsp) 2609 movaps %xmm14, 0xc0(%rsp) 2610 movaps %xmm15, 0xd0(%rsp) 2611.Lxts_dec_body: 2612___ 2613$code.=<<___; 2614 mov %rsp, %rbp # backup %rsp 2615 mov $arg1, $inp # backup arguments 2616 mov $arg2, $out 2617 mov $arg3, $len 2618 mov $arg4, $key 2619 2620 lea ($arg6), $arg1 2621 lea 0x20(%rbp), $arg2 2622 lea ($arg5), $arg3 2623 call asm_AES_encrypt # generate initial tweak 2624 2625 mov 240($key), %eax # rounds 2626 mov $len, %rbx # backup $len 2627 2628 mov %eax, %edx # rounds 2629 shl \$7, %rax # 128 bytes per inner round key 2630 sub \$`128-32`, %rax # size of bit-sliced key schedule 2631 sub %rax, %rsp 2632 2633 mov %rsp, %rax # pass key schedule 2634 mov $key, %rcx # pass key 2635 mov %edx, %r10d # pass rounds 2636 call _bsaes_key_convert 2637 pxor (%rsp), %xmm7 # fix up round 0 key 2638 movdqa %xmm6, (%rax) # save last round key 2639 movdqa %xmm7, (%rsp) 2640 2641 xor %eax, %eax # if ($len%16) len-=16; 2642 and \$-16, $len 2643 test \$15, %ebx 2644 setnz %al 2645 shl \$4, %rax 2646 sub %rax, $len 2647 2648 sub \$0x80, %rsp # place for tweak[8] 2649 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2650 2651 pxor $twtmp, $twtmp 2652 movdqa .Lxts_magic(%rip), $twmask 2653 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2654 2655 sub \$0x80, $len 2656 jc .Lxts_dec_short 2657 jmp .Lxts_dec_loop 2658 2659.align 16 2660.Lxts_dec_loop: 2661___ 2662 for ($i=0;$i<7;$i++) { 2663 $code.=<<___; 2664 pshufd \$0x13, $twtmp, $twres 2665 pxor $twtmp, $twtmp 2666 movdqa @XMM[7], @XMM[$i] 2667 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2668 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2669 pand $twmask, $twres # isolate carry and residue 2670 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2671 pxor $twres, @XMM[7] 2672___ 2673 $code.=<<___ if ($i>=1); 2674 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2675___ 2676 $code.=<<___ if ($i>=2); 2677 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2678___ 2679 } 2680$code.=<<___; 2681 movdqu 0x60($inp), @XMM[8+6] 2682 pxor @XMM[8+5], @XMM[5] 2683 movdqu 0x70($inp), @XMM[8+7] 2684 lea 0x80($inp), $inp 2685 movdqa @XMM[7], 0x70(%rsp) 2686 pxor @XMM[8+6], @XMM[6] 2687 lea 0x80(%rsp), %rax # pass key schedule 2688 pxor @XMM[8+7], @XMM[7] 2689 mov %edx, %r10d # pass rounds 2690 2691 call _bsaes_decrypt8 2692 2693 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2694 pxor 0x10(%rsp), @XMM[1] 2695 movdqu @XMM[0], 0x00($out) # write output 2696 pxor 0x20(%rsp), @XMM[6] 2697 movdqu @XMM[1], 0x10($out) 2698 pxor 0x30(%rsp), @XMM[4] 2699 movdqu @XMM[6], 0x20($out) 2700 pxor 0x40(%rsp), @XMM[2] 2701 movdqu @XMM[4], 0x30($out) 2702 pxor 0x50(%rsp), @XMM[7] 2703 movdqu @XMM[2], 0x40($out) 2704 pxor 0x60(%rsp), @XMM[3] 2705 movdqu @XMM[7], 0x50($out) 2706 pxor 0x70(%rsp), @XMM[5] 2707 movdqu @XMM[3], 0x60($out) 2708 movdqu @XMM[5], 0x70($out) 2709 lea 0x80($out), $out 2710 2711 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2712 pxor $twtmp, $twtmp 2713 movdqa .Lxts_magic(%rip), $twmask 2714 pcmpgtd @XMM[7], $twtmp 2715 pshufd \$0x13, $twtmp, $twres 2716 pxor $twtmp, $twtmp 2717 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2718 pand $twmask, $twres # isolate carry and residue 2719 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2720 pxor $twres, @XMM[7] 2721 2722 sub \$0x80,$len 2723 jnc .Lxts_dec_loop 2724 2725.Lxts_dec_short: 2726 add \$0x80, $len 2727 jz .Lxts_dec_done 2728___ 2729 for ($i=0;$i<7;$i++) { 2730 $code.=<<___; 2731 pshufd \$0x13, $twtmp, $twres 2732 pxor $twtmp, $twtmp 2733 movdqa @XMM[7], @XMM[$i] 2734 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2735 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2736 pand $twmask, $twres # isolate carry and residue 2737 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2738 pxor $twres, @XMM[7] 2739___ 2740 $code.=<<___ if ($i>=1); 2741 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2742 cmp \$`0x10*$i`,$len 2743 je .Lxts_dec_$i 2744___ 2745 $code.=<<___ if ($i>=2); 2746 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2747___ 2748 } 2749$code.=<<___; 2750 movdqu 0x60($inp), @XMM[8+6] 2751 pxor @XMM[8+5], @XMM[5] 2752 movdqa @XMM[7], 0x70(%rsp) 2753 lea 0x70($inp), $inp 2754 pxor @XMM[8+6], @XMM[6] 2755 lea 0x80(%rsp), %rax # pass key schedule 2756 mov %edx, %r10d # pass rounds 2757 2758 call _bsaes_decrypt8 2759 2760 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2761 pxor 0x10(%rsp), @XMM[1] 2762 movdqu @XMM[0], 0x00($out) # write output 2763 pxor 0x20(%rsp), @XMM[6] 2764 movdqu @XMM[1], 0x10($out) 2765 pxor 0x30(%rsp), @XMM[4] 2766 movdqu @XMM[6], 0x20($out) 2767 pxor 0x40(%rsp), @XMM[2] 2768 movdqu @XMM[4], 0x30($out) 2769 pxor 0x50(%rsp), @XMM[7] 2770 movdqu @XMM[2], 0x40($out) 2771 pxor 0x60(%rsp), @XMM[3] 2772 movdqu @XMM[7], 0x50($out) 2773 movdqu @XMM[3], 0x60($out) 2774 lea 0x70($out), $out 2775 2776 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2777 jmp .Lxts_dec_done 2778.align 16 2779.Lxts_dec_6: 2780 pxor @XMM[8+4], @XMM[4] 2781 lea 0x60($inp), $inp 2782 pxor @XMM[8+5], @XMM[5] 2783 lea 0x80(%rsp), %rax # pass key schedule 2784 mov %edx, %r10d # pass rounds 2785 2786 call _bsaes_decrypt8 2787 2788 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2789 pxor 0x10(%rsp), @XMM[1] 2790 movdqu @XMM[0], 0x00($out) # write output 2791 pxor 0x20(%rsp), @XMM[6] 2792 movdqu @XMM[1], 0x10($out) 2793 pxor 0x30(%rsp), @XMM[4] 2794 movdqu @XMM[6], 0x20($out) 2795 pxor 0x40(%rsp), @XMM[2] 2796 movdqu @XMM[4], 0x30($out) 2797 pxor 0x50(%rsp), @XMM[7] 2798 movdqu @XMM[2], 0x40($out) 2799 movdqu @XMM[7], 0x50($out) 2800 lea 0x60($out), $out 2801 2802 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2803 jmp .Lxts_dec_done 2804.align 16 2805.Lxts_dec_5: 2806 pxor @XMM[8+3], @XMM[3] 2807 lea 0x50($inp), $inp 2808 pxor @XMM[8+4], @XMM[4] 2809 lea 0x80(%rsp), %rax # pass key schedule 2810 mov %edx, %r10d # pass rounds 2811 2812 call _bsaes_decrypt8 2813 2814 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2815 pxor 0x10(%rsp), @XMM[1] 2816 movdqu @XMM[0], 0x00($out) # write output 2817 pxor 0x20(%rsp), @XMM[6] 2818 movdqu @XMM[1], 0x10($out) 2819 pxor 0x30(%rsp), @XMM[4] 2820 movdqu @XMM[6], 0x20($out) 2821 pxor 0x40(%rsp), @XMM[2] 2822 movdqu @XMM[4], 0x30($out) 2823 movdqu @XMM[2], 0x40($out) 2824 lea 0x50($out), $out 2825 2826 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2827 jmp .Lxts_dec_done 2828.align 16 2829.Lxts_dec_4: 2830 pxor @XMM[8+2], @XMM[2] 2831 lea 0x40($inp), $inp 2832 pxor @XMM[8+3], @XMM[3] 2833 lea 0x80(%rsp), %rax # pass key schedule 2834 mov %edx, %r10d # pass rounds 2835 2836 call _bsaes_decrypt8 2837 2838 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2839 pxor 0x10(%rsp), @XMM[1] 2840 movdqu @XMM[0], 0x00($out) # write output 2841 pxor 0x20(%rsp), @XMM[6] 2842 movdqu @XMM[1], 0x10($out) 2843 pxor 0x30(%rsp), @XMM[4] 2844 movdqu @XMM[6], 0x20($out) 2845 movdqu @XMM[4], 0x30($out) 2846 lea 0x40($out), $out 2847 2848 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2849 jmp .Lxts_dec_done 2850.align 16 2851.Lxts_dec_3: 2852 pxor @XMM[8+1], @XMM[1] 2853 lea 0x30($inp), $inp 2854 pxor @XMM[8+2], @XMM[2] 2855 lea 0x80(%rsp), %rax # pass key schedule 2856 mov %edx, %r10d # pass rounds 2857 2858 call _bsaes_decrypt8 2859 2860 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2861 pxor 0x10(%rsp), @XMM[1] 2862 movdqu @XMM[0], 0x00($out) # write output 2863 pxor 0x20(%rsp), @XMM[6] 2864 movdqu @XMM[1], 0x10($out) 2865 movdqu @XMM[6], 0x20($out) 2866 lea 0x30($out), $out 2867 2868 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2869 jmp .Lxts_dec_done 2870.align 16 2871.Lxts_dec_2: 2872 pxor @XMM[8+0], @XMM[0] 2873 lea 0x20($inp), $inp 2874 pxor @XMM[8+1], @XMM[1] 2875 lea 0x80(%rsp), %rax # pass key schedule 2876 mov %edx, %r10d # pass rounds 2877 2878 call _bsaes_decrypt8 2879 2880 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2881 pxor 0x10(%rsp), @XMM[1] 2882 movdqu @XMM[0], 0x00($out) # write output 2883 movdqu @XMM[1], 0x10($out) 2884 lea 0x20($out), $out 2885 2886 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2887 jmp .Lxts_dec_done 2888.align 16 2889.Lxts_dec_1: 2890 pxor @XMM[0], @XMM[8] 2891 lea 0x10($inp), $inp 2892 movdqa @XMM[8], 0x20(%rbp) 2893 lea 0x20(%rbp), $arg1 2894 lea 0x20(%rbp), $arg2 2895 lea ($key), $arg3 2896 call asm_AES_decrypt # doesn't touch %xmm 2897 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2898 #pxor @XMM[8], @XMM[0] 2899 #lea 0x80(%rsp), %rax # pass key schedule 2900 #mov %edx, %r10d # pass rounds 2901 #call _bsaes_decrypt8 2902 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2903 movdqu @XMM[0], 0x00($out) # write output 2904 lea 0x10($out), $out 2905 2906 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2907 2908.Lxts_dec_done: 2909 and \$15, %ebx 2910 jz .Lxts_dec_ret 2911 2912 pxor $twtmp, $twtmp 2913 movdqa .Lxts_magic(%rip), $twmask 2914 pcmpgtd @XMM[7], $twtmp 2915 pshufd \$0x13, $twtmp, $twres 2916 movdqa @XMM[7], @XMM[6] 2917 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2918 pand $twmask, $twres # isolate carry and residue 2919 movdqu ($inp), @XMM[0] 2920 pxor $twres, @XMM[7] 2921 2922 lea 0x20(%rbp), $arg1 2923 pxor @XMM[7], @XMM[0] 2924 lea 0x20(%rbp), $arg2 2925 movdqa @XMM[0], 0x20(%rbp) 2926 lea ($key), $arg3 2927 call asm_AES_decrypt # doesn't touch %xmm 2928 pxor 0x20(%rbp), @XMM[7] 2929 mov $out, %rdx 2930 movdqu @XMM[7], ($out) 2931 2932.Lxts_dec_steal: 2933 movzb 16($inp), %eax 2934 movzb (%rdx), %ecx 2935 lea 1($inp), $inp 2936 mov %al, (%rdx) 2937 mov %cl, 16(%rdx) 2938 lea 1(%rdx), %rdx 2939 sub \$1,%ebx 2940 jnz .Lxts_dec_steal 2941 2942 movdqu ($out), @XMM[0] 2943 lea 0x20(%rbp), $arg1 2944 pxor @XMM[6], @XMM[0] 2945 lea 0x20(%rbp), $arg2 2946 movdqa @XMM[0], 0x20(%rbp) 2947 lea ($key), $arg3 2948 call asm_AES_decrypt # doesn't touch %xmm 2949 pxor 0x20(%rbp), @XMM[6] 2950 movdqu @XMM[6], ($out) 2951 2952.Lxts_dec_ret: 2953 lea (%rsp), %rax 2954 pxor %xmm0, %xmm0 2955.Lxts_dec_bzero: # wipe key schedule [if any] 2956 movdqa %xmm0, 0x00(%rax) 2957 movdqa %xmm0, 0x10(%rax) 2958 lea 0x20(%rax), %rax 2959 cmp %rax, %rbp 2960 ja .Lxts_dec_bzero 2961 2962 lea 0x78(%rbp),%rax 2963.cfi_def_cfa %rax,8 2964___ 2965$code.=<<___ if ($win64); 2966 movaps 0x40(%rbp), %xmm6 2967 movaps 0x50(%rbp), %xmm7 2968 movaps 0x60(%rbp), %xmm8 2969 movaps 0x70(%rbp), %xmm9 2970 movaps 0x80(%rbp), %xmm10 2971 movaps 0x90(%rbp), %xmm11 2972 movaps 0xa0(%rbp), %xmm12 2973 movaps 0xb0(%rbp), %xmm13 2974 movaps 0xc0(%rbp), %xmm14 2975 movaps 0xd0(%rbp), %xmm15 2976 lea 0xa0(%rax), %rax 2977.Lxts_dec_tail: 2978___ 2979$code.=<<___; 2980 mov -48(%rax), %r15 2981.cfi_restore %r15 2982 mov -40(%rax), %r14 2983.cfi_restore %r14 2984 mov -32(%rax), %r13 2985.cfi_restore %r13 2986 mov -24(%rax), %r12 2987.cfi_restore %r12 2988 mov -16(%rax), %rbx 2989.cfi_restore %rbx 2990 mov -8(%rax), %rbp 2991.cfi_restore %rbp 2992 lea (%rax), %rsp # restore %rsp 2993.cfi_def_cfa_register %rsp 2994.Lxts_dec_epilogue: 2995 ret 2996.cfi_endproc 2997.size bsaes_xts_decrypt,.-bsaes_xts_decrypt 2998___ 2999} 3000$code.=<<___; 3001.type _bsaes_const,\@object 3002.align 64 3003_bsaes_const: 3004.LM0ISR: # InvShiftRows constants 3005 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 3006.LISRM0: 3007 .quad 0x01040b0e0205080f, 0x0306090c00070a0d 3008.LISR: 3009 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 3010.LBS0: # bit-slice constants 3011 .quad 0x5555555555555555, 0x5555555555555555 3012.LBS1: 3013 .quad 0x3333333333333333, 0x3333333333333333 3014.LBS2: 3015 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f 3016.LSR: # shiftrows constants 3017 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b 3018.LSRM0: 3019 .quad 0x0304090e00050a0f, 0x01060b0c0207080d 3020.LM0SR: 3021 .quad 0x0a0e02060f03070b, 0x0004080c05090d01 3022.LSWPUP: # byte-swap upper dword 3023 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 3024.LSWPUPM0SR: 3025 .quad 0x0a0d02060c03070b, 0x0004080f05090e01 3026.LADD1: # counter increment constants 3027 .quad 0x0000000000000000, 0x0000000100000000 3028.LADD2: 3029 .quad 0x0000000000000000, 0x0000000200000000 3030.LADD3: 3031 .quad 0x0000000000000000, 0x0000000300000000 3032.LADD4: 3033 .quad 0x0000000000000000, 0x0000000400000000 3034.LADD5: 3035 .quad 0x0000000000000000, 0x0000000500000000 3036.LADD6: 3037 .quad 0x0000000000000000, 0x0000000600000000 3038.LADD7: 3039 .quad 0x0000000000000000, 0x0000000700000000 3040.LADD8: 3041 .quad 0x0000000000000000, 0x0000000800000000 3042.Lxts_magic: 3043 .long 0x87,0,1,0 3044.Lmasks: 3045 .quad 0x0101010101010101, 0x0101010101010101 3046 .quad 0x0202020202020202, 0x0202020202020202 3047 .quad 0x0404040404040404, 0x0404040404040404 3048 .quad 0x0808080808080808, 0x0808080808080808 3049.LM0: 3050 .quad 0x02060a0e03070b0f, 0x0004080c0105090d 3051.L63: 3052 .quad 0x6363636363636363, 0x6363636363636363 3053.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov" 3054.align 64 3055.size _bsaes_const,.-_bsaes_const 3056___ 3057 3058# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3059# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3060if ($win64) { 3061$rec="%rcx"; 3062$frame="%rdx"; 3063$context="%r8"; 3064$disp="%r9"; 3065 3066$code.=<<___; 3067.extern __imp_RtlVirtualUnwind 3068.type se_handler,\@abi-omnipotent 3069.align 16 3070se_handler: 3071 push %rsi 3072 push %rdi 3073 push %rbx 3074 push %rbp 3075 push %r12 3076 push %r13 3077 push %r14 3078 push %r15 3079 pushfq 3080 sub \$64,%rsp 3081 3082 mov 120($context),%rax # pull context->Rax 3083 mov 248($context),%rbx # pull context->Rip 3084 3085 mov 8($disp),%rsi # disp->ImageBase 3086 mov 56($disp),%r11 # disp->HandlerData 3087 3088 mov 0(%r11),%r10d # HandlerData[0] 3089 lea (%rsi,%r10),%r10 # prologue label 3090 cmp %r10,%rbx # context->Rip<=prologue label 3091 jbe .Lin_prologue 3092 3093 mov 4(%r11),%r10d # HandlerData[1] 3094 lea (%rsi,%r10),%r10 # epilogue label 3095 cmp %r10,%rbx # context->Rip>=epilogue label 3096 jae .Lin_prologue 3097 3098 mov 8(%r11),%r10d # HandlerData[2] 3099 lea (%rsi,%r10),%r10 # epilogue label 3100 cmp %r10,%rbx # context->Rip>=tail label 3101 jae .Lin_tail 3102 3103 mov 160($context),%rax # pull context->Rbp 3104 3105 lea 0x40(%rax),%rsi # %xmm save area 3106 lea 512($context),%rdi # &context.Xmm6 3107 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 3108 .long 0xa548f3fc # cld; rep movsq 3109 lea 0xa0+0x78(%rax),%rax # adjust stack pointer 3110 3111.Lin_tail: 3112 mov -48(%rax),%rbp 3113 mov -40(%rax),%rbx 3114 mov -32(%rax),%r12 3115 mov -24(%rax),%r13 3116 mov -16(%rax),%r14 3117 mov -8(%rax),%r15 3118 mov %rbx,144($context) # restore context->Rbx 3119 mov %rbp,160($context) # restore context->Rbp 3120 mov %r12,216($context) # restore context->R12 3121 mov %r13,224($context) # restore context->R13 3122 mov %r14,232($context) # restore context->R14 3123 mov %r15,240($context) # restore context->R15 3124 3125.Lin_prologue: 3126 mov %rax,152($context) # restore context->Rsp 3127 3128 mov 40($disp),%rdi # disp->ContextRecord 3129 mov $context,%rsi # context 3130 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 3131 .long 0xa548f3fc # cld; rep movsq 3132 3133 mov $disp,%rsi 3134 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3135 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3136 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3137 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3138 mov 40(%rsi),%r10 # disp->ContextRecord 3139 lea 56(%rsi),%r11 # &disp->HandlerData 3140 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3141 mov %r10,32(%rsp) # arg5 3142 mov %r11,40(%rsp) # arg6 3143 mov %r12,48(%rsp) # arg7 3144 mov %rcx,56(%rsp) # arg8, (NULL) 3145 call *__imp_RtlVirtualUnwind(%rip) 3146 3147 mov \$1,%eax # ExceptionContinueSearch 3148 add \$64,%rsp 3149 popfq 3150 pop %r15 3151 pop %r14 3152 pop %r13 3153 pop %r12 3154 pop %rbp 3155 pop %rbx 3156 pop %rdi 3157 pop %rsi 3158 ret 3159.size se_handler,.-se_handler 3160 3161.section .pdata 3162.align 4 3163___ 3164$code.=<<___ if ($ecb); 3165 .rva .Lecb_enc_prologue 3166 .rva .Lecb_enc_epilogue 3167 .rva .Lecb_enc_info 3168 3169 .rva .Lecb_dec_prologue 3170 .rva .Lecb_dec_epilogue 3171 .rva .Lecb_dec_info 3172___ 3173$code.=<<___; 3174 .rva .Lcbc_dec_prologue 3175 .rva .Lcbc_dec_epilogue 3176 .rva .Lcbc_dec_info 3177 3178 .rva .Lctr_enc_prologue 3179 .rva .Lctr_enc_epilogue 3180 .rva .Lctr_enc_info 3181 3182 .rva .Lxts_enc_prologue 3183 .rva .Lxts_enc_epilogue 3184 .rva .Lxts_enc_info 3185 3186 .rva .Lxts_dec_prologue 3187 .rva .Lxts_dec_epilogue 3188 .rva .Lxts_dec_info 3189 3190.section .xdata 3191.align 8 3192___ 3193$code.=<<___ if ($ecb); 3194.Lecb_enc_info: 3195 .byte 9,0,0,0 3196 .rva se_handler 3197 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] 3198 .rva .Lecb_enc_tail 3199 .long 0 3200.Lecb_dec_info: 3201 .byte 9,0,0,0 3202 .rva se_handler 3203 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] 3204 .rva .Lecb_dec_tail 3205 .long 0 3206___ 3207$code.=<<___; 3208.Lcbc_dec_info: 3209 .byte 9,0,0,0 3210 .rva se_handler 3211 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] 3212 .rva .Lcbc_dec_tail 3213 .long 0 3214.Lctr_enc_info: 3215 .byte 9,0,0,0 3216 .rva se_handler 3217 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] 3218 .rva .Lctr_enc_tail 3219 .long 0 3220.Lxts_enc_info: 3221 .byte 9,0,0,0 3222 .rva se_handler 3223 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 3224 .rva .Lxts_enc_tail 3225 .long 0 3226.Lxts_dec_info: 3227 .byte 9,0,0,0 3228 .rva se_handler 3229 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 3230 .rva .Lxts_dec_tail 3231 .long 0 3232___ 3233} 3234 3235$code =~ s/\`([^\`]*)\`/eval($1)/gem; 3236 3237print $code; 3238 3239close STDOUT; 3240