1#! /usr/bin/env perl 2# Copyright 2011-2023 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# September 2011 18# 19# Assembler helpers for Padlock engine. See even e_padlock-x86.pl for 20# details. 21 22# $output is the last argument if it looks like a file (it has an extension) 23# $flavour is the first argument if it doesn't look like a file 24$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 25$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 26 27$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 28 29$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 30( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 31( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or 32die "can't locate x86_64-xlate.pl"; 33 34open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" 35 or die "can't call $xlate: $!"; 36*STDOUT=*OUT; 37 38$code=".text\n"; 39 40%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata 41$PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20 42 43$ctx="%rdx"; 44$out="%rdi"; 45$inp="%rsi"; 46$len="%rcx"; 47$chunk="%rbx"; 48 49($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order 50 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 51 52$code.=<<___; 53.globl padlock_capability 54.type padlock_capability,\@abi-omnipotent 55.align 16 56padlock_capability: 57 mov %rbx,%r8 58 xor %eax,%eax 59 cpuid 60 xor %eax,%eax 61 cmp \$`"0x".unpack("H*",'tneC')`,%ebx 62 jne .Lzhaoxin 63 cmp \$`"0x".unpack("H*",'Hrua')`,%edx 64 jne .Lnoluck 65 cmp \$`"0x".unpack("H*",'slua')`,%ecx 66 jne .Lnoluck 67 jmp .LzhaoxinEnd 68.Lzhaoxin: 69 cmp \$`"0x".unpack("H*",'hS ')`,%ebx 70 jne .Lnoluck 71 cmp \$`"0x".unpack("H*",'hgna')`,%edx 72 jne .Lnoluck 73 cmp \$`"0x".unpack("H*",' ia')`,%ecx 74 jne .Lnoluck 75.LzhaoxinEnd: 76 mov \$0xC0000000,%eax 77 cpuid 78 mov %eax,%edx 79 xor %eax,%eax 80 cmp \$0xC0000001,%edx 81 jb .Lnoluck 82 mov \$0xC0000001,%eax 83 cpuid 84 mov %edx,%eax 85 and \$0xffffffef,%eax 86 or \$0x10,%eax # set Nano bit#4 87.Lnoluck: 88 mov %r8,%rbx 89 ret 90.size padlock_capability,.-padlock_capability 91 92.globl padlock_key_bswap 93.type padlock_key_bswap,\@abi-omnipotent,0 94.align 16 95padlock_key_bswap: 96 mov 240($arg1),%edx 97 inc %edx 98 shl \$2,%edx 99.Lbswap_loop: 100 mov ($arg1),%eax 101 bswap %eax 102 mov %eax,($arg1) 103 lea 4($arg1),$arg1 104 sub \$1,%edx 105 jnz .Lbswap_loop 106 ret 107.size padlock_key_bswap,.-padlock_key_bswap 108 109.globl padlock_verify_context 110.type padlock_verify_context,\@abi-omnipotent 111.align 16 112padlock_verify_context: 113 mov $arg1,$ctx 114 pushf 115 lea .Lpadlock_saved_context(%rip),%rax 116 call _padlock_verify_ctx 117 lea 8(%rsp),%rsp 118 ret 119.size padlock_verify_context,.-padlock_verify_context 120 121.type _padlock_verify_ctx,\@abi-omnipotent 122.align 16 123_padlock_verify_ctx: 124 mov 8(%rsp),%r8 125 bt \$30,%r8 126 jnc .Lverified 127 cmp (%rax),$ctx 128 je .Lverified 129 pushf 130 popf 131.Lverified: 132 mov $ctx,(%rax) 133 ret 134.size _padlock_verify_ctx,.-_padlock_verify_ctx 135 136.globl padlock_reload_key 137.type padlock_reload_key,\@abi-omnipotent 138.align 16 139padlock_reload_key: 140 pushf 141 popf 142 ret 143.size padlock_reload_key,.-padlock_reload_key 144 145.globl padlock_aes_block 146.type padlock_aes_block,\@function,3 147.align 16 148padlock_aes_block: 149 mov %rbx,%r8 150 mov \$1,$len 151 lea 32($ctx),%rbx # key 152 lea 16($ctx),$ctx # control word 153 .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb 154 mov %r8,%rbx 155 ret 156.size padlock_aes_block,.-padlock_aes_block 157 158.globl padlock_xstore 159.type padlock_xstore,\@function,2 160.align 16 161padlock_xstore: 162 mov %esi,%edx 163 .byte 0x0f,0xa7,0xc0 # xstore 164 ret 165.size padlock_xstore,.-padlock_xstore 166 167.globl padlock_sha1_oneshot 168.type padlock_sha1_oneshot,\@function,3 169.align 16 170padlock_sha1_oneshot: 171 mov %rdx,%rcx 172 mov %rdi,%rdx # put aside %rdi 173 movups (%rdi),%xmm0 # copy-in context 174 sub \$128+8,%rsp 175 mov 16(%rdi),%eax 176 movaps %xmm0,(%rsp) 177 mov %rsp,%rdi 178 mov %eax,16(%rsp) 179 xor %rax,%rax 180 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 181 movaps (%rsp),%xmm0 182 mov 16(%rsp),%eax 183 add \$128+8,%rsp 184 movups %xmm0,(%rdx) # copy-out context 185 mov %eax,16(%rdx) 186 ret 187.size padlock_sha1_oneshot,.-padlock_sha1_oneshot 188 189.globl padlock_sha1_blocks 190.type padlock_sha1_blocks,\@function,3 191.align 16 192padlock_sha1_blocks: 193 mov %rdx,%rcx 194 mov %rdi,%rdx # put aside %rdi 195 movups (%rdi),%xmm0 # copy-in context 196 sub \$128+8,%rsp 197 mov 16(%rdi),%eax 198 movaps %xmm0,(%rsp) 199 mov %rsp,%rdi 200 mov %eax,16(%rsp) 201 mov \$-1,%rax 202 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 203 movaps (%rsp),%xmm0 204 mov 16(%rsp),%eax 205 add \$128+8,%rsp 206 movups %xmm0,(%rdx) # copy-out context 207 mov %eax,16(%rdx) 208 ret 209.size padlock_sha1_blocks,.-padlock_sha1_blocks 210 211.globl padlock_sha256_oneshot 212.type padlock_sha256_oneshot,\@function,3 213.align 16 214padlock_sha256_oneshot: 215 mov %rdx,%rcx 216 mov %rdi,%rdx # put aside %rdi 217 movups (%rdi),%xmm0 # copy-in context 218 sub \$128+8,%rsp 219 movups 16(%rdi),%xmm1 220 movaps %xmm0,(%rsp) 221 mov %rsp,%rdi 222 movaps %xmm1,16(%rsp) 223 xor %rax,%rax 224 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 225 movaps (%rsp),%xmm0 226 movaps 16(%rsp),%xmm1 227 add \$128+8,%rsp 228 movups %xmm0,(%rdx) # copy-out context 229 movups %xmm1,16(%rdx) 230 ret 231.size padlock_sha256_oneshot,.-padlock_sha256_oneshot 232 233.globl padlock_sha256_blocks 234.type padlock_sha256_blocks,\@function,3 235.align 16 236padlock_sha256_blocks: 237 mov %rdx,%rcx 238 mov %rdi,%rdx # put aside %rdi 239 movups (%rdi),%xmm0 # copy-in context 240 sub \$128+8,%rsp 241 movups 16(%rdi),%xmm1 242 movaps %xmm0,(%rsp) 243 mov %rsp,%rdi 244 movaps %xmm1,16(%rsp) 245 mov \$-1,%rax 246 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 247 movaps (%rsp),%xmm0 248 movaps 16(%rsp),%xmm1 249 add \$128+8,%rsp 250 movups %xmm0,(%rdx) # copy-out context 251 movups %xmm1,16(%rdx) 252 ret 253.size padlock_sha256_blocks,.-padlock_sha256_blocks 254 255.globl padlock_sha512_blocks 256.type padlock_sha512_blocks,\@function,3 257.align 16 258padlock_sha512_blocks: 259 mov %rdx,%rcx 260 mov %rdi,%rdx # put aside %rdi 261 movups (%rdi),%xmm0 # copy-in context 262 sub \$128+8,%rsp 263 movups 16(%rdi),%xmm1 264 movups 32(%rdi),%xmm2 265 movups 48(%rdi),%xmm3 266 movaps %xmm0,(%rsp) 267 mov %rsp,%rdi 268 movaps %xmm1,16(%rsp) 269 movaps %xmm2,32(%rsp) 270 movaps %xmm3,48(%rsp) 271 .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512 272 movaps (%rsp),%xmm0 273 movaps 16(%rsp),%xmm1 274 movaps 32(%rsp),%xmm2 275 movaps 48(%rsp),%xmm3 276 add \$128+8,%rsp 277 movups %xmm0,(%rdx) # copy-out context 278 movups %xmm1,16(%rdx) 279 movups %xmm2,32(%rdx) 280 movups %xmm3,48(%rdx) 281 ret 282.size padlock_sha512_blocks,.-padlock_sha512_blocks 283___ 284 285sub generate_mode { 286my ($mode,$opcode) = @_; 287# int padlock_$mode_encrypt(void *out, const void *inp, 288# struct padlock_cipher_data *ctx, size_t len); 289$code.=<<___; 290.globl padlock_${mode}_encrypt 291.type padlock_${mode}_encrypt,\@function,4 292.align 16 293padlock_${mode}_encrypt: 294 push %rbp 295 push %rbx 296 297 xor %eax,%eax 298 test \$15,$ctx 299 jnz .L${mode}_abort 300 test \$15,$len 301 jnz .L${mode}_abort 302 lea .Lpadlock_saved_context(%rip),%rax 303 pushf 304 cld 305 call _padlock_verify_ctx 306 lea 16($ctx),$ctx # control word 307 xor %eax,%eax 308 xor %ebx,%ebx 309 testl \$`1<<5`,($ctx) # align bit in control word 310 jnz .L${mode}_aligned 311 test \$0x0f,$out 312 setz %al # !out_misaligned 313 test \$0x0f,$inp 314 setz %bl # !inp_misaligned 315 test %ebx,%eax 316 jnz .L${mode}_aligned 317 neg %rax 318 mov \$$PADLOCK_CHUNK,$chunk 319 not %rax # out_misaligned?-1:0 320 lea (%rsp),%rbp 321 cmp $chunk,$len 322 cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len 323 and $chunk,%rax # out_misaligned?chunk:0 324 mov $len,$chunk 325 neg %rax 326 and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK 327 lea (%rax,%rbp),%rsp 328 mov \$$PADLOCK_CHUNK,%rax 329 cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK 330___ 331$code.=<<___ if ($mode eq "ctr32"); 332.L${mode}_reenter: 333 mov -4($ctx),%eax # pull 32-bit counter 334 bswap %eax 335 neg %eax 336 and \$`$PADLOCK_CHUNK/16-1`,%eax 337 mov \$$PADLOCK_CHUNK,$chunk 338 shl \$4,%eax 339 cmovz $chunk,%rax 340 cmp %rax,$len 341 cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK 342 cmovbe $len,$chunk 343___ 344$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); 345 cmp $chunk,$len 346 ja .L${mode}_loop 347 mov $inp,%rax # check if prefetch crosses page 348 cmp %rsp,%rbp 349 cmove $out,%rax 350 add $len,%rax 351 neg %rax 352 and \$0xfff,%rax # distance to page boundary 353 cmp \$$PADLOCK_PREFETCH{$mode},%rax 354 mov \$-$PADLOCK_PREFETCH{$mode},%rax 355 cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1 356 and %rax,$chunk 357 jz .L${mode}_unaligned_tail 358___ 359$code.=<<___; 360 jmp .L${mode}_loop 361.align 16 362.L${mode}_loop: 363 cmp $len,$chunk # ctr32 artefact 364 cmova $len,$chunk # ctr32 artefact 365 mov $out,%r8 # save parameters 366 mov $inp,%r9 367 mov $len,%r10 368 mov $chunk,$len 369 mov $chunk,%r11 370 test \$0x0f,$out # out_misaligned 371 cmovnz %rsp,$out 372 test \$0x0f,$inp # inp_misaligned 373 jz .L${mode}_inp_aligned 374 shr \$3,$len 375 .byte 0xf3,0x48,0xa5 # rep movsq 376 sub $chunk,$out 377 mov $chunk,$len 378 mov $out,$inp 379.L${mode}_inp_aligned: 380 lea -16($ctx),%rax # ivp 381 lea 16($ctx),%rbx # key 382 shr \$4,$len 383 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* 384___ 385$code.=<<___ if ($mode !~ /ecb|ctr/); 386 movdqa (%rax),%xmm0 387 movdqa %xmm0,-16($ctx) # copy [or refresh] iv 388___ 389$code.=<<___ if ($mode eq "ctr32"); 390 mov -4($ctx),%eax # pull 32-bit counter 391 test \$0xffff0000,%eax 392 jnz .L${mode}_no_carry 393 bswap %eax 394 add \$0x10000,%eax 395 bswap %eax 396 mov %eax,-4($ctx) 397.L${mode}_no_carry: 398___ 399$code.=<<___; 400 mov %r8,$out # restore parameters 401 mov %r11,$chunk 402 test \$0x0f,$out 403 jz .L${mode}_out_aligned 404 mov $chunk,$len 405 lea (%rsp),$inp 406 shr \$3,$len 407 .byte 0xf3,0x48,0xa5 # rep movsq 408 sub $chunk,$out 409.L${mode}_out_aligned: 410 mov %r9,$inp 411 mov %r10,$len 412 add $chunk,$out 413 add $chunk,$inp 414 sub $chunk,$len 415 mov \$$PADLOCK_CHUNK,$chunk 416___ 417 if (!$PADLOCK_PREFETCH{$mode}) { 418$code.=<<___; 419 jnz .L${mode}_loop 420___ 421 } else { 422$code.=<<___; 423 jz .L${mode}_break 424 cmp $chunk,$len 425 jae .L${mode}_loop 426___ 427$code.=<<___ if ($mode eq "ctr32"); 428 mov $len,$chunk 429 mov $inp,%rax # check if prefetch crosses page 430 cmp %rsp,%rbp 431 cmove $out,%rax 432 add $len,%rax 433 neg %rax 434 and \$0xfff,%rax # distance to page boundary 435 cmp \$$PADLOCK_PREFETCH{$mode},%rax 436 mov \$-$PADLOCK_PREFETCH{$mode},%rax 437 cmovae $chunk,%rax 438 and %rax,$chunk 439 jnz .L${mode}_loop 440___ 441$code.=<<___; 442.L${mode}_unaligned_tail: 443 xor %eax,%eax 444 cmp %rsp,%rbp 445 cmove $len,%rax 446 mov $out,%r8 # save parameters 447 mov $len,$chunk 448 sub %rax,%rsp # alloca 449 shr \$3,$len 450 lea (%rsp),$out 451 .byte 0xf3,0x48,0xa5 # rep movsq 452 mov %rsp,$inp 453 mov %r8, $out # restore parameters 454 mov $chunk,$len 455 jmp .L${mode}_loop 456.align 16 457.L${mode}_break: 458___ 459 } 460$code.=<<___; 461 cmp %rbp,%rsp 462 je .L${mode}_done 463 464 pxor %xmm0,%xmm0 465 lea (%rsp),%rax 466.L${mode}_bzero: 467 movaps %xmm0,(%rax) 468 lea 16(%rax),%rax 469 cmp %rax,%rbp 470 ja .L${mode}_bzero 471 472.L${mode}_done: 473 lea (%rbp),%rsp 474 jmp .L${mode}_exit 475 476.align 16 477.L${mode}_aligned: 478___ 479$code.=<<___ if ($mode eq "ctr32"); 480 mov -4($ctx),%eax # pull 32-bit counter 481 bswap %eax 482 neg %eax 483 and \$0xffff,%eax 484 mov \$`16*0x10000`,$chunk 485 shl \$4,%eax 486 cmovz $chunk,%rax 487 cmp %rax,$len 488 cmova %rax,$chunk # don't let counter cross 2^16 489 cmovbe $len,$chunk 490 jbe .L${mode}_aligned_skip 491 492.L${mode}_aligned_loop: 493 mov $len,%r10 # save parameters 494 mov $chunk,$len 495 mov $chunk,%r11 496 497 lea -16($ctx),%rax # ivp 498 lea 16($ctx),%rbx # key 499 shr \$4,$len # len/=AES_BLOCK_SIZE 500 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* 501 502 mov -4($ctx),%eax # pull 32-bit counter 503 bswap %eax 504 add \$0x10000,%eax 505 bswap %eax 506 mov %eax,-4($ctx) 507 508 mov %r10,$len # restore parameters 509 sub %r11,$len 510 mov \$`16*0x10000`,$chunk 511 jz .L${mode}_exit 512 cmp $chunk,$len 513 jae .L${mode}_aligned_loop 514 515.L${mode}_aligned_skip: 516___ 517$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); 518 lea ($inp,$len),%rbp 519 neg %rbp 520 and \$0xfff,%rbp # distance to page boundary 521 xor %eax,%eax 522 cmp \$$PADLOCK_PREFETCH{$mode},%rbp 523 mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp 524 cmovae %rax,%rbp 525 and $len,%rbp # remainder 526 sub %rbp,$len 527 jz .L${mode}_aligned_tail 528___ 529$code.=<<___; 530 lea -16($ctx),%rax # ivp 531 lea 16($ctx),%rbx # key 532 shr \$4,$len # len/=AES_BLOCK_SIZE 533 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* 534___ 535$code.=<<___ if ($mode !~ /ecb|ctr/); 536 movdqa (%rax),%xmm0 537 movdqa %xmm0,-16($ctx) # copy [or refresh] iv 538___ 539$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); 540 test %rbp,%rbp # check remainder 541 jz .L${mode}_exit 542 543.L${mode}_aligned_tail: 544 mov $out,%r8 545 mov %rbp,$chunk 546 mov %rbp,$len 547 lea (%rsp),%rbp 548 sub $len,%rsp 549 shr \$3,$len 550 lea (%rsp),$out 551 .byte 0xf3,0x48,0xa5 # rep movsq 552 lea (%r8),$out 553 lea (%rsp),$inp 554 mov $chunk,$len 555 jmp .L${mode}_loop 556___ 557$code.=<<___; 558.L${mode}_exit: 559 mov \$1,%eax 560 lea 8(%rsp),%rsp 561.L${mode}_abort: 562 pop %rbx 563 pop %rbp 564 ret 565.size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt 566___ 567} 568 569&generate_mode("ecb",0xc8); 570&generate_mode("cbc",0xd0); 571&generate_mode("cfb",0xe0); 572&generate_mode("ofb",0xe8); 573&generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR... 574 575$code.=<<___; 576.asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>" 577.align 16 578.data 579.align 8 580.Lpadlock_saved_context: 581 .quad 0 582___ 583$code =~ s/\`([^\`]*)\`/eval($1)/gem; 584 585print $code; 586 587close STDOUT; 588