1#! /usr/bin/env perl 2# Copyright 2011-2023 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# September 2011 18# 19# Assembler helpers for Padlock engine. See even e_padlock-x86.pl for 20# details. 21 22$flavour = shift; 23$output = shift; 24if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 25 26$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 27 28$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 29( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 30( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or 31die "can't locate x86_64-xlate.pl"; 32 33open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 34*STDOUT=*OUT; 35 36$code=".text\n"; 37 38%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata 39$PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20 40 41$ctx="%rdx"; 42$out="%rdi"; 43$inp="%rsi"; 44$len="%rcx"; 45$chunk="%rbx"; 46 47($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order 48 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 49 50$code.=<<___; 51.globl padlock_capability 52.type padlock_capability,\@abi-omnipotent 53.align 16 54padlock_capability: 55 mov %rbx,%r8 56 xor %eax,%eax 57 cpuid 58 xor %eax,%eax 59 cmp \$`"0x".unpack("H*",'tneC')`,%ebx 60 jne .Lzhaoxin 61 cmp \$`"0x".unpack("H*",'Hrua')`,%edx 62 jne .Lnoluck 63 cmp \$`"0x".unpack("H*",'slua')`,%ecx 64 jne .Lnoluck 65 jmp .LzhaoxinEnd 66.Lzhaoxin: 67 cmp \$`"0x".unpack("H*",'hS ')`,%ebx 68 jne .Lnoluck 69 cmp \$`"0x".unpack("H*",'hgna')`,%edx 70 jne .Lnoluck 71 cmp \$`"0x".unpack("H*",' ia')`,%ecx 72 jne .Lnoluck 73.LzhaoxinEnd: 74 mov \$0xC0000000,%eax 75 cpuid 76 mov %eax,%edx 77 xor %eax,%eax 78 cmp \$0xC0000001,%edx 79 jb .Lnoluck 80 mov \$0xC0000001,%eax 81 cpuid 82 mov %edx,%eax 83 and \$0xffffffef,%eax 84 or \$0x10,%eax # set Nano bit#4 85.Lnoluck: 86 mov %r8,%rbx 87 ret 88.size padlock_capability,.-padlock_capability 89 90.globl padlock_key_bswap 91.type padlock_key_bswap,\@abi-omnipotent,0 92.align 16 93padlock_key_bswap: 94 mov 240($arg1),%edx 95 inc %edx 96 shl \$2,%edx 97.Lbswap_loop: 98 mov ($arg1),%eax 99 bswap %eax 100 mov %eax,($arg1) 101 lea 4($arg1),$arg1 102 sub \$1,%edx 103 jnz .Lbswap_loop 104 ret 105.size padlock_key_bswap,.-padlock_key_bswap 106 107.globl padlock_verify_context 108.type padlock_verify_context,\@abi-omnipotent 109.align 16 110padlock_verify_context: 111 mov $arg1,$ctx 112 pushf 113 lea .Lpadlock_saved_context(%rip),%rax 114 call _padlock_verify_ctx 115 lea 8(%rsp),%rsp 116 ret 117.size padlock_verify_context,.-padlock_verify_context 118 119.type _padlock_verify_ctx,\@abi-omnipotent 120.align 16 121_padlock_verify_ctx: 122 mov 8(%rsp),%r8 123 bt \$30,%r8 124 jnc .Lverified 125 cmp (%rax),$ctx 126 je .Lverified 127 pushf 128 popf 129.Lverified: 130 mov $ctx,(%rax) 131 ret 132.size _padlock_verify_ctx,.-_padlock_verify_ctx 133 134.globl padlock_reload_key 135.type padlock_reload_key,\@abi-omnipotent 136.align 16 137padlock_reload_key: 138 pushf 139 popf 140 ret 141.size padlock_reload_key,.-padlock_reload_key 142 143.globl padlock_aes_block 144.type padlock_aes_block,\@function,3 145.align 16 146padlock_aes_block: 147 mov %rbx,%r8 148 mov \$1,$len 149 lea 32($ctx),%rbx # key 150 lea 16($ctx),$ctx # control word 151 .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb 152 mov %r8,%rbx 153 ret 154.size padlock_aes_block,.-padlock_aes_block 155 156.globl padlock_xstore 157.type padlock_xstore,\@function,2 158.align 16 159padlock_xstore: 160 mov %esi,%edx 161 .byte 0x0f,0xa7,0xc0 # xstore 162 ret 163.size padlock_xstore,.-padlock_xstore 164 165.globl padlock_sha1_oneshot 166.type padlock_sha1_oneshot,\@function,3 167.align 16 168padlock_sha1_oneshot: 169 mov %rdx,%rcx 170 mov %rdi,%rdx # put aside %rdi 171 movups (%rdi),%xmm0 # copy-in context 172 sub \$128+8,%rsp 173 mov 16(%rdi),%eax 174 movaps %xmm0,(%rsp) 175 mov %rsp,%rdi 176 mov %eax,16(%rsp) 177 xor %rax,%rax 178 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 179 movaps (%rsp),%xmm0 180 mov 16(%rsp),%eax 181 add \$128+8,%rsp 182 movups %xmm0,(%rdx) # copy-out context 183 mov %eax,16(%rdx) 184 ret 185.size padlock_sha1_oneshot,.-padlock_sha1_oneshot 186 187.globl padlock_sha1_blocks 188.type padlock_sha1_blocks,\@function,3 189.align 16 190padlock_sha1_blocks: 191 mov %rdx,%rcx 192 mov %rdi,%rdx # put aside %rdi 193 movups (%rdi),%xmm0 # copy-in context 194 sub \$128+8,%rsp 195 mov 16(%rdi),%eax 196 movaps %xmm0,(%rsp) 197 mov %rsp,%rdi 198 mov %eax,16(%rsp) 199 mov \$-1,%rax 200 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 201 movaps (%rsp),%xmm0 202 mov 16(%rsp),%eax 203 add \$128+8,%rsp 204 movups %xmm0,(%rdx) # copy-out context 205 mov %eax,16(%rdx) 206 ret 207.size padlock_sha1_blocks,.-padlock_sha1_blocks 208 209.globl padlock_sha256_oneshot 210.type padlock_sha256_oneshot,\@function,3 211.align 16 212padlock_sha256_oneshot: 213 mov %rdx,%rcx 214 mov %rdi,%rdx # put aside %rdi 215 movups (%rdi),%xmm0 # copy-in context 216 sub \$128+8,%rsp 217 movups 16(%rdi),%xmm1 218 movaps %xmm0,(%rsp) 219 mov %rsp,%rdi 220 movaps %xmm1,16(%rsp) 221 xor %rax,%rax 222 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 223 movaps (%rsp),%xmm0 224 movaps 16(%rsp),%xmm1 225 add \$128+8,%rsp 226 movups %xmm0,(%rdx) # copy-out context 227 movups %xmm1,16(%rdx) 228 ret 229.size padlock_sha256_oneshot,.-padlock_sha256_oneshot 230 231.globl padlock_sha256_blocks 232.type padlock_sha256_blocks,\@function,3 233.align 16 234padlock_sha256_blocks: 235 mov %rdx,%rcx 236 mov %rdi,%rdx # put aside %rdi 237 movups (%rdi),%xmm0 # copy-in context 238 sub \$128+8,%rsp 239 movups 16(%rdi),%xmm1 240 movaps %xmm0,(%rsp) 241 mov %rsp,%rdi 242 movaps %xmm1,16(%rsp) 243 mov \$-1,%rax 244 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 245 movaps (%rsp),%xmm0 246 movaps 16(%rsp),%xmm1 247 add \$128+8,%rsp 248 movups %xmm0,(%rdx) # copy-out context 249 movups %xmm1,16(%rdx) 250 ret 251.size padlock_sha256_blocks,.-padlock_sha256_blocks 252 253.globl padlock_sha512_blocks 254.type padlock_sha512_blocks,\@function,3 255.align 16 256padlock_sha512_blocks: 257 mov %rdx,%rcx 258 mov %rdi,%rdx # put aside %rdi 259 movups (%rdi),%xmm0 # copy-in context 260 sub \$128+8,%rsp 261 movups 16(%rdi),%xmm1 262 movups 32(%rdi),%xmm2 263 movups 48(%rdi),%xmm3 264 movaps %xmm0,(%rsp) 265 mov %rsp,%rdi 266 movaps %xmm1,16(%rsp) 267 movaps %xmm2,32(%rsp) 268 movaps %xmm3,48(%rsp) 269 .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512 270 movaps (%rsp),%xmm0 271 movaps 16(%rsp),%xmm1 272 movaps 32(%rsp),%xmm2 273 movaps 48(%rsp),%xmm3 274 add \$128+8,%rsp 275 movups %xmm0,(%rdx) # copy-out context 276 movups %xmm1,16(%rdx) 277 movups %xmm2,32(%rdx) 278 movups %xmm3,48(%rdx) 279 ret 280.size padlock_sha512_blocks,.-padlock_sha512_blocks 281___ 282 283sub generate_mode { 284my ($mode,$opcode) = @_; 285# int padlock_$mode_encrypt(void *out, const void *inp, 286# struct padlock_cipher_data *ctx, size_t len); 287$code.=<<___; 288.globl padlock_${mode}_encrypt 289.type padlock_${mode}_encrypt,\@function,4 290.align 16 291padlock_${mode}_encrypt: 292 push %rbp 293 push %rbx 294 295 xor %eax,%eax 296 test \$15,$ctx 297 jnz .L${mode}_abort 298 test \$15,$len 299 jnz .L${mode}_abort 300 lea .Lpadlock_saved_context(%rip),%rax 301 pushf 302 cld 303 call _padlock_verify_ctx 304 lea 16($ctx),$ctx # control word 305 xor %eax,%eax 306 xor %ebx,%ebx 307 testl \$`1<<5`,($ctx) # align bit in control word 308 jnz .L${mode}_aligned 309 test \$0x0f,$out 310 setz %al # !out_misaligned 311 test \$0x0f,$inp 312 setz %bl # !inp_misaligned 313 test %ebx,%eax 314 jnz .L${mode}_aligned 315 neg %rax 316 mov \$$PADLOCK_CHUNK,$chunk 317 not %rax # out_misaligned?-1:0 318 lea (%rsp),%rbp 319 cmp $chunk,$len 320 cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len 321 and $chunk,%rax # out_misaligned?chunk:0 322 mov $len,$chunk 323 neg %rax 324 and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK 325 lea (%rax,%rbp),%rsp 326 mov \$$PADLOCK_CHUNK,%rax 327 cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK 328___ 329$code.=<<___ if ($mode eq "ctr32"); 330.L${mode}_reenter: 331 mov -4($ctx),%eax # pull 32-bit counter 332 bswap %eax 333 neg %eax 334 and \$`$PADLOCK_CHUNK/16-1`,%eax 335 mov \$$PADLOCK_CHUNK,$chunk 336 shl \$4,%eax 337 cmovz $chunk,%rax 338 cmp %rax,$len 339 cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK 340 cmovbe $len,$chunk 341___ 342$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); 343 cmp $chunk,$len 344 ja .L${mode}_loop 345 mov $inp,%rax # check if prefetch crosses page 346 cmp %rsp,%rbp 347 cmove $out,%rax 348 add $len,%rax 349 neg %rax 350 and \$0xfff,%rax # distance to page boundary 351 cmp \$$PADLOCK_PREFETCH{$mode},%rax 352 mov \$-$PADLOCK_PREFETCH{$mode},%rax 353 cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1 354 and %rax,$chunk 355 jz .L${mode}_unaligned_tail 356___ 357$code.=<<___; 358 jmp .L${mode}_loop 359.align 16 360.L${mode}_loop: 361 cmp $len,$chunk # ctr32 artefact 362 cmova $len,$chunk # ctr32 artefact 363 mov $out,%r8 # save parameters 364 mov $inp,%r9 365 mov $len,%r10 366 mov $chunk,$len 367 mov $chunk,%r11 368 test \$0x0f,$out # out_misaligned 369 cmovnz %rsp,$out 370 test \$0x0f,$inp # inp_misaligned 371 jz .L${mode}_inp_aligned 372 shr \$3,$len 373 .byte 0xf3,0x48,0xa5 # rep movsq 374 sub $chunk,$out 375 mov $chunk,$len 376 mov $out,$inp 377.L${mode}_inp_aligned: 378 lea -16($ctx),%rax # ivp 379 lea 16($ctx),%rbx # key 380 shr \$4,$len 381 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* 382___ 383$code.=<<___ if ($mode !~ /ecb|ctr/); 384 movdqa (%rax),%xmm0 385 movdqa %xmm0,-16($ctx) # copy [or refresh] iv 386___ 387$code.=<<___ if ($mode eq "ctr32"); 388 mov -4($ctx),%eax # pull 32-bit counter 389 test \$0xffff0000,%eax 390 jnz .L${mode}_no_carry 391 bswap %eax 392 add \$0x10000,%eax 393 bswap %eax 394 mov %eax,-4($ctx) 395.L${mode}_no_carry: 396___ 397$code.=<<___; 398 mov %r8,$out # restore parameters 399 mov %r11,$chunk 400 test \$0x0f,$out 401 jz .L${mode}_out_aligned 402 mov $chunk,$len 403 lea (%rsp),$inp 404 shr \$3,$len 405 .byte 0xf3,0x48,0xa5 # rep movsq 406 sub $chunk,$out 407.L${mode}_out_aligned: 408 mov %r9,$inp 409 mov %r10,$len 410 add $chunk,$out 411 add $chunk,$inp 412 sub $chunk,$len 413 mov \$$PADLOCK_CHUNK,$chunk 414___ 415 if (!$PADLOCK_PREFETCH{$mode}) { 416$code.=<<___; 417 jnz .L${mode}_loop 418___ 419 } else { 420$code.=<<___; 421 jz .L${mode}_break 422 cmp $chunk,$len 423 jae .L${mode}_loop 424___ 425$code.=<<___ if ($mode eq "ctr32"); 426 mov $len,$chunk 427 mov $inp,%rax # check if prefetch crosses page 428 cmp %rsp,%rbp 429 cmove $out,%rax 430 add $len,%rax 431 neg %rax 432 and \$0xfff,%rax # distance to page boundary 433 cmp \$$PADLOCK_PREFETCH{$mode},%rax 434 mov \$-$PADLOCK_PREFETCH{$mode},%rax 435 cmovae $chunk,%rax 436 and %rax,$chunk 437 jnz .L${mode}_loop 438___ 439$code.=<<___; 440.L${mode}_unaligned_tail: 441 xor %eax,%eax 442 cmp %rsp,%rbp 443 cmove $len,%rax 444 mov $out,%r8 # save parameters 445 mov $len,$chunk 446 sub %rax,%rsp # alloca 447 shr \$3,$len 448 lea (%rsp),$out 449 .byte 0xf3,0x48,0xa5 # rep movsq 450 mov %rsp,$inp 451 mov %r8, $out # restore parameters 452 mov $chunk,$len 453 jmp .L${mode}_loop 454.align 16 455.L${mode}_break: 456___ 457 } 458$code.=<<___; 459 cmp %rbp,%rsp 460 je .L${mode}_done 461 462 pxor %xmm0,%xmm0 463 lea (%rsp),%rax 464.L${mode}_bzero: 465 movaps %xmm0,(%rax) 466 lea 16(%rax),%rax 467 cmp %rax,%rbp 468 ja .L${mode}_bzero 469 470.L${mode}_done: 471 lea (%rbp),%rsp 472 jmp .L${mode}_exit 473 474.align 16 475.L${mode}_aligned: 476___ 477$code.=<<___ if ($mode eq "ctr32"); 478 mov -4($ctx),%eax # pull 32-bit counter 479 bswap %eax 480 neg %eax 481 and \$0xffff,%eax 482 mov \$`16*0x10000`,$chunk 483 shl \$4,%eax 484 cmovz $chunk,%rax 485 cmp %rax,$len 486 cmova %rax,$chunk # don't let counter cross 2^16 487 cmovbe $len,$chunk 488 jbe .L${mode}_aligned_skip 489 490.L${mode}_aligned_loop: 491 mov $len,%r10 # save parameters 492 mov $chunk,$len 493 mov $chunk,%r11 494 495 lea -16($ctx),%rax # ivp 496 lea 16($ctx),%rbx # key 497 shr \$4,$len # len/=AES_BLOCK_SIZE 498 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* 499 500 mov -4($ctx),%eax # pull 32-bit counter 501 bswap %eax 502 add \$0x10000,%eax 503 bswap %eax 504 mov %eax,-4($ctx) 505 506 mov %r10,$len # restore parameters 507 sub %r11,$len 508 mov \$`16*0x10000`,$chunk 509 jz .L${mode}_exit 510 cmp $chunk,$len 511 jae .L${mode}_aligned_loop 512 513.L${mode}_aligned_skip: 514___ 515$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); 516 lea ($inp,$len),%rbp 517 neg %rbp 518 and \$0xfff,%rbp # distance to page boundary 519 xor %eax,%eax 520 cmp \$$PADLOCK_PREFETCH{$mode},%rbp 521 mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp 522 cmovae %rax,%rbp 523 and $len,%rbp # remainder 524 sub %rbp,$len 525 jz .L${mode}_aligned_tail 526___ 527$code.=<<___; 528 lea -16($ctx),%rax # ivp 529 lea 16($ctx),%rbx # key 530 shr \$4,$len # len/=AES_BLOCK_SIZE 531 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* 532___ 533$code.=<<___ if ($mode !~ /ecb|ctr/); 534 movdqa (%rax),%xmm0 535 movdqa %xmm0,-16($ctx) # copy [or refresh] iv 536___ 537$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); 538 test %rbp,%rbp # check remainder 539 jz .L${mode}_exit 540 541.L${mode}_aligned_tail: 542 mov $out,%r8 543 mov %rbp,$chunk 544 mov %rbp,$len 545 lea (%rsp),%rbp 546 sub $len,%rsp 547 shr \$3,$len 548 lea (%rsp),$out 549 .byte 0xf3,0x48,0xa5 # rep movsq 550 lea (%r8),$out 551 lea (%rsp),$inp 552 mov $chunk,$len 553 jmp .L${mode}_loop 554___ 555$code.=<<___; 556.L${mode}_exit: 557 mov \$1,%eax 558 lea 8(%rsp),%rsp 559.L${mode}_abort: 560 pop %rbx 561 pop %rbp 562 ret 563.size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt 564___ 565} 566 567&generate_mode("ecb",0xc8); 568&generate_mode("cbc",0xd0); 569&generate_mode("cfb",0xe0); 570&generate_mode("ofb",0xe8); 571&generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR... 572 573$code.=<<___; 574.asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>" 575.align 16 576.data 577.align 8 578.Lpadlock_saved_context: 579 .quad 0 580___ 581$code =~ s/\`([^\`]*)\`/eval($1)/gem; 582 583print $code; 584 585close STDOUT; 586