1#! /usr/bin/env perl 2# Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# September 2011 18# 19# Assembler helpers for Padlock engine. See even e_padlock-x86.pl for 20# details. 21 22$flavour = shift; 23$output = shift; 24if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 25 26$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 27 28$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 29( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 30( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or 31die "can't locate x86_64-xlate.pl"; 32 33open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 34*STDOUT=*OUT; 35 36$code=".text\n"; 37 38%PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata 39$PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20 40 41$ctx="%rdx"; 42$out="%rdi"; 43$inp="%rsi"; 44$len="%rcx"; 45$chunk="%rbx"; 46 47($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order 48 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 49 50$code.=<<___; 51.globl padlock_capability 52.type padlock_capability,\@abi-omnipotent 53.align 16 54padlock_capability: 55 mov %rbx,%r8 56 xor %eax,%eax 57 cpuid 58 xor %eax,%eax 59 cmp \$`"0x".unpack("H*",'tneC')`,%ebx 60 jne .Lzhaoxin 61 cmp \$`"0x".unpack("H*",'Hrua')`,%edx 62 jne .Lnoluck 63 cmp \$`"0x".unpack("H*",'slua')`,%ecx 64 jne .Lnoluck 65 jmp .LzhaoxinEnd 66.Lzhaoxin: 67 cmp \$`"0x".unpack("H*",'hS ')`,%ebx 68 jne .Lnoluck 69 cmp \$`"0x".unpack("H*",'hgna')`,%edx 70 jne .Lnoluck 71 cmp \$`"0x".unpack("H*",' ia')`,%ecx 72 jne .Lnoluck 73.LzhaoxinEnd: 74 mov \$0xC0000000,%eax 75 cpuid 76 mov %eax,%edx 77 xor %eax,%eax 78 cmp \$0xC0000001,%edx 79 jb .Lnoluck 80 mov \$0xC0000001,%eax 81 cpuid 82 mov %edx,%eax 83 and \$0xffffffef,%eax 84 or \$0x10,%eax # set Nano bit#4 85.Lnoluck: 86 mov %r8,%rbx 87 ret 88.size padlock_capability,.-padlock_capability 89 90.globl padlock_key_bswap 91.type padlock_key_bswap,\@abi-omnipotent,0 92.align 16 93padlock_key_bswap: 94 mov 240($arg1),%edx 95.Lbswap_loop: 96 mov ($arg1),%eax 97 bswap %eax 98 mov %eax,($arg1) 99 lea 4($arg1),$arg1 100 sub \$1,%edx 101 jnz .Lbswap_loop 102 ret 103.size padlock_key_bswap,.-padlock_key_bswap 104 105.globl padlock_verify_context 106.type padlock_verify_context,\@abi-omnipotent 107.align 16 108padlock_verify_context: 109 mov $arg1,$ctx 110 pushf 111 lea .Lpadlock_saved_context(%rip),%rax 112 call _padlock_verify_ctx 113 lea 8(%rsp),%rsp 114 ret 115.size padlock_verify_context,.-padlock_verify_context 116 117.type _padlock_verify_ctx,\@abi-omnipotent 118.align 16 119_padlock_verify_ctx: 120 mov 8(%rsp),%r8 121 bt \$30,%r8 122 jnc .Lverified 123 cmp (%rax),$ctx 124 je .Lverified 125 pushf 126 popf 127.Lverified: 128 mov $ctx,(%rax) 129 ret 130.size _padlock_verify_ctx,.-_padlock_verify_ctx 131 132.globl padlock_reload_key 133.type padlock_reload_key,\@abi-omnipotent 134.align 16 135padlock_reload_key: 136 pushf 137 popf 138 ret 139.size padlock_reload_key,.-padlock_reload_key 140 141.globl padlock_aes_block 142.type padlock_aes_block,\@function,3 143.align 16 144padlock_aes_block: 145 mov %rbx,%r8 146 mov \$1,$len 147 lea 32($ctx),%rbx # key 148 lea 16($ctx),$ctx # control word 149 .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb 150 mov %r8,%rbx 151 ret 152.size padlock_aes_block,.-padlock_aes_block 153 154.globl padlock_xstore 155.type padlock_xstore,\@function,2 156.align 16 157padlock_xstore: 158 mov %esi,%edx 159 .byte 0x0f,0xa7,0xc0 # xstore 160 ret 161.size padlock_xstore,.-padlock_xstore 162 163.globl padlock_sha1_oneshot 164.type padlock_sha1_oneshot,\@function,3 165.align 16 166padlock_sha1_oneshot: 167 mov %rdx,%rcx 168 mov %rdi,%rdx # put aside %rdi 169 movups (%rdi),%xmm0 # copy-in context 170 sub \$128+8,%rsp 171 mov 16(%rdi),%eax 172 movaps %xmm0,(%rsp) 173 mov %rsp,%rdi 174 mov %eax,16(%rsp) 175 xor %rax,%rax 176 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 177 movaps (%rsp),%xmm0 178 mov 16(%rsp),%eax 179 add \$128+8,%rsp 180 movups %xmm0,(%rdx) # copy-out context 181 mov %eax,16(%rdx) 182 ret 183.size padlock_sha1_oneshot,.-padlock_sha1_oneshot 184 185.globl padlock_sha1_blocks 186.type padlock_sha1_blocks,\@function,3 187.align 16 188padlock_sha1_blocks: 189 mov %rdx,%rcx 190 mov %rdi,%rdx # put aside %rdi 191 movups (%rdi),%xmm0 # copy-in context 192 sub \$128+8,%rsp 193 mov 16(%rdi),%eax 194 movaps %xmm0,(%rsp) 195 mov %rsp,%rdi 196 mov %eax,16(%rsp) 197 mov \$-1,%rax 198 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1 199 movaps (%rsp),%xmm0 200 mov 16(%rsp),%eax 201 add \$128+8,%rsp 202 movups %xmm0,(%rdx) # copy-out context 203 mov %eax,16(%rdx) 204 ret 205.size padlock_sha1_blocks,.-padlock_sha1_blocks 206 207.globl padlock_sha256_oneshot 208.type padlock_sha256_oneshot,\@function,3 209.align 16 210padlock_sha256_oneshot: 211 mov %rdx,%rcx 212 mov %rdi,%rdx # put aside %rdi 213 movups (%rdi),%xmm0 # copy-in context 214 sub \$128+8,%rsp 215 movups 16(%rdi),%xmm1 216 movaps %xmm0,(%rsp) 217 mov %rsp,%rdi 218 movaps %xmm1,16(%rsp) 219 xor %rax,%rax 220 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 221 movaps (%rsp),%xmm0 222 movaps 16(%rsp),%xmm1 223 add \$128+8,%rsp 224 movups %xmm0,(%rdx) # copy-out context 225 movups %xmm1,16(%rdx) 226 ret 227.size padlock_sha256_oneshot,.-padlock_sha256_oneshot 228 229.globl padlock_sha256_blocks 230.type padlock_sha256_blocks,\@function,3 231.align 16 232padlock_sha256_blocks: 233 mov %rdx,%rcx 234 mov %rdi,%rdx # put aside %rdi 235 movups (%rdi),%xmm0 # copy-in context 236 sub \$128+8,%rsp 237 movups 16(%rdi),%xmm1 238 movaps %xmm0,(%rsp) 239 mov %rsp,%rdi 240 movaps %xmm1,16(%rsp) 241 mov \$-1,%rax 242 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256 243 movaps (%rsp),%xmm0 244 movaps 16(%rsp),%xmm1 245 add \$128+8,%rsp 246 movups %xmm0,(%rdx) # copy-out context 247 movups %xmm1,16(%rdx) 248 ret 249.size padlock_sha256_blocks,.-padlock_sha256_blocks 250 251.globl padlock_sha512_blocks 252.type padlock_sha512_blocks,\@function,3 253.align 16 254padlock_sha512_blocks: 255 mov %rdx,%rcx 256 mov %rdi,%rdx # put aside %rdi 257 movups (%rdi),%xmm0 # copy-in context 258 sub \$128+8,%rsp 259 movups 16(%rdi),%xmm1 260 movups 32(%rdi),%xmm2 261 movups 48(%rdi),%xmm3 262 movaps %xmm0,(%rsp) 263 mov %rsp,%rdi 264 movaps %xmm1,16(%rsp) 265 movaps %xmm2,32(%rsp) 266 movaps %xmm3,48(%rsp) 267 .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512 268 movaps (%rsp),%xmm0 269 movaps 16(%rsp),%xmm1 270 movaps 32(%rsp),%xmm2 271 movaps 48(%rsp),%xmm3 272 add \$128+8,%rsp 273 movups %xmm0,(%rdx) # copy-out context 274 movups %xmm1,16(%rdx) 275 movups %xmm2,32(%rdx) 276 movups %xmm3,48(%rdx) 277 ret 278.size padlock_sha512_blocks,.-padlock_sha512_blocks 279___ 280 281sub generate_mode { 282my ($mode,$opcode) = @_; 283# int padlock_$mode_encrypt(void *out, const void *inp, 284# struct padlock_cipher_data *ctx, size_t len); 285$code.=<<___; 286.globl padlock_${mode}_encrypt 287.type padlock_${mode}_encrypt,\@function,4 288.align 16 289padlock_${mode}_encrypt: 290 push %rbp 291 push %rbx 292 293 xor %eax,%eax 294 test \$15,$ctx 295 jnz .L${mode}_abort 296 test \$15,$len 297 jnz .L${mode}_abort 298 lea .Lpadlock_saved_context(%rip),%rax 299 pushf 300 cld 301 call _padlock_verify_ctx 302 lea 16($ctx),$ctx # control word 303 xor %eax,%eax 304 xor %ebx,%ebx 305 testl \$`1<<5`,($ctx) # align bit in control word 306 jnz .L${mode}_aligned 307 test \$0x0f,$out 308 setz %al # !out_misaligned 309 test \$0x0f,$inp 310 setz %bl # !inp_misaligned 311 test %ebx,%eax 312 jnz .L${mode}_aligned 313 neg %rax 314 mov \$$PADLOCK_CHUNK,$chunk 315 not %rax # out_misaligned?-1:0 316 lea (%rsp),%rbp 317 cmp $chunk,$len 318 cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len 319 and $chunk,%rax # out_misaligned?chunk:0 320 mov $len,$chunk 321 neg %rax 322 and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK 323 lea (%rax,%rbp),%rsp 324 mov \$$PADLOCK_CHUNK,%rax 325 cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK 326___ 327$code.=<<___ if ($mode eq "ctr32"); 328.L${mode}_reenter: 329 mov -4($ctx),%eax # pull 32-bit counter 330 bswap %eax 331 neg %eax 332 and \$`$PADLOCK_CHUNK/16-1`,%eax 333 mov \$$PADLOCK_CHUNK,$chunk 334 shl \$4,%eax 335 cmovz $chunk,%rax 336 cmp %rax,$len 337 cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK 338 cmovbe $len,$chunk 339___ 340$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); 341 cmp $chunk,$len 342 ja .L${mode}_loop 343 mov $inp,%rax # check if prefetch crosses page 344 cmp %rsp,%rbp 345 cmove $out,%rax 346 add $len,%rax 347 neg %rax 348 and \$0xfff,%rax # distance to page boundary 349 cmp \$$PADLOCK_PREFETCH{$mode},%rax 350 mov \$-$PADLOCK_PREFETCH{$mode},%rax 351 cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1 352 and %rax,$chunk 353 jz .L${mode}_unaligned_tail 354___ 355$code.=<<___; 356 jmp .L${mode}_loop 357.align 16 358.L${mode}_loop: 359 cmp $len,$chunk # ctr32 artefact 360 cmova $len,$chunk # ctr32 artefact 361 mov $out,%r8 # save parameters 362 mov $inp,%r9 363 mov $len,%r10 364 mov $chunk,$len 365 mov $chunk,%r11 366 test \$0x0f,$out # out_misaligned 367 cmovnz %rsp,$out 368 test \$0x0f,$inp # inp_misaligned 369 jz .L${mode}_inp_aligned 370 shr \$3,$len 371 .byte 0xf3,0x48,0xa5 # rep movsq 372 sub $chunk,$out 373 mov $chunk,$len 374 mov $out,$inp 375.L${mode}_inp_aligned: 376 lea -16($ctx),%rax # ivp 377 lea 16($ctx),%rbx # key 378 shr \$4,$len 379 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* 380___ 381$code.=<<___ if ($mode !~ /ecb|ctr/); 382 movdqa (%rax),%xmm0 383 movdqa %xmm0,-16($ctx) # copy [or refresh] iv 384___ 385$code.=<<___ if ($mode eq "ctr32"); 386 mov -4($ctx),%eax # pull 32-bit counter 387 test \$0xffff0000,%eax 388 jnz .L${mode}_no_carry 389 bswap %eax 390 add \$0x10000,%eax 391 bswap %eax 392 mov %eax,-4($ctx) 393.L${mode}_no_carry: 394___ 395$code.=<<___; 396 mov %r8,$out # restore parameters 397 mov %r11,$chunk 398 test \$0x0f,$out 399 jz .L${mode}_out_aligned 400 mov $chunk,$len 401 lea (%rsp),$inp 402 shr \$3,$len 403 .byte 0xf3,0x48,0xa5 # rep movsq 404 sub $chunk,$out 405.L${mode}_out_aligned: 406 mov %r9,$inp 407 mov %r10,$len 408 add $chunk,$out 409 add $chunk,$inp 410 sub $chunk,$len 411 mov \$$PADLOCK_CHUNK,$chunk 412___ 413 if (!$PADLOCK_PREFETCH{$mode}) { 414$code.=<<___; 415 jnz .L${mode}_loop 416___ 417 } else { 418$code.=<<___; 419 jz .L${mode}_break 420 cmp $chunk,$len 421 jae .L${mode}_loop 422___ 423$code.=<<___ if ($mode eq "ctr32"); 424 mov $len,$chunk 425 mov $inp,%rax # check if prefetch crosses page 426 cmp %rsp,%rbp 427 cmove $out,%rax 428 add $len,%rax 429 neg %rax 430 and \$0xfff,%rax # distance to page boundary 431 cmp \$$PADLOCK_PREFETCH{$mode},%rax 432 mov \$-$PADLOCK_PREFETCH{$mode},%rax 433 cmovae $chunk,%rax 434 and %rax,$chunk 435 jnz .L${mode}_loop 436___ 437$code.=<<___; 438.L${mode}_unaligned_tail: 439 xor %eax,%eax 440 cmp %rsp,%rbp 441 cmove $len,%rax 442 mov $out,%r8 # save parameters 443 mov $len,$chunk 444 sub %rax,%rsp # alloca 445 shr \$3,$len 446 lea (%rsp),$out 447 .byte 0xf3,0x48,0xa5 # rep movsq 448 mov %rsp,$inp 449 mov %r8, $out # restore parameters 450 mov $chunk,$len 451 jmp .L${mode}_loop 452.align 16 453.L${mode}_break: 454___ 455 } 456$code.=<<___; 457 cmp %rbp,%rsp 458 je .L${mode}_done 459 460 pxor %xmm0,%xmm0 461 lea (%rsp),%rax 462.L${mode}_bzero: 463 movaps %xmm0,(%rax) 464 lea 16(%rax),%rax 465 cmp %rax,%rbp 466 ja .L${mode}_bzero 467 468.L${mode}_done: 469 lea (%rbp),%rsp 470 jmp .L${mode}_exit 471 472.align 16 473.L${mode}_aligned: 474___ 475$code.=<<___ if ($mode eq "ctr32"); 476 mov -4($ctx),%eax # pull 32-bit counter 477 bswap %eax 478 neg %eax 479 and \$0xffff,%eax 480 mov \$`16*0x10000`,$chunk 481 shl \$4,%eax 482 cmovz $chunk,%rax 483 cmp %rax,$len 484 cmova %rax,$chunk # don't let counter cross 2^16 485 cmovbe $len,$chunk 486 jbe .L${mode}_aligned_skip 487 488.L${mode}_aligned_loop: 489 mov $len,%r10 # save parameters 490 mov $chunk,$len 491 mov $chunk,%r11 492 493 lea -16($ctx),%rax # ivp 494 lea 16($ctx),%rbx # key 495 shr \$4,$len # len/=AES_BLOCK_SIZE 496 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* 497 498 mov -4($ctx),%eax # pull 32-bit counter 499 bswap %eax 500 add \$0x10000,%eax 501 bswap %eax 502 mov %eax,-4($ctx) 503 504 mov %r10,$len # restore parameters 505 sub %r11,$len 506 mov \$`16*0x10000`,$chunk 507 jz .L${mode}_exit 508 cmp $chunk,$len 509 jae .L${mode}_aligned_loop 510 511.L${mode}_aligned_skip: 512___ 513$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); 514 lea ($inp,$len),%rbp 515 neg %rbp 516 and \$0xfff,%rbp # distance to page boundary 517 xor %eax,%eax 518 cmp \$$PADLOCK_PREFETCH{$mode},%rbp 519 mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp 520 cmovae %rax,%rbp 521 and $len,%rbp # remainder 522 sub %rbp,$len 523 jz .L${mode}_aligned_tail 524___ 525$code.=<<___; 526 lea -16($ctx),%rax # ivp 527 lea 16($ctx),%rbx # key 528 shr \$4,$len # len/=AES_BLOCK_SIZE 529 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt* 530___ 531$code.=<<___ if ($mode !~ /ecb|ctr/); 532 movdqa (%rax),%xmm0 533 movdqa %xmm0,-16($ctx) # copy [or refresh] iv 534___ 535$code.=<<___ if ($PADLOCK_PREFETCH{$mode}); 536 test %rbp,%rbp # check remainder 537 jz .L${mode}_exit 538 539.L${mode}_aligned_tail: 540 mov $out,%r8 541 mov %rbp,$chunk 542 mov %rbp,$len 543 lea (%rsp),%rbp 544 sub $len,%rsp 545 shr \$3,$len 546 lea (%rsp),$out 547 .byte 0xf3,0x48,0xa5 # rep movsq 548 lea (%r8),$out 549 lea (%rsp),$inp 550 mov $chunk,$len 551 jmp .L${mode}_loop 552___ 553$code.=<<___; 554.L${mode}_exit: 555 mov \$1,%eax 556 lea 8(%rsp),%rsp 557.L${mode}_abort: 558 pop %rbx 559 pop %rbp 560 ret 561.size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt 562___ 563} 564 565&generate_mode("ecb",0xc8); 566&generate_mode("cbc",0xd0); 567&generate_mode("cfb",0xe0); 568&generate_mode("ofb",0xe8); 569&generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR... 570 571$code.=<<___; 572.asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>" 573.align 16 574.data 575.align 8 576.Lpadlock_saved_context: 577 .quad 0 578___ 579$code =~ s/\`([^\`]*)\`/eval($1)/gem; 580 581print $code; 582 583close STDOUT; 584