1#! /usr/bin/env perl 2# Copyright 2011-2023 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# September 2011 18# 19# Assembler helpers for Padlock engine. Compared to original engine 20# version relying on inline assembler and compiled with gcc 3.4.6 it 21# was measured to provide ~100% improvement on misaligned data in ECB 22# mode and ~75% in CBC mode. For aligned data improvement can be 23# observed for short inputs only, e.g. 45% for 64-byte messages in 24# ECB mode, 20% in CBC. Difference in performance for aligned vs. 25# misaligned data depends on misalignment and is either ~1.8x or 2.9x. 26# These are approximately same factors as for hardware support, so 27# there is little reason to rely on the latter. On the contrary, it 28# might actually hurt performance in mixture of aligned and misaligned 29# buffers, because a) if you choose to flip 'align' flag in control 30# word on per-buffer basis, then you'd have to reload key context, 31# which incurs penalty; b) if you choose to set 'align' flag 32# permanently, it limits performance even for aligned data to ~1/2. 33# All above mentioned results were collected on 1.5GHz C7. Nano on the 34# other hand handles unaligned data more gracefully. Depending on 35# algorithm and how unaligned data is, hardware can be up to 70% more 36# efficient than below software alignment procedures, nor does 'align' 37# flag have affect on aligned performance [if has any meaning at all]. 38# Therefore suggestion is to unconditionally set 'align' flag on Nano 39# for optimal performance. 40 41$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 42push(@INC,"${dir}","${dir}../../crypto/perlasm"); 43require "x86asm.pl"; 44 45$output=pop; 46open STDOUT,">$output"; 47 48&asm_init($ARGV[0]); 49 50%PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata 51$PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16 52 53$ctx="edx"; 54$out="edi"; 55$inp="esi"; 56$len="ecx"; 57$chunk="ebx"; 58 59&function_begin_B("padlock_capability"); 60 &push ("ebx"); 61 &pushf (); 62 &pop ("eax"); 63 &mov ("ecx","eax"); 64 &xor ("eax",1<<21); 65 &push ("eax"); 66 &popf (); 67 &pushf (); 68 &pop ("eax"); 69 &xor ("ecx","eax"); 70 &xor ("eax","eax"); 71 &bt ("ecx",21); 72 &jnc (&label("noluck")); 73 &cpuid (); 74 &xor ("eax","eax"); 75 &cmp ("ebx","0x".unpack("H*",'tneC')); 76 &jne (&label("zhaoxin")); 77 &cmp ("edx","0x".unpack("H*",'Hrua')); 78 &jne (&label("noluck")); 79 &cmp ("ecx","0x".unpack("H*",'slua')); 80 &jne (&label("noluck")); 81 &jmp (&label("zhaoxinEnd")); 82&set_label("zhaoxin"); 83 &cmp ("ebx","0x".unpack("H*",'hS ')); 84 &jne (&label("noluck")); 85 &cmp ("edx","0x".unpack("H*",'hgna')); 86 &jne (&label("noluck")); 87 &cmp ("ecx","0x".unpack("H*",' ia')); 88 &jne (&label("noluck")); 89&set_label("zhaoxinEnd"); 90 &mov ("eax",0xC0000000); 91 &cpuid (); 92 &mov ("edx","eax"); 93 &xor ("eax","eax"); 94 &cmp ("edx",0xC0000001); 95 &jb (&label("noluck")); 96 &mov ("eax",1); 97 &cpuid (); 98 &or ("eax",0x0f); 99 &xor ("ebx","ebx"); 100 &and ("eax",0x0fff); 101 &cmp ("eax",0x06ff); # check for Nano 102 &sete ("bl"); 103 &mov ("eax",0xC0000001); 104 &push ("ebx"); 105 &cpuid (); 106 &pop ("ebx"); 107 &mov ("eax","edx"); 108 &shl ("ebx",4); # bit#4 denotes Nano 109 &and ("eax",0xffffffef); 110 &or ("eax","ebx") 111&set_label("noluck"); 112 &pop ("ebx"); 113 &ret (); 114&function_end_B("padlock_capability") 115 116&function_begin_B("padlock_key_bswap"); 117 &mov ("edx",&wparam(0)); 118 &mov ("ecx",&DWP(240,"edx")); 119 &inc ("ecx"); 120 &shl ("ecx",2); 121&set_label("bswap_loop"); 122 &mov ("eax",&DWP(0,"edx")); 123 &bswap ("eax"); 124 &mov (&DWP(0,"edx"),"eax"); 125 &lea ("edx",&DWP(4,"edx")); 126 &sub ("ecx",1); 127 &jnz (&label("bswap_loop")); 128 &ret (); 129&function_end_B("padlock_key_bswap"); 130 131# This is heuristic key context tracing. At first one 132# believes that one should use atomic swap instructions, 133# but it's not actually necessary. Point is that if 134# padlock_saved_context was changed by another thread 135# after we've read it and before we compare it with ctx, 136# our key *shall* be reloaded upon thread context switch 137# and we are therefore set in either case... 138&static_label("padlock_saved_context"); 139 140&function_begin_B("padlock_verify_context"); 141 &mov ($ctx,&wparam(0)); 142 &lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) : 143 &DWP(&label("padlock_saved_context")."-".&label("verify_pic_point"))); 144 &pushf (); 145 &call ("_padlock_verify_ctx"); 146&set_label("verify_pic_point"); 147 &lea ("esp",&DWP(4,"esp")); 148 &ret (); 149&function_end_B("padlock_verify_context"); 150 151&function_begin_B("_padlock_verify_ctx"); 152 &add ("eax",&DWP(0,"esp")) if(!($::win32 or $::coff));# &padlock_saved_context 153 &bt (&DWP(4,"esp"),30); # eflags 154 &jnc (&label("verified")); 155 &cmp ($ctx,&DWP(0,"eax")); 156 &je (&label("verified")); 157 &pushf (); 158 &popf (); 159&set_label("verified"); 160 &mov (&DWP(0,"eax"),$ctx); 161 &ret (); 162&function_end_B("_padlock_verify_ctx"); 163 164&function_begin_B("padlock_reload_key"); 165 &pushf (); 166 &popf (); 167 &ret (); 168&function_end_B("padlock_reload_key"); 169 170&function_begin_B("padlock_aes_block"); 171 &push ("edi"); 172 &push ("esi"); 173 &push ("ebx"); 174 &mov ($out,&wparam(0)); # must be 16-byte aligned 175 &mov ($inp,&wparam(1)); # must be 16-byte aligned 176 &mov ($ctx,&wparam(2)); 177 &mov ($len,1); 178 &lea ("ebx",&DWP(32,$ctx)); # key 179 &lea ($ctx,&DWP(16,$ctx)); # control word 180 &data_byte(0xf3,0x0f,0xa7,0xc8); # rep xcryptecb 181 &pop ("ebx"); 182 &pop ("esi"); 183 &pop ("edi"); 184 &ret (); 185&function_end_B("padlock_aes_block"); 186 187sub generate_mode { 188my ($mode,$opcode) = @_; 189# int padlock_$mode_encrypt(void *out, const void *inp, 190# struct padlock_cipher_data *ctx, size_t len); 191&function_begin("padlock_${mode}_encrypt"); 192 &mov ($out,&wparam(0)); 193 &mov ($inp,&wparam(1)); 194 &mov ($ctx,&wparam(2)); 195 &mov ($len,&wparam(3)); 196 &test ($ctx,15); 197 &jnz (&label("${mode}_abort")); 198 &test ($len,15); 199 &jnz (&label("${mode}_abort")); 200 &lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) : 201 &DWP(&label("padlock_saved_context")."-".&label("${mode}_pic_point"))); 202 &pushf (); 203 &cld (); 204 &call ("_padlock_verify_ctx"); 205&set_label("${mode}_pic_point"); 206 &lea ($ctx,&DWP(16,$ctx)); # control word 207 &xor ("eax","eax"); 208 if ($mode eq "ctr32") { 209 &movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter 210 } else { 211 &xor ("ebx","ebx"); 212 &test (&DWP(0,$ctx),1<<5); # align bit in control word 213 &jnz (&label("${mode}_aligned")); 214 &test ($out,0x0f); 215 &setz ("al"); # !out_misaligned 216 &test ($inp,0x0f); 217 &setz ("bl"); # !inp_misaligned 218 &test ("eax","ebx"); 219 &jnz (&label("${mode}_aligned")); 220 &neg ("eax"); 221 } 222 &mov ($chunk,$PADLOCK_CHUNK); 223 ¬ ("eax"); # out_misaligned?-1:0 224 &lea ("ebp",&DWP(-24,"esp")); 225 &cmp ($len,$chunk); 226 &cmovc ($chunk,$len); # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len 227 &and ("eax",$chunk); # out_misaligned?chunk:0 228 &mov ($chunk,$len); 229 &neg ("eax"); 230 &and ($chunk,$PADLOCK_CHUNK-1); # chunk=len%PADLOCK_CHUNK 231 &lea ("esp",&DWP(0,"eax","ebp")); # alloca 232 &mov ("eax",$PADLOCK_CHUNK); 233 &cmovz ($chunk,"eax"); # chunk=chunk?:PADLOCK_CHUNK 234 &mov ("eax","ebp"); 235 &and ("ebp",-16); 236 &and ("esp",-16); 237 &mov (&DWP(16,"ebp"),"eax"); 238 if ($PADLOCK_PREFETCH{$mode}) { 239 &cmp ($len,$chunk); 240 &ja (&label("${mode}_loop")); 241 &mov ("eax",$inp); # check if prefetch crosses page 242 &cmp ("ebp","esp"); 243 &cmove ("eax",$out); 244 &add ("eax",$len); 245 &neg ("eax"); 246 &and ("eax",0xfff); # distance to page boundary 247 &cmp ("eax",$PADLOCK_PREFETCH{$mode}); 248 &mov ("eax",-$PADLOCK_PREFETCH{$mode}); 249 &cmovae ("eax",$chunk); # mask=distance<prefetch?-prefetch:-1 250 &and ($chunk,"eax"); 251 &jz (&label("${mode}_unaligned_tail")); 252 } 253 &jmp (&label("${mode}_loop")); 254 255&set_label("${mode}_loop",16); 256 &mov (&DWP(0,"ebp"),$out); # save parameters 257 &mov (&DWP(4,"ebp"),$inp); 258 &mov (&DWP(8,"ebp"),$len); 259 &mov ($len,$chunk); 260 &mov (&DWP(12,"ebp"),$chunk); # chunk 261 if ($mode eq "ctr32") { 262 &mov ("ecx",&DWP(-4,$ctx)); 263 &xor ($out,$out); 264 &mov ("eax",&DWP(-8,$ctx)); # borrow $len 265&set_label("${mode}_prepare"); 266 &mov (&DWP(12,"esp",$out),"ecx"); 267 &bswap ("ecx"); 268 &movq (&QWP(0,"esp",$out),"mm0"); 269 &inc ("ecx"); 270 &mov (&DWP(8,"esp",$out),"eax"); 271 &bswap ("ecx"); 272 &lea ($out,&DWP(16,$out)); 273 &cmp ($out,$chunk); 274 &jb (&label("${mode}_prepare")); 275 276 &mov (&DWP(-4,$ctx),"ecx"); 277 &lea ($inp,&DWP(0,"esp")); 278 &lea ($out,&DWP(0,"esp")); 279 &mov ($len,$chunk); 280 } else { 281 &test ($out,0x0f); # out_misaligned 282 &cmovnz ($out,"esp"); 283 &test ($inp,0x0f); # inp_misaligned 284 &jz (&label("${mode}_inp_aligned")); 285 &shr ($len,2); 286 &data_byte(0xf3,0xa5); # rep movsl 287 &sub ($out,$chunk); 288 &mov ($len,$chunk); 289 &mov ($inp,$out); 290&set_label("${mode}_inp_aligned"); 291 } 292 &lea ("eax",&DWP(-16,$ctx)); # ivp 293 &lea ("ebx",&DWP(16,$ctx)); # key 294 &shr ($len,4); # len/=AES_BLOCK_SIZE 295 &data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt* 296 if ($mode !~ /ecb|ctr/) { 297 &movaps ("xmm0",&QWP(0,"eax")); 298 &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv 299 } 300 &mov ($out,&DWP(0,"ebp")); # restore parameters 301 &mov ($chunk,&DWP(12,"ebp")); 302 if ($mode eq "ctr32") { 303 &mov ($inp,&DWP(4,"ebp")); 304 &xor ($len,$len); 305&set_label("${mode}_xor"); 306 &movups ("xmm1",&QWP(0,$inp,$len)); 307 &lea ($len,&DWP(16,$len)); 308 &pxor ("xmm1",&QWP(-16,"esp",$len)); 309 &movups (&QWP(-16,$out,$len),"xmm1"); 310 &cmp ($len,$chunk); 311 &jb (&label("${mode}_xor")); 312 } else { 313 &test ($out,0x0f); 314 &jz (&label("${mode}_out_aligned")); 315 &mov ($len,$chunk); 316 &lea ($inp,&DWP(0,"esp")); 317 &shr ($len,2); 318 &data_byte(0xf3,0xa5); # rep movsl 319 &sub ($out,$chunk); 320&set_label("${mode}_out_aligned"); 321 &mov ($inp,&DWP(4,"ebp")); 322 } 323 &mov ($len,&DWP(8,"ebp")); 324 &add ($out,$chunk); 325 &add ($inp,$chunk); 326 &sub ($len,$chunk); 327 &mov ($chunk,$PADLOCK_CHUNK); 328 if (!$PADLOCK_PREFETCH{$mode}) { 329 &jnz (&label("${mode}_loop")); 330 } else { 331 &jz (&label("${mode}_break")); 332 &cmp ($len,$chunk); 333 &jae (&label("${mode}_loop")); 334 335&set_label("${mode}_unaligned_tail"); 336 &xor ("eax","eax"); 337 &cmp ("esp","ebp"); 338 &cmove ("eax",$len); 339 &sub ("esp","eax"); # alloca 340 &mov ("eax", $out); # save parameters 341 &mov ($chunk,$len); 342 &shr ($len,2); 343 &lea ($out,&DWP(0,"esp")); 344 &data_byte(0xf3,0xa5); # rep movsl 345 &mov ($inp,"esp"); 346 &mov ($out,"eax"); # restore parameters 347 &mov ($len,$chunk); 348 &jmp (&label("${mode}_loop")); 349 350&set_label("${mode}_break",16); 351 } 352 if ($mode ne "ctr32") { 353 &cmp ("esp","ebp"); 354 &je (&label("${mode}_done")); 355 } 356 &pxor ("xmm0","xmm0"); 357 &lea ("eax",&DWP(0,"esp")); 358&set_label("${mode}_bzero"); 359 &movaps (&QWP(0,"eax"),"xmm0"); 360 &lea ("eax",&DWP(16,"eax")); 361 &cmp ("ebp","eax"); 362 &ja (&label("${mode}_bzero")); 363 364&set_label("${mode}_done"); 365 &mov ("ebp",&DWP(16,"ebp")); 366 &lea ("esp",&DWP(24,"ebp")); 367 if ($mode ne "ctr32") { 368 &jmp (&label("${mode}_exit")); 369 370&set_label("${mode}_aligned",16); 371 if ($PADLOCK_PREFETCH{$mode}) { 372 &lea ("ebp",&DWP(0,$inp,$len)); 373 &neg ("ebp"); 374 &and ("ebp",0xfff); # distance to page boundary 375 &xor ("eax","eax"); 376 &cmp ("ebp",$PADLOCK_PREFETCH{$mode}); 377 &mov ("ebp",$PADLOCK_PREFETCH{$mode}-1); 378 &cmovae ("ebp","eax"); 379 &and ("ebp",$len); # remainder 380 &sub ($len,"ebp"); 381 &jz (&label("${mode}_aligned_tail")); 382 } 383 &lea ("eax",&DWP(-16,$ctx)); # ivp 384 &lea ("ebx",&DWP(16,$ctx)); # key 385 &shr ($len,4); # len/=AES_BLOCK_SIZE 386 &data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt* 387 if ($mode ne "ecb") { 388 &movaps ("xmm0",&QWP(0,"eax")); 389 &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv 390 } 391 if ($PADLOCK_PREFETCH{$mode}) { 392 &test ("ebp","ebp"); 393 &jz (&label("${mode}_exit")); 394 395&set_label("${mode}_aligned_tail"); 396 &mov ($len,"ebp"); 397 &lea ("ebp",&DWP(-24,"esp")); 398 &mov ("esp","ebp"); 399 &mov ("eax","ebp"); 400 &sub ("esp",$len); 401 &and ("ebp",-16); 402 &and ("esp",-16); 403 &mov (&DWP(16,"ebp"),"eax"); 404 &mov ("eax", $out); # save parameters 405 &mov ($chunk,$len); 406 &shr ($len,2); 407 &lea ($out,&DWP(0,"esp")); 408 &data_byte(0xf3,0xa5); # rep movsl 409 &mov ($inp,"esp"); 410 &mov ($out,"eax"); # restore parameters 411 &mov ($len,$chunk); 412 &jmp (&label("${mode}_loop")); 413 } 414&set_label("${mode}_exit"); } 415 &mov ("eax",1); 416 &lea ("esp",&DWP(4,"esp")); # popf 417 &emms () if ($mode eq "ctr32"); 418&set_label("${mode}_abort"); 419&function_end("padlock_${mode}_encrypt"); 420} 421 422&generate_mode("ecb",0xc8); 423&generate_mode("cbc",0xd0); 424&generate_mode("cfb",0xe0); 425&generate_mode("ofb",0xe8); 426&generate_mode("ctr32",0xc8); # yes, it implements own CTR with ECB opcode, 427 # because hardware CTR was introduced later 428 # and even has errata on certain C7 stepping. 429 # own implementation *always* works, though 430 # ~15% slower than dedicated hardware... 431 432&function_begin_B("padlock_xstore"); 433 &push ("edi"); 434 &mov ("edi",&wparam(0)); 435 &mov ("edx",&wparam(1)); 436 &data_byte(0x0f,0xa7,0xc0); # xstore 437 &pop ("edi"); 438 &ret (); 439&function_end_B("padlock_xstore"); 440 441&function_begin_B("_win32_segv_handler"); 442 &mov ("eax",1); # ExceptionContinueSearch 443 &mov ("edx",&wparam(0)); # *ExceptionRecord 444 &mov ("ecx",&wparam(2)); # *ContextRecord 445 &cmp (&DWP(0,"edx"),0xC0000005) # ExceptionRecord->ExceptionCode == STATUS_ACCESS_VIOLATION 446 &jne (&label("ret")); 447 &add (&DWP(184,"ecx"),4); # skip over rep sha* 448 &mov ("eax",0); # ExceptionContinueExecution 449&set_label("ret"); 450 &ret (); 451&function_end_B("_win32_segv_handler"); 452&safeseh("_win32_segv_handler") if ($::win32); 453 454&function_begin_B("padlock_sha1_oneshot"); 455 &push ("edi"); 456 &push ("esi"); 457 &xor ("eax","eax"); 458 &mov ("edi",&wparam(0)); 459 &mov ("esi",&wparam(1)); 460 &mov ("ecx",&wparam(2)); 461 if ($::win32 or $::coff) { 462 &push (&::islabel("_win32_segv_handler")); 463 &data_byte(0x64,0xff,0x30); # push %fs:(%eax) 464 &data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax) 465 } 466 &mov ("edx","esp"); # put aside %esp 467 &add ("esp",-128); # 32 is enough but spec says 128 468 &movups ("xmm0",&QWP(0,"edi")); # copy-in context 469 &and ("esp",-16); 470 &mov ("eax",&DWP(16,"edi")); 471 &movaps (&QWP(0,"esp"),"xmm0"); 472 &mov ("edi","esp"); 473 &mov (&DWP(16,"esp"),"eax"); 474 &xor ("eax","eax"); 475 &data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1 476 &movaps ("xmm0",&QWP(0,"esp")); 477 &mov ("eax",&DWP(16,"esp")); 478 &mov ("esp","edx"); # restore %esp 479 if ($::win32 or $::coff) { 480 &data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0 481 &lea ("esp",&DWP(4,"esp")); 482 } 483 &mov ("edi",&wparam(0)); 484 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 485 &mov (&DWP(16,"edi"),"eax"); 486 &pop ("esi"); 487 &pop ("edi"); 488 &ret (); 489&function_end_B("padlock_sha1_oneshot"); 490 491&function_begin_B("padlock_sha1_blocks"); 492 &push ("edi"); 493 &push ("esi"); 494 &mov ("edi",&wparam(0)); 495 &mov ("esi",&wparam(1)); 496 &mov ("edx","esp"); # put aside %esp 497 &mov ("ecx",&wparam(2)); 498 &add ("esp",-128); 499 &movups ("xmm0",&QWP(0,"edi")); # copy-in context 500 &and ("esp",-16); 501 &mov ("eax",&DWP(16,"edi")); 502 &movaps (&QWP(0,"esp"),"xmm0"); 503 &mov ("edi","esp"); 504 &mov (&DWP(16,"esp"),"eax"); 505 &mov ("eax",-1); 506 &data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1 507 &movaps ("xmm0",&QWP(0,"esp")); 508 &mov ("eax",&DWP(16,"esp")); 509 &mov ("esp","edx"); # restore %esp 510 &mov ("edi",&wparam(0)); 511 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 512 &mov (&DWP(16,"edi"),"eax"); 513 &pop ("esi"); 514 &pop ("edi"); 515 &ret (); 516&function_end_B("padlock_sha1_blocks"); 517 518&function_begin_B("padlock_sha256_oneshot"); 519 &push ("edi"); 520 &push ("esi"); 521 &xor ("eax","eax"); 522 &mov ("edi",&wparam(0)); 523 &mov ("esi",&wparam(1)); 524 &mov ("ecx",&wparam(2)); 525 if ($::win32 or $::coff) { 526 &push (&::islabel("_win32_segv_handler")); 527 &data_byte(0x64,0xff,0x30); # push %fs:(%eax) 528 &data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax) 529 } 530 &mov ("edx","esp"); # put aside %esp 531 &add ("esp",-128); 532 &movups ("xmm0",&QWP(0,"edi")); # copy-in context 533 &and ("esp",-16); 534 &movups ("xmm1",&QWP(16,"edi")); 535 &movaps (&QWP(0,"esp"),"xmm0"); 536 &mov ("edi","esp"); 537 &movaps (&QWP(16,"esp"),"xmm1"); 538 &xor ("eax","eax"); 539 &data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256 540 &movaps ("xmm0",&QWP(0,"esp")); 541 &movaps ("xmm1",&QWP(16,"esp")); 542 &mov ("esp","edx"); # restore %esp 543 if ($::win32 or $::coff) { 544 &data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0 545 &lea ("esp",&DWP(4,"esp")); 546 } 547 &mov ("edi",&wparam(0)); 548 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 549 &movups (&QWP(16,"edi"),"xmm1"); 550 &pop ("esi"); 551 &pop ("edi"); 552 &ret (); 553&function_end_B("padlock_sha256_oneshot"); 554 555&function_begin_B("padlock_sha256_blocks"); 556 &push ("edi"); 557 &push ("esi"); 558 &mov ("edi",&wparam(0)); 559 &mov ("esi",&wparam(1)); 560 &mov ("ecx",&wparam(2)); 561 &mov ("edx","esp"); # put aside %esp 562 &add ("esp",-128); 563 &movups ("xmm0",&QWP(0,"edi")); # copy-in context 564 &and ("esp",-16); 565 &movups ("xmm1",&QWP(16,"edi")); 566 &movaps (&QWP(0,"esp"),"xmm0"); 567 &mov ("edi","esp"); 568 &movaps (&QWP(16,"esp"),"xmm1"); 569 &mov ("eax",-1); 570 &data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256 571 &movaps ("xmm0",&QWP(0,"esp")); 572 &movaps ("xmm1",&QWP(16,"esp")); 573 &mov ("esp","edx"); # restore %esp 574 &mov ("edi",&wparam(0)); 575 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 576 &movups (&QWP(16,"edi"),"xmm1"); 577 &pop ("esi"); 578 &pop ("edi"); 579 &ret (); 580&function_end_B("padlock_sha256_blocks"); 581 582&function_begin_B("padlock_sha512_blocks"); 583 &push ("edi"); 584 &push ("esi"); 585 &mov ("edi",&wparam(0)); 586 &mov ("esi",&wparam(1)); 587 &mov ("ecx",&wparam(2)); 588 &mov ("edx","esp"); # put aside %esp 589 &add ("esp",-128); 590 &movups ("xmm0",&QWP(0,"edi")); # copy-in context 591 &and ("esp",-16); 592 &movups ("xmm1",&QWP(16,"edi")); 593 &movups ("xmm2",&QWP(32,"edi")); 594 &movups ("xmm3",&QWP(48,"edi")); 595 &movaps (&QWP(0,"esp"),"xmm0"); 596 &mov ("edi","esp"); 597 &movaps (&QWP(16,"esp"),"xmm1"); 598 &movaps (&QWP(32,"esp"),"xmm2"); 599 &movaps (&QWP(48,"esp"),"xmm3"); 600 &data_byte(0xf3,0x0f,0xa6,0xe0); # rep xsha512 601 &movaps ("xmm0",&QWP(0,"esp")); 602 &movaps ("xmm1",&QWP(16,"esp")); 603 &movaps ("xmm2",&QWP(32,"esp")); 604 &movaps ("xmm3",&QWP(48,"esp")); 605 &mov ("esp","edx"); # restore %esp 606 &mov ("edi",&wparam(0)); 607 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 608 &movups (&QWP(16,"edi"),"xmm1"); 609 &movups (&QWP(32,"edi"),"xmm2"); 610 &movups (&QWP(48,"edi"),"xmm3"); 611 &pop ("esi"); 612 &pop ("edi"); 613 &ret (); 614&function_end_B("padlock_sha512_blocks"); 615 616&asciz ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>"); 617&align (16); 618 619&dataseg(); 620# Essentially this variable belongs in thread local storage. 621# Having this variable global on the other hand can only cause 622# few bogus key reloads [if any at all on signle-CPU system], 623# so we accept the penalty... 624&set_label("padlock_saved_context",4); 625&data_word(0); 626 627&asm_finish(); 628 629close STDOUT; 630