1*e71b7053SJung-uk Kim#! /usr/bin/env perl 2*e71b7053SJung-uk Kim# Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved. 3*e71b7053SJung-uk Kim# 4*e71b7053SJung-uk Kim# Licensed under the OpenSSL license (the "License"). You may not use 5*e71b7053SJung-uk Kim# this file except in compliance with the License. You can obtain a copy 6*e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at 7*e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html 8*e71b7053SJung-uk Kim 9*e71b7053SJung-uk Kim 10*e71b7053SJung-uk Kim# ==================================================================== 11*e71b7053SJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12*e71b7053SJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and 13*e71b7053SJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further 14*e71b7053SJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/. 15*e71b7053SJung-uk Kim# ==================================================================== 16*e71b7053SJung-uk Kim 17*e71b7053SJung-uk Kim# September 2011 18*e71b7053SJung-uk Kim# 19*e71b7053SJung-uk Kim# Assembler helpers for Padlock engine. Compared to original engine 20*e71b7053SJung-uk Kim# version relying on inline assembler and compiled with gcc 3.4.6 it 21*e71b7053SJung-uk Kim# was measured to provide ~100% improvement on misaligned data in ECB 22*e71b7053SJung-uk Kim# mode and ~75% in CBC mode. For aligned data improvement can be 23*e71b7053SJung-uk Kim# observed for short inputs only, e.g. 45% for 64-byte messages in 24*e71b7053SJung-uk Kim# ECB mode, 20% in CBC. Difference in performance for aligned vs. 25*e71b7053SJung-uk Kim# misaligned data depends on misalignment and is either ~1.8x or 2.9x. 26*e71b7053SJung-uk Kim# These are approximately same factors as for hardware support, so 27*e71b7053SJung-uk Kim# there is little reason to rely on the latter. On the contrary, it 28*e71b7053SJung-uk Kim# might actually hurt performance in mixture of aligned and misaligned 29*e71b7053SJung-uk Kim# buffers, because a) if you choose to flip 'align' flag in control 30*e71b7053SJung-uk Kim# word on per-buffer basis, then you'd have to reload key context, 31*e71b7053SJung-uk Kim# which incurs penalty; b) if you choose to set 'align' flag 32*e71b7053SJung-uk Kim# permanently, it limits performance even for aligned data to ~1/2. 33*e71b7053SJung-uk Kim# All above mentioned results were collected on 1.5GHz C7. Nano on the 34*e71b7053SJung-uk Kim# other hand handles unaligned data more gracefully. Depending on 35*e71b7053SJung-uk Kim# algorithm and how unaligned data is, hardware can be up to 70% more 36*e71b7053SJung-uk Kim# efficient than below software alignment procedures, nor does 'align' 37*e71b7053SJung-uk Kim# flag have affect on aligned performance [if has any meaning at all]. 38*e71b7053SJung-uk Kim# Therefore suggestion is to unconditionally set 'align' flag on Nano 39*e71b7053SJung-uk Kim# for optimal performance. 40*e71b7053SJung-uk Kim 41*e71b7053SJung-uk Kim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 42*e71b7053SJung-uk Kimpush(@INC,"${dir}","${dir}../../crypto/perlasm"); 43*e71b7053SJung-uk Kimrequire "x86asm.pl"; 44*e71b7053SJung-uk Kim 45*e71b7053SJung-uk Kim$output=pop; 46*e71b7053SJung-uk Kimopen STDOUT,">$output"; 47*e71b7053SJung-uk Kim 48*e71b7053SJung-uk Kim&asm_init($ARGV[0]); 49*e71b7053SJung-uk Kim 50*e71b7053SJung-uk Kim%PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata 51*e71b7053SJung-uk Kim$PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16 52*e71b7053SJung-uk Kim 53*e71b7053SJung-uk Kim$ctx="edx"; 54*e71b7053SJung-uk Kim$out="edi"; 55*e71b7053SJung-uk Kim$inp="esi"; 56*e71b7053SJung-uk Kim$len="ecx"; 57*e71b7053SJung-uk Kim$chunk="ebx"; 58*e71b7053SJung-uk Kim 59*e71b7053SJung-uk Kim&function_begin_B("padlock_capability"); 60*e71b7053SJung-uk Kim &push ("ebx"); 61*e71b7053SJung-uk Kim &pushf (); 62*e71b7053SJung-uk Kim &pop ("eax"); 63*e71b7053SJung-uk Kim &mov ("ecx","eax"); 64*e71b7053SJung-uk Kim &xor ("eax",1<<21); 65*e71b7053SJung-uk Kim &push ("eax"); 66*e71b7053SJung-uk Kim &popf (); 67*e71b7053SJung-uk Kim &pushf (); 68*e71b7053SJung-uk Kim &pop ("eax"); 69*e71b7053SJung-uk Kim &xor ("ecx","eax"); 70*e71b7053SJung-uk Kim &xor ("eax","eax"); 71*e71b7053SJung-uk Kim &bt ("ecx",21); 72*e71b7053SJung-uk Kim &jnc (&label("noluck")); 73*e71b7053SJung-uk Kim &cpuid (); 74*e71b7053SJung-uk Kim &xor ("eax","eax"); 75*e71b7053SJung-uk Kim &cmp ("ebx","0x".unpack("H*",'tneC')); 76*e71b7053SJung-uk Kim &jne (&label("zhaoxin")); 77*e71b7053SJung-uk Kim &cmp ("edx","0x".unpack("H*",'Hrua')); 78*e71b7053SJung-uk Kim &jne (&label("noluck")); 79*e71b7053SJung-uk Kim &cmp ("ecx","0x".unpack("H*",'slua')); 80*e71b7053SJung-uk Kim &jne (&label("noluck")); 81*e71b7053SJung-uk Kim &jmp (&label("zhaoxinEnd")); 82*e71b7053SJung-uk Kim&set_label("zhaoxin"); 83*e71b7053SJung-uk Kim &cmp ("ebx","0x".unpack("H*",'hS ')); 84*e71b7053SJung-uk Kim &jne (&label("noluck")); 85*e71b7053SJung-uk Kim &cmp ("edx","0x".unpack("H*",'hgna')); 86*e71b7053SJung-uk Kim &jne (&label("noluck")); 87*e71b7053SJung-uk Kim &cmp ("ecx","0x".unpack("H*",' ia')); 88*e71b7053SJung-uk Kim &jne (&label("noluck")); 89*e71b7053SJung-uk Kim&set_label("zhaoxinEnd"); 90*e71b7053SJung-uk Kim &mov ("eax",0xC0000000); 91*e71b7053SJung-uk Kim &cpuid (); 92*e71b7053SJung-uk Kim &mov ("edx","eax"); 93*e71b7053SJung-uk Kim &xor ("eax","eax"); 94*e71b7053SJung-uk Kim &cmp ("edx",0xC0000001); 95*e71b7053SJung-uk Kim &jb (&label("noluck")); 96*e71b7053SJung-uk Kim &mov ("eax",1); 97*e71b7053SJung-uk Kim &cpuid (); 98*e71b7053SJung-uk Kim &or ("eax",0x0f); 99*e71b7053SJung-uk Kim &xor ("ebx","ebx"); 100*e71b7053SJung-uk Kim &and ("eax",0x0fff); 101*e71b7053SJung-uk Kim &cmp ("eax",0x06ff); # check for Nano 102*e71b7053SJung-uk Kim &sete ("bl"); 103*e71b7053SJung-uk Kim &mov ("eax",0xC0000001); 104*e71b7053SJung-uk Kim &push ("ebx"); 105*e71b7053SJung-uk Kim &cpuid (); 106*e71b7053SJung-uk Kim &pop ("ebx"); 107*e71b7053SJung-uk Kim &mov ("eax","edx"); 108*e71b7053SJung-uk Kim &shl ("ebx",4); # bit#4 denotes Nano 109*e71b7053SJung-uk Kim &and ("eax",0xffffffef); 110*e71b7053SJung-uk Kim &or ("eax","ebx") 111*e71b7053SJung-uk Kim&set_label("noluck"); 112*e71b7053SJung-uk Kim &pop ("ebx"); 113*e71b7053SJung-uk Kim &ret (); 114*e71b7053SJung-uk Kim&function_end_B("padlock_capability") 115*e71b7053SJung-uk Kim 116*e71b7053SJung-uk Kim&function_begin_B("padlock_key_bswap"); 117*e71b7053SJung-uk Kim &mov ("edx",&wparam(0)); 118*e71b7053SJung-uk Kim &mov ("ecx",&DWP(240,"edx")); 119*e71b7053SJung-uk Kim&set_label("bswap_loop"); 120*e71b7053SJung-uk Kim &mov ("eax",&DWP(0,"edx")); 121*e71b7053SJung-uk Kim &bswap ("eax"); 122*e71b7053SJung-uk Kim &mov (&DWP(0,"edx"),"eax"); 123*e71b7053SJung-uk Kim &lea ("edx",&DWP(4,"edx")); 124*e71b7053SJung-uk Kim &sub ("ecx",1); 125*e71b7053SJung-uk Kim &jnz (&label("bswap_loop")); 126*e71b7053SJung-uk Kim &ret (); 127*e71b7053SJung-uk Kim&function_end_B("padlock_key_bswap"); 128*e71b7053SJung-uk Kim 129*e71b7053SJung-uk Kim# This is heuristic key context tracing. At first one 130*e71b7053SJung-uk Kim# believes that one should use atomic swap instructions, 131*e71b7053SJung-uk Kim# but it's not actually necessary. Point is that if 132*e71b7053SJung-uk Kim# padlock_saved_context was changed by another thread 133*e71b7053SJung-uk Kim# after we've read it and before we compare it with ctx, 134*e71b7053SJung-uk Kim# our key *shall* be reloaded upon thread context switch 135*e71b7053SJung-uk Kim# and we are therefore set in either case... 136*e71b7053SJung-uk Kim&static_label("padlock_saved_context"); 137*e71b7053SJung-uk Kim 138*e71b7053SJung-uk Kim&function_begin_B("padlock_verify_context"); 139*e71b7053SJung-uk Kim &mov ($ctx,&wparam(0)); 140*e71b7053SJung-uk Kim &lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) : 141*e71b7053SJung-uk Kim &DWP(&label("padlock_saved_context")."-".&label("verify_pic_point"))); 142*e71b7053SJung-uk Kim &pushf (); 143*e71b7053SJung-uk Kim &call ("_padlock_verify_ctx"); 144*e71b7053SJung-uk Kim&set_label("verify_pic_point"); 145*e71b7053SJung-uk Kim &lea ("esp",&DWP(4,"esp")); 146*e71b7053SJung-uk Kim &ret (); 147*e71b7053SJung-uk Kim&function_end_B("padlock_verify_context"); 148*e71b7053SJung-uk Kim 149*e71b7053SJung-uk Kim&function_begin_B("_padlock_verify_ctx"); 150*e71b7053SJung-uk Kim &add ("eax",&DWP(0,"esp")) if(!($::win32 or $::coff));# &padlock_saved_context 151*e71b7053SJung-uk Kim &bt (&DWP(4,"esp"),30); # eflags 152*e71b7053SJung-uk Kim &jnc (&label("verified")); 153*e71b7053SJung-uk Kim &cmp ($ctx,&DWP(0,"eax")); 154*e71b7053SJung-uk Kim &je (&label("verified")); 155*e71b7053SJung-uk Kim &pushf (); 156*e71b7053SJung-uk Kim &popf (); 157*e71b7053SJung-uk Kim&set_label("verified"); 158*e71b7053SJung-uk Kim &mov (&DWP(0,"eax"),$ctx); 159*e71b7053SJung-uk Kim &ret (); 160*e71b7053SJung-uk Kim&function_end_B("_padlock_verify_ctx"); 161*e71b7053SJung-uk Kim 162*e71b7053SJung-uk Kim&function_begin_B("padlock_reload_key"); 163*e71b7053SJung-uk Kim &pushf (); 164*e71b7053SJung-uk Kim &popf (); 165*e71b7053SJung-uk Kim &ret (); 166*e71b7053SJung-uk Kim&function_end_B("padlock_reload_key"); 167*e71b7053SJung-uk Kim 168*e71b7053SJung-uk Kim&function_begin_B("padlock_aes_block"); 169*e71b7053SJung-uk Kim &push ("edi"); 170*e71b7053SJung-uk Kim &push ("esi"); 171*e71b7053SJung-uk Kim &push ("ebx"); 172*e71b7053SJung-uk Kim &mov ($out,&wparam(0)); # must be 16-byte aligned 173*e71b7053SJung-uk Kim &mov ($inp,&wparam(1)); # must be 16-byte aligned 174*e71b7053SJung-uk Kim &mov ($ctx,&wparam(2)); 175*e71b7053SJung-uk Kim &mov ($len,1); 176*e71b7053SJung-uk Kim &lea ("ebx",&DWP(32,$ctx)); # key 177*e71b7053SJung-uk Kim &lea ($ctx,&DWP(16,$ctx)); # control word 178*e71b7053SJung-uk Kim &data_byte(0xf3,0x0f,0xa7,0xc8); # rep xcryptecb 179*e71b7053SJung-uk Kim &pop ("ebx"); 180*e71b7053SJung-uk Kim &pop ("esi"); 181*e71b7053SJung-uk Kim &pop ("edi"); 182*e71b7053SJung-uk Kim &ret (); 183*e71b7053SJung-uk Kim&function_end_B("padlock_aes_block"); 184*e71b7053SJung-uk Kim 185*e71b7053SJung-uk Kimsub generate_mode { 186*e71b7053SJung-uk Kimmy ($mode,$opcode) = @_; 187*e71b7053SJung-uk Kim# int padlock_$mode_encrypt(void *out, const void *inp, 188*e71b7053SJung-uk Kim# struct padlock_cipher_data *ctx, size_t len); 189*e71b7053SJung-uk Kim&function_begin("padlock_${mode}_encrypt"); 190*e71b7053SJung-uk Kim &mov ($out,&wparam(0)); 191*e71b7053SJung-uk Kim &mov ($inp,&wparam(1)); 192*e71b7053SJung-uk Kim &mov ($ctx,&wparam(2)); 193*e71b7053SJung-uk Kim &mov ($len,&wparam(3)); 194*e71b7053SJung-uk Kim &test ($ctx,15); 195*e71b7053SJung-uk Kim &jnz (&label("${mode}_abort")); 196*e71b7053SJung-uk Kim &test ($len,15); 197*e71b7053SJung-uk Kim &jnz (&label("${mode}_abort")); 198*e71b7053SJung-uk Kim &lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) : 199*e71b7053SJung-uk Kim &DWP(&label("padlock_saved_context")."-".&label("${mode}_pic_point"))); 200*e71b7053SJung-uk Kim &pushf (); 201*e71b7053SJung-uk Kim &cld (); 202*e71b7053SJung-uk Kim &call ("_padlock_verify_ctx"); 203*e71b7053SJung-uk Kim&set_label("${mode}_pic_point"); 204*e71b7053SJung-uk Kim &lea ($ctx,&DWP(16,$ctx)); # control word 205*e71b7053SJung-uk Kim &xor ("eax","eax"); 206*e71b7053SJung-uk Kim if ($mode eq "ctr32") { 207*e71b7053SJung-uk Kim &movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter 208*e71b7053SJung-uk Kim } else { 209*e71b7053SJung-uk Kim &xor ("ebx","ebx"); 210*e71b7053SJung-uk Kim &test (&DWP(0,$ctx),1<<5); # align bit in control word 211*e71b7053SJung-uk Kim &jnz (&label("${mode}_aligned")); 212*e71b7053SJung-uk Kim &test ($out,0x0f); 213*e71b7053SJung-uk Kim &setz ("al"); # !out_misaligned 214*e71b7053SJung-uk Kim &test ($inp,0x0f); 215*e71b7053SJung-uk Kim &setz ("bl"); # !inp_misaligned 216*e71b7053SJung-uk Kim &test ("eax","ebx"); 217*e71b7053SJung-uk Kim &jnz (&label("${mode}_aligned")); 218*e71b7053SJung-uk Kim &neg ("eax"); 219*e71b7053SJung-uk Kim } 220*e71b7053SJung-uk Kim &mov ($chunk,$PADLOCK_CHUNK); 221*e71b7053SJung-uk Kim ¬ ("eax"); # out_misaligned?-1:0 222*e71b7053SJung-uk Kim &lea ("ebp",&DWP(-24,"esp")); 223*e71b7053SJung-uk Kim &cmp ($len,$chunk); 224*e71b7053SJung-uk Kim &cmovc ($chunk,$len); # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len 225*e71b7053SJung-uk Kim &and ("eax",$chunk); # out_misaligned?chunk:0 226*e71b7053SJung-uk Kim &mov ($chunk,$len); 227*e71b7053SJung-uk Kim &neg ("eax"); 228*e71b7053SJung-uk Kim &and ($chunk,$PADLOCK_CHUNK-1); # chunk=len%PADLOCK_CHUNK 229*e71b7053SJung-uk Kim &lea ("esp",&DWP(0,"eax","ebp")); # alloca 230*e71b7053SJung-uk Kim &mov ("eax",$PADLOCK_CHUNK); 231*e71b7053SJung-uk Kim &cmovz ($chunk,"eax"); # chunk=chunk?:PADLOCK_CHUNK 232*e71b7053SJung-uk Kim &mov ("eax","ebp"); 233*e71b7053SJung-uk Kim &and ("ebp",-16); 234*e71b7053SJung-uk Kim &and ("esp",-16); 235*e71b7053SJung-uk Kim &mov (&DWP(16,"ebp"),"eax"); 236*e71b7053SJung-uk Kim if ($PADLOCK_PREFETCH{$mode}) { 237*e71b7053SJung-uk Kim &cmp ($len,$chunk); 238*e71b7053SJung-uk Kim &ja (&label("${mode}_loop")); 239*e71b7053SJung-uk Kim &mov ("eax",$inp); # check if prefetch crosses page 240*e71b7053SJung-uk Kim &cmp ("ebp","esp"); 241*e71b7053SJung-uk Kim &cmove ("eax",$out); 242*e71b7053SJung-uk Kim &add ("eax",$len); 243*e71b7053SJung-uk Kim &neg ("eax"); 244*e71b7053SJung-uk Kim &and ("eax",0xfff); # distance to page boundary 245*e71b7053SJung-uk Kim &cmp ("eax",$PADLOCK_PREFETCH{$mode}); 246*e71b7053SJung-uk Kim &mov ("eax",-$PADLOCK_PREFETCH{$mode}); 247*e71b7053SJung-uk Kim &cmovae ("eax",$chunk); # mask=distance<prefetch?-prefetch:-1 248*e71b7053SJung-uk Kim &and ($chunk,"eax"); 249*e71b7053SJung-uk Kim &jz (&label("${mode}_unaligned_tail")); 250*e71b7053SJung-uk Kim } 251*e71b7053SJung-uk Kim &jmp (&label("${mode}_loop")); 252*e71b7053SJung-uk Kim 253*e71b7053SJung-uk Kim&set_label("${mode}_loop",16); 254*e71b7053SJung-uk Kim &mov (&DWP(0,"ebp"),$out); # save parameters 255*e71b7053SJung-uk Kim &mov (&DWP(4,"ebp"),$inp); 256*e71b7053SJung-uk Kim &mov (&DWP(8,"ebp"),$len); 257*e71b7053SJung-uk Kim &mov ($len,$chunk); 258*e71b7053SJung-uk Kim &mov (&DWP(12,"ebp"),$chunk); # chunk 259*e71b7053SJung-uk Kim if ($mode eq "ctr32") { 260*e71b7053SJung-uk Kim &mov ("ecx",&DWP(-4,$ctx)); 261*e71b7053SJung-uk Kim &xor ($out,$out); 262*e71b7053SJung-uk Kim &mov ("eax",&DWP(-8,$ctx)); # borrow $len 263*e71b7053SJung-uk Kim&set_label("${mode}_prepare"); 264*e71b7053SJung-uk Kim &mov (&DWP(12,"esp",$out),"ecx"); 265*e71b7053SJung-uk Kim &bswap ("ecx"); 266*e71b7053SJung-uk Kim &movq (&QWP(0,"esp",$out),"mm0"); 267*e71b7053SJung-uk Kim &inc ("ecx"); 268*e71b7053SJung-uk Kim &mov (&DWP(8,"esp",$out),"eax"); 269*e71b7053SJung-uk Kim &bswap ("ecx"); 270*e71b7053SJung-uk Kim &lea ($out,&DWP(16,$out)); 271*e71b7053SJung-uk Kim &cmp ($out,$chunk); 272*e71b7053SJung-uk Kim &jb (&label("${mode}_prepare")); 273*e71b7053SJung-uk Kim 274*e71b7053SJung-uk Kim &mov (&DWP(-4,$ctx),"ecx"); 275*e71b7053SJung-uk Kim &lea ($inp,&DWP(0,"esp")); 276*e71b7053SJung-uk Kim &lea ($out,&DWP(0,"esp")); 277*e71b7053SJung-uk Kim &mov ($len,$chunk); 278*e71b7053SJung-uk Kim } else { 279*e71b7053SJung-uk Kim &test ($out,0x0f); # out_misaligned 280*e71b7053SJung-uk Kim &cmovnz ($out,"esp"); 281*e71b7053SJung-uk Kim &test ($inp,0x0f); # inp_misaligned 282*e71b7053SJung-uk Kim &jz (&label("${mode}_inp_aligned")); 283*e71b7053SJung-uk Kim &shr ($len,2); 284*e71b7053SJung-uk Kim &data_byte(0xf3,0xa5); # rep movsl 285*e71b7053SJung-uk Kim &sub ($out,$chunk); 286*e71b7053SJung-uk Kim &mov ($len,$chunk); 287*e71b7053SJung-uk Kim &mov ($inp,$out); 288*e71b7053SJung-uk Kim&set_label("${mode}_inp_aligned"); 289*e71b7053SJung-uk Kim } 290*e71b7053SJung-uk Kim &lea ("eax",&DWP(-16,$ctx)); # ivp 291*e71b7053SJung-uk Kim &lea ("ebx",&DWP(16,$ctx)); # key 292*e71b7053SJung-uk Kim &shr ($len,4); # len/=AES_BLOCK_SIZE 293*e71b7053SJung-uk Kim &data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt* 294*e71b7053SJung-uk Kim if ($mode !~ /ecb|ctr/) { 295*e71b7053SJung-uk Kim &movaps ("xmm0",&QWP(0,"eax")); 296*e71b7053SJung-uk Kim &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv 297*e71b7053SJung-uk Kim } 298*e71b7053SJung-uk Kim &mov ($out,&DWP(0,"ebp")); # restore parameters 299*e71b7053SJung-uk Kim &mov ($chunk,&DWP(12,"ebp")); 300*e71b7053SJung-uk Kim if ($mode eq "ctr32") { 301*e71b7053SJung-uk Kim &mov ($inp,&DWP(4,"ebp")); 302*e71b7053SJung-uk Kim &xor ($len,$len); 303*e71b7053SJung-uk Kim&set_label("${mode}_xor"); 304*e71b7053SJung-uk Kim &movups ("xmm1",&QWP(0,$inp,$len)); 305*e71b7053SJung-uk Kim &lea ($len,&DWP(16,$len)); 306*e71b7053SJung-uk Kim &pxor ("xmm1",&QWP(-16,"esp",$len)); 307*e71b7053SJung-uk Kim &movups (&QWP(-16,$out,$len),"xmm1"); 308*e71b7053SJung-uk Kim &cmp ($len,$chunk); 309*e71b7053SJung-uk Kim &jb (&label("${mode}_xor")); 310*e71b7053SJung-uk Kim } else { 311*e71b7053SJung-uk Kim &test ($out,0x0f); 312*e71b7053SJung-uk Kim &jz (&label("${mode}_out_aligned")); 313*e71b7053SJung-uk Kim &mov ($len,$chunk); 314*e71b7053SJung-uk Kim &lea ($inp,&DWP(0,"esp")); 315*e71b7053SJung-uk Kim &shr ($len,2); 316*e71b7053SJung-uk Kim &data_byte(0xf3,0xa5); # rep movsl 317*e71b7053SJung-uk Kim &sub ($out,$chunk); 318*e71b7053SJung-uk Kim&set_label("${mode}_out_aligned"); 319*e71b7053SJung-uk Kim &mov ($inp,&DWP(4,"ebp")); 320*e71b7053SJung-uk Kim } 321*e71b7053SJung-uk Kim &mov ($len,&DWP(8,"ebp")); 322*e71b7053SJung-uk Kim &add ($out,$chunk); 323*e71b7053SJung-uk Kim &add ($inp,$chunk); 324*e71b7053SJung-uk Kim &sub ($len,$chunk); 325*e71b7053SJung-uk Kim &mov ($chunk,$PADLOCK_CHUNK); 326*e71b7053SJung-uk Kim if (!$PADLOCK_PREFETCH{$mode}) { 327*e71b7053SJung-uk Kim &jnz (&label("${mode}_loop")); 328*e71b7053SJung-uk Kim } else { 329*e71b7053SJung-uk Kim &jz (&label("${mode}_break")); 330*e71b7053SJung-uk Kim &cmp ($len,$chunk); 331*e71b7053SJung-uk Kim &jae (&label("${mode}_loop")); 332*e71b7053SJung-uk Kim 333*e71b7053SJung-uk Kim&set_label("${mode}_unaligned_tail"); 334*e71b7053SJung-uk Kim &xor ("eax","eax"); 335*e71b7053SJung-uk Kim &cmp ("esp","ebp"); 336*e71b7053SJung-uk Kim &cmove ("eax",$len); 337*e71b7053SJung-uk Kim &sub ("esp","eax"); # alloca 338*e71b7053SJung-uk Kim &mov ("eax", $out); # save parameters 339*e71b7053SJung-uk Kim &mov ($chunk,$len); 340*e71b7053SJung-uk Kim &shr ($len,2); 341*e71b7053SJung-uk Kim &lea ($out,&DWP(0,"esp")); 342*e71b7053SJung-uk Kim &data_byte(0xf3,0xa5); # rep movsl 343*e71b7053SJung-uk Kim &mov ($inp,"esp"); 344*e71b7053SJung-uk Kim &mov ($out,"eax"); # restore parameters 345*e71b7053SJung-uk Kim &mov ($len,$chunk); 346*e71b7053SJung-uk Kim &jmp (&label("${mode}_loop")); 347*e71b7053SJung-uk Kim 348*e71b7053SJung-uk Kim&set_label("${mode}_break",16); 349*e71b7053SJung-uk Kim } 350*e71b7053SJung-uk Kim if ($mode ne "ctr32") { 351*e71b7053SJung-uk Kim &cmp ("esp","ebp"); 352*e71b7053SJung-uk Kim &je (&label("${mode}_done")); 353*e71b7053SJung-uk Kim } 354*e71b7053SJung-uk Kim &pxor ("xmm0","xmm0"); 355*e71b7053SJung-uk Kim &lea ("eax",&DWP(0,"esp")); 356*e71b7053SJung-uk Kim&set_label("${mode}_bzero"); 357*e71b7053SJung-uk Kim &movaps (&QWP(0,"eax"),"xmm0"); 358*e71b7053SJung-uk Kim &lea ("eax",&DWP(16,"eax")); 359*e71b7053SJung-uk Kim &cmp ("ebp","eax"); 360*e71b7053SJung-uk Kim &ja (&label("${mode}_bzero")); 361*e71b7053SJung-uk Kim 362*e71b7053SJung-uk Kim&set_label("${mode}_done"); 363*e71b7053SJung-uk Kim &mov ("ebp",&DWP(16,"ebp")); 364*e71b7053SJung-uk Kim &lea ("esp",&DWP(24,"ebp")); 365*e71b7053SJung-uk Kim if ($mode ne "ctr32") { 366*e71b7053SJung-uk Kim &jmp (&label("${mode}_exit")); 367*e71b7053SJung-uk Kim 368*e71b7053SJung-uk Kim&set_label("${mode}_aligned",16); 369*e71b7053SJung-uk Kim if ($PADLOCK_PREFETCH{$mode}) { 370*e71b7053SJung-uk Kim &lea ("ebp",&DWP(0,$inp,$len)); 371*e71b7053SJung-uk Kim &neg ("ebp"); 372*e71b7053SJung-uk Kim &and ("ebp",0xfff); # distance to page boundary 373*e71b7053SJung-uk Kim &xor ("eax","eax"); 374*e71b7053SJung-uk Kim &cmp ("ebp",$PADLOCK_PREFETCH{$mode}); 375*e71b7053SJung-uk Kim &mov ("ebp",$PADLOCK_PREFETCH{$mode}-1); 376*e71b7053SJung-uk Kim &cmovae ("ebp","eax"); 377*e71b7053SJung-uk Kim &and ("ebp",$len); # remainder 378*e71b7053SJung-uk Kim &sub ($len,"ebp"); 379*e71b7053SJung-uk Kim &jz (&label("${mode}_aligned_tail")); 380*e71b7053SJung-uk Kim } 381*e71b7053SJung-uk Kim &lea ("eax",&DWP(-16,$ctx)); # ivp 382*e71b7053SJung-uk Kim &lea ("ebx",&DWP(16,$ctx)); # key 383*e71b7053SJung-uk Kim &shr ($len,4); # len/=AES_BLOCK_SIZE 384*e71b7053SJung-uk Kim &data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt* 385*e71b7053SJung-uk Kim if ($mode ne "ecb") { 386*e71b7053SJung-uk Kim &movaps ("xmm0",&QWP(0,"eax")); 387*e71b7053SJung-uk Kim &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv 388*e71b7053SJung-uk Kim } 389*e71b7053SJung-uk Kim if ($PADLOCK_PREFETCH{$mode}) { 390*e71b7053SJung-uk Kim &test ("ebp","ebp"); 391*e71b7053SJung-uk Kim &jz (&label("${mode}_exit")); 392*e71b7053SJung-uk Kim 393*e71b7053SJung-uk Kim&set_label("${mode}_aligned_tail"); 394*e71b7053SJung-uk Kim &mov ($len,"ebp"); 395*e71b7053SJung-uk Kim &lea ("ebp",&DWP(-24,"esp")); 396*e71b7053SJung-uk Kim &mov ("esp","ebp"); 397*e71b7053SJung-uk Kim &mov ("eax","ebp"); 398*e71b7053SJung-uk Kim &sub ("esp",$len); 399*e71b7053SJung-uk Kim &and ("ebp",-16); 400*e71b7053SJung-uk Kim &and ("esp",-16); 401*e71b7053SJung-uk Kim &mov (&DWP(16,"ebp"),"eax"); 402*e71b7053SJung-uk Kim &mov ("eax", $out); # save parameters 403*e71b7053SJung-uk Kim &mov ($chunk,$len); 404*e71b7053SJung-uk Kim &shr ($len,2); 405*e71b7053SJung-uk Kim &lea ($out,&DWP(0,"esp")); 406*e71b7053SJung-uk Kim &data_byte(0xf3,0xa5); # rep movsl 407*e71b7053SJung-uk Kim &mov ($inp,"esp"); 408*e71b7053SJung-uk Kim &mov ($out,"eax"); # restore parameters 409*e71b7053SJung-uk Kim &mov ($len,$chunk); 410*e71b7053SJung-uk Kim &jmp (&label("${mode}_loop")); 411*e71b7053SJung-uk Kim } 412*e71b7053SJung-uk Kim&set_label("${mode}_exit"); } 413*e71b7053SJung-uk Kim &mov ("eax",1); 414*e71b7053SJung-uk Kim &lea ("esp",&DWP(4,"esp")); # popf 415*e71b7053SJung-uk Kim &emms () if ($mode eq "ctr32"); 416*e71b7053SJung-uk Kim&set_label("${mode}_abort"); 417*e71b7053SJung-uk Kim&function_end("padlock_${mode}_encrypt"); 418*e71b7053SJung-uk Kim} 419*e71b7053SJung-uk Kim 420*e71b7053SJung-uk Kim&generate_mode("ecb",0xc8); 421*e71b7053SJung-uk Kim&generate_mode("cbc",0xd0); 422*e71b7053SJung-uk Kim&generate_mode("cfb",0xe0); 423*e71b7053SJung-uk Kim&generate_mode("ofb",0xe8); 424*e71b7053SJung-uk Kim&generate_mode("ctr32",0xc8); # yes, it implements own CTR with ECB opcode, 425*e71b7053SJung-uk Kim # because hardware CTR was introduced later 426*e71b7053SJung-uk Kim # and even has errata on certain C7 stepping. 427*e71b7053SJung-uk Kim # own implementation *always* works, though 428*e71b7053SJung-uk Kim # ~15% slower than dedicated hardware... 429*e71b7053SJung-uk Kim 430*e71b7053SJung-uk Kim&function_begin_B("padlock_xstore"); 431*e71b7053SJung-uk Kim &push ("edi"); 432*e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 433*e71b7053SJung-uk Kim &mov ("edx",&wparam(1)); 434*e71b7053SJung-uk Kim &data_byte(0x0f,0xa7,0xc0); # xstore 435*e71b7053SJung-uk Kim &pop ("edi"); 436*e71b7053SJung-uk Kim &ret (); 437*e71b7053SJung-uk Kim&function_end_B("padlock_xstore"); 438*e71b7053SJung-uk Kim 439*e71b7053SJung-uk Kim&function_begin_B("_win32_segv_handler"); 440*e71b7053SJung-uk Kim &mov ("eax",1); # ExceptionContinueSearch 441*e71b7053SJung-uk Kim &mov ("edx",&wparam(0)); # *ExceptionRecord 442*e71b7053SJung-uk Kim &mov ("ecx",&wparam(2)); # *ContextRecord 443*e71b7053SJung-uk Kim &cmp (&DWP(0,"edx"),0xC0000005) # ExceptionRecord->ExceptionCode == STATUS_ACCESS_VIOLATION 444*e71b7053SJung-uk Kim &jne (&label("ret")); 445*e71b7053SJung-uk Kim &add (&DWP(184,"ecx"),4); # skip over rep sha* 446*e71b7053SJung-uk Kim &mov ("eax",0); # ExceptionContinueExecution 447*e71b7053SJung-uk Kim&set_label("ret"); 448*e71b7053SJung-uk Kim &ret (); 449*e71b7053SJung-uk Kim&function_end_B("_win32_segv_handler"); 450*e71b7053SJung-uk Kim&safeseh("_win32_segv_handler") if ($::win32); 451*e71b7053SJung-uk Kim 452*e71b7053SJung-uk Kim&function_begin_B("padlock_sha1_oneshot"); 453*e71b7053SJung-uk Kim &push ("edi"); 454*e71b7053SJung-uk Kim &push ("esi"); 455*e71b7053SJung-uk Kim &xor ("eax","eax"); 456*e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 457*e71b7053SJung-uk Kim &mov ("esi",&wparam(1)); 458*e71b7053SJung-uk Kim &mov ("ecx",&wparam(2)); 459*e71b7053SJung-uk Kim if ($::win32 or $::coff) { 460*e71b7053SJung-uk Kim &push (&::islabel("_win32_segv_handler")); 461*e71b7053SJung-uk Kim &data_byte(0x64,0xff,0x30); # push %fs:(%eax) 462*e71b7053SJung-uk Kim &data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax) 463*e71b7053SJung-uk Kim } 464*e71b7053SJung-uk Kim &mov ("edx","esp"); # put aside %esp 465*e71b7053SJung-uk Kim &add ("esp",-128); # 32 is enough but spec says 128 466*e71b7053SJung-uk Kim &movups ("xmm0",&QWP(0,"edi")); # copy-in context 467*e71b7053SJung-uk Kim &and ("esp",-16); 468*e71b7053SJung-uk Kim &mov ("eax",&DWP(16,"edi")); 469*e71b7053SJung-uk Kim &movaps (&QWP(0,"esp"),"xmm0"); 470*e71b7053SJung-uk Kim &mov ("edi","esp"); 471*e71b7053SJung-uk Kim &mov (&DWP(16,"esp"),"eax"); 472*e71b7053SJung-uk Kim &xor ("eax","eax"); 473*e71b7053SJung-uk Kim &data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1 474*e71b7053SJung-uk Kim &movaps ("xmm0",&QWP(0,"esp")); 475*e71b7053SJung-uk Kim &mov ("eax",&DWP(16,"esp")); 476*e71b7053SJung-uk Kim &mov ("esp","edx"); # restore %esp 477*e71b7053SJung-uk Kim if ($::win32 or $::coff) { 478*e71b7053SJung-uk Kim &data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0 479*e71b7053SJung-uk Kim &lea ("esp",&DWP(4,"esp")); 480*e71b7053SJung-uk Kim } 481*e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 482*e71b7053SJung-uk Kim &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 483*e71b7053SJung-uk Kim &mov (&DWP(16,"edi"),"eax"); 484*e71b7053SJung-uk Kim &pop ("esi"); 485*e71b7053SJung-uk Kim &pop ("edi"); 486*e71b7053SJung-uk Kim &ret (); 487*e71b7053SJung-uk Kim&function_end_B("padlock_sha1_oneshot"); 488*e71b7053SJung-uk Kim 489*e71b7053SJung-uk Kim&function_begin_B("padlock_sha1_blocks"); 490*e71b7053SJung-uk Kim &push ("edi"); 491*e71b7053SJung-uk Kim &push ("esi"); 492*e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 493*e71b7053SJung-uk Kim &mov ("esi",&wparam(1)); 494*e71b7053SJung-uk Kim &mov ("edx","esp"); # put aside %esp 495*e71b7053SJung-uk Kim &mov ("ecx",&wparam(2)); 496*e71b7053SJung-uk Kim &add ("esp",-128); 497*e71b7053SJung-uk Kim &movups ("xmm0",&QWP(0,"edi")); # copy-in context 498*e71b7053SJung-uk Kim &and ("esp",-16); 499*e71b7053SJung-uk Kim &mov ("eax",&DWP(16,"edi")); 500*e71b7053SJung-uk Kim &movaps (&QWP(0,"esp"),"xmm0"); 501*e71b7053SJung-uk Kim &mov ("edi","esp"); 502*e71b7053SJung-uk Kim &mov (&DWP(16,"esp"),"eax"); 503*e71b7053SJung-uk Kim &mov ("eax",-1); 504*e71b7053SJung-uk Kim &data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1 505*e71b7053SJung-uk Kim &movaps ("xmm0",&QWP(0,"esp")); 506*e71b7053SJung-uk Kim &mov ("eax",&DWP(16,"esp")); 507*e71b7053SJung-uk Kim &mov ("esp","edx"); # restore %esp 508*e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 509*e71b7053SJung-uk Kim &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 510*e71b7053SJung-uk Kim &mov (&DWP(16,"edi"),"eax"); 511*e71b7053SJung-uk Kim &pop ("esi"); 512*e71b7053SJung-uk Kim &pop ("edi"); 513*e71b7053SJung-uk Kim &ret (); 514*e71b7053SJung-uk Kim&function_end_B("padlock_sha1_blocks"); 515*e71b7053SJung-uk Kim 516*e71b7053SJung-uk Kim&function_begin_B("padlock_sha256_oneshot"); 517*e71b7053SJung-uk Kim &push ("edi"); 518*e71b7053SJung-uk Kim &push ("esi"); 519*e71b7053SJung-uk Kim &xor ("eax","eax"); 520*e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 521*e71b7053SJung-uk Kim &mov ("esi",&wparam(1)); 522*e71b7053SJung-uk Kim &mov ("ecx",&wparam(2)); 523*e71b7053SJung-uk Kim if ($::win32 or $::coff) { 524*e71b7053SJung-uk Kim &push (&::islabel("_win32_segv_handler")); 525*e71b7053SJung-uk Kim &data_byte(0x64,0xff,0x30); # push %fs:(%eax) 526*e71b7053SJung-uk Kim &data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax) 527*e71b7053SJung-uk Kim } 528*e71b7053SJung-uk Kim &mov ("edx","esp"); # put aside %esp 529*e71b7053SJung-uk Kim &add ("esp",-128); 530*e71b7053SJung-uk Kim &movups ("xmm0",&QWP(0,"edi")); # copy-in context 531*e71b7053SJung-uk Kim &and ("esp",-16); 532*e71b7053SJung-uk Kim &movups ("xmm1",&QWP(16,"edi")); 533*e71b7053SJung-uk Kim &movaps (&QWP(0,"esp"),"xmm0"); 534*e71b7053SJung-uk Kim &mov ("edi","esp"); 535*e71b7053SJung-uk Kim &movaps (&QWP(16,"esp"),"xmm1"); 536*e71b7053SJung-uk Kim &xor ("eax","eax"); 537*e71b7053SJung-uk Kim &data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256 538*e71b7053SJung-uk Kim &movaps ("xmm0",&QWP(0,"esp")); 539*e71b7053SJung-uk Kim &movaps ("xmm1",&QWP(16,"esp")); 540*e71b7053SJung-uk Kim &mov ("esp","edx"); # restore %esp 541*e71b7053SJung-uk Kim if ($::win32 or $::coff) { 542*e71b7053SJung-uk Kim &data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0 543*e71b7053SJung-uk Kim &lea ("esp",&DWP(4,"esp")); 544*e71b7053SJung-uk Kim } 545*e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 546*e71b7053SJung-uk Kim &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 547*e71b7053SJung-uk Kim &movups (&QWP(16,"edi"),"xmm1"); 548*e71b7053SJung-uk Kim &pop ("esi"); 549*e71b7053SJung-uk Kim &pop ("edi"); 550*e71b7053SJung-uk Kim &ret (); 551*e71b7053SJung-uk Kim&function_end_B("padlock_sha256_oneshot"); 552*e71b7053SJung-uk Kim 553*e71b7053SJung-uk Kim&function_begin_B("padlock_sha256_blocks"); 554*e71b7053SJung-uk Kim &push ("edi"); 555*e71b7053SJung-uk Kim &push ("esi"); 556*e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 557*e71b7053SJung-uk Kim &mov ("esi",&wparam(1)); 558*e71b7053SJung-uk Kim &mov ("ecx",&wparam(2)); 559*e71b7053SJung-uk Kim &mov ("edx","esp"); # put aside %esp 560*e71b7053SJung-uk Kim &add ("esp",-128); 561*e71b7053SJung-uk Kim &movups ("xmm0",&QWP(0,"edi")); # copy-in context 562*e71b7053SJung-uk Kim &and ("esp",-16); 563*e71b7053SJung-uk Kim &movups ("xmm1",&QWP(16,"edi")); 564*e71b7053SJung-uk Kim &movaps (&QWP(0,"esp"),"xmm0"); 565*e71b7053SJung-uk Kim &mov ("edi","esp"); 566*e71b7053SJung-uk Kim &movaps (&QWP(16,"esp"),"xmm1"); 567*e71b7053SJung-uk Kim &mov ("eax",-1); 568*e71b7053SJung-uk Kim &data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256 569*e71b7053SJung-uk Kim &movaps ("xmm0",&QWP(0,"esp")); 570*e71b7053SJung-uk Kim &movaps ("xmm1",&QWP(16,"esp")); 571*e71b7053SJung-uk Kim &mov ("esp","edx"); # restore %esp 572*e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 573*e71b7053SJung-uk Kim &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 574*e71b7053SJung-uk Kim &movups (&QWP(16,"edi"),"xmm1"); 575*e71b7053SJung-uk Kim &pop ("esi"); 576*e71b7053SJung-uk Kim &pop ("edi"); 577*e71b7053SJung-uk Kim &ret (); 578*e71b7053SJung-uk Kim&function_end_B("padlock_sha256_blocks"); 579*e71b7053SJung-uk Kim 580*e71b7053SJung-uk Kim&function_begin_B("padlock_sha512_blocks"); 581*e71b7053SJung-uk Kim &push ("edi"); 582*e71b7053SJung-uk Kim &push ("esi"); 583*e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 584*e71b7053SJung-uk Kim &mov ("esi",&wparam(1)); 585*e71b7053SJung-uk Kim &mov ("ecx",&wparam(2)); 586*e71b7053SJung-uk Kim &mov ("edx","esp"); # put aside %esp 587*e71b7053SJung-uk Kim &add ("esp",-128); 588*e71b7053SJung-uk Kim &movups ("xmm0",&QWP(0,"edi")); # copy-in context 589*e71b7053SJung-uk Kim &and ("esp",-16); 590*e71b7053SJung-uk Kim &movups ("xmm1",&QWP(16,"edi")); 591*e71b7053SJung-uk Kim &movups ("xmm2",&QWP(32,"edi")); 592*e71b7053SJung-uk Kim &movups ("xmm3",&QWP(48,"edi")); 593*e71b7053SJung-uk Kim &movaps (&QWP(0,"esp"),"xmm0"); 594*e71b7053SJung-uk Kim &mov ("edi","esp"); 595*e71b7053SJung-uk Kim &movaps (&QWP(16,"esp"),"xmm1"); 596*e71b7053SJung-uk Kim &movaps (&QWP(32,"esp"),"xmm2"); 597*e71b7053SJung-uk Kim &movaps (&QWP(48,"esp"),"xmm3"); 598*e71b7053SJung-uk Kim &data_byte(0xf3,0x0f,0xa6,0xe0); # rep xsha512 599*e71b7053SJung-uk Kim &movaps ("xmm0",&QWP(0,"esp")); 600*e71b7053SJung-uk Kim &movaps ("xmm1",&QWP(16,"esp")); 601*e71b7053SJung-uk Kim &movaps ("xmm2",&QWP(32,"esp")); 602*e71b7053SJung-uk Kim &movaps ("xmm3",&QWP(48,"esp")); 603*e71b7053SJung-uk Kim &mov ("esp","edx"); # restore %esp 604*e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 605*e71b7053SJung-uk Kim &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 606*e71b7053SJung-uk Kim &movups (&QWP(16,"edi"),"xmm1"); 607*e71b7053SJung-uk Kim &movups (&QWP(32,"edi"),"xmm2"); 608*e71b7053SJung-uk Kim &movups (&QWP(48,"edi"),"xmm3"); 609*e71b7053SJung-uk Kim &pop ("esi"); 610*e71b7053SJung-uk Kim &pop ("edi"); 611*e71b7053SJung-uk Kim &ret (); 612*e71b7053SJung-uk Kim&function_end_B("padlock_sha512_blocks"); 613*e71b7053SJung-uk Kim 614*e71b7053SJung-uk Kim&asciz ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>"); 615*e71b7053SJung-uk Kim&align (16); 616*e71b7053SJung-uk Kim 617*e71b7053SJung-uk Kim&dataseg(); 618*e71b7053SJung-uk Kim# Essentially this variable belongs in thread local storage. 619*e71b7053SJung-uk Kim# Having this variable global on the other hand can only cause 620*e71b7053SJung-uk Kim# few bogus key reloads [if any at all on signle-CPU system], 621*e71b7053SJung-uk Kim# so we accept the penalty... 622*e71b7053SJung-uk Kim&set_label("padlock_saved_context",4); 623*e71b7053SJung-uk Kim&data_word(0); 624*e71b7053SJung-uk Kim 625*e71b7053SJung-uk Kim&asm_finish(); 626*e71b7053SJung-uk Kim 627*e71b7053SJung-uk Kimclose STDOUT; 628