1e71b7053SJung-uk Kim#! /usr/bin/env perl 2640242a5SJung-uk Kim# Copyright 2011-2023 The OpenSSL Project Authors. All Rights Reserved. 3e71b7053SJung-uk Kim# 4*b077aed3SPierre Pronchery# Licensed under the Apache License 2.0 (the "License"). You may not use 5e71b7053SJung-uk Kim# this file except in compliance with the License. You can obtain a copy 6e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at 7e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html 8e71b7053SJung-uk Kim 9e71b7053SJung-uk Kim 10e71b7053SJung-uk Kim# ==================================================================== 11e71b7053SJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e71b7053SJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and 13e71b7053SJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further 14e71b7053SJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/. 15e71b7053SJung-uk Kim# ==================================================================== 16e71b7053SJung-uk Kim 17e71b7053SJung-uk Kim# September 2011 18e71b7053SJung-uk Kim# 19e71b7053SJung-uk Kim# Assembler helpers for Padlock engine. Compared to original engine 20e71b7053SJung-uk Kim# version relying on inline assembler and compiled with gcc 3.4.6 it 21e71b7053SJung-uk Kim# was measured to provide ~100% improvement on misaligned data in ECB 22e71b7053SJung-uk Kim# mode and ~75% in CBC mode. For aligned data improvement can be 23e71b7053SJung-uk Kim# observed for short inputs only, e.g. 45% for 64-byte messages in 24e71b7053SJung-uk Kim# ECB mode, 20% in CBC. Difference in performance for aligned vs. 25e71b7053SJung-uk Kim# misaligned data depends on misalignment and is either ~1.8x or 2.9x. 26e71b7053SJung-uk Kim# These are approximately same factors as for hardware support, so 27e71b7053SJung-uk Kim# there is little reason to rely on the latter. On the contrary, it 28e71b7053SJung-uk Kim# might actually hurt performance in mixture of aligned and misaligned 29e71b7053SJung-uk Kim# buffers, because a) if you choose to flip 'align' flag in control 30e71b7053SJung-uk Kim# word on per-buffer basis, then you'd have to reload key context, 31e71b7053SJung-uk Kim# which incurs penalty; b) if you choose to set 'align' flag 32e71b7053SJung-uk Kim# permanently, it limits performance even for aligned data to ~1/2. 33e71b7053SJung-uk Kim# All above mentioned results were collected on 1.5GHz C7. Nano on the 34e71b7053SJung-uk Kim# other hand handles unaligned data more gracefully. Depending on 35e71b7053SJung-uk Kim# algorithm and how unaligned data is, hardware can be up to 70% more 36e71b7053SJung-uk Kim# efficient than below software alignment procedures, nor does 'align' 37e71b7053SJung-uk Kim# flag have affect on aligned performance [if has any meaning at all]. 38e71b7053SJung-uk Kim# Therefore suggestion is to unconditionally set 'align' flag on Nano 39e71b7053SJung-uk Kim# for optimal performance. 40e71b7053SJung-uk Kim 41e71b7053SJung-uk Kim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 42e71b7053SJung-uk Kimpush(@INC,"${dir}","${dir}../../crypto/perlasm"); 43e71b7053SJung-uk Kimrequire "x86asm.pl"; 44e71b7053SJung-uk Kim 45*b077aed3SPierre Pronchery$output=pop and open STDOUT,">$output"; 46e71b7053SJung-uk Kim 47e71b7053SJung-uk Kim&asm_init($ARGV[0]); 48e71b7053SJung-uk Kim 49e71b7053SJung-uk Kim%PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata 50e71b7053SJung-uk Kim$PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16 51e71b7053SJung-uk Kim 52e71b7053SJung-uk Kim$ctx="edx"; 53e71b7053SJung-uk Kim$out="edi"; 54e71b7053SJung-uk Kim$inp="esi"; 55e71b7053SJung-uk Kim$len="ecx"; 56e71b7053SJung-uk Kim$chunk="ebx"; 57e71b7053SJung-uk Kim 58e71b7053SJung-uk Kim&function_begin_B("padlock_capability"); 59e71b7053SJung-uk Kim &push ("ebx"); 60e71b7053SJung-uk Kim &pushf (); 61e71b7053SJung-uk Kim &pop ("eax"); 62e71b7053SJung-uk Kim &mov ("ecx","eax"); 63e71b7053SJung-uk Kim &xor ("eax",1<<21); 64e71b7053SJung-uk Kim &push ("eax"); 65e71b7053SJung-uk Kim &popf (); 66e71b7053SJung-uk Kim &pushf (); 67e71b7053SJung-uk Kim &pop ("eax"); 68e71b7053SJung-uk Kim &xor ("ecx","eax"); 69e71b7053SJung-uk Kim &xor ("eax","eax"); 70e71b7053SJung-uk Kim &bt ("ecx",21); 71e71b7053SJung-uk Kim &jnc (&label("noluck")); 72e71b7053SJung-uk Kim &cpuid (); 73e71b7053SJung-uk Kim &xor ("eax","eax"); 74e71b7053SJung-uk Kim &cmp ("ebx","0x".unpack("H*",'tneC')); 75e71b7053SJung-uk Kim &jne (&label("zhaoxin")); 76e71b7053SJung-uk Kim &cmp ("edx","0x".unpack("H*",'Hrua')); 77e71b7053SJung-uk Kim &jne (&label("noluck")); 78e71b7053SJung-uk Kim &cmp ("ecx","0x".unpack("H*",'slua')); 79e71b7053SJung-uk Kim &jne (&label("noluck")); 80e71b7053SJung-uk Kim &jmp (&label("zhaoxinEnd")); 81e71b7053SJung-uk Kim&set_label("zhaoxin"); 82e71b7053SJung-uk Kim &cmp ("ebx","0x".unpack("H*",'hS ')); 83e71b7053SJung-uk Kim &jne (&label("noluck")); 84e71b7053SJung-uk Kim &cmp ("edx","0x".unpack("H*",'hgna')); 85e71b7053SJung-uk Kim &jne (&label("noluck")); 86e71b7053SJung-uk Kim &cmp ("ecx","0x".unpack("H*",' ia')); 87e71b7053SJung-uk Kim &jne (&label("noluck")); 88e71b7053SJung-uk Kim&set_label("zhaoxinEnd"); 89e71b7053SJung-uk Kim &mov ("eax",0xC0000000); 90e71b7053SJung-uk Kim &cpuid (); 91e71b7053SJung-uk Kim &mov ("edx","eax"); 92e71b7053SJung-uk Kim &xor ("eax","eax"); 93e71b7053SJung-uk Kim &cmp ("edx",0xC0000001); 94e71b7053SJung-uk Kim &jb (&label("noluck")); 95e71b7053SJung-uk Kim &mov ("eax",1); 96e71b7053SJung-uk Kim &cpuid (); 97e71b7053SJung-uk Kim &or ("eax",0x0f); 98e71b7053SJung-uk Kim &xor ("ebx","ebx"); 99e71b7053SJung-uk Kim &and ("eax",0x0fff); 100e71b7053SJung-uk Kim &cmp ("eax",0x06ff); # check for Nano 101e71b7053SJung-uk Kim &sete ("bl"); 102e71b7053SJung-uk Kim &mov ("eax",0xC0000001); 103e71b7053SJung-uk Kim &push ("ebx"); 104e71b7053SJung-uk Kim &cpuid (); 105e71b7053SJung-uk Kim &pop ("ebx"); 106e71b7053SJung-uk Kim &mov ("eax","edx"); 107e71b7053SJung-uk Kim &shl ("ebx",4); # bit#4 denotes Nano 108e71b7053SJung-uk Kim &and ("eax",0xffffffef); 109e71b7053SJung-uk Kim &or ("eax","ebx") 110e71b7053SJung-uk Kim&set_label("noluck"); 111e71b7053SJung-uk Kim &pop ("ebx"); 112e71b7053SJung-uk Kim &ret (); 113e71b7053SJung-uk Kim&function_end_B("padlock_capability") 114e71b7053SJung-uk Kim 115e71b7053SJung-uk Kim&function_begin_B("padlock_key_bswap"); 116e71b7053SJung-uk Kim &mov ("edx",&wparam(0)); 117e71b7053SJung-uk Kim &mov ("ecx",&DWP(240,"edx")); 118640242a5SJung-uk Kim &inc ("ecx"); 119640242a5SJung-uk Kim &shl ("ecx",2); 120e71b7053SJung-uk Kim&set_label("bswap_loop"); 121e71b7053SJung-uk Kim &mov ("eax",&DWP(0,"edx")); 122e71b7053SJung-uk Kim &bswap ("eax"); 123e71b7053SJung-uk Kim &mov (&DWP(0,"edx"),"eax"); 124e71b7053SJung-uk Kim &lea ("edx",&DWP(4,"edx")); 125e71b7053SJung-uk Kim &sub ("ecx",1); 126e71b7053SJung-uk Kim &jnz (&label("bswap_loop")); 127e71b7053SJung-uk Kim &ret (); 128e71b7053SJung-uk Kim&function_end_B("padlock_key_bswap"); 129e71b7053SJung-uk Kim 130e71b7053SJung-uk Kim# This is heuristic key context tracing. At first one 131e71b7053SJung-uk Kim# believes that one should use atomic swap instructions, 132e71b7053SJung-uk Kim# but it's not actually necessary. Point is that if 133e71b7053SJung-uk Kim# padlock_saved_context was changed by another thread 134e71b7053SJung-uk Kim# after we've read it and before we compare it with ctx, 135e71b7053SJung-uk Kim# our key *shall* be reloaded upon thread context switch 136e71b7053SJung-uk Kim# and we are therefore set in either case... 137e71b7053SJung-uk Kim&static_label("padlock_saved_context"); 138e71b7053SJung-uk Kim 139e71b7053SJung-uk Kim&function_begin_B("padlock_verify_context"); 140e71b7053SJung-uk Kim &mov ($ctx,&wparam(0)); 141e71b7053SJung-uk Kim &lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) : 142e71b7053SJung-uk Kim &DWP(&label("padlock_saved_context")."-".&label("verify_pic_point"))); 143e71b7053SJung-uk Kim &pushf (); 144e71b7053SJung-uk Kim &call ("_padlock_verify_ctx"); 145e71b7053SJung-uk Kim&set_label("verify_pic_point"); 146e71b7053SJung-uk Kim &lea ("esp",&DWP(4,"esp")); 147e71b7053SJung-uk Kim &ret (); 148e71b7053SJung-uk Kim&function_end_B("padlock_verify_context"); 149e71b7053SJung-uk Kim 150e71b7053SJung-uk Kim&function_begin_B("_padlock_verify_ctx"); 151e71b7053SJung-uk Kim &add ("eax",&DWP(0,"esp")) if(!($::win32 or $::coff));# &padlock_saved_context 152e71b7053SJung-uk Kim &bt (&DWP(4,"esp"),30); # eflags 153e71b7053SJung-uk Kim &jnc (&label("verified")); 154e71b7053SJung-uk Kim &cmp ($ctx,&DWP(0,"eax")); 155e71b7053SJung-uk Kim &je (&label("verified")); 156e71b7053SJung-uk Kim &pushf (); 157e71b7053SJung-uk Kim &popf (); 158e71b7053SJung-uk Kim&set_label("verified"); 159e71b7053SJung-uk Kim &mov (&DWP(0,"eax"),$ctx); 160e71b7053SJung-uk Kim &ret (); 161e71b7053SJung-uk Kim&function_end_B("_padlock_verify_ctx"); 162e71b7053SJung-uk Kim 163e71b7053SJung-uk Kim&function_begin_B("padlock_reload_key"); 164e71b7053SJung-uk Kim &pushf (); 165e71b7053SJung-uk Kim &popf (); 166e71b7053SJung-uk Kim &ret (); 167e71b7053SJung-uk Kim&function_end_B("padlock_reload_key"); 168e71b7053SJung-uk Kim 169e71b7053SJung-uk Kim&function_begin_B("padlock_aes_block"); 170e71b7053SJung-uk Kim &push ("edi"); 171e71b7053SJung-uk Kim &push ("esi"); 172e71b7053SJung-uk Kim &push ("ebx"); 173e71b7053SJung-uk Kim &mov ($out,&wparam(0)); # must be 16-byte aligned 174e71b7053SJung-uk Kim &mov ($inp,&wparam(1)); # must be 16-byte aligned 175e71b7053SJung-uk Kim &mov ($ctx,&wparam(2)); 176e71b7053SJung-uk Kim &mov ($len,1); 177e71b7053SJung-uk Kim &lea ("ebx",&DWP(32,$ctx)); # key 178e71b7053SJung-uk Kim &lea ($ctx,&DWP(16,$ctx)); # control word 179e71b7053SJung-uk Kim &data_byte(0xf3,0x0f,0xa7,0xc8); # rep xcryptecb 180e71b7053SJung-uk Kim &pop ("ebx"); 181e71b7053SJung-uk Kim &pop ("esi"); 182e71b7053SJung-uk Kim &pop ("edi"); 183e71b7053SJung-uk Kim &ret (); 184e71b7053SJung-uk Kim&function_end_B("padlock_aes_block"); 185e71b7053SJung-uk Kim 186e71b7053SJung-uk Kimsub generate_mode { 187e71b7053SJung-uk Kimmy ($mode,$opcode) = @_; 188e71b7053SJung-uk Kim# int padlock_$mode_encrypt(void *out, const void *inp, 189e71b7053SJung-uk Kim# struct padlock_cipher_data *ctx, size_t len); 190e71b7053SJung-uk Kim&function_begin("padlock_${mode}_encrypt"); 191e71b7053SJung-uk Kim &mov ($out,&wparam(0)); 192e71b7053SJung-uk Kim &mov ($inp,&wparam(1)); 193e71b7053SJung-uk Kim &mov ($ctx,&wparam(2)); 194e71b7053SJung-uk Kim &mov ($len,&wparam(3)); 195e71b7053SJung-uk Kim &test ($ctx,15); 196e71b7053SJung-uk Kim &jnz (&label("${mode}_abort")); 197e71b7053SJung-uk Kim &test ($len,15); 198e71b7053SJung-uk Kim &jnz (&label("${mode}_abort")); 199e71b7053SJung-uk Kim &lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) : 200e71b7053SJung-uk Kim &DWP(&label("padlock_saved_context")."-".&label("${mode}_pic_point"))); 201e71b7053SJung-uk Kim &pushf (); 202e71b7053SJung-uk Kim &cld (); 203e71b7053SJung-uk Kim &call ("_padlock_verify_ctx"); 204e71b7053SJung-uk Kim&set_label("${mode}_pic_point"); 205e71b7053SJung-uk Kim &lea ($ctx,&DWP(16,$ctx)); # control word 206e71b7053SJung-uk Kim &xor ("eax","eax"); 207e71b7053SJung-uk Kim if ($mode eq "ctr32") { 208e71b7053SJung-uk Kim &movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter 209e71b7053SJung-uk Kim } else { 210e71b7053SJung-uk Kim &xor ("ebx","ebx"); 211e71b7053SJung-uk Kim &test (&DWP(0,$ctx),1<<5); # align bit in control word 212e71b7053SJung-uk Kim &jnz (&label("${mode}_aligned")); 213e71b7053SJung-uk Kim &test ($out,0x0f); 214e71b7053SJung-uk Kim &setz ("al"); # !out_misaligned 215e71b7053SJung-uk Kim &test ($inp,0x0f); 216e71b7053SJung-uk Kim &setz ("bl"); # !inp_misaligned 217e71b7053SJung-uk Kim &test ("eax","ebx"); 218e71b7053SJung-uk Kim &jnz (&label("${mode}_aligned")); 219e71b7053SJung-uk Kim &neg ("eax"); 220e71b7053SJung-uk Kim } 221e71b7053SJung-uk Kim &mov ($chunk,$PADLOCK_CHUNK); 222e71b7053SJung-uk Kim ¬ ("eax"); # out_misaligned?-1:0 223e71b7053SJung-uk Kim &lea ("ebp",&DWP(-24,"esp")); 224e71b7053SJung-uk Kim &cmp ($len,$chunk); 225e71b7053SJung-uk Kim &cmovc ($chunk,$len); # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len 226e71b7053SJung-uk Kim &and ("eax",$chunk); # out_misaligned?chunk:0 227e71b7053SJung-uk Kim &mov ($chunk,$len); 228e71b7053SJung-uk Kim &neg ("eax"); 229e71b7053SJung-uk Kim &and ($chunk,$PADLOCK_CHUNK-1); # chunk=len%PADLOCK_CHUNK 230e71b7053SJung-uk Kim &lea ("esp",&DWP(0,"eax","ebp")); # alloca 231e71b7053SJung-uk Kim &mov ("eax",$PADLOCK_CHUNK); 232e71b7053SJung-uk Kim &cmovz ($chunk,"eax"); # chunk=chunk?:PADLOCK_CHUNK 233e71b7053SJung-uk Kim &mov ("eax","ebp"); 234e71b7053SJung-uk Kim &and ("ebp",-16); 235e71b7053SJung-uk Kim &and ("esp",-16); 236e71b7053SJung-uk Kim &mov (&DWP(16,"ebp"),"eax"); 237e71b7053SJung-uk Kim if ($PADLOCK_PREFETCH{$mode}) { 238e71b7053SJung-uk Kim &cmp ($len,$chunk); 239e71b7053SJung-uk Kim &ja (&label("${mode}_loop")); 240e71b7053SJung-uk Kim &mov ("eax",$inp); # check if prefetch crosses page 241e71b7053SJung-uk Kim &cmp ("ebp","esp"); 242e71b7053SJung-uk Kim &cmove ("eax",$out); 243e71b7053SJung-uk Kim &add ("eax",$len); 244e71b7053SJung-uk Kim &neg ("eax"); 245e71b7053SJung-uk Kim &and ("eax",0xfff); # distance to page boundary 246e71b7053SJung-uk Kim &cmp ("eax",$PADLOCK_PREFETCH{$mode}); 247e71b7053SJung-uk Kim &mov ("eax",-$PADLOCK_PREFETCH{$mode}); 248e71b7053SJung-uk Kim &cmovae ("eax",$chunk); # mask=distance<prefetch?-prefetch:-1 249e71b7053SJung-uk Kim &and ($chunk,"eax"); 250e71b7053SJung-uk Kim &jz (&label("${mode}_unaligned_tail")); 251e71b7053SJung-uk Kim } 252e71b7053SJung-uk Kim &jmp (&label("${mode}_loop")); 253e71b7053SJung-uk Kim 254e71b7053SJung-uk Kim&set_label("${mode}_loop",16); 255e71b7053SJung-uk Kim &mov (&DWP(0,"ebp"),$out); # save parameters 256e71b7053SJung-uk Kim &mov (&DWP(4,"ebp"),$inp); 257e71b7053SJung-uk Kim &mov (&DWP(8,"ebp"),$len); 258e71b7053SJung-uk Kim &mov ($len,$chunk); 259e71b7053SJung-uk Kim &mov (&DWP(12,"ebp"),$chunk); # chunk 260e71b7053SJung-uk Kim if ($mode eq "ctr32") { 261e71b7053SJung-uk Kim &mov ("ecx",&DWP(-4,$ctx)); 262e71b7053SJung-uk Kim &xor ($out,$out); 263e71b7053SJung-uk Kim &mov ("eax",&DWP(-8,$ctx)); # borrow $len 264e71b7053SJung-uk Kim&set_label("${mode}_prepare"); 265e71b7053SJung-uk Kim &mov (&DWP(12,"esp",$out),"ecx"); 266e71b7053SJung-uk Kim &bswap ("ecx"); 267e71b7053SJung-uk Kim &movq (&QWP(0,"esp",$out),"mm0"); 268e71b7053SJung-uk Kim &inc ("ecx"); 269e71b7053SJung-uk Kim &mov (&DWP(8,"esp",$out),"eax"); 270e71b7053SJung-uk Kim &bswap ("ecx"); 271e71b7053SJung-uk Kim &lea ($out,&DWP(16,$out)); 272e71b7053SJung-uk Kim &cmp ($out,$chunk); 273e71b7053SJung-uk Kim &jb (&label("${mode}_prepare")); 274e71b7053SJung-uk Kim 275e71b7053SJung-uk Kim &mov (&DWP(-4,$ctx),"ecx"); 276e71b7053SJung-uk Kim &lea ($inp,&DWP(0,"esp")); 277e71b7053SJung-uk Kim &lea ($out,&DWP(0,"esp")); 278e71b7053SJung-uk Kim &mov ($len,$chunk); 279e71b7053SJung-uk Kim } else { 280e71b7053SJung-uk Kim &test ($out,0x0f); # out_misaligned 281e71b7053SJung-uk Kim &cmovnz ($out,"esp"); 282e71b7053SJung-uk Kim &test ($inp,0x0f); # inp_misaligned 283e71b7053SJung-uk Kim &jz (&label("${mode}_inp_aligned")); 284e71b7053SJung-uk Kim &shr ($len,2); 285e71b7053SJung-uk Kim &data_byte(0xf3,0xa5); # rep movsl 286e71b7053SJung-uk Kim &sub ($out,$chunk); 287e71b7053SJung-uk Kim &mov ($len,$chunk); 288e71b7053SJung-uk Kim &mov ($inp,$out); 289e71b7053SJung-uk Kim&set_label("${mode}_inp_aligned"); 290e71b7053SJung-uk Kim } 291e71b7053SJung-uk Kim &lea ("eax",&DWP(-16,$ctx)); # ivp 292e71b7053SJung-uk Kim &lea ("ebx",&DWP(16,$ctx)); # key 293e71b7053SJung-uk Kim &shr ($len,4); # len/=AES_BLOCK_SIZE 294e71b7053SJung-uk Kim &data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt* 295e71b7053SJung-uk Kim if ($mode !~ /ecb|ctr/) { 296e71b7053SJung-uk Kim &movaps ("xmm0",&QWP(0,"eax")); 297e71b7053SJung-uk Kim &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv 298e71b7053SJung-uk Kim } 299e71b7053SJung-uk Kim &mov ($out,&DWP(0,"ebp")); # restore parameters 300e71b7053SJung-uk Kim &mov ($chunk,&DWP(12,"ebp")); 301e71b7053SJung-uk Kim if ($mode eq "ctr32") { 302e71b7053SJung-uk Kim &mov ($inp,&DWP(4,"ebp")); 303e71b7053SJung-uk Kim &xor ($len,$len); 304e71b7053SJung-uk Kim&set_label("${mode}_xor"); 305e71b7053SJung-uk Kim &movups ("xmm1",&QWP(0,$inp,$len)); 306e71b7053SJung-uk Kim &lea ($len,&DWP(16,$len)); 307e71b7053SJung-uk Kim &pxor ("xmm1",&QWP(-16,"esp",$len)); 308e71b7053SJung-uk Kim &movups (&QWP(-16,$out,$len),"xmm1"); 309e71b7053SJung-uk Kim &cmp ($len,$chunk); 310e71b7053SJung-uk Kim &jb (&label("${mode}_xor")); 311e71b7053SJung-uk Kim } else { 312e71b7053SJung-uk Kim &test ($out,0x0f); 313e71b7053SJung-uk Kim &jz (&label("${mode}_out_aligned")); 314e71b7053SJung-uk Kim &mov ($len,$chunk); 315e71b7053SJung-uk Kim &lea ($inp,&DWP(0,"esp")); 316e71b7053SJung-uk Kim &shr ($len,2); 317e71b7053SJung-uk Kim &data_byte(0xf3,0xa5); # rep movsl 318e71b7053SJung-uk Kim &sub ($out,$chunk); 319e71b7053SJung-uk Kim&set_label("${mode}_out_aligned"); 320e71b7053SJung-uk Kim &mov ($inp,&DWP(4,"ebp")); 321e71b7053SJung-uk Kim } 322e71b7053SJung-uk Kim &mov ($len,&DWP(8,"ebp")); 323e71b7053SJung-uk Kim &add ($out,$chunk); 324e71b7053SJung-uk Kim &add ($inp,$chunk); 325e71b7053SJung-uk Kim &sub ($len,$chunk); 326e71b7053SJung-uk Kim &mov ($chunk,$PADLOCK_CHUNK); 327e71b7053SJung-uk Kim if (!$PADLOCK_PREFETCH{$mode}) { 328e71b7053SJung-uk Kim &jnz (&label("${mode}_loop")); 329e71b7053SJung-uk Kim } else { 330e71b7053SJung-uk Kim &jz (&label("${mode}_break")); 331e71b7053SJung-uk Kim &cmp ($len,$chunk); 332e71b7053SJung-uk Kim &jae (&label("${mode}_loop")); 333e71b7053SJung-uk Kim 334e71b7053SJung-uk Kim&set_label("${mode}_unaligned_tail"); 335e71b7053SJung-uk Kim &xor ("eax","eax"); 336e71b7053SJung-uk Kim &cmp ("esp","ebp"); 337e71b7053SJung-uk Kim &cmove ("eax",$len); 338e71b7053SJung-uk Kim &sub ("esp","eax"); # alloca 339e71b7053SJung-uk Kim &mov ("eax", $out); # save parameters 340e71b7053SJung-uk Kim &mov ($chunk,$len); 341e71b7053SJung-uk Kim &shr ($len,2); 342e71b7053SJung-uk Kim &lea ($out,&DWP(0,"esp")); 343e71b7053SJung-uk Kim &data_byte(0xf3,0xa5); # rep movsl 344e71b7053SJung-uk Kim &mov ($inp,"esp"); 345e71b7053SJung-uk Kim &mov ($out,"eax"); # restore parameters 346e71b7053SJung-uk Kim &mov ($len,$chunk); 347e71b7053SJung-uk Kim &jmp (&label("${mode}_loop")); 348e71b7053SJung-uk Kim 349e71b7053SJung-uk Kim&set_label("${mode}_break",16); 350e71b7053SJung-uk Kim } 351e71b7053SJung-uk Kim if ($mode ne "ctr32") { 352e71b7053SJung-uk Kim &cmp ("esp","ebp"); 353e71b7053SJung-uk Kim &je (&label("${mode}_done")); 354e71b7053SJung-uk Kim } 355e71b7053SJung-uk Kim &pxor ("xmm0","xmm0"); 356e71b7053SJung-uk Kim &lea ("eax",&DWP(0,"esp")); 357e71b7053SJung-uk Kim&set_label("${mode}_bzero"); 358e71b7053SJung-uk Kim &movaps (&QWP(0,"eax"),"xmm0"); 359e71b7053SJung-uk Kim &lea ("eax",&DWP(16,"eax")); 360e71b7053SJung-uk Kim &cmp ("ebp","eax"); 361e71b7053SJung-uk Kim &ja (&label("${mode}_bzero")); 362e71b7053SJung-uk Kim 363e71b7053SJung-uk Kim&set_label("${mode}_done"); 364e71b7053SJung-uk Kim &mov ("ebp",&DWP(16,"ebp")); 365e71b7053SJung-uk Kim &lea ("esp",&DWP(24,"ebp")); 366e71b7053SJung-uk Kim if ($mode ne "ctr32") { 367e71b7053SJung-uk Kim &jmp (&label("${mode}_exit")); 368e71b7053SJung-uk Kim 369e71b7053SJung-uk Kim&set_label("${mode}_aligned",16); 370e71b7053SJung-uk Kim if ($PADLOCK_PREFETCH{$mode}) { 371e71b7053SJung-uk Kim &lea ("ebp",&DWP(0,$inp,$len)); 372e71b7053SJung-uk Kim &neg ("ebp"); 373e71b7053SJung-uk Kim &and ("ebp",0xfff); # distance to page boundary 374e71b7053SJung-uk Kim &xor ("eax","eax"); 375e71b7053SJung-uk Kim &cmp ("ebp",$PADLOCK_PREFETCH{$mode}); 376e71b7053SJung-uk Kim &mov ("ebp",$PADLOCK_PREFETCH{$mode}-1); 377e71b7053SJung-uk Kim &cmovae ("ebp","eax"); 378e71b7053SJung-uk Kim &and ("ebp",$len); # remainder 379e71b7053SJung-uk Kim &sub ($len,"ebp"); 380e71b7053SJung-uk Kim &jz (&label("${mode}_aligned_tail")); 381e71b7053SJung-uk Kim } 382e71b7053SJung-uk Kim &lea ("eax",&DWP(-16,$ctx)); # ivp 383e71b7053SJung-uk Kim &lea ("ebx",&DWP(16,$ctx)); # key 384e71b7053SJung-uk Kim &shr ($len,4); # len/=AES_BLOCK_SIZE 385e71b7053SJung-uk Kim &data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt* 386e71b7053SJung-uk Kim if ($mode ne "ecb") { 387e71b7053SJung-uk Kim &movaps ("xmm0",&QWP(0,"eax")); 388e71b7053SJung-uk Kim &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv 389e71b7053SJung-uk Kim } 390e71b7053SJung-uk Kim if ($PADLOCK_PREFETCH{$mode}) { 391e71b7053SJung-uk Kim &test ("ebp","ebp"); 392e71b7053SJung-uk Kim &jz (&label("${mode}_exit")); 393e71b7053SJung-uk Kim 394e71b7053SJung-uk Kim&set_label("${mode}_aligned_tail"); 395e71b7053SJung-uk Kim &mov ($len,"ebp"); 396e71b7053SJung-uk Kim &lea ("ebp",&DWP(-24,"esp")); 397e71b7053SJung-uk Kim &mov ("esp","ebp"); 398e71b7053SJung-uk Kim &mov ("eax","ebp"); 399e71b7053SJung-uk Kim &sub ("esp",$len); 400e71b7053SJung-uk Kim &and ("ebp",-16); 401e71b7053SJung-uk Kim &and ("esp",-16); 402e71b7053SJung-uk Kim &mov (&DWP(16,"ebp"),"eax"); 403e71b7053SJung-uk Kim &mov ("eax", $out); # save parameters 404e71b7053SJung-uk Kim &mov ($chunk,$len); 405e71b7053SJung-uk Kim &shr ($len,2); 406e71b7053SJung-uk Kim &lea ($out,&DWP(0,"esp")); 407e71b7053SJung-uk Kim &data_byte(0xf3,0xa5); # rep movsl 408e71b7053SJung-uk Kim &mov ($inp,"esp"); 409e71b7053SJung-uk Kim &mov ($out,"eax"); # restore parameters 410e71b7053SJung-uk Kim &mov ($len,$chunk); 411e71b7053SJung-uk Kim &jmp (&label("${mode}_loop")); 412e71b7053SJung-uk Kim } 413e71b7053SJung-uk Kim&set_label("${mode}_exit"); } 414e71b7053SJung-uk Kim &mov ("eax",1); 415e71b7053SJung-uk Kim &lea ("esp",&DWP(4,"esp")); # popf 416e71b7053SJung-uk Kim &emms () if ($mode eq "ctr32"); 417e71b7053SJung-uk Kim&set_label("${mode}_abort"); 418e71b7053SJung-uk Kim&function_end("padlock_${mode}_encrypt"); 419e71b7053SJung-uk Kim} 420e71b7053SJung-uk Kim 421e71b7053SJung-uk Kim&generate_mode("ecb",0xc8); 422e71b7053SJung-uk Kim&generate_mode("cbc",0xd0); 423e71b7053SJung-uk Kim&generate_mode("cfb",0xe0); 424e71b7053SJung-uk Kim&generate_mode("ofb",0xe8); 425e71b7053SJung-uk Kim&generate_mode("ctr32",0xc8); # yes, it implements own CTR with ECB opcode, 426e71b7053SJung-uk Kim # because hardware CTR was introduced later 427e71b7053SJung-uk Kim # and even has errata on certain C7 stepping. 428e71b7053SJung-uk Kim # own implementation *always* works, though 429e71b7053SJung-uk Kim # ~15% slower than dedicated hardware... 430e71b7053SJung-uk Kim 431e71b7053SJung-uk Kim&function_begin_B("padlock_xstore"); 432e71b7053SJung-uk Kim &push ("edi"); 433e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 434e71b7053SJung-uk Kim &mov ("edx",&wparam(1)); 435e71b7053SJung-uk Kim &data_byte(0x0f,0xa7,0xc0); # xstore 436e71b7053SJung-uk Kim &pop ("edi"); 437e71b7053SJung-uk Kim &ret (); 438e71b7053SJung-uk Kim&function_end_B("padlock_xstore"); 439e71b7053SJung-uk Kim 440e71b7053SJung-uk Kim&function_begin_B("_win32_segv_handler"); 441e71b7053SJung-uk Kim &mov ("eax",1); # ExceptionContinueSearch 442e71b7053SJung-uk Kim &mov ("edx",&wparam(0)); # *ExceptionRecord 443e71b7053SJung-uk Kim &mov ("ecx",&wparam(2)); # *ContextRecord 444e71b7053SJung-uk Kim &cmp (&DWP(0,"edx"),0xC0000005) # ExceptionRecord->ExceptionCode == STATUS_ACCESS_VIOLATION 445e71b7053SJung-uk Kim &jne (&label("ret")); 446e71b7053SJung-uk Kim &add (&DWP(184,"ecx"),4); # skip over rep sha* 447e71b7053SJung-uk Kim &mov ("eax",0); # ExceptionContinueExecution 448e71b7053SJung-uk Kim&set_label("ret"); 449e71b7053SJung-uk Kim &ret (); 450e71b7053SJung-uk Kim&function_end_B("_win32_segv_handler"); 451e71b7053SJung-uk Kim&safeseh("_win32_segv_handler") if ($::win32); 452e71b7053SJung-uk Kim 453e71b7053SJung-uk Kim&function_begin_B("padlock_sha1_oneshot"); 454e71b7053SJung-uk Kim &push ("edi"); 455e71b7053SJung-uk Kim &push ("esi"); 456e71b7053SJung-uk Kim &xor ("eax","eax"); 457e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 458e71b7053SJung-uk Kim &mov ("esi",&wparam(1)); 459e71b7053SJung-uk Kim &mov ("ecx",&wparam(2)); 460e71b7053SJung-uk Kim if ($::win32 or $::coff) { 461e71b7053SJung-uk Kim &push (&::islabel("_win32_segv_handler")); 462e71b7053SJung-uk Kim &data_byte(0x64,0xff,0x30); # push %fs:(%eax) 463e71b7053SJung-uk Kim &data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax) 464e71b7053SJung-uk Kim } 465e71b7053SJung-uk Kim &mov ("edx","esp"); # put aside %esp 466e71b7053SJung-uk Kim &add ("esp",-128); # 32 is enough but spec says 128 467e71b7053SJung-uk Kim &movups ("xmm0",&QWP(0,"edi")); # copy-in context 468e71b7053SJung-uk Kim &and ("esp",-16); 469e71b7053SJung-uk Kim &mov ("eax",&DWP(16,"edi")); 470e71b7053SJung-uk Kim &movaps (&QWP(0,"esp"),"xmm0"); 471e71b7053SJung-uk Kim &mov ("edi","esp"); 472e71b7053SJung-uk Kim &mov (&DWP(16,"esp"),"eax"); 473e71b7053SJung-uk Kim &xor ("eax","eax"); 474e71b7053SJung-uk Kim &data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1 475e71b7053SJung-uk Kim &movaps ("xmm0",&QWP(0,"esp")); 476e71b7053SJung-uk Kim &mov ("eax",&DWP(16,"esp")); 477e71b7053SJung-uk Kim &mov ("esp","edx"); # restore %esp 478e71b7053SJung-uk Kim if ($::win32 or $::coff) { 479e71b7053SJung-uk Kim &data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0 480e71b7053SJung-uk Kim &lea ("esp",&DWP(4,"esp")); 481e71b7053SJung-uk Kim } 482e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 483e71b7053SJung-uk Kim &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 484e71b7053SJung-uk Kim &mov (&DWP(16,"edi"),"eax"); 485e71b7053SJung-uk Kim &pop ("esi"); 486e71b7053SJung-uk Kim &pop ("edi"); 487e71b7053SJung-uk Kim &ret (); 488e71b7053SJung-uk Kim&function_end_B("padlock_sha1_oneshot"); 489e71b7053SJung-uk Kim 490e71b7053SJung-uk Kim&function_begin_B("padlock_sha1_blocks"); 491e71b7053SJung-uk Kim &push ("edi"); 492e71b7053SJung-uk Kim &push ("esi"); 493e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 494e71b7053SJung-uk Kim &mov ("esi",&wparam(1)); 495e71b7053SJung-uk Kim &mov ("edx","esp"); # put aside %esp 496e71b7053SJung-uk Kim &mov ("ecx",&wparam(2)); 497e71b7053SJung-uk Kim &add ("esp",-128); 498e71b7053SJung-uk Kim &movups ("xmm0",&QWP(0,"edi")); # copy-in context 499e71b7053SJung-uk Kim &and ("esp",-16); 500e71b7053SJung-uk Kim &mov ("eax",&DWP(16,"edi")); 501e71b7053SJung-uk Kim &movaps (&QWP(0,"esp"),"xmm0"); 502e71b7053SJung-uk Kim &mov ("edi","esp"); 503e71b7053SJung-uk Kim &mov (&DWP(16,"esp"),"eax"); 504e71b7053SJung-uk Kim &mov ("eax",-1); 505e71b7053SJung-uk Kim &data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1 506e71b7053SJung-uk Kim &movaps ("xmm0",&QWP(0,"esp")); 507e71b7053SJung-uk Kim &mov ("eax",&DWP(16,"esp")); 508e71b7053SJung-uk Kim &mov ("esp","edx"); # restore %esp 509e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 510e71b7053SJung-uk Kim &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 511e71b7053SJung-uk Kim &mov (&DWP(16,"edi"),"eax"); 512e71b7053SJung-uk Kim &pop ("esi"); 513e71b7053SJung-uk Kim &pop ("edi"); 514e71b7053SJung-uk Kim &ret (); 515e71b7053SJung-uk Kim&function_end_B("padlock_sha1_blocks"); 516e71b7053SJung-uk Kim 517e71b7053SJung-uk Kim&function_begin_B("padlock_sha256_oneshot"); 518e71b7053SJung-uk Kim &push ("edi"); 519e71b7053SJung-uk Kim &push ("esi"); 520e71b7053SJung-uk Kim &xor ("eax","eax"); 521e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 522e71b7053SJung-uk Kim &mov ("esi",&wparam(1)); 523e71b7053SJung-uk Kim &mov ("ecx",&wparam(2)); 524e71b7053SJung-uk Kim if ($::win32 or $::coff) { 525e71b7053SJung-uk Kim &push (&::islabel("_win32_segv_handler")); 526e71b7053SJung-uk Kim &data_byte(0x64,0xff,0x30); # push %fs:(%eax) 527e71b7053SJung-uk Kim &data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax) 528e71b7053SJung-uk Kim } 529e71b7053SJung-uk Kim &mov ("edx","esp"); # put aside %esp 530e71b7053SJung-uk Kim &add ("esp",-128); 531e71b7053SJung-uk Kim &movups ("xmm0",&QWP(0,"edi")); # copy-in context 532e71b7053SJung-uk Kim &and ("esp",-16); 533e71b7053SJung-uk Kim &movups ("xmm1",&QWP(16,"edi")); 534e71b7053SJung-uk Kim &movaps (&QWP(0,"esp"),"xmm0"); 535e71b7053SJung-uk Kim &mov ("edi","esp"); 536e71b7053SJung-uk Kim &movaps (&QWP(16,"esp"),"xmm1"); 537e71b7053SJung-uk Kim &xor ("eax","eax"); 538e71b7053SJung-uk Kim &data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256 539e71b7053SJung-uk Kim &movaps ("xmm0",&QWP(0,"esp")); 540e71b7053SJung-uk Kim &movaps ("xmm1",&QWP(16,"esp")); 541e71b7053SJung-uk Kim &mov ("esp","edx"); # restore %esp 542e71b7053SJung-uk Kim if ($::win32 or $::coff) { 543e71b7053SJung-uk Kim &data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0 544e71b7053SJung-uk Kim &lea ("esp",&DWP(4,"esp")); 545e71b7053SJung-uk Kim } 546e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 547e71b7053SJung-uk Kim &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 548e71b7053SJung-uk Kim &movups (&QWP(16,"edi"),"xmm1"); 549e71b7053SJung-uk Kim &pop ("esi"); 550e71b7053SJung-uk Kim &pop ("edi"); 551e71b7053SJung-uk Kim &ret (); 552e71b7053SJung-uk Kim&function_end_B("padlock_sha256_oneshot"); 553e71b7053SJung-uk Kim 554e71b7053SJung-uk Kim&function_begin_B("padlock_sha256_blocks"); 555e71b7053SJung-uk Kim &push ("edi"); 556e71b7053SJung-uk Kim &push ("esi"); 557e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 558e71b7053SJung-uk Kim &mov ("esi",&wparam(1)); 559e71b7053SJung-uk Kim &mov ("ecx",&wparam(2)); 560e71b7053SJung-uk Kim &mov ("edx","esp"); # put aside %esp 561e71b7053SJung-uk Kim &add ("esp",-128); 562e71b7053SJung-uk Kim &movups ("xmm0",&QWP(0,"edi")); # copy-in context 563e71b7053SJung-uk Kim &and ("esp",-16); 564e71b7053SJung-uk Kim &movups ("xmm1",&QWP(16,"edi")); 565e71b7053SJung-uk Kim &movaps (&QWP(0,"esp"),"xmm0"); 566e71b7053SJung-uk Kim &mov ("edi","esp"); 567e71b7053SJung-uk Kim &movaps (&QWP(16,"esp"),"xmm1"); 568e71b7053SJung-uk Kim &mov ("eax",-1); 569e71b7053SJung-uk Kim &data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256 570e71b7053SJung-uk Kim &movaps ("xmm0",&QWP(0,"esp")); 571e71b7053SJung-uk Kim &movaps ("xmm1",&QWP(16,"esp")); 572e71b7053SJung-uk Kim &mov ("esp","edx"); # restore %esp 573e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 574e71b7053SJung-uk Kim &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 575e71b7053SJung-uk Kim &movups (&QWP(16,"edi"),"xmm1"); 576e71b7053SJung-uk Kim &pop ("esi"); 577e71b7053SJung-uk Kim &pop ("edi"); 578e71b7053SJung-uk Kim &ret (); 579e71b7053SJung-uk Kim&function_end_B("padlock_sha256_blocks"); 580e71b7053SJung-uk Kim 581e71b7053SJung-uk Kim&function_begin_B("padlock_sha512_blocks"); 582e71b7053SJung-uk Kim &push ("edi"); 583e71b7053SJung-uk Kim &push ("esi"); 584e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 585e71b7053SJung-uk Kim &mov ("esi",&wparam(1)); 586e71b7053SJung-uk Kim &mov ("ecx",&wparam(2)); 587e71b7053SJung-uk Kim &mov ("edx","esp"); # put aside %esp 588e71b7053SJung-uk Kim &add ("esp",-128); 589e71b7053SJung-uk Kim &movups ("xmm0",&QWP(0,"edi")); # copy-in context 590e71b7053SJung-uk Kim &and ("esp",-16); 591e71b7053SJung-uk Kim &movups ("xmm1",&QWP(16,"edi")); 592e71b7053SJung-uk Kim &movups ("xmm2",&QWP(32,"edi")); 593e71b7053SJung-uk Kim &movups ("xmm3",&QWP(48,"edi")); 594e71b7053SJung-uk Kim &movaps (&QWP(0,"esp"),"xmm0"); 595e71b7053SJung-uk Kim &mov ("edi","esp"); 596e71b7053SJung-uk Kim &movaps (&QWP(16,"esp"),"xmm1"); 597e71b7053SJung-uk Kim &movaps (&QWP(32,"esp"),"xmm2"); 598e71b7053SJung-uk Kim &movaps (&QWP(48,"esp"),"xmm3"); 599e71b7053SJung-uk Kim &data_byte(0xf3,0x0f,0xa6,0xe0); # rep xsha512 600e71b7053SJung-uk Kim &movaps ("xmm0",&QWP(0,"esp")); 601e71b7053SJung-uk Kim &movaps ("xmm1",&QWP(16,"esp")); 602e71b7053SJung-uk Kim &movaps ("xmm2",&QWP(32,"esp")); 603e71b7053SJung-uk Kim &movaps ("xmm3",&QWP(48,"esp")); 604e71b7053SJung-uk Kim &mov ("esp","edx"); # restore %esp 605e71b7053SJung-uk Kim &mov ("edi",&wparam(0)); 606e71b7053SJung-uk Kim &movups (&QWP(0,"edi"),"xmm0"); # copy-out context 607e71b7053SJung-uk Kim &movups (&QWP(16,"edi"),"xmm1"); 608e71b7053SJung-uk Kim &movups (&QWP(32,"edi"),"xmm2"); 609e71b7053SJung-uk Kim &movups (&QWP(48,"edi"),"xmm3"); 610e71b7053SJung-uk Kim &pop ("esi"); 611e71b7053SJung-uk Kim &pop ("edi"); 612e71b7053SJung-uk Kim &ret (); 613e71b7053SJung-uk Kim&function_end_B("padlock_sha512_blocks"); 614e71b7053SJung-uk Kim 615e71b7053SJung-uk Kim&asciz ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>"); 616e71b7053SJung-uk Kim&align (16); 617e71b7053SJung-uk Kim 618e71b7053SJung-uk Kim&dataseg(); 619e71b7053SJung-uk Kim# Essentially this variable belongs in thread local storage. 620e71b7053SJung-uk Kim# Having this variable global on the other hand can only cause 621e71b7053SJung-uk Kim# few bogus key reloads [if any at all on signle-CPU system], 622e71b7053SJung-uk Kim# so we accept the penalty... 623e71b7053SJung-uk Kim&set_label("padlock_saved_context",4); 624e71b7053SJung-uk Kim&data_word(0); 625e71b7053SJung-uk Kim 626e71b7053SJung-uk Kim&asm_finish(); 627e71b7053SJung-uk Kim 628e71b7053SJung-uk Kimclose STDOUT; 629