xref: /freebsd/crypto/openssl/engines/asm/e_padlock-x86.pl (revision e71b70530d95c4f34d8bdbd78d1242df1ba4a945)
1*e71b7053SJung-uk Kim#! /usr/bin/env perl
2*e71b7053SJung-uk Kim# Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved.
3*e71b7053SJung-uk Kim#
4*e71b7053SJung-uk Kim# Licensed under the OpenSSL license (the "License").  You may not use
5*e71b7053SJung-uk Kim# this file except in compliance with the License.  You can obtain a copy
6*e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at
7*e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html
8*e71b7053SJung-uk Kim
9*e71b7053SJung-uk Kim
10*e71b7053SJung-uk Kim# ====================================================================
11*e71b7053SJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12*e71b7053SJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and
13*e71b7053SJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further
14*e71b7053SJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/.
15*e71b7053SJung-uk Kim# ====================================================================
16*e71b7053SJung-uk Kim
17*e71b7053SJung-uk Kim# September 2011
18*e71b7053SJung-uk Kim#
19*e71b7053SJung-uk Kim# Assembler helpers for Padlock engine. Compared to original engine
20*e71b7053SJung-uk Kim# version relying on inline assembler and compiled with gcc 3.4.6 it
21*e71b7053SJung-uk Kim# was measured to provide ~100% improvement on misaligned data in ECB
22*e71b7053SJung-uk Kim# mode and ~75% in CBC mode. For aligned data improvement can be
23*e71b7053SJung-uk Kim# observed for short inputs only, e.g. 45% for 64-byte messages in
24*e71b7053SJung-uk Kim# ECB mode, 20% in CBC. Difference in performance for aligned vs.
25*e71b7053SJung-uk Kim# misaligned data depends on misalignment and is either ~1.8x or 2.9x.
26*e71b7053SJung-uk Kim# These are approximately same factors as for hardware support, so
27*e71b7053SJung-uk Kim# there is little reason to rely on the latter. On the contrary, it
28*e71b7053SJung-uk Kim# might actually hurt performance in mixture of aligned and misaligned
29*e71b7053SJung-uk Kim# buffers, because a) if you choose to flip 'align' flag in control
30*e71b7053SJung-uk Kim# word on per-buffer basis, then you'd have to reload key context,
31*e71b7053SJung-uk Kim# which incurs penalty; b) if you choose to set 'align' flag
32*e71b7053SJung-uk Kim# permanently, it limits performance even for aligned data to ~1/2.
33*e71b7053SJung-uk Kim# All above mentioned results were collected on 1.5GHz C7. Nano on the
34*e71b7053SJung-uk Kim# other hand handles unaligned data more gracefully. Depending on
35*e71b7053SJung-uk Kim# algorithm and how unaligned data is, hardware can be up to 70% more
36*e71b7053SJung-uk Kim# efficient than below software alignment procedures, nor does 'align'
37*e71b7053SJung-uk Kim# flag have affect on aligned performance [if has any meaning at all].
38*e71b7053SJung-uk Kim# Therefore suggestion is to unconditionally set 'align' flag on Nano
39*e71b7053SJung-uk Kim# for optimal performance.
40*e71b7053SJung-uk Kim
41*e71b7053SJung-uk Kim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42*e71b7053SJung-uk Kimpush(@INC,"${dir}","${dir}../../crypto/perlasm");
43*e71b7053SJung-uk Kimrequire "x86asm.pl";
44*e71b7053SJung-uk Kim
45*e71b7053SJung-uk Kim$output=pop;
46*e71b7053SJung-uk Kimopen STDOUT,">$output";
47*e71b7053SJung-uk Kim
48*e71b7053SJung-uk Kim&asm_init($ARGV[0]);
49*e71b7053SJung-uk Kim
50*e71b7053SJung-uk Kim%PADLOCK_PREFETCH=(ecb=>128, cbc=>64);	# prefetch errata
51*e71b7053SJung-uk Kim$PADLOCK_CHUNK=512;	# Must be a power of 2 larger than 16
52*e71b7053SJung-uk Kim
53*e71b7053SJung-uk Kim$ctx="edx";
54*e71b7053SJung-uk Kim$out="edi";
55*e71b7053SJung-uk Kim$inp="esi";
56*e71b7053SJung-uk Kim$len="ecx";
57*e71b7053SJung-uk Kim$chunk="ebx";
58*e71b7053SJung-uk Kim
59*e71b7053SJung-uk Kim&function_begin_B("padlock_capability");
60*e71b7053SJung-uk Kim	&push	("ebx");
61*e71b7053SJung-uk Kim	&pushf	();
62*e71b7053SJung-uk Kim	&pop	("eax");
63*e71b7053SJung-uk Kim	&mov	("ecx","eax");
64*e71b7053SJung-uk Kim	&xor	("eax",1<<21);
65*e71b7053SJung-uk Kim	&push	("eax");
66*e71b7053SJung-uk Kim	&popf	();
67*e71b7053SJung-uk Kim	&pushf	();
68*e71b7053SJung-uk Kim	&pop	("eax");
69*e71b7053SJung-uk Kim	&xor	("ecx","eax");
70*e71b7053SJung-uk Kim	&xor	("eax","eax");
71*e71b7053SJung-uk Kim	&bt	("ecx",21);
72*e71b7053SJung-uk Kim	&jnc	(&label("noluck"));
73*e71b7053SJung-uk Kim	&cpuid	();
74*e71b7053SJung-uk Kim	&xor	("eax","eax");
75*e71b7053SJung-uk Kim	&cmp	("ebx","0x".unpack("H*",'tneC'));
76*e71b7053SJung-uk Kim	&jne	(&label("zhaoxin"));
77*e71b7053SJung-uk Kim	&cmp	("edx","0x".unpack("H*",'Hrua'));
78*e71b7053SJung-uk Kim	&jne	(&label("noluck"));
79*e71b7053SJung-uk Kim	&cmp	("ecx","0x".unpack("H*",'slua'));
80*e71b7053SJung-uk Kim	&jne	(&label("noluck"));
81*e71b7053SJung-uk Kim	&jmp	(&label("zhaoxinEnd"));
82*e71b7053SJung-uk Kim&set_label("zhaoxin");
83*e71b7053SJung-uk Kim	&cmp	("ebx","0x".unpack("H*",'hS  '));
84*e71b7053SJung-uk Kim	&jne	(&label("noluck"));
85*e71b7053SJung-uk Kim	&cmp	("edx","0x".unpack("H*",'hgna'));
86*e71b7053SJung-uk Kim	&jne	(&label("noluck"));
87*e71b7053SJung-uk Kim	&cmp	("ecx","0x".unpack("H*",'  ia'));
88*e71b7053SJung-uk Kim	&jne	(&label("noluck"));
89*e71b7053SJung-uk Kim&set_label("zhaoxinEnd");
90*e71b7053SJung-uk Kim	&mov	("eax",0xC0000000);
91*e71b7053SJung-uk Kim	&cpuid	();
92*e71b7053SJung-uk Kim	&mov	("edx","eax");
93*e71b7053SJung-uk Kim	&xor	("eax","eax");
94*e71b7053SJung-uk Kim	&cmp	("edx",0xC0000001);
95*e71b7053SJung-uk Kim	&jb	(&label("noluck"));
96*e71b7053SJung-uk Kim	&mov	("eax",1);
97*e71b7053SJung-uk Kim	&cpuid	();
98*e71b7053SJung-uk Kim	&or	("eax",0x0f);
99*e71b7053SJung-uk Kim	&xor	("ebx","ebx");
100*e71b7053SJung-uk Kim	&and	("eax",0x0fff);
101*e71b7053SJung-uk Kim	&cmp	("eax",0x06ff);		# check for Nano
102*e71b7053SJung-uk Kim	&sete	("bl");
103*e71b7053SJung-uk Kim	&mov	("eax",0xC0000001);
104*e71b7053SJung-uk Kim	&push	("ebx");
105*e71b7053SJung-uk Kim	&cpuid	();
106*e71b7053SJung-uk Kim	&pop	("ebx");
107*e71b7053SJung-uk Kim	&mov	("eax","edx");
108*e71b7053SJung-uk Kim	&shl	("ebx",4);		# bit#4 denotes Nano
109*e71b7053SJung-uk Kim	&and	("eax",0xffffffef);
110*e71b7053SJung-uk Kim	&or	("eax","ebx")
111*e71b7053SJung-uk Kim&set_label("noluck");
112*e71b7053SJung-uk Kim	&pop	("ebx");
113*e71b7053SJung-uk Kim	&ret	();
114*e71b7053SJung-uk Kim&function_end_B("padlock_capability")
115*e71b7053SJung-uk Kim
116*e71b7053SJung-uk Kim&function_begin_B("padlock_key_bswap");
117*e71b7053SJung-uk Kim	&mov	("edx",&wparam(0));
118*e71b7053SJung-uk Kim	&mov	("ecx",&DWP(240,"edx"));
119*e71b7053SJung-uk Kim&set_label("bswap_loop");
120*e71b7053SJung-uk Kim	&mov	("eax",&DWP(0,"edx"));
121*e71b7053SJung-uk Kim	&bswap	("eax");
122*e71b7053SJung-uk Kim	&mov	(&DWP(0,"edx"),"eax");
123*e71b7053SJung-uk Kim	&lea	("edx",&DWP(4,"edx"));
124*e71b7053SJung-uk Kim	&sub	("ecx",1);
125*e71b7053SJung-uk Kim	&jnz	(&label("bswap_loop"));
126*e71b7053SJung-uk Kim	&ret	();
127*e71b7053SJung-uk Kim&function_end_B("padlock_key_bswap");
128*e71b7053SJung-uk Kim
129*e71b7053SJung-uk Kim# This is heuristic key context tracing. At first one
130*e71b7053SJung-uk Kim# believes that one should use atomic swap instructions,
131*e71b7053SJung-uk Kim# but it's not actually necessary. Point is that if
132*e71b7053SJung-uk Kim# padlock_saved_context was changed by another thread
133*e71b7053SJung-uk Kim# after we've read it and before we compare it with ctx,
134*e71b7053SJung-uk Kim# our key *shall* be reloaded upon thread context switch
135*e71b7053SJung-uk Kim# and we are therefore set in either case...
136*e71b7053SJung-uk Kim&static_label("padlock_saved_context");
137*e71b7053SJung-uk Kim
138*e71b7053SJung-uk Kim&function_begin_B("padlock_verify_context");
139*e71b7053SJung-uk Kim	&mov	($ctx,&wparam(0));
140*e71b7053SJung-uk Kim	&lea	("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) :
141*e71b7053SJung-uk Kim		       &DWP(&label("padlock_saved_context")."-".&label("verify_pic_point")));
142*e71b7053SJung-uk Kim	&pushf	();
143*e71b7053SJung-uk Kim	&call	("_padlock_verify_ctx");
144*e71b7053SJung-uk Kim&set_label("verify_pic_point");
145*e71b7053SJung-uk Kim	&lea	("esp",&DWP(4,"esp"));
146*e71b7053SJung-uk Kim	&ret	();
147*e71b7053SJung-uk Kim&function_end_B("padlock_verify_context");
148*e71b7053SJung-uk Kim
149*e71b7053SJung-uk Kim&function_begin_B("_padlock_verify_ctx");
150*e71b7053SJung-uk Kim	&add	("eax",&DWP(0,"esp")) if(!($::win32 or $::coff));# &padlock_saved_context
151*e71b7053SJung-uk Kim	&bt	(&DWP(4,"esp"),30);		# eflags
152*e71b7053SJung-uk Kim	&jnc	(&label("verified"));
153*e71b7053SJung-uk Kim	&cmp	($ctx,&DWP(0,"eax"));
154*e71b7053SJung-uk Kim	&je	(&label("verified"));
155*e71b7053SJung-uk Kim	&pushf	();
156*e71b7053SJung-uk Kim	&popf	();
157*e71b7053SJung-uk Kim&set_label("verified");
158*e71b7053SJung-uk Kim	&mov	(&DWP(0,"eax"),$ctx);
159*e71b7053SJung-uk Kim	&ret	();
160*e71b7053SJung-uk Kim&function_end_B("_padlock_verify_ctx");
161*e71b7053SJung-uk Kim
162*e71b7053SJung-uk Kim&function_begin_B("padlock_reload_key");
163*e71b7053SJung-uk Kim	&pushf	();
164*e71b7053SJung-uk Kim	&popf	();
165*e71b7053SJung-uk Kim	&ret	();
166*e71b7053SJung-uk Kim&function_end_B("padlock_reload_key");
167*e71b7053SJung-uk Kim
168*e71b7053SJung-uk Kim&function_begin_B("padlock_aes_block");
169*e71b7053SJung-uk Kim	&push	("edi");
170*e71b7053SJung-uk Kim	&push	("esi");
171*e71b7053SJung-uk Kim	&push	("ebx");
172*e71b7053SJung-uk Kim	&mov	($out,&wparam(0));		# must be 16-byte aligned
173*e71b7053SJung-uk Kim	&mov	($inp,&wparam(1));		# must be 16-byte aligned
174*e71b7053SJung-uk Kim	&mov	($ctx,&wparam(2));
175*e71b7053SJung-uk Kim	&mov	($len,1);
176*e71b7053SJung-uk Kim	&lea	("ebx",&DWP(32,$ctx));		# key
177*e71b7053SJung-uk Kim	&lea	($ctx,&DWP(16,$ctx));		# control word
178*e71b7053SJung-uk Kim	&data_byte(0xf3,0x0f,0xa7,0xc8);	# rep xcryptecb
179*e71b7053SJung-uk Kim	&pop	("ebx");
180*e71b7053SJung-uk Kim	&pop	("esi");
181*e71b7053SJung-uk Kim	&pop	("edi");
182*e71b7053SJung-uk Kim	&ret	();
183*e71b7053SJung-uk Kim&function_end_B("padlock_aes_block");
184*e71b7053SJung-uk Kim
185*e71b7053SJung-uk Kimsub generate_mode {
186*e71b7053SJung-uk Kimmy ($mode,$opcode) = @_;
187*e71b7053SJung-uk Kim# int padlock_$mode_encrypt(void *out, const void *inp,
188*e71b7053SJung-uk Kim#		struct padlock_cipher_data *ctx, size_t len);
189*e71b7053SJung-uk Kim&function_begin("padlock_${mode}_encrypt");
190*e71b7053SJung-uk Kim	&mov	($out,&wparam(0));
191*e71b7053SJung-uk Kim	&mov	($inp,&wparam(1));
192*e71b7053SJung-uk Kim	&mov	($ctx,&wparam(2));
193*e71b7053SJung-uk Kim	&mov	($len,&wparam(3));
194*e71b7053SJung-uk Kim	&test	($ctx,15);
195*e71b7053SJung-uk Kim	&jnz	(&label("${mode}_abort"));
196*e71b7053SJung-uk Kim	&test	($len,15);
197*e71b7053SJung-uk Kim	&jnz	(&label("${mode}_abort"));
198*e71b7053SJung-uk Kim	&lea	("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) :
199*e71b7053SJung-uk Kim		       &DWP(&label("padlock_saved_context")."-".&label("${mode}_pic_point")));
200*e71b7053SJung-uk Kim	&pushf	();
201*e71b7053SJung-uk Kim	&cld	();
202*e71b7053SJung-uk Kim	&call	("_padlock_verify_ctx");
203*e71b7053SJung-uk Kim&set_label("${mode}_pic_point");
204*e71b7053SJung-uk Kim	&lea	($ctx,&DWP(16,$ctx));	# control word
205*e71b7053SJung-uk Kim	&xor	("eax","eax");
206*e71b7053SJung-uk Kim					if ($mode eq "ctr32") {
207*e71b7053SJung-uk Kim	&movq	("mm0",&QWP(-16,$ctx));	# load [upper part of] counter
208*e71b7053SJung-uk Kim					} else {
209*e71b7053SJung-uk Kim	&xor	("ebx","ebx");
210*e71b7053SJung-uk Kim	&test	(&DWP(0,$ctx),1<<5);	# align bit in control word
211*e71b7053SJung-uk Kim	&jnz	(&label("${mode}_aligned"));
212*e71b7053SJung-uk Kim	&test	($out,0x0f);
213*e71b7053SJung-uk Kim	&setz	("al");			# !out_misaligned
214*e71b7053SJung-uk Kim	&test	($inp,0x0f);
215*e71b7053SJung-uk Kim	&setz	("bl");			# !inp_misaligned
216*e71b7053SJung-uk Kim	&test	("eax","ebx");
217*e71b7053SJung-uk Kim	&jnz	(&label("${mode}_aligned"));
218*e71b7053SJung-uk Kim	&neg	("eax");
219*e71b7053SJung-uk Kim					}
220*e71b7053SJung-uk Kim	&mov	($chunk,$PADLOCK_CHUNK);
221*e71b7053SJung-uk Kim	&not	("eax");		# out_misaligned?-1:0
222*e71b7053SJung-uk Kim	&lea	("ebp",&DWP(-24,"esp"));
223*e71b7053SJung-uk Kim	&cmp	($len,$chunk);
224*e71b7053SJung-uk Kim	&cmovc	($chunk,$len);		# chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
225*e71b7053SJung-uk Kim	&and	("eax",$chunk);		# out_misaligned?chunk:0
226*e71b7053SJung-uk Kim	&mov	($chunk,$len);
227*e71b7053SJung-uk Kim	&neg	("eax");
228*e71b7053SJung-uk Kim	&and	($chunk,$PADLOCK_CHUNK-1);	# chunk=len%PADLOCK_CHUNK
229*e71b7053SJung-uk Kim	&lea	("esp",&DWP(0,"eax","ebp"));	# alloca
230*e71b7053SJung-uk Kim	&mov	("eax",$PADLOCK_CHUNK);
231*e71b7053SJung-uk Kim	&cmovz	($chunk,"eax");			# chunk=chunk?:PADLOCK_CHUNK
232*e71b7053SJung-uk Kim	&mov	("eax","ebp");
233*e71b7053SJung-uk Kim	&and	("ebp",-16);
234*e71b7053SJung-uk Kim	&and	("esp",-16);
235*e71b7053SJung-uk Kim	&mov	(&DWP(16,"ebp"),"eax");
236*e71b7053SJung-uk Kim    if ($PADLOCK_PREFETCH{$mode}) {
237*e71b7053SJung-uk Kim	&cmp	($len,$chunk);
238*e71b7053SJung-uk Kim	&ja	(&label("${mode}_loop"));
239*e71b7053SJung-uk Kim	&mov	("eax",$inp);		# check if prefetch crosses page
240*e71b7053SJung-uk Kim	&cmp	("ebp","esp");
241*e71b7053SJung-uk Kim	&cmove	("eax",$out);
242*e71b7053SJung-uk Kim	&add	("eax",$len);
243*e71b7053SJung-uk Kim	&neg	("eax");
244*e71b7053SJung-uk Kim	&and	("eax",0xfff);		# distance to page boundary
245*e71b7053SJung-uk Kim	&cmp	("eax",$PADLOCK_PREFETCH{$mode});
246*e71b7053SJung-uk Kim	&mov	("eax",-$PADLOCK_PREFETCH{$mode});
247*e71b7053SJung-uk Kim	&cmovae	("eax",$chunk);		# mask=distance<prefetch?-prefetch:-1
248*e71b7053SJung-uk Kim	&and	($chunk,"eax");
249*e71b7053SJung-uk Kim	&jz	(&label("${mode}_unaligned_tail"));
250*e71b7053SJung-uk Kim    }
251*e71b7053SJung-uk Kim	&jmp	(&label("${mode}_loop"));
252*e71b7053SJung-uk Kim
253*e71b7053SJung-uk Kim&set_label("${mode}_loop",16);
254*e71b7053SJung-uk Kim	&mov	(&DWP(0,"ebp"),$out);		# save parameters
255*e71b7053SJung-uk Kim	&mov	(&DWP(4,"ebp"),$inp);
256*e71b7053SJung-uk Kim	&mov	(&DWP(8,"ebp"),$len);
257*e71b7053SJung-uk Kim	&mov	($len,$chunk);
258*e71b7053SJung-uk Kim	&mov	(&DWP(12,"ebp"),$chunk);	# chunk
259*e71b7053SJung-uk Kim						if ($mode eq "ctr32") {
260*e71b7053SJung-uk Kim	&mov	("ecx",&DWP(-4,$ctx));
261*e71b7053SJung-uk Kim	&xor	($out,$out);
262*e71b7053SJung-uk Kim	&mov	("eax",&DWP(-8,$ctx));		# borrow $len
263*e71b7053SJung-uk Kim&set_label("${mode}_prepare");
264*e71b7053SJung-uk Kim	&mov	(&DWP(12,"esp",$out),"ecx");
265*e71b7053SJung-uk Kim	&bswap	("ecx");
266*e71b7053SJung-uk Kim	&movq	(&QWP(0,"esp",$out),"mm0");
267*e71b7053SJung-uk Kim	&inc	("ecx");
268*e71b7053SJung-uk Kim	&mov	(&DWP(8,"esp",$out),"eax");
269*e71b7053SJung-uk Kim	&bswap	("ecx");
270*e71b7053SJung-uk Kim	&lea	($out,&DWP(16,$out));
271*e71b7053SJung-uk Kim	&cmp	($out,$chunk);
272*e71b7053SJung-uk Kim	&jb	(&label("${mode}_prepare"));
273*e71b7053SJung-uk Kim
274*e71b7053SJung-uk Kim	&mov	(&DWP(-4,$ctx),"ecx");
275*e71b7053SJung-uk Kim	&lea	($inp,&DWP(0,"esp"));
276*e71b7053SJung-uk Kim	&lea	($out,&DWP(0,"esp"));
277*e71b7053SJung-uk Kim	&mov	($len,$chunk);
278*e71b7053SJung-uk Kim						} else {
279*e71b7053SJung-uk Kim	&test	($out,0x0f);			# out_misaligned
280*e71b7053SJung-uk Kim	&cmovnz	($out,"esp");
281*e71b7053SJung-uk Kim	&test	($inp,0x0f);			# inp_misaligned
282*e71b7053SJung-uk Kim	&jz	(&label("${mode}_inp_aligned"));
283*e71b7053SJung-uk Kim	&shr	($len,2);
284*e71b7053SJung-uk Kim	&data_byte(0xf3,0xa5);			# rep movsl
285*e71b7053SJung-uk Kim	&sub	($out,$chunk);
286*e71b7053SJung-uk Kim	&mov	($len,$chunk);
287*e71b7053SJung-uk Kim	&mov	($inp,$out);
288*e71b7053SJung-uk Kim&set_label("${mode}_inp_aligned");
289*e71b7053SJung-uk Kim						}
290*e71b7053SJung-uk Kim	&lea	("eax",&DWP(-16,$ctx));		# ivp
291*e71b7053SJung-uk Kim	&lea	("ebx",&DWP(16,$ctx));		# key
292*e71b7053SJung-uk Kim	&shr	($len,4);			# len/=AES_BLOCK_SIZE
293*e71b7053SJung-uk Kim	&data_byte(0xf3,0x0f,0xa7,$opcode);	# rep xcrypt*
294*e71b7053SJung-uk Kim						if ($mode !~ /ecb|ctr/) {
295*e71b7053SJung-uk Kim	&movaps	("xmm0",&QWP(0,"eax"));
296*e71b7053SJung-uk Kim	&movaps	(&QWP(-16,$ctx),"xmm0");	# copy [or refresh] iv
297*e71b7053SJung-uk Kim						}
298*e71b7053SJung-uk Kim	&mov	($out,&DWP(0,"ebp"));		# restore parameters
299*e71b7053SJung-uk Kim	&mov	($chunk,&DWP(12,"ebp"));
300*e71b7053SJung-uk Kim						if ($mode eq "ctr32") {
301*e71b7053SJung-uk Kim	&mov	($inp,&DWP(4,"ebp"));
302*e71b7053SJung-uk Kim	&xor	($len,$len);
303*e71b7053SJung-uk Kim&set_label("${mode}_xor");
304*e71b7053SJung-uk Kim	&movups	("xmm1",&QWP(0,$inp,$len));
305*e71b7053SJung-uk Kim	&lea	($len,&DWP(16,$len));
306*e71b7053SJung-uk Kim	&pxor	("xmm1",&QWP(-16,"esp",$len));
307*e71b7053SJung-uk Kim	&movups	(&QWP(-16,$out,$len),"xmm1");
308*e71b7053SJung-uk Kim	&cmp	($len,$chunk);
309*e71b7053SJung-uk Kim	&jb	(&label("${mode}_xor"));
310*e71b7053SJung-uk Kim						} else {
311*e71b7053SJung-uk Kim	&test	($out,0x0f);
312*e71b7053SJung-uk Kim	&jz	(&label("${mode}_out_aligned"));
313*e71b7053SJung-uk Kim	&mov	($len,$chunk);
314*e71b7053SJung-uk Kim	&lea	($inp,&DWP(0,"esp"));
315*e71b7053SJung-uk Kim	&shr	($len,2);
316*e71b7053SJung-uk Kim	&data_byte(0xf3,0xa5);			# rep movsl
317*e71b7053SJung-uk Kim	&sub	($out,$chunk);
318*e71b7053SJung-uk Kim&set_label("${mode}_out_aligned");
319*e71b7053SJung-uk Kim	&mov	($inp,&DWP(4,"ebp"));
320*e71b7053SJung-uk Kim						}
321*e71b7053SJung-uk Kim	&mov	($len,&DWP(8,"ebp"));
322*e71b7053SJung-uk Kim	&add	($out,$chunk);
323*e71b7053SJung-uk Kim	&add	($inp,$chunk);
324*e71b7053SJung-uk Kim	&sub	($len,$chunk);
325*e71b7053SJung-uk Kim	&mov	($chunk,$PADLOCK_CHUNK);
326*e71b7053SJung-uk Kim    if (!$PADLOCK_PREFETCH{$mode}) {
327*e71b7053SJung-uk Kim	&jnz	(&label("${mode}_loop"));
328*e71b7053SJung-uk Kim    } else {
329*e71b7053SJung-uk Kim	&jz	(&label("${mode}_break"));
330*e71b7053SJung-uk Kim	&cmp	($len,$chunk);
331*e71b7053SJung-uk Kim	&jae	(&label("${mode}_loop"));
332*e71b7053SJung-uk Kim
333*e71b7053SJung-uk Kim&set_label("${mode}_unaligned_tail");
334*e71b7053SJung-uk Kim	&xor	("eax","eax");
335*e71b7053SJung-uk Kim	&cmp	("esp","ebp");
336*e71b7053SJung-uk Kim	&cmove	("eax",$len);
337*e71b7053SJung-uk Kim	&sub	("esp","eax");			# alloca
338*e71b7053SJung-uk Kim	&mov	("eax", $out);			# save parameters
339*e71b7053SJung-uk Kim	&mov	($chunk,$len);
340*e71b7053SJung-uk Kim	&shr	($len,2);
341*e71b7053SJung-uk Kim	&lea	($out,&DWP(0,"esp"));
342*e71b7053SJung-uk Kim	&data_byte(0xf3,0xa5);			# rep movsl
343*e71b7053SJung-uk Kim	&mov	($inp,"esp");
344*e71b7053SJung-uk Kim	&mov	($out,"eax");			# restore parameters
345*e71b7053SJung-uk Kim	&mov	($len,$chunk);
346*e71b7053SJung-uk Kim	&jmp	(&label("${mode}_loop"));
347*e71b7053SJung-uk Kim
348*e71b7053SJung-uk Kim&set_label("${mode}_break",16);
349*e71b7053SJung-uk Kim    }
350*e71b7053SJung-uk Kim						if ($mode ne "ctr32") {
351*e71b7053SJung-uk Kim	&cmp	("esp","ebp");
352*e71b7053SJung-uk Kim	&je	(&label("${mode}_done"));
353*e71b7053SJung-uk Kim						}
354*e71b7053SJung-uk Kim	&pxor	("xmm0","xmm0");
355*e71b7053SJung-uk Kim	&lea	("eax",&DWP(0,"esp"));
356*e71b7053SJung-uk Kim&set_label("${mode}_bzero");
357*e71b7053SJung-uk Kim	&movaps	(&QWP(0,"eax"),"xmm0");
358*e71b7053SJung-uk Kim	&lea	("eax",&DWP(16,"eax"));
359*e71b7053SJung-uk Kim	&cmp	("ebp","eax");
360*e71b7053SJung-uk Kim	&ja	(&label("${mode}_bzero"));
361*e71b7053SJung-uk Kim
362*e71b7053SJung-uk Kim&set_label("${mode}_done");
363*e71b7053SJung-uk Kim	&mov	("ebp",&DWP(16,"ebp"));
364*e71b7053SJung-uk Kim	&lea	("esp",&DWP(24,"ebp"));
365*e71b7053SJung-uk Kim						if ($mode ne "ctr32") {
366*e71b7053SJung-uk Kim	&jmp	(&label("${mode}_exit"));
367*e71b7053SJung-uk Kim
368*e71b7053SJung-uk Kim&set_label("${mode}_aligned",16);
369*e71b7053SJung-uk Kim    if ($PADLOCK_PREFETCH{$mode}) {
370*e71b7053SJung-uk Kim	&lea	("ebp",&DWP(0,$inp,$len));
371*e71b7053SJung-uk Kim	&neg	("ebp");
372*e71b7053SJung-uk Kim	&and	("ebp",0xfff);			# distance to page boundary
373*e71b7053SJung-uk Kim	&xor	("eax","eax");
374*e71b7053SJung-uk Kim	&cmp	("ebp",$PADLOCK_PREFETCH{$mode});
375*e71b7053SJung-uk Kim	&mov	("ebp",$PADLOCK_PREFETCH{$mode}-1);
376*e71b7053SJung-uk Kim	&cmovae	("ebp","eax");
377*e71b7053SJung-uk Kim	&and	("ebp",$len);			# remainder
378*e71b7053SJung-uk Kim	&sub	($len,"ebp");
379*e71b7053SJung-uk Kim	&jz	(&label("${mode}_aligned_tail"));
380*e71b7053SJung-uk Kim    }
381*e71b7053SJung-uk Kim	&lea	("eax",&DWP(-16,$ctx));		# ivp
382*e71b7053SJung-uk Kim	&lea	("ebx",&DWP(16,$ctx));		# key
383*e71b7053SJung-uk Kim	&shr	($len,4);			# len/=AES_BLOCK_SIZE
384*e71b7053SJung-uk Kim	&data_byte(0xf3,0x0f,0xa7,$opcode);	# rep xcrypt*
385*e71b7053SJung-uk Kim						if ($mode ne "ecb") {
386*e71b7053SJung-uk Kim	&movaps	("xmm0",&QWP(0,"eax"));
387*e71b7053SJung-uk Kim	&movaps	(&QWP(-16,$ctx),"xmm0");	# copy [or refresh] iv
388*e71b7053SJung-uk Kim						}
389*e71b7053SJung-uk Kim    if ($PADLOCK_PREFETCH{$mode}) {
390*e71b7053SJung-uk Kim	&test	("ebp","ebp");
391*e71b7053SJung-uk Kim	&jz	(&label("${mode}_exit"));
392*e71b7053SJung-uk Kim
393*e71b7053SJung-uk Kim&set_label("${mode}_aligned_tail");
394*e71b7053SJung-uk Kim	&mov	($len,"ebp");
395*e71b7053SJung-uk Kim	&lea	("ebp",&DWP(-24,"esp"));
396*e71b7053SJung-uk Kim	&mov	("esp","ebp");
397*e71b7053SJung-uk Kim	&mov	("eax","ebp");
398*e71b7053SJung-uk Kim	&sub	("esp",$len);
399*e71b7053SJung-uk Kim	&and	("ebp",-16);
400*e71b7053SJung-uk Kim	&and	("esp",-16);
401*e71b7053SJung-uk Kim	&mov	(&DWP(16,"ebp"),"eax");
402*e71b7053SJung-uk Kim	&mov	("eax", $out);			# save parameters
403*e71b7053SJung-uk Kim	&mov	($chunk,$len);
404*e71b7053SJung-uk Kim	&shr	($len,2);
405*e71b7053SJung-uk Kim	&lea	($out,&DWP(0,"esp"));
406*e71b7053SJung-uk Kim	&data_byte(0xf3,0xa5);			# rep movsl
407*e71b7053SJung-uk Kim	&mov	($inp,"esp");
408*e71b7053SJung-uk Kim	&mov	($out,"eax");			# restore parameters
409*e71b7053SJung-uk Kim	&mov	($len,$chunk);
410*e71b7053SJung-uk Kim	&jmp	(&label("${mode}_loop"));
411*e71b7053SJung-uk Kim    }
412*e71b7053SJung-uk Kim&set_label("${mode}_exit");			}
413*e71b7053SJung-uk Kim	&mov	("eax",1);
414*e71b7053SJung-uk Kim	&lea	("esp",&DWP(4,"esp"));		# popf
415*e71b7053SJung-uk Kim	&emms	()				if ($mode eq "ctr32");
416*e71b7053SJung-uk Kim&set_label("${mode}_abort");
417*e71b7053SJung-uk Kim&function_end("padlock_${mode}_encrypt");
418*e71b7053SJung-uk Kim}
419*e71b7053SJung-uk Kim
420*e71b7053SJung-uk Kim&generate_mode("ecb",0xc8);
421*e71b7053SJung-uk Kim&generate_mode("cbc",0xd0);
422*e71b7053SJung-uk Kim&generate_mode("cfb",0xe0);
423*e71b7053SJung-uk Kim&generate_mode("ofb",0xe8);
424*e71b7053SJung-uk Kim&generate_mode("ctr32",0xc8);	# yes, it implements own CTR with ECB opcode,
425*e71b7053SJung-uk Kim				# because hardware CTR was introduced later
426*e71b7053SJung-uk Kim				# and even has errata on certain C7 stepping.
427*e71b7053SJung-uk Kim				# own implementation *always* works, though
428*e71b7053SJung-uk Kim				# ~15% slower than dedicated hardware...
429*e71b7053SJung-uk Kim
430*e71b7053SJung-uk Kim&function_begin_B("padlock_xstore");
431*e71b7053SJung-uk Kim	&push	("edi");
432*e71b7053SJung-uk Kim	&mov	("edi",&wparam(0));
433*e71b7053SJung-uk Kim	&mov	("edx",&wparam(1));
434*e71b7053SJung-uk Kim	&data_byte(0x0f,0xa7,0xc0);		# xstore
435*e71b7053SJung-uk Kim	&pop	("edi");
436*e71b7053SJung-uk Kim	&ret	();
437*e71b7053SJung-uk Kim&function_end_B("padlock_xstore");
438*e71b7053SJung-uk Kim
439*e71b7053SJung-uk Kim&function_begin_B("_win32_segv_handler");
440*e71b7053SJung-uk Kim	&mov	("eax",1);			# ExceptionContinueSearch
441*e71b7053SJung-uk Kim	&mov	("edx",&wparam(0));		# *ExceptionRecord
442*e71b7053SJung-uk Kim	&mov	("ecx",&wparam(2));		# *ContextRecord
443*e71b7053SJung-uk Kim	&cmp	(&DWP(0,"edx"),0xC0000005)	# ExceptionRecord->ExceptionCode == STATUS_ACCESS_VIOLATION
444*e71b7053SJung-uk Kim	&jne	(&label("ret"));
445*e71b7053SJung-uk Kim	&add	(&DWP(184,"ecx"),4);		# skip over rep sha*
446*e71b7053SJung-uk Kim	&mov	("eax",0);			# ExceptionContinueExecution
447*e71b7053SJung-uk Kim&set_label("ret");
448*e71b7053SJung-uk Kim	&ret	();
449*e71b7053SJung-uk Kim&function_end_B("_win32_segv_handler");
450*e71b7053SJung-uk Kim&safeseh("_win32_segv_handler")			if ($::win32);
451*e71b7053SJung-uk Kim
452*e71b7053SJung-uk Kim&function_begin_B("padlock_sha1_oneshot");
453*e71b7053SJung-uk Kim	&push	("edi");
454*e71b7053SJung-uk Kim	&push	("esi");
455*e71b7053SJung-uk Kim	&xor	("eax","eax");
456*e71b7053SJung-uk Kim	&mov	("edi",&wparam(0));
457*e71b7053SJung-uk Kim	&mov	("esi",&wparam(1));
458*e71b7053SJung-uk Kim	&mov	("ecx",&wparam(2));
459*e71b7053SJung-uk Kim    if ($::win32 or $::coff) {
460*e71b7053SJung-uk Kim    	&push	(&::islabel("_win32_segv_handler"));
461*e71b7053SJung-uk Kim	&data_byte(0x64,0xff,0x30);		# push	%fs:(%eax)
462*e71b7053SJung-uk Kim	&data_byte(0x64,0x89,0x20);		# mov	%esp,%fs:(%eax)
463*e71b7053SJung-uk Kim    }
464*e71b7053SJung-uk Kim	&mov	("edx","esp");			# put aside %esp
465*e71b7053SJung-uk Kim	&add	("esp",-128);			# 32 is enough but spec says 128
466*e71b7053SJung-uk Kim	&movups	("xmm0",&QWP(0,"edi"));		# copy-in context
467*e71b7053SJung-uk Kim	&and	("esp",-16);
468*e71b7053SJung-uk Kim	&mov	("eax",&DWP(16,"edi"));
469*e71b7053SJung-uk Kim	&movaps	(&QWP(0,"esp"),"xmm0");
470*e71b7053SJung-uk Kim	&mov	("edi","esp");
471*e71b7053SJung-uk Kim	&mov	(&DWP(16,"esp"),"eax");
472*e71b7053SJung-uk Kim	&xor	("eax","eax");
473*e71b7053SJung-uk Kim	&data_byte(0xf3,0x0f,0xa6,0xc8);	# rep xsha1
474*e71b7053SJung-uk Kim	&movaps	("xmm0",&QWP(0,"esp"));
475*e71b7053SJung-uk Kim	&mov	("eax",&DWP(16,"esp"));
476*e71b7053SJung-uk Kim	&mov	("esp","edx");			# restore %esp
477*e71b7053SJung-uk Kim    if ($::win32 or $::coff) {
478*e71b7053SJung-uk Kim	&data_byte(0x64,0x8f,0x05,0,0,0,0);	# pop	%fs:0
479*e71b7053SJung-uk Kim	&lea	("esp",&DWP(4,"esp"));
480*e71b7053SJung-uk Kim    }
481*e71b7053SJung-uk Kim	&mov	("edi",&wparam(0));
482*e71b7053SJung-uk Kim	&movups	(&QWP(0,"edi"),"xmm0");		# copy-out context
483*e71b7053SJung-uk Kim	&mov	(&DWP(16,"edi"),"eax");
484*e71b7053SJung-uk Kim	&pop	("esi");
485*e71b7053SJung-uk Kim	&pop	("edi");
486*e71b7053SJung-uk Kim	&ret	();
487*e71b7053SJung-uk Kim&function_end_B("padlock_sha1_oneshot");
488*e71b7053SJung-uk Kim
489*e71b7053SJung-uk Kim&function_begin_B("padlock_sha1_blocks");
490*e71b7053SJung-uk Kim	&push	("edi");
491*e71b7053SJung-uk Kim	&push	("esi");
492*e71b7053SJung-uk Kim	&mov	("edi",&wparam(0));
493*e71b7053SJung-uk Kim	&mov	("esi",&wparam(1));
494*e71b7053SJung-uk Kim	&mov	("edx","esp");			# put aside %esp
495*e71b7053SJung-uk Kim	&mov	("ecx",&wparam(2));
496*e71b7053SJung-uk Kim	&add	("esp",-128);
497*e71b7053SJung-uk Kim	&movups	("xmm0",&QWP(0,"edi"));		# copy-in context
498*e71b7053SJung-uk Kim	&and	("esp",-16);
499*e71b7053SJung-uk Kim	&mov	("eax",&DWP(16,"edi"));
500*e71b7053SJung-uk Kim	&movaps	(&QWP(0,"esp"),"xmm0");
501*e71b7053SJung-uk Kim	&mov	("edi","esp");
502*e71b7053SJung-uk Kim	&mov	(&DWP(16,"esp"),"eax");
503*e71b7053SJung-uk Kim	&mov	("eax",-1);
504*e71b7053SJung-uk Kim	&data_byte(0xf3,0x0f,0xa6,0xc8);	# rep xsha1
505*e71b7053SJung-uk Kim	&movaps	("xmm0",&QWP(0,"esp"));
506*e71b7053SJung-uk Kim	&mov	("eax",&DWP(16,"esp"));
507*e71b7053SJung-uk Kim	&mov	("esp","edx");			# restore %esp
508*e71b7053SJung-uk Kim	&mov	("edi",&wparam(0));
509*e71b7053SJung-uk Kim	&movups	(&QWP(0,"edi"),"xmm0");		# copy-out context
510*e71b7053SJung-uk Kim	&mov	(&DWP(16,"edi"),"eax");
511*e71b7053SJung-uk Kim 	&pop	("esi");
512*e71b7053SJung-uk Kim	&pop	("edi");
513*e71b7053SJung-uk Kim	&ret	();
514*e71b7053SJung-uk Kim&function_end_B("padlock_sha1_blocks");
515*e71b7053SJung-uk Kim
516*e71b7053SJung-uk Kim&function_begin_B("padlock_sha256_oneshot");
517*e71b7053SJung-uk Kim	&push	("edi");
518*e71b7053SJung-uk Kim	&push	("esi");
519*e71b7053SJung-uk Kim	&xor	("eax","eax");
520*e71b7053SJung-uk Kim	&mov	("edi",&wparam(0));
521*e71b7053SJung-uk Kim	&mov	("esi",&wparam(1));
522*e71b7053SJung-uk Kim	&mov	("ecx",&wparam(2));
523*e71b7053SJung-uk Kim    if ($::win32 or $::coff) {
524*e71b7053SJung-uk Kim    	&push	(&::islabel("_win32_segv_handler"));
525*e71b7053SJung-uk Kim	&data_byte(0x64,0xff,0x30);		# push	%fs:(%eax)
526*e71b7053SJung-uk Kim	&data_byte(0x64,0x89,0x20);		# mov	%esp,%fs:(%eax)
527*e71b7053SJung-uk Kim    }
528*e71b7053SJung-uk Kim	&mov	("edx","esp");			# put aside %esp
529*e71b7053SJung-uk Kim	&add	("esp",-128);
530*e71b7053SJung-uk Kim	&movups	("xmm0",&QWP(0,"edi"));		# copy-in context
531*e71b7053SJung-uk Kim	&and	("esp",-16);
532*e71b7053SJung-uk Kim	&movups	("xmm1",&QWP(16,"edi"));
533*e71b7053SJung-uk Kim	&movaps	(&QWP(0,"esp"),"xmm0");
534*e71b7053SJung-uk Kim	&mov	("edi","esp");
535*e71b7053SJung-uk Kim	&movaps	(&QWP(16,"esp"),"xmm1");
536*e71b7053SJung-uk Kim	&xor	("eax","eax");
537*e71b7053SJung-uk Kim	&data_byte(0xf3,0x0f,0xa6,0xd0);	# rep xsha256
538*e71b7053SJung-uk Kim	&movaps	("xmm0",&QWP(0,"esp"));
539*e71b7053SJung-uk Kim	&movaps	("xmm1",&QWP(16,"esp"));
540*e71b7053SJung-uk Kim	&mov	("esp","edx");			# restore %esp
541*e71b7053SJung-uk Kim    if ($::win32 or $::coff) {
542*e71b7053SJung-uk Kim	&data_byte(0x64,0x8f,0x05,0,0,0,0);	# pop	%fs:0
543*e71b7053SJung-uk Kim	&lea	("esp",&DWP(4,"esp"));
544*e71b7053SJung-uk Kim    }
545*e71b7053SJung-uk Kim	&mov	("edi",&wparam(0));
546*e71b7053SJung-uk Kim	&movups	(&QWP(0,"edi"),"xmm0");		# copy-out context
547*e71b7053SJung-uk Kim	&movups	(&QWP(16,"edi"),"xmm1");
548*e71b7053SJung-uk Kim	&pop	("esi");
549*e71b7053SJung-uk Kim	&pop	("edi");
550*e71b7053SJung-uk Kim	&ret	();
551*e71b7053SJung-uk Kim&function_end_B("padlock_sha256_oneshot");
552*e71b7053SJung-uk Kim
553*e71b7053SJung-uk Kim&function_begin_B("padlock_sha256_blocks");
554*e71b7053SJung-uk Kim	&push	("edi");
555*e71b7053SJung-uk Kim	&push	("esi");
556*e71b7053SJung-uk Kim	&mov	("edi",&wparam(0));
557*e71b7053SJung-uk Kim	&mov	("esi",&wparam(1));
558*e71b7053SJung-uk Kim	&mov	("ecx",&wparam(2));
559*e71b7053SJung-uk Kim	&mov	("edx","esp");			# put aside %esp
560*e71b7053SJung-uk Kim	&add	("esp",-128);
561*e71b7053SJung-uk Kim	&movups	("xmm0",&QWP(0,"edi"));		# copy-in context
562*e71b7053SJung-uk Kim	&and	("esp",-16);
563*e71b7053SJung-uk Kim	&movups	("xmm1",&QWP(16,"edi"));
564*e71b7053SJung-uk Kim	&movaps	(&QWP(0,"esp"),"xmm0");
565*e71b7053SJung-uk Kim	&mov	("edi","esp");
566*e71b7053SJung-uk Kim	&movaps	(&QWP(16,"esp"),"xmm1");
567*e71b7053SJung-uk Kim	&mov	("eax",-1);
568*e71b7053SJung-uk Kim	&data_byte(0xf3,0x0f,0xa6,0xd0);	# rep xsha256
569*e71b7053SJung-uk Kim	&movaps	("xmm0",&QWP(0,"esp"));
570*e71b7053SJung-uk Kim	&movaps	("xmm1",&QWP(16,"esp"));
571*e71b7053SJung-uk Kim	&mov	("esp","edx");			# restore %esp
572*e71b7053SJung-uk Kim	&mov	("edi",&wparam(0));
573*e71b7053SJung-uk Kim	&movups	(&QWP(0,"edi"),"xmm0");		# copy-out context
574*e71b7053SJung-uk Kim	&movups	(&QWP(16,"edi"),"xmm1");
575*e71b7053SJung-uk Kim	&pop	("esi");
576*e71b7053SJung-uk Kim	&pop	("edi");
577*e71b7053SJung-uk Kim	&ret	();
578*e71b7053SJung-uk Kim&function_end_B("padlock_sha256_blocks");
579*e71b7053SJung-uk Kim
580*e71b7053SJung-uk Kim&function_begin_B("padlock_sha512_blocks");
581*e71b7053SJung-uk Kim	&push	("edi");
582*e71b7053SJung-uk Kim	&push	("esi");
583*e71b7053SJung-uk Kim	&mov	("edi",&wparam(0));
584*e71b7053SJung-uk Kim	&mov	("esi",&wparam(1));
585*e71b7053SJung-uk Kim	&mov	("ecx",&wparam(2));
586*e71b7053SJung-uk Kim	&mov	("edx","esp");			# put aside %esp
587*e71b7053SJung-uk Kim	&add	("esp",-128);
588*e71b7053SJung-uk Kim	&movups	("xmm0",&QWP(0,"edi"));		# copy-in context
589*e71b7053SJung-uk Kim	&and	("esp",-16);
590*e71b7053SJung-uk Kim	&movups	("xmm1",&QWP(16,"edi"));
591*e71b7053SJung-uk Kim	&movups	("xmm2",&QWP(32,"edi"));
592*e71b7053SJung-uk Kim	&movups	("xmm3",&QWP(48,"edi"));
593*e71b7053SJung-uk Kim	&movaps	(&QWP(0,"esp"),"xmm0");
594*e71b7053SJung-uk Kim	&mov	("edi","esp");
595*e71b7053SJung-uk Kim	&movaps	(&QWP(16,"esp"),"xmm1");
596*e71b7053SJung-uk Kim	&movaps	(&QWP(32,"esp"),"xmm2");
597*e71b7053SJung-uk Kim	&movaps	(&QWP(48,"esp"),"xmm3");
598*e71b7053SJung-uk Kim	&data_byte(0xf3,0x0f,0xa6,0xe0);	# rep xsha512
599*e71b7053SJung-uk Kim	&movaps	("xmm0",&QWP(0,"esp"));
600*e71b7053SJung-uk Kim	&movaps	("xmm1",&QWP(16,"esp"));
601*e71b7053SJung-uk Kim	&movaps	("xmm2",&QWP(32,"esp"));
602*e71b7053SJung-uk Kim	&movaps	("xmm3",&QWP(48,"esp"));
603*e71b7053SJung-uk Kim	&mov	("esp","edx");			# restore %esp
604*e71b7053SJung-uk Kim	&mov	("edi",&wparam(0));
605*e71b7053SJung-uk Kim	&movups	(&QWP(0,"edi"),"xmm0");		# copy-out context
606*e71b7053SJung-uk Kim	&movups	(&QWP(16,"edi"),"xmm1");
607*e71b7053SJung-uk Kim	&movups	(&QWP(32,"edi"),"xmm2");
608*e71b7053SJung-uk Kim	&movups	(&QWP(48,"edi"),"xmm3");
609*e71b7053SJung-uk Kim	&pop	("esi");
610*e71b7053SJung-uk Kim	&pop	("edi");
611*e71b7053SJung-uk Kim	&ret	();
612*e71b7053SJung-uk Kim&function_end_B("padlock_sha512_blocks");
613*e71b7053SJung-uk Kim
614*e71b7053SJung-uk Kim&asciz	("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>");
615*e71b7053SJung-uk Kim&align	(16);
616*e71b7053SJung-uk Kim
617*e71b7053SJung-uk Kim&dataseg();
618*e71b7053SJung-uk Kim# Essentially this variable belongs in thread local storage.
619*e71b7053SJung-uk Kim# Having this variable global on the other hand can only cause
620*e71b7053SJung-uk Kim# few bogus key reloads [if any at all on signle-CPU system],
621*e71b7053SJung-uk Kim# so we accept the penalty...
622*e71b7053SJung-uk Kim&set_label("padlock_saved_context",4);
623*e71b7053SJung-uk Kim&data_word(0);
624*e71b7053SJung-uk Kim
625*e71b7053SJung-uk Kim&asm_finish();
626*e71b7053SJung-uk Kim
627*e71b7053SJung-uk Kimclose STDOUT;
628