xref: /freebsd/crypto/openssl/crypto/sha/asm/keccak1600-mmx.pl (revision b077aed33b7b6aefca7b17ddb250cf521f938613)
1e71b7053SJung-uk Kim#!/usr/bin/env perl
217f01e99SJung-uk Kim# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3e71b7053SJung-uk Kim#
4*b077aed3SPierre Pronchery# Licensed under the Apache License 2.0 (the "License").  You may not use
5e71b7053SJung-uk Kim# this file except in compliance with the License.  You can obtain a copy
6e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at
7e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html
8e71b7053SJung-uk Kim#
9e71b7053SJung-uk Kim# ====================================================================
10e71b7053SJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11e71b7053SJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and
12e71b7053SJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further
13e71b7053SJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/.
14e71b7053SJung-uk Kim# ====================================================================
15e71b7053SJung-uk Kim#
16e71b7053SJung-uk Kim# Keccak-1600 for x86 MMX.
17e71b7053SJung-uk Kim#
18e71b7053SJung-uk Kim# June 2017.
19e71b7053SJung-uk Kim#
20e71b7053SJung-uk Kim# Below code is KECCAK_2X implementation (see sha/keccak1600.c) with
21e71b7053SJung-uk Kim# C[5] held in register bank and D[5] offloaded to memory. Though
22e71b7053SJung-uk Kim# instead of actually unrolling the loop pair-wise I simply flip
23e71b7053SJung-uk Kim# pointers to T[][] and A[][] and the end of round. Since number of
24e71b7053SJung-uk Kim# rounds is even, last round writes to A[][] and everything works out.
25e71b7053SJung-uk Kim# It's argued that MMX is the only code path meaningful to implement
26e71b7053SJung-uk Kim# for x86. This is because non-MMX-capable processors is an extinct
27e71b7053SJung-uk Kim# breed, and they as well can lurk executing compiler-generated code.
28e71b7053SJung-uk Kim# For reference gcc-5.x-generated KECCAK_2X code takes 89 cycles per
29e71b7053SJung-uk Kim# processed byte on Pentium. Which is fair result. But older compilers
30e71b7053SJung-uk Kim# produce worse code. On the other hand one can wonder why not 128-bit
31e71b7053SJung-uk Kim# SSE2? Well, SSE2 won't provide double improvement, rather far from
32e71b7053SJung-uk Kim# that, if any at all on some processors, because it will take extra
3317f01e99SJung-uk Kim# permutations and inter-bank data transfers. Besides, contemporary
34e71b7053SJung-uk Kim# CPUs are better off executing 64-bit code, and it makes lesser sense
35e71b7053SJung-uk Kim# to invest into fancy 32-bit code. And the decision doesn't seem to
36e71b7053SJung-uk Kim# be inadequate, if one compares below results to "64-bit platforms in
37e71b7053SJung-uk Kim# 32-bit mode" SIMD data points available at
38e71b7053SJung-uk Kim# http://keccak.noekeon.org/sw_performance.html.
39e71b7053SJung-uk Kim#
40e71b7053SJung-uk Kim########################################################################
41e71b7053SJung-uk Kim# Numbers are cycles per processed byte out of large message.
42e71b7053SJung-uk Kim#
43e71b7053SJung-uk Kim#			r=1088(i)
44e71b7053SJung-uk Kim#
45e71b7053SJung-uk Kim# PIII			30/+150%
46e71b7053SJung-uk Kim# Pentium M		27/+150%
47e71b7053SJung-uk Kim# P4			40/+85%
48e71b7053SJung-uk Kim# Core 2		19/+170%
49e71b7053SJung-uk Kim# Sandy Bridge(ii)	18/+140%
50e71b7053SJung-uk Kim# Atom			33/+180%
51e71b7053SJung-uk Kim# Silvermont(ii)	30/+180%
52e71b7053SJung-uk Kim# VIA Nano(ii)		43/+60%
53e71b7053SJung-uk Kim# Sledgehammer(ii)(iii)	24/+130%
54e71b7053SJung-uk Kim#
55e71b7053SJung-uk Kim# (i)	Corresponds to SHA3-256. Numbers after slash are improvement
56e71b7053SJung-uk Kim#	coefficients over KECCAK_2X [with bit interleave and lane
57e71b7053SJung-uk Kim#	complementing] position-independent *scalar* code generated
58e71b7053SJung-uk Kim#	by gcc-5.x. It's not exactly fair comparison, but it's a
59e71b7053SJung-uk Kim#	datapoint...
60e71b7053SJung-uk Kim# (ii)	64-bit processor executing 32-bit code.
61e71b7053SJung-uk Kim# (iii)	Result is considered to be representative even for older AMD
62e71b7053SJung-uk Kim#	processors.
63e71b7053SJung-uk Kim
64e71b7053SJung-uk Kim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65e71b7053SJung-uk Kimpush(@INC,"${dir}","${dir}../../perlasm");
66e71b7053SJung-uk Kimrequire "x86asm.pl";
67e71b7053SJung-uk Kim
68*b077aed3SPierre Pronchery$output=pop and open STDOUT,">$output";
69e71b7053SJung-uk Kim
70e71b7053SJung-uk Kim&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
71e71b7053SJung-uk Kim
72e71b7053SJung-uk Kimmy @C = map("mm$_",(0..4));
73e71b7053SJung-uk Kimmy @T = map("mm$_",(5..7));
74e71b7053SJung-uk Kimmy @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
75e71b7053SJung-uk Kim              8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
76e71b7053SJung-uk Kimmy @D = map(8*$_+4, (0..4));
77e71b7053SJung-uk Kimmy @rhotates = ([  0,  1, 62, 28, 27 ],
78e71b7053SJung-uk Kim                [ 36, 44,  6, 55, 20 ],
79e71b7053SJung-uk Kim                [  3, 10, 43, 25, 39 ],
80e71b7053SJung-uk Kim                [ 41, 45, 15, 21,  8 ],
81e71b7053SJung-uk Kim                [ 18,  2, 61, 56, 14 ]);
82e71b7053SJung-uk Kim
83e71b7053SJung-uk Kim&static_label("iotas");
84e71b7053SJung-uk Kim
85e71b7053SJung-uk Kim&function_begin_B("_KeccakF1600");
86e71b7053SJung-uk Kim	&movq	(@C[0],&QWP($A[4][0],"esi"));
87e71b7053SJung-uk Kim	&movq	(@C[1],&QWP($A[4][1],"esi"));
88e71b7053SJung-uk Kim	&movq	(@C[2],&QWP($A[4][2],"esi"));
89e71b7053SJung-uk Kim	&movq	(@C[3],&QWP($A[4][3],"esi"));
90e71b7053SJung-uk Kim	&movq	(@C[4],&QWP($A[4][4],"esi"));
91e71b7053SJung-uk Kim
92e71b7053SJung-uk Kim	&mov	("ecx",24);			# loop counter
93e71b7053SJung-uk Kim	&jmp	(&label("loop"));
94e71b7053SJung-uk Kim
95e71b7053SJung-uk Kim    &set_label("loop",16);
96e71b7053SJung-uk Kim	######################################### Theta
97e71b7053SJung-uk Kim	&pxor	(@C[0],&QWP($A[0][0],"esi"));
98e71b7053SJung-uk Kim	&pxor	(@C[1],&QWP($A[0][1],"esi"));
99e71b7053SJung-uk Kim	&pxor	(@C[2],&QWP($A[0][2],"esi"));
100e71b7053SJung-uk Kim	&pxor	(@C[3],&QWP($A[0][3],"esi"));
101e71b7053SJung-uk Kim	&pxor	(@C[4],&QWP($A[0][4],"esi"));
102e71b7053SJung-uk Kim
103e71b7053SJung-uk Kim	&pxor	(@C[0],&QWP($A[1][0],"esi"));
104e71b7053SJung-uk Kim	&pxor	(@C[1],&QWP($A[1][1],"esi"));
105e71b7053SJung-uk Kim	&pxor	(@C[2],&QWP($A[1][2],"esi"));
106e71b7053SJung-uk Kim	&pxor	(@C[3],&QWP($A[1][3],"esi"));
107e71b7053SJung-uk Kim	&pxor	(@C[4],&QWP($A[1][4],"esi"));
108e71b7053SJung-uk Kim
109e71b7053SJung-uk Kim	&pxor	(@C[0],&QWP($A[2][0],"esi"));
110e71b7053SJung-uk Kim	&pxor	(@C[1],&QWP($A[2][1],"esi"));
111e71b7053SJung-uk Kim	&pxor	(@C[2],&QWP($A[2][2],"esi"));
112e71b7053SJung-uk Kim	&pxor	(@C[3],&QWP($A[2][3],"esi"));
113e71b7053SJung-uk Kim	&pxor	(@C[4],&QWP($A[2][4],"esi"));
114e71b7053SJung-uk Kim
115e71b7053SJung-uk Kim	&pxor	(@C[2],&QWP($A[3][2],"esi"));
116e71b7053SJung-uk Kim	&pxor	(@C[0],&QWP($A[3][0],"esi"));
117e71b7053SJung-uk Kim	&pxor	(@C[1],&QWP($A[3][1],"esi"));
118e71b7053SJung-uk Kim	&pxor	(@C[3],&QWP($A[3][3],"esi"));
119e71b7053SJung-uk Kim	 &movq	(@T[0],@C[2]);
120e71b7053SJung-uk Kim	&pxor	(@C[4],&QWP($A[3][4],"esi"));
121e71b7053SJung-uk Kim
122e71b7053SJung-uk Kim	 &movq	(@T[2],@C[2]);
123e71b7053SJung-uk Kim	 &psrlq	(@T[0],63);
124e71b7053SJung-uk Kim	&movq	(@T[1],@C[0]);
125e71b7053SJung-uk Kim	 &psllq	(@T[2],1);
126e71b7053SJung-uk Kim	 &pxor	(@T[0],@C[0]);
127e71b7053SJung-uk Kim	&psrlq	(@C[0],63);
128e71b7053SJung-uk Kim	 &pxor	(@T[0],@T[2]);
129e71b7053SJung-uk Kim	&psllq	(@T[1],1);
130e71b7053SJung-uk Kim	 &movq	(@T[2],@C[1]);
131e71b7053SJung-uk Kim	 &movq	(&QWP(@D[1],"esp"),@T[0]);	# D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
132e71b7053SJung-uk Kim
133e71b7053SJung-uk Kim	&pxor	(@T[1],@C[0]);
134e71b7053SJung-uk Kim	 &psrlq	(@T[2],63);
135e71b7053SJung-uk Kim	&pxor	(@T[1],@C[3]);
136e71b7053SJung-uk Kim	 &movq	(@C[0],@C[1]);
137e71b7053SJung-uk Kim	&movq	(&QWP(@D[4],"esp"),@T[1]);	# D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
138e71b7053SJung-uk Kim
139e71b7053SJung-uk Kim	 &psllq	(@C[0],1);
140e71b7053SJung-uk Kim	 &pxor	(@T[2],@C[4]);
141e71b7053SJung-uk Kim	 &pxor	(@C[0],@T[2]);
142e71b7053SJung-uk Kim
143e71b7053SJung-uk Kim	&movq	(@T[2],@C[3]);
144e71b7053SJung-uk Kim	&psrlq	(@C[3],63);
145e71b7053SJung-uk Kim	 &movq	(&QWP(@D[0],"esp"),@C[0]);	# D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
146e71b7053SJung-uk Kim	&psllq	(@T[2],1);
147e71b7053SJung-uk Kim	 &movq	(@T[0],@C[4]);
148e71b7053SJung-uk Kim	 &psrlq	(@C[4],63);
149e71b7053SJung-uk Kim	&pxor	(@C[1],@C[3]);
150e71b7053SJung-uk Kim	 &psllq	(@T[0],1);
151e71b7053SJung-uk Kim	&pxor	(@C[1],@T[2]);
152e71b7053SJung-uk Kim	 &pxor	(@C[2],@C[4]);
153e71b7053SJung-uk Kim	&movq	(&QWP(@D[2],"esp"),@C[1]);	# D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
154e71b7053SJung-uk Kim	 &pxor	(@C[2],@T[0]);
155e71b7053SJung-uk Kim
156e71b7053SJung-uk Kim	######################################### first Rho(0) is special
157e71b7053SJung-uk Kim	&movq	(@C[3],&QWP($A[3][3],"esi"));
158e71b7053SJung-uk Kim	 &movq	(&QWP(@D[3],"esp"),@C[2]);	# D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
159e71b7053SJung-uk Kim	&pxor	(@C[3],@C[2]);
160e71b7053SJung-uk Kim	 &movq	(@C[4],&QWP($A[4][4],"esi"));
161e71b7053SJung-uk Kim	&movq	(@T[2],@C[3]);
162e71b7053SJung-uk Kim	&psrlq	(@C[3],64-$rhotates[3][3]);
163e71b7053SJung-uk Kim	 &pxor	(@C[4],@T[1]);
164e71b7053SJung-uk Kim	&psllq	(@T[2],$rhotates[3][3]);
165e71b7053SJung-uk Kim	 &movq	(@T[1],@C[4]);
166e71b7053SJung-uk Kim	 &psrlq	(@C[4],64-$rhotates[4][4]);
167e71b7053SJung-uk Kim	&por	(@C[3],@T[2]);		# C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]);   /* D[3] */
168e71b7053SJung-uk Kim	 &psllq	(@T[1],$rhotates[4][4]);
169e71b7053SJung-uk Kim
170e71b7053SJung-uk Kim	&movq	(@C[2],&QWP($A[2][2],"esi"));
171e71b7053SJung-uk Kim	 &por	(@C[4],@T[1]);		# C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]);   /* D[4] */
172e71b7053SJung-uk Kim	&pxor	(@C[2],@C[1]);
173e71b7053SJung-uk Kim	 &movq	(@C[1],&QWP($A[1][1],"esi"));
174e71b7053SJung-uk Kim	&movq	(@T[1],@C[2]);
175e71b7053SJung-uk Kim	&psrlq	(@C[2],64-$rhotates[2][2]);
176e71b7053SJung-uk Kim	 &pxor	(@C[1],&QWP(@D[1],"esp"));
177e71b7053SJung-uk Kim	&psllq	(@T[1],$rhotates[2][2]);
178e71b7053SJung-uk Kim
179e71b7053SJung-uk Kim	 &movq	(@T[2],@C[1]);
180e71b7053SJung-uk Kim	 &psrlq	(@C[1],64-$rhotates[1][1]);
181e71b7053SJung-uk Kim	&por	(@C[2],@T[1]);		# C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]);   /* D[2] */
182e71b7053SJung-uk Kim	 &psllq	(@T[2],$rhotates[1][1]);
183e71b7053SJung-uk Kim	&pxor	(@C[0],&QWP($A[0][0],"esi")); # /* rotate by 0 */  /* D[0] */
184e71b7053SJung-uk Kim	 &por	(@C[1],@T[2]);		# C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
185e71b7053SJung-uk Kim
186e71b7053SJung-uk Kimsub Chi() {				######### regular Chi step
187e71b7053SJung-uk Kim    my ($y,$xrho) = @_;
188e71b7053SJung-uk Kim
189e71b7053SJung-uk Kim	&movq	(@T[0],@C[1]);
190e71b7053SJung-uk Kim	 &movq	(@T[1],@C[2]);
191e71b7053SJung-uk Kim	&pandn	(@T[0],@C[2]);
192e71b7053SJung-uk Kim	 &pandn	(@C[2],@C[3]);
193e71b7053SJung-uk Kim	&pxor	(@T[0],@C[0]);
194e71b7053SJung-uk Kim	 &pxor	(@C[2],@C[1]);
195e71b7053SJung-uk Kim	&pxor	(@T[0],&QWP(0,"ebx"))		if ($y == 0);
196e71b7053SJung-uk Kim	&lea	("ebx",&DWP(8,"ebx"))		if ($y == 0);
197e71b7053SJung-uk Kim
198e71b7053SJung-uk Kim	&movq	(@T[2],@C[3]);
199e71b7053SJung-uk Kim	&movq	(&QWP($A[$y][0],"edi"),@T[0]);	# R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
200e71b7053SJung-uk Kim	 &movq	(@T[0],@C[4]);
201e71b7053SJung-uk Kim	&pandn	(@C[3],@C[4]);
202e71b7053SJung-uk Kim	 &pandn	(@C[4],@C[0]);
203e71b7053SJung-uk Kim	&pxor	(@C[3],@T[1]);
204e71b7053SJung-uk Kim	 &movq	(&QWP($A[$y][1],"edi"),@C[2]);	# R[0][1] = C[1] ^ (~C[2] & C[3]);
205e71b7053SJung-uk Kim	 &pxor	(@C[4],@T[2]);
206e71b7053SJung-uk Kim	  &movq	(@T[2],&QWP($A[0][$xrho],"esi"))	if (defined($xrho));
207e71b7053SJung-uk Kim
208e71b7053SJung-uk Kim	 &movq	(&QWP($A[$y][2],"edi"),@C[3]);	# R[0][2] = C[2] ^ (~C[3] & C[4]);
209e71b7053SJung-uk Kim	&pandn	(@C[0],@C[1]);
210e71b7053SJung-uk Kim	 &movq	(&QWP($A[$y][3],"edi"),@C[4]);	# R[0][3] = C[3] ^ (~C[4] & C[0]);
211e71b7053SJung-uk Kim	&pxor	(@C[0],@T[0]);
212e71b7053SJung-uk Kim	  &pxor	(@T[2],&QWP(@D[$xrho],"esp"))		if (defined($xrho));
213e71b7053SJung-uk Kim	&movq	(&QWP($A[$y][4],"edi"),@C[0]);	# R[0][4] = C[4] ^ (~C[0] & C[1]);
214e71b7053SJung-uk Kim}
215e71b7053SJung-uk Kim	&Chi	(0, 3);
216e71b7053SJung-uk Kim
217e71b7053SJung-uk Kimsub Rho() {				######### regular Rho step
218e71b7053SJung-uk Kim    my $x = shift;
219e71b7053SJung-uk Kim
220e71b7053SJung-uk Kim	#&movq	(@T[2],&QWP($A[0][$x],"esi"));	# moved to Chi
221e71b7053SJung-uk Kim	#&pxor	(@T[2],&QWP(@D[$x],"esp"));	# moved to Chi
222e71b7053SJung-uk Kim	&movq	(@C[0],@T[2]);
223e71b7053SJung-uk Kim	&psrlq	(@T[2],64-$rhotates[0][$x]);
224e71b7053SJung-uk Kim	 &movq	(@C[1],&QWP($A[1][($x+1)%5],"esi"));
225e71b7053SJung-uk Kim	&psllq	(@C[0],$rhotates[0][$x]);
226e71b7053SJung-uk Kim	 &pxor	(@C[1],&QWP(@D[($x+1)%5],"esp"));
227e71b7053SJung-uk Kim	&por	(@C[0],@T[2]);		# C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
228e71b7053SJung-uk Kim
229e71b7053SJung-uk Kim	 &movq	(@T[1],@C[1]);
230e71b7053SJung-uk Kim	 &psrlq	(@C[1],64-$rhotates[1][($x+1)%5]);
231e71b7053SJung-uk Kim	&movq	(@C[2],&QWP($A[2][($x+2)%5],"esi"));
232e71b7053SJung-uk Kim	 &psllq	(@T[1],$rhotates[1][($x+1)%5]);
233e71b7053SJung-uk Kim	&pxor	(@C[2],&QWP(@D[($x+2)%5],"esp"));
234e71b7053SJung-uk Kim	 &por	(@C[1],@T[1]);		# C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
235e71b7053SJung-uk Kim
236e71b7053SJung-uk Kim	&movq	(@T[2],@C[2]);
237e71b7053SJung-uk Kim	&psrlq	(@C[2],64-$rhotates[2][($x+2)%5]);
238e71b7053SJung-uk Kim	 &movq	(@C[3],&QWP($A[3][($x+3)%5],"esi"));
239e71b7053SJung-uk Kim	&psllq	(@T[2],$rhotates[2][($x+2)%5]);
240e71b7053SJung-uk Kim	 &pxor	(@C[3],&QWP(@D[($x+3)%5],"esp"));
241e71b7053SJung-uk Kim	&por	(@C[2],@T[2]);		# C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
242e71b7053SJung-uk Kim
243e71b7053SJung-uk Kim	 &movq	(@T[0],@C[3]);
244e71b7053SJung-uk Kim	 &psrlq	(@C[3],64-$rhotates[3][($x+3)%5]);
245e71b7053SJung-uk Kim	&movq	(@C[4],&QWP($A[4][($x+4)%5],"esi"));
246e71b7053SJung-uk Kim	 &psllq	(@T[0],$rhotates[3][($x+3)%5]);
247e71b7053SJung-uk Kim	&pxor	(@C[4],&QWP(@D[($x+4)%5],"esp"));
248e71b7053SJung-uk Kim	 &por	(@C[3],@T[0]);		# C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
249e71b7053SJung-uk Kim
250e71b7053SJung-uk Kim	&movq	(@T[1],@C[4]);
251e71b7053SJung-uk Kim	&psrlq	(@C[4],64-$rhotates[4][($x+4)%5]);
252e71b7053SJung-uk Kim	&psllq	(@T[1],$rhotates[4][($x+4)%5]);
253e71b7053SJung-uk Kim	&por	(@C[4],@T[1]);		# C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
254e71b7053SJung-uk Kim}
255e71b7053SJung-uk Kim	&Rho	(3);	&Chi	(1, 1);
256e71b7053SJung-uk Kim	&Rho	(1);	&Chi	(2, 4);
257e71b7053SJung-uk Kim	&Rho	(4);	&Chi	(3, 2);
258e71b7053SJung-uk Kim	&Rho	(2);	###&Chi	(4);
259e71b7053SJung-uk Kim
260e71b7053SJung-uk Kim	&movq	(@T[0],@C[0]);		######### last Chi(4) is special
261e71b7053SJung-uk Kim	 &xor	("edi","esi");		# &xchg	("esi","edi");
262e71b7053SJung-uk Kim	&movq	(&QWP(@D[1],"esp"),@C[1]);
263e71b7053SJung-uk Kim	 &xor	("esi","edi");
264e71b7053SJung-uk Kim	 &xor	("edi","esi");
265e71b7053SJung-uk Kim
266e71b7053SJung-uk Kim	&movq	(@T[1],@C[1]);
267e71b7053SJung-uk Kim	 &movq	(@T[2],@C[2]);
268e71b7053SJung-uk Kim	&pandn	(@T[1],@C[2]);
269e71b7053SJung-uk Kim	 &pandn	(@T[2],@C[3]);
270e71b7053SJung-uk Kim	&pxor	(@C[0],@T[1]);
271e71b7053SJung-uk Kim	 &pxor	(@C[1],@T[2]);
272e71b7053SJung-uk Kim
273e71b7053SJung-uk Kim	&movq	(@T[1],@C[3]);
274e71b7053SJung-uk Kim	 &movq	(&QWP($A[4][0],"esi"),@C[0]);	# R[4][0] = C[0] ^= (~C[1] & C[2]);
275e71b7053SJung-uk Kim	&pandn	(@T[1],@C[4]);
276e71b7053SJung-uk Kim	 &movq	(&QWP($A[4][1],"esi"),@C[1]);	# R[4][1] = C[1] ^= (~C[2] & C[3]);
277e71b7053SJung-uk Kim	&pxor	(@C[2],@T[1]);
278e71b7053SJung-uk Kim	 &movq	(@T[2],@C[4]);
279e71b7053SJung-uk Kim	&movq	(&QWP($A[4][2],"esi"),@C[2]);	# R[4][2] = C[2] ^= (~C[3] & C[4]);
280e71b7053SJung-uk Kim
281e71b7053SJung-uk Kim	&pandn	(@T[2],@T[0]);
282e71b7053SJung-uk Kim	 &pandn	(@T[0],&QWP(@D[1],"esp"));
283e71b7053SJung-uk Kim	&pxor	(@C[3],@T[2]);
284e71b7053SJung-uk Kim	 &pxor	(@C[4],@T[0]);
285e71b7053SJung-uk Kim	&movq	(&QWP($A[4][3],"esi"),@C[3]);	# R[4][3] = C[3] ^= (~C[4] & D[0]);
286e71b7053SJung-uk Kim	&sub	("ecx",1);
287e71b7053SJung-uk Kim	 &movq	(&QWP($A[4][4],"esi"),@C[4]);	# R[4][4] = C[4] ^= (~D[0] & D[1]);
288e71b7053SJung-uk Kim	&jnz	(&label("loop"));
289e71b7053SJung-uk Kim
290e71b7053SJung-uk Kim	&lea	("ebx",&DWP(-192,"ebx"));	# rewind iotas
291e71b7053SJung-uk Kim	&ret	();
292e71b7053SJung-uk Kim&function_end_B("_KeccakF1600");
293e71b7053SJung-uk Kim
294e71b7053SJung-uk Kim&function_begin("KeccakF1600");
295e71b7053SJung-uk Kim	&mov	("esi",&wparam(0));
296e71b7053SJung-uk Kim	&mov	("ebp","esp");
297e71b7053SJung-uk Kim	&sub	("esp",240);
298e71b7053SJung-uk Kim	&call	(&label("pic_point"));
299e71b7053SJung-uk Kim    &set_label("pic_point");
300e71b7053SJung-uk Kim	&blindpop("ebx");
301e71b7053SJung-uk Kim	&lea	("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx"));
302e71b7053SJung-uk Kim	&and	("esp",-8);
303e71b7053SJung-uk Kim	&lea	("esi",&DWP(100,"esi"));	# size optimization
304e71b7053SJung-uk Kim	&lea	("edi",&DWP(8*5+100,"esp"));	# size optimization
305e71b7053SJung-uk Kim
306e71b7053SJung-uk Kim	&call	("_KeccakF1600");
307e71b7053SJung-uk Kim
308e71b7053SJung-uk Kim	&mov	("esp","ebp");
309e71b7053SJung-uk Kim	&emms	();
310e71b7053SJung-uk Kim&function_end("KeccakF1600");
311e71b7053SJung-uk Kim
312e71b7053SJung-uk Kim&function_begin("SHA3_absorb");
313e71b7053SJung-uk Kim	&mov	("esi",&wparam(0));		# A[][]
314e71b7053SJung-uk Kim	&mov	("eax",&wparam(1));		# inp
315e71b7053SJung-uk Kim	&mov	("ecx",&wparam(2));		# len
316e71b7053SJung-uk Kim	&mov	("edx",&wparam(3));		# bsz
317e71b7053SJung-uk Kim	&mov	("ebp","esp");
318e71b7053SJung-uk Kim	&sub	("esp",240+8);
319e71b7053SJung-uk Kim	&call	(&label("pic_point"));
320e71b7053SJung-uk Kim    &set_label("pic_point");
321e71b7053SJung-uk Kim	&blindpop("ebx");
322e71b7053SJung-uk Kim	&lea	("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx"));
323e71b7053SJung-uk Kim	&and	("esp",-8);
324e71b7053SJung-uk Kim
325e71b7053SJung-uk Kim	&mov	("edi","esi");
326e71b7053SJung-uk Kim	&lea	("esi",&DWP(100,"esi"));	# size optimization
327e71b7053SJung-uk Kim	&mov	(&DWP(-4,"ebp"),"edx");		# save bsz
328e71b7053SJung-uk Kim	&jmp	(&label("loop"));
329e71b7053SJung-uk Kim
330e71b7053SJung-uk Kim&set_label("loop",16);
331e71b7053SJung-uk Kim	&cmp	("ecx","edx");			# len < bsz?
332e71b7053SJung-uk Kim	&jc	(&label("absorbed"));
333e71b7053SJung-uk Kim
334e71b7053SJung-uk Kim	&shr	("edx",3);			# bsz /= 8
335e71b7053SJung-uk Kim&set_label("block");
336e71b7053SJung-uk Kim	&movq	("mm0",&QWP(0,"eax"));
337e71b7053SJung-uk Kim	&lea	("eax",&DWP(8,"eax"));
338e71b7053SJung-uk Kim	&pxor	("mm0",&QWP(0,"edi"));
339e71b7053SJung-uk Kim	&lea	("edi",&DWP(8,"edi"));
340e71b7053SJung-uk Kim	&sub	("ecx",8);			# len -= 8
341e71b7053SJung-uk Kim	&movq	(&QWP(-8,"edi"),"mm0");
342e71b7053SJung-uk Kim	&dec	("edx");			# bsz--
343e71b7053SJung-uk Kim	&jnz	(&label("block"));
344e71b7053SJung-uk Kim
345e71b7053SJung-uk Kim	&lea	("edi",&DWP(8*5+100,"esp"));	# size optimization
346e71b7053SJung-uk Kim	&mov	(&DWP(-8,"ebp"),"ecx");		# save len
347e71b7053SJung-uk Kim	&call	("_KeccakF1600");
348e71b7053SJung-uk Kim	&mov	("ecx",&DWP(-8,"ebp"));		# pull len
349e71b7053SJung-uk Kim	&mov	("edx",&DWP(-4,"ebp"));		# pull bsz
350e71b7053SJung-uk Kim	&lea	("edi",&DWP(-100,"esi"));
351e71b7053SJung-uk Kim	&jmp	(&label("loop"));
352e71b7053SJung-uk Kim
353e71b7053SJung-uk Kim&set_label("absorbed",16);
354e71b7053SJung-uk Kim	&mov	("eax","ecx");			# return value
355e71b7053SJung-uk Kim	&mov	("esp","ebp");
356e71b7053SJung-uk Kim	&emms	();
357e71b7053SJung-uk Kim&function_end("SHA3_absorb");
358e71b7053SJung-uk Kim
359e71b7053SJung-uk Kim&function_begin("SHA3_squeeze");
360e71b7053SJung-uk Kim	&mov	("esi",&wparam(0));		# A[][]
361e71b7053SJung-uk Kim	&mov	("eax",&wparam(1));		# out
362e71b7053SJung-uk Kim	&mov	("ecx",&wparam(2));		# len
363e71b7053SJung-uk Kim	&mov	("edx",&wparam(3));		# bsz
364e71b7053SJung-uk Kim	&mov	("ebp","esp");
365e71b7053SJung-uk Kim	&sub	("esp",240+8);
366e71b7053SJung-uk Kim	&call	(&label("pic_point"));
367e71b7053SJung-uk Kim    &set_label("pic_point");
368e71b7053SJung-uk Kim	&blindpop("ebx");
369e71b7053SJung-uk Kim	&lea	("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx"));
370e71b7053SJung-uk Kim	&and	("esp",-8);
371e71b7053SJung-uk Kim
372e71b7053SJung-uk Kim	&shr	("edx",3);			# bsz /= 8
373e71b7053SJung-uk Kim	&mov	("edi","esi");
374e71b7053SJung-uk Kim	&lea	("esi",&DWP(100,"esi"));	# size optimization
375e71b7053SJung-uk Kim	&mov	(&DWP(-4,"ebp"),"edx");		# save bsz
376e71b7053SJung-uk Kim	&jmp	(&label("loop"));
377e71b7053SJung-uk Kim
378e71b7053SJung-uk Kim&set_label("loop",16);
379e71b7053SJung-uk Kim	&cmp	("ecx",8);			# len < 8?
380e71b7053SJung-uk Kim	&jc	(&label("tail"));
381e71b7053SJung-uk Kim
382e71b7053SJung-uk Kim	&movq	("mm0",&QWP(0,"edi"));
383e71b7053SJung-uk Kim	&lea	("edi",&DWP(8,"edi"));
384e71b7053SJung-uk Kim	&movq	(&QWP(0,"eax"),"mm0");
385e71b7053SJung-uk Kim	&lea	("eax",&DWP(8,"eax"));
386e71b7053SJung-uk Kim	&sub	("ecx",8);			# len -= 8
387e71b7053SJung-uk Kim	&jz	(&label("done"));
388e71b7053SJung-uk Kim
389e71b7053SJung-uk Kim	&dec	("edx");			# bsz--
390e71b7053SJung-uk Kim	&jnz	(&label("loop"));
391e71b7053SJung-uk Kim
392e71b7053SJung-uk Kim	&lea	("edi",&DWP(8*5+100,"esp"));	# size optimization
393e71b7053SJung-uk Kim	&mov	(&DWP(-8,"ebp"),"ecx");		# save len
394e71b7053SJung-uk Kim	&call	("_KeccakF1600");
395e71b7053SJung-uk Kim	&mov	("ecx",&DWP(-8,"ebp"));		# pull len
396e71b7053SJung-uk Kim	&mov	("edx",&DWP(-4,"ebp"));		# pull bsz
397e71b7053SJung-uk Kim	&lea	("edi",&DWP(-100,"esi"));
398e71b7053SJung-uk Kim	&jmp	(&label("loop"));
399e71b7053SJung-uk Kim
400e71b7053SJung-uk Kim&set_label("tail",16);
401e71b7053SJung-uk Kim	&mov	("esi","edi");
402e71b7053SJung-uk Kim	&mov	("edi","eax");
403e71b7053SJung-uk Kim	&data_word("0xA4F39066");		# rep movsb
404e71b7053SJung-uk Kim
405e71b7053SJung-uk Kim&set_label("done");
406e71b7053SJung-uk Kim	&mov	("esp","ebp");
407e71b7053SJung-uk Kim	&emms	();
408e71b7053SJung-uk Kim&function_end("SHA3_squeeze");
409e71b7053SJung-uk Kim
410e71b7053SJung-uk Kim&set_label("iotas",32);
411e71b7053SJung-uk Kim	&data_word(0x00000001,0x00000000);
412e71b7053SJung-uk Kim	&data_word(0x00008082,0x00000000);
413e71b7053SJung-uk Kim	&data_word(0x0000808a,0x80000000);
414e71b7053SJung-uk Kim	&data_word(0x80008000,0x80000000);
415e71b7053SJung-uk Kim	&data_word(0x0000808b,0x00000000);
416e71b7053SJung-uk Kim	&data_word(0x80000001,0x00000000);
417e71b7053SJung-uk Kim	&data_word(0x80008081,0x80000000);
418e71b7053SJung-uk Kim	&data_word(0x00008009,0x80000000);
419e71b7053SJung-uk Kim	&data_word(0x0000008a,0x00000000);
420e71b7053SJung-uk Kim	&data_word(0x00000088,0x00000000);
421e71b7053SJung-uk Kim	&data_word(0x80008009,0x00000000);
422e71b7053SJung-uk Kim	&data_word(0x8000000a,0x00000000);
423e71b7053SJung-uk Kim	&data_word(0x8000808b,0x00000000);
424e71b7053SJung-uk Kim	&data_word(0x0000008b,0x80000000);
425e71b7053SJung-uk Kim	&data_word(0x00008089,0x80000000);
426e71b7053SJung-uk Kim	&data_word(0x00008003,0x80000000);
427e71b7053SJung-uk Kim	&data_word(0x00008002,0x80000000);
428e71b7053SJung-uk Kim	&data_word(0x00000080,0x80000000);
429e71b7053SJung-uk Kim	&data_word(0x0000800a,0x00000000);
430e71b7053SJung-uk Kim	&data_word(0x8000000a,0x80000000);
431e71b7053SJung-uk Kim	&data_word(0x80008081,0x80000000);
432e71b7053SJung-uk Kim	&data_word(0x00008080,0x80000000);
433e71b7053SJung-uk Kim	&data_word(0x80000001,0x00000000);
434e71b7053SJung-uk Kim	&data_word(0x80008008,0x80000000);
435e71b7053SJung-uk Kim&asciz("Keccak-1600 absorb and squeeze for MMX, CRYPTOGAMS by <appro\@openssl.org>");
436e71b7053SJung-uk Kim
437e71b7053SJung-uk Kim&asm_finish();
438e71b7053SJung-uk Kim
43917f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!";
440