1e71b7053SJung-uk Kim#!/usr/bin/env perl 217f01e99SJung-uk Kim# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. 3e71b7053SJung-uk Kim# 4*b077aed3SPierre Pronchery# Licensed under the Apache License 2.0 (the "License"). You may not use 5e71b7053SJung-uk Kim# this file except in compliance with the License. You can obtain a copy 6e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at 7e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html 8e71b7053SJung-uk Kim# 9e71b7053SJung-uk Kim# ==================================================================== 10e71b7053SJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11e71b7053SJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and 12e71b7053SJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further 13e71b7053SJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/. 14e71b7053SJung-uk Kim# ==================================================================== 15e71b7053SJung-uk Kim# 16e71b7053SJung-uk Kim# Keccak-1600 for x86 MMX. 17e71b7053SJung-uk Kim# 18e71b7053SJung-uk Kim# June 2017. 19e71b7053SJung-uk Kim# 20e71b7053SJung-uk Kim# Below code is KECCAK_2X implementation (see sha/keccak1600.c) with 21e71b7053SJung-uk Kim# C[5] held in register bank and D[5] offloaded to memory. Though 22e71b7053SJung-uk Kim# instead of actually unrolling the loop pair-wise I simply flip 23e71b7053SJung-uk Kim# pointers to T[][] and A[][] and the end of round. Since number of 24e71b7053SJung-uk Kim# rounds is even, last round writes to A[][] and everything works out. 25e71b7053SJung-uk Kim# It's argued that MMX is the only code path meaningful to implement 26e71b7053SJung-uk Kim# for x86. This is because non-MMX-capable processors is an extinct 27e71b7053SJung-uk Kim# breed, and they as well can lurk executing compiler-generated code. 28e71b7053SJung-uk Kim# For reference gcc-5.x-generated KECCAK_2X code takes 89 cycles per 29e71b7053SJung-uk Kim# processed byte on Pentium. Which is fair result. But older compilers 30e71b7053SJung-uk Kim# produce worse code. On the other hand one can wonder why not 128-bit 31e71b7053SJung-uk Kim# SSE2? Well, SSE2 won't provide double improvement, rather far from 32e71b7053SJung-uk Kim# that, if any at all on some processors, because it will take extra 3317f01e99SJung-uk Kim# permutations and inter-bank data transfers. Besides, contemporary 34e71b7053SJung-uk Kim# CPUs are better off executing 64-bit code, and it makes lesser sense 35e71b7053SJung-uk Kim# to invest into fancy 32-bit code. And the decision doesn't seem to 36e71b7053SJung-uk Kim# be inadequate, if one compares below results to "64-bit platforms in 37e71b7053SJung-uk Kim# 32-bit mode" SIMD data points available at 38e71b7053SJung-uk Kim# http://keccak.noekeon.org/sw_performance.html. 39e71b7053SJung-uk Kim# 40e71b7053SJung-uk Kim######################################################################## 41e71b7053SJung-uk Kim# Numbers are cycles per processed byte out of large message. 42e71b7053SJung-uk Kim# 43e71b7053SJung-uk Kim# r=1088(i) 44e71b7053SJung-uk Kim# 45e71b7053SJung-uk Kim# PIII 30/+150% 46e71b7053SJung-uk Kim# Pentium M 27/+150% 47e71b7053SJung-uk Kim# P4 40/+85% 48e71b7053SJung-uk Kim# Core 2 19/+170% 49e71b7053SJung-uk Kim# Sandy Bridge(ii) 18/+140% 50e71b7053SJung-uk Kim# Atom 33/+180% 51e71b7053SJung-uk Kim# Silvermont(ii) 30/+180% 52e71b7053SJung-uk Kim# VIA Nano(ii) 43/+60% 53e71b7053SJung-uk Kim# Sledgehammer(ii)(iii) 24/+130% 54e71b7053SJung-uk Kim# 55e71b7053SJung-uk Kim# (i) Corresponds to SHA3-256. Numbers after slash are improvement 56e71b7053SJung-uk Kim# coefficients over KECCAK_2X [with bit interleave and lane 57e71b7053SJung-uk Kim# complementing] position-independent *scalar* code generated 58e71b7053SJung-uk Kim# by gcc-5.x. It's not exactly fair comparison, but it's a 59e71b7053SJung-uk Kim# datapoint... 60e71b7053SJung-uk Kim# (ii) 64-bit processor executing 32-bit code. 61e71b7053SJung-uk Kim# (iii) Result is considered to be representative even for older AMD 62e71b7053SJung-uk Kim# processors. 63e71b7053SJung-uk Kim 64e71b7053SJung-uk Kim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 65e71b7053SJung-uk Kimpush(@INC,"${dir}","${dir}../../perlasm"); 66e71b7053SJung-uk Kimrequire "x86asm.pl"; 67e71b7053SJung-uk Kim 68*b077aed3SPierre Pronchery$output=pop and open STDOUT,">$output"; 69e71b7053SJung-uk Kim 70e71b7053SJung-uk Kim&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); 71e71b7053SJung-uk Kim 72e71b7053SJung-uk Kimmy @C = map("mm$_",(0..4)); 73e71b7053SJung-uk Kimmy @T = map("mm$_",(5..7)); 74e71b7053SJung-uk Kimmy @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100, 75e71b7053SJung-uk Kim 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20)); 76e71b7053SJung-uk Kimmy @D = map(8*$_+4, (0..4)); 77e71b7053SJung-uk Kimmy @rhotates = ([ 0, 1, 62, 28, 27 ], 78e71b7053SJung-uk Kim [ 36, 44, 6, 55, 20 ], 79e71b7053SJung-uk Kim [ 3, 10, 43, 25, 39 ], 80e71b7053SJung-uk Kim [ 41, 45, 15, 21, 8 ], 81e71b7053SJung-uk Kim [ 18, 2, 61, 56, 14 ]); 82e71b7053SJung-uk Kim 83e71b7053SJung-uk Kim&static_label("iotas"); 84e71b7053SJung-uk Kim 85e71b7053SJung-uk Kim&function_begin_B("_KeccakF1600"); 86e71b7053SJung-uk Kim &movq (@C[0],&QWP($A[4][0],"esi")); 87e71b7053SJung-uk Kim &movq (@C[1],&QWP($A[4][1],"esi")); 88e71b7053SJung-uk Kim &movq (@C[2],&QWP($A[4][2],"esi")); 89e71b7053SJung-uk Kim &movq (@C[3],&QWP($A[4][3],"esi")); 90e71b7053SJung-uk Kim &movq (@C[4],&QWP($A[4][4],"esi")); 91e71b7053SJung-uk Kim 92e71b7053SJung-uk Kim &mov ("ecx",24); # loop counter 93e71b7053SJung-uk Kim &jmp (&label("loop")); 94e71b7053SJung-uk Kim 95e71b7053SJung-uk Kim &set_label("loop",16); 96e71b7053SJung-uk Kim ######################################### Theta 97e71b7053SJung-uk Kim &pxor (@C[0],&QWP($A[0][0],"esi")); 98e71b7053SJung-uk Kim &pxor (@C[1],&QWP($A[0][1],"esi")); 99e71b7053SJung-uk Kim &pxor (@C[2],&QWP($A[0][2],"esi")); 100e71b7053SJung-uk Kim &pxor (@C[3],&QWP($A[0][3],"esi")); 101e71b7053SJung-uk Kim &pxor (@C[4],&QWP($A[0][4],"esi")); 102e71b7053SJung-uk Kim 103e71b7053SJung-uk Kim &pxor (@C[0],&QWP($A[1][0],"esi")); 104e71b7053SJung-uk Kim &pxor (@C[1],&QWP($A[1][1],"esi")); 105e71b7053SJung-uk Kim &pxor (@C[2],&QWP($A[1][2],"esi")); 106e71b7053SJung-uk Kim &pxor (@C[3],&QWP($A[1][3],"esi")); 107e71b7053SJung-uk Kim &pxor (@C[4],&QWP($A[1][4],"esi")); 108e71b7053SJung-uk Kim 109e71b7053SJung-uk Kim &pxor (@C[0],&QWP($A[2][0],"esi")); 110e71b7053SJung-uk Kim &pxor (@C[1],&QWP($A[2][1],"esi")); 111e71b7053SJung-uk Kim &pxor (@C[2],&QWP($A[2][2],"esi")); 112e71b7053SJung-uk Kim &pxor (@C[3],&QWP($A[2][3],"esi")); 113e71b7053SJung-uk Kim &pxor (@C[4],&QWP($A[2][4],"esi")); 114e71b7053SJung-uk Kim 115e71b7053SJung-uk Kim &pxor (@C[2],&QWP($A[3][2],"esi")); 116e71b7053SJung-uk Kim &pxor (@C[0],&QWP($A[3][0],"esi")); 117e71b7053SJung-uk Kim &pxor (@C[1],&QWP($A[3][1],"esi")); 118e71b7053SJung-uk Kim &pxor (@C[3],&QWP($A[3][3],"esi")); 119e71b7053SJung-uk Kim &movq (@T[0],@C[2]); 120e71b7053SJung-uk Kim &pxor (@C[4],&QWP($A[3][4],"esi")); 121e71b7053SJung-uk Kim 122e71b7053SJung-uk Kim &movq (@T[2],@C[2]); 123e71b7053SJung-uk Kim &psrlq (@T[0],63); 124e71b7053SJung-uk Kim &movq (@T[1],@C[0]); 125e71b7053SJung-uk Kim &psllq (@T[2],1); 126e71b7053SJung-uk Kim &pxor (@T[0],@C[0]); 127e71b7053SJung-uk Kim &psrlq (@C[0],63); 128e71b7053SJung-uk Kim &pxor (@T[0],@T[2]); 129e71b7053SJung-uk Kim &psllq (@T[1],1); 130e71b7053SJung-uk Kim &movq (@T[2],@C[1]); 131e71b7053SJung-uk Kim &movq (&QWP(@D[1],"esp"),@T[0]); # D[1] = E[0] = ROL64(C[2], 1) ^ C[0]; 132e71b7053SJung-uk Kim 133e71b7053SJung-uk Kim &pxor (@T[1],@C[0]); 134e71b7053SJung-uk Kim &psrlq (@T[2],63); 135e71b7053SJung-uk Kim &pxor (@T[1],@C[3]); 136e71b7053SJung-uk Kim &movq (@C[0],@C[1]); 137e71b7053SJung-uk Kim &movq (&QWP(@D[4],"esp"),@T[1]); # D[4] = E[1] = ROL64(C[0], 1) ^ C[3]; 138e71b7053SJung-uk Kim 139e71b7053SJung-uk Kim &psllq (@C[0],1); 140e71b7053SJung-uk Kim &pxor (@T[2],@C[4]); 141e71b7053SJung-uk Kim &pxor (@C[0],@T[2]); 142e71b7053SJung-uk Kim 143e71b7053SJung-uk Kim &movq (@T[2],@C[3]); 144e71b7053SJung-uk Kim &psrlq (@C[3],63); 145e71b7053SJung-uk Kim &movq (&QWP(@D[0],"esp"),@C[0]); # D[0] = C[0] = ROL64(C[1], 1) ^ C[4]; 146e71b7053SJung-uk Kim &psllq (@T[2],1); 147e71b7053SJung-uk Kim &movq (@T[0],@C[4]); 148e71b7053SJung-uk Kim &psrlq (@C[4],63); 149e71b7053SJung-uk Kim &pxor (@C[1],@C[3]); 150e71b7053SJung-uk Kim &psllq (@T[0],1); 151e71b7053SJung-uk Kim &pxor (@C[1],@T[2]); 152e71b7053SJung-uk Kim &pxor (@C[2],@C[4]); 153e71b7053SJung-uk Kim &movq (&QWP(@D[2],"esp"),@C[1]); # D[2] = C[1] = ROL64(C[3], 1) ^ C[1]; 154e71b7053SJung-uk Kim &pxor (@C[2],@T[0]); 155e71b7053SJung-uk Kim 156e71b7053SJung-uk Kim ######################################### first Rho(0) is special 157e71b7053SJung-uk Kim &movq (@C[3],&QWP($A[3][3],"esi")); 158e71b7053SJung-uk Kim &movq (&QWP(@D[3],"esp"),@C[2]); # D[3] = C[2] = ROL64(C[4], 1) ^ C[2]; 159e71b7053SJung-uk Kim &pxor (@C[3],@C[2]); 160e71b7053SJung-uk Kim &movq (@C[4],&QWP($A[4][4],"esi")); 161e71b7053SJung-uk Kim &movq (@T[2],@C[3]); 162e71b7053SJung-uk Kim &psrlq (@C[3],64-$rhotates[3][3]); 163e71b7053SJung-uk Kim &pxor (@C[4],@T[1]); 164e71b7053SJung-uk Kim &psllq (@T[2],$rhotates[3][3]); 165e71b7053SJung-uk Kim &movq (@T[1],@C[4]); 166e71b7053SJung-uk Kim &psrlq (@C[4],64-$rhotates[4][4]); 167e71b7053SJung-uk Kim &por (@C[3],@T[2]); # C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */ 168e71b7053SJung-uk Kim &psllq (@T[1],$rhotates[4][4]); 169e71b7053SJung-uk Kim 170e71b7053SJung-uk Kim &movq (@C[2],&QWP($A[2][2],"esi")); 171e71b7053SJung-uk Kim &por (@C[4],@T[1]); # C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */ 172e71b7053SJung-uk Kim &pxor (@C[2],@C[1]); 173e71b7053SJung-uk Kim &movq (@C[1],&QWP($A[1][1],"esi")); 174e71b7053SJung-uk Kim &movq (@T[1],@C[2]); 175e71b7053SJung-uk Kim &psrlq (@C[2],64-$rhotates[2][2]); 176e71b7053SJung-uk Kim &pxor (@C[1],&QWP(@D[1],"esp")); 177e71b7053SJung-uk Kim &psllq (@T[1],$rhotates[2][2]); 178e71b7053SJung-uk Kim 179e71b7053SJung-uk Kim &movq (@T[2],@C[1]); 180e71b7053SJung-uk Kim &psrlq (@C[1],64-$rhotates[1][1]); 181e71b7053SJung-uk Kim &por (@C[2],@T[1]); # C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */ 182e71b7053SJung-uk Kim &psllq (@T[2],$rhotates[1][1]); 183e71b7053SJung-uk Kim &pxor (@C[0],&QWP($A[0][0],"esi")); # /* rotate by 0 */ /* D[0] */ 184e71b7053SJung-uk Kim &por (@C[1],@T[2]); # C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]); 185e71b7053SJung-uk Kim 186e71b7053SJung-uk Kimsub Chi() { ######### regular Chi step 187e71b7053SJung-uk Kim my ($y,$xrho) = @_; 188e71b7053SJung-uk Kim 189e71b7053SJung-uk Kim &movq (@T[0],@C[1]); 190e71b7053SJung-uk Kim &movq (@T[1],@C[2]); 191e71b7053SJung-uk Kim &pandn (@T[0],@C[2]); 192e71b7053SJung-uk Kim &pandn (@C[2],@C[3]); 193e71b7053SJung-uk Kim &pxor (@T[0],@C[0]); 194e71b7053SJung-uk Kim &pxor (@C[2],@C[1]); 195e71b7053SJung-uk Kim &pxor (@T[0],&QWP(0,"ebx")) if ($y == 0); 196e71b7053SJung-uk Kim &lea ("ebx",&DWP(8,"ebx")) if ($y == 0); 197e71b7053SJung-uk Kim 198e71b7053SJung-uk Kim &movq (@T[2],@C[3]); 199e71b7053SJung-uk Kim &movq (&QWP($A[$y][0],"edi"),@T[0]); # R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; 200e71b7053SJung-uk Kim &movq (@T[0],@C[4]); 201e71b7053SJung-uk Kim &pandn (@C[3],@C[4]); 202e71b7053SJung-uk Kim &pandn (@C[4],@C[0]); 203e71b7053SJung-uk Kim &pxor (@C[3],@T[1]); 204e71b7053SJung-uk Kim &movq (&QWP($A[$y][1],"edi"),@C[2]); # R[0][1] = C[1] ^ (~C[2] & C[3]); 205e71b7053SJung-uk Kim &pxor (@C[4],@T[2]); 206e71b7053SJung-uk Kim &movq (@T[2],&QWP($A[0][$xrho],"esi")) if (defined($xrho)); 207e71b7053SJung-uk Kim 208e71b7053SJung-uk Kim &movq (&QWP($A[$y][2],"edi"),@C[3]); # R[0][2] = C[2] ^ (~C[3] & C[4]); 209e71b7053SJung-uk Kim &pandn (@C[0],@C[1]); 210e71b7053SJung-uk Kim &movq (&QWP($A[$y][3],"edi"),@C[4]); # R[0][3] = C[3] ^ (~C[4] & C[0]); 211e71b7053SJung-uk Kim &pxor (@C[0],@T[0]); 212e71b7053SJung-uk Kim &pxor (@T[2],&QWP(@D[$xrho],"esp")) if (defined($xrho)); 213e71b7053SJung-uk Kim &movq (&QWP($A[$y][4],"edi"),@C[0]); # R[0][4] = C[4] ^ (~C[0] & C[1]); 214e71b7053SJung-uk Kim} 215e71b7053SJung-uk Kim &Chi (0, 3); 216e71b7053SJung-uk Kim 217e71b7053SJung-uk Kimsub Rho() { ######### regular Rho step 218e71b7053SJung-uk Kim my $x = shift; 219e71b7053SJung-uk Kim 220e71b7053SJung-uk Kim #&movq (@T[2],&QWP($A[0][$x],"esi")); # moved to Chi 221e71b7053SJung-uk Kim #&pxor (@T[2],&QWP(@D[$x],"esp")); # moved to Chi 222e71b7053SJung-uk Kim &movq (@C[0],@T[2]); 223e71b7053SJung-uk Kim &psrlq (@T[2],64-$rhotates[0][$x]); 224e71b7053SJung-uk Kim &movq (@C[1],&QWP($A[1][($x+1)%5],"esi")); 225e71b7053SJung-uk Kim &psllq (@C[0],$rhotates[0][$x]); 226e71b7053SJung-uk Kim &pxor (@C[1],&QWP(@D[($x+1)%5],"esp")); 227e71b7053SJung-uk Kim &por (@C[0],@T[2]); # C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]); 228e71b7053SJung-uk Kim 229e71b7053SJung-uk Kim &movq (@T[1],@C[1]); 230e71b7053SJung-uk Kim &psrlq (@C[1],64-$rhotates[1][($x+1)%5]); 231e71b7053SJung-uk Kim &movq (@C[2],&QWP($A[2][($x+2)%5],"esi")); 232e71b7053SJung-uk Kim &psllq (@T[1],$rhotates[1][($x+1)%5]); 233e71b7053SJung-uk Kim &pxor (@C[2],&QWP(@D[($x+2)%5],"esp")); 234e71b7053SJung-uk Kim &por (@C[1],@T[1]); # C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); 235e71b7053SJung-uk Kim 236e71b7053SJung-uk Kim &movq (@T[2],@C[2]); 237e71b7053SJung-uk Kim &psrlq (@C[2],64-$rhotates[2][($x+2)%5]); 238e71b7053SJung-uk Kim &movq (@C[3],&QWP($A[3][($x+3)%5],"esi")); 239e71b7053SJung-uk Kim &psllq (@T[2],$rhotates[2][($x+2)%5]); 240e71b7053SJung-uk Kim &pxor (@C[3],&QWP(@D[($x+3)%5],"esp")); 241e71b7053SJung-uk Kim &por (@C[2],@T[2]); # C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]); 242e71b7053SJung-uk Kim 243e71b7053SJung-uk Kim &movq (@T[0],@C[3]); 244e71b7053SJung-uk Kim &psrlq (@C[3],64-$rhotates[3][($x+3)%5]); 245e71b7053SJung-uk Kim &movq (@C[4],&QWP($A[4][($x+4)%5],"esi")); 246e71b7053SJung-uk Kim &psllq (@T[0],$rhotates[3][($x+3)%5]); 247e71b7053SJung-uk Kim &pxor (@C[4],&QWP(@D[($x+4)%5],"esp")); 248e71b7053SJung-uk Kim &por (@C[3],@T[0]); # C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]); 249e71b7053SJung-uk Kim 250e71b7053SJung-uk Kim &movq (@T[1],@C[4]); 251e71b7053SJung-uk Kim &psrlq (@C[4],64-$rhotates[4][($x+4)%5]); 252e71b7053SJung-uk Kim &psllq (@T[1],$rhotates[4][($x+4)%5]); 253e71b7053SJung-uk Kim &por (@C[4],@T[1]); # C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]); 254e71b7053SJung-uk Kim} 255e71b7053SJung-uk Kim &Rho (3); &Chi (1, 1); 256e71b7053SJung-uk Kim &Rho (1); &Chi (2, 4); 257e71b7053SJung-uk Kim &Rho (4); &Chi (3, 2); 258e71b7053SJung-uk Kim &Rho (2); ###&Chi (4); 259e71b7053SJung-uk Kim 260e71b7053SJung-uk Kim &movq (@T[0],@C[0]); ######### last Chi(4) is special 261e71b7053SJung-uk Kim &xor ("edi","esi"); # &xchg ("esi","edi"); 262e71b7053SJung-uk Kim &movq (&QWP(@D[1],"esp"),@C[1]); 263e71b7053SJung-uk Kim &xor ("esi","edi"); 264e71b7053SJung-uk Kim &xor ("edi","esi"); 265e71b7053SJung-uk Kim 266e71b7053SJung-uk Kim &movq (@T[1],@C[1]); 267e71b7053SJung-uk Kim &movq (@T[2],@C[2]); 268e71b7053SJung-uk Kim &pandn (@T[1],@C[2]); 269e71b7053SJung-uk Kim &pandn (@T[2],@C[3]); 270e71b7053SJung-uk Kim &pxor (@C[0],@T[1]); 271e71b7053SJung-uk Kim &pxor (@C[1],@T[2]); 272e71b7053SJung-uk Kim 273e71b7053SJung-uk Kim &movq (@T[1],@C[3]); 274e71b7053SJung-uk Kim &movq (&QWP($A[4][0],"esi"),@C[0]); # R[4][0] = C[0] ^= (~C[1] & C[2]); 275e71b7053SJung-uk Kim &pandn (@T[1],@C[4]); 276e71b7053SJung-uk Kim &movq (&QWP($A[4][1],"esi"),@C[1]); # R[4][1] = C[1] ^= (~C[2] & C[3]); 277e71b7053SJung-uk Kim &pxor (@C[2],@T[1]); 278e71b7053SJung-uk Kim &movq (@T[2],@C[4]); 279e71b7053SJung-uk Kim &movq (&QWP($A[4][2],"esi"),@C[2]); # R[4][2] = C[2] ^= (~C[3] & C[4]); 280e71b7053SJung-uk Kim 281e71b7053SJung-uk Kim &pandn (@T[2],@T[0]); 282e71b7053SJung-uk Kim &pandn (@T[0],&QWP(@D[1],"esp")); 283e71b7053SJung-uk Kim &pxor (@C[3],@T[2]); 284e71b7053SJung-uk Kim &pxor (@C[4],@T[0]); 285e71b7053SJung-uk Kim &movq (&QWP($A[4][3],"esi"),@C[3]); # R[4][3] = C[3] ^= (~C[4] & D[0]); 286e71b7053SJung-uk Kim &sub ("ecx",1); 287e71b7053SJung-uk Kim &movq (&QWP($A[4][4],"esi"),@C[4]); # R[4][4] = C[4] ^= (~D[0] & D[1]); 288e71b7053SJung-uk Kim &jnz (&label("loop")); 289e71b7053SJung-uk Kim 290e71b7053SJung-uk Kim &lea ("ebx",&DWP(-192,"ebx")); # rewind iotas 291e71b7053SJung-uk Kim &ret (); 292e71b7053SJung-uk Kim&function_end_B("_KeccakF1600"); 293e71b7053SJung-uk Kim 294e71b7053SJung-uk Kim&function_begin("KeccakF1600"); 295e71b7053SJung-uk Kim &mov ("esi",&wparam(0)); 296e71b7053SJung-uk Kim &mov ("ebp","esp"); 297e71b7053SJung-uk Kim &sub ("esp",240); 298e71b7053SJung-uk Kim &call (&label("pic_point")); 299e71b7053SJung-uk Kim &set_label("pic_point"); 300e71b7053SJung-uk Kim &blindpop("ebx"); 301e71b7053SJung-uk Kim &lea ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx")); 302e71b7053SJung-uk Kim &and ("esp",-8); 303e71b7053SJung-uk Kim &lea ("esi",&DWP(100,"esi")); # size optimization 304e71b7053SJung-uk Kim &lea ("edi",&DWP(8*5+100,"esp")); # size optimization 305e71b7053SJung-uk Kim 306e71b7053SJung-uk Kim &call ("_KeccakF1600"); 307e71b7053SJung-uk Kim 308e71b7053SJung-uk Kim &mov ("esp","ebp"); 309e71b7053SJung-uk Kim &emms (); 310e71b7053SJung-uk Kim&function_end("KeccakF1600"); 311e71b7053SJung-uk Kim 312e71b7053SJung-uk Kim&function_begin("SHA3_absorb"); 313e71b7053SJung-uk Kim &mov ("esi",&wparam(0)); # A[][] 314e71b7053SJung-uk Kim &mov ("eax",&wparam(1)); # inp 315e71b7053SJung-uk Kim &mov ("ecx",&wparam(2)); # len 316e71b7053SJung-uk Kim &mov ("edx",&wparam(3)); # bsz 317e71b7053SJung-uk Kim &mov ("ebp","esp"); 318e71b7053SJung-uk Kim &sub ("esp",240+8); 319e71b7053SJung-uk Kim &call (&label("pic_point")); 320e71b7053SJung-uk Kim &set_label("pic_point"); 321e71b7053SJung-uk Kim &blindpop("ebx"); 322e71b7053SJung-uk Kim &lea ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx")); 323e71b7053SJung-uk Kim &and ("esp",-8); 324e71b7053SJung-uk Kim 325e71b7053SJung-uk Kim &mov ("edi","esi"); 326e71b7053SJung-uk Kim &lea ("esi",&DWP(100,"esi")); # size optimization 327e71b7053SJung-uk Kim &mov (&DWP(-4,"ebp"),"edx"); # save bsz 328e71b7053SJung-uk Kim &jmp (&label("loop")); 329e71b7053SJung-uk Kim 330e71b7053SJung-uk Kim&set_label("loop",16); 331e71b7053SJung-uk Kim &cmp ("ecx","edx"); # len < bsz? 332e71b7053SJung-uk Kim &jc (&label("absorbed")); 333e71b7053SJung-uk Kim 334e71b7053SJung-uk Kim &shr ("edx",3); # bsz /= 8 335e71b7053SJung-uk Kim&set_label("block"); 336e71b7053SJung-uk Kim &movq ("mm0",&QWP(0,"eax")); 337e71b7053SJung-uk Kim &lea ("eax",&DWP(8,"eax")); 338e71b7053SJung-uk Kim &pxor ("mm0",&QWP(0,"edi")); 339e71b7053SJung-uk Kim &lea ("edi",&DWP(8,"edi")); 340e71b7053SJung-uk Kim &sub ("ecx",8); # len -= 8 341e71b7053SJung-uk Kim &movq (&QWP(-8,"edi"),"mm0"); 342e71b7053SJung-uk Kim &dec ("edx"); # bsz-- 343e71b7053SJung-uk Kim &jnz (&label("block")); 344e71b7053SJung-uk Kim 345e71b7053SJung-uk Kim &lea ("edi",&DWP(8*5+100,"esp")); # size optimization 346e71b7053SJung-uk Kim &mov (&DWP(-8,"ebp"),"ecx"); # save len 347e71b7053SJung-uk Kim &call ("_KeccakF1600"); 348e71b7053SJung-uk Kim &mov ("ecx",&DWP(-8,"ebp")); # pull len 349e71b7053SJung-uk Kim &mov ("edx",&DWP(-4,"ebp")); # pull bsz 350e71b7053SJung-uk Kim &lea ("edi",&DWP(-100,"esi")); 351e71b7053SJung-uk Kim &jmp (&label("loop")); 352e71b7053SJung-uk Kim 353e71b7053SJung-uk Kim&set_label("absorbed",16); 354e71b7053SJung-uk Kim &mov ("eax","ecx"); # return value 355e71b7053SJung-uk Kim &mov ("esp","ebp"); 356e71b7053SJung-uk Kim &emms (); 357e71b7053SJung-uk Kim&function_end("SHA3_absorb"); 358e71b7053SJung-uk Kim 359e71b7053SJung-uk Kim&function_begin("SHA3_squeeze"); 360e71b7053SJung-uk Kim &mov ("esi",&wparam(0)); # A[][] 361e71b7053SJung-uk Kim &mov ("eax",&wparam(1)); # out 362e71b7053SJung-uk Kim &mov ("ecx",&wparam(2)); # len 363e71b7053SJung-uk Kim &mov ("edx",&wparam(3)); # bsz 364e71b7053SJung-uk Kim &mov ("ebp","esp"); 365e71b7053SJung-uk Kim &sub ("esp",240+8); 366e71b7053SJung-uk Kim &call (&label("pic_point")); 367e71b7053SJung-uk Kim &set_label("pic_point"); 368e71b7053SJung-uk Kim &blindpop("ebx"); 369e71b7053SJung-uk Kim &lea ("ebx",&DWP(&label("iotas")."-".&label("pic_point"),"ebx")); 370e71b7053SJung-uk Kim &and ("esp",-8); 371e71b7053SJung-uk Kim 372e71b7053SJung-uk Kim &shr ("edx",3); # bsz /= 8 373e71b7053SJung-uk Kim &mov ("edi","esi"); 374e71b7053SJung-uk Kim &lea ("esi",&DWP(100,"esi")); # size optimization 375e71b7053SJung-uk Kim &mov (&DWP(-4,"ebp"),"edx"); # save bsz 376e71b7053SJung-uk Kim &jmp (&label("loop")); 377e71b7053SJung-uk Kim 378e71b7053SJung-uk Kim&set_label("loop",16); 379e71b7053SJung-uk Kim &cmp ("ecx",8); # len < 8? 380e71b7053SJung-uk Kim &jc (&label("tail")); 381e71b7053SJung-uk Kim 382e71b7053SJung-uk Kim &movq ("mm0",&QWP(0,"edi")); 383e71b7053SJung-uk Kim &lea ("edi",&DWP(8,"edi")); 384e71b7053SJung-uk Kim &movq (&QWP(0,"eax"),"mm0"); 385e71b7053SJung-uk Kim &lea ("eax",&DWP(8,"eax")); 386e71b7053SJung-uk Kim &sub ("ecx",8); # len -= 8 387e71b7053SJung-uk Kim &jz (&label("done")); 388e71b7053SJung-uk Kim 389e71b7053SJung-uk Kim &dec ("edx"); # bsz-- 390e71b7053SJung-uk Kim &jnz (&label("loop")); 391e71b7053SJung-uk Kim 392e71b7053SJung-uk Kim &lea ("edi",&DWP(8*5+100,"esp")); # size optimization 393e71b7053SJung-uk Kim &mov (&DWP(-8,"ebp"),"ecx"); # save len 394e71b7053SJung-uk Kim &call ("_KeccakF1600"); 395e71b7053SJung-uk Kim &mov ("ecx",&DWP(-8,"ebp")); # pull len 396e71b7053SJung-uk Kim &mov ("edx",&DWP(-4,"ebp")); # pull bsz 397e71b7053SJung-uk Kim &lea ("edi",&DWP(-100,"esi")); 398e71b7053SJung-uk Kim &jmp (&label("loop")); 399e71b7053SJung-uk Kim 400e71b7053SJung-uk Kim&set_label("tail",16); 401e71b7053SJung-uk Kim &mov ("esi","edi"); 402e71b7053SJung-uk Kim &mov ("edi","eax"); 403e71b7053SJung-uk Kim &data_word("0xA4F39066"); # rep movsb 404e71b7053SJung-uk Kim 405e71b7053SJung-uk Kim&set_label("done"); 406e71b7053SJung-uk Kim &mov ("esp","ebp"); 407e71b7053SJung-uk Kim &emms (); 408e71b7053SJung-uk Kim&function_end("SHA3_squeeze"); 409e71b7053SJung-uk Kim 410e71b7053SJung-uk Kim&set_label("iotas",32); 411e71b7053SJung-uk Kim &data_word(0x00000001,0x00000000); 412e71b7053SJung-uk Kim &data_word(0x00008082,0x00000000); 413e71b7053SJung-uk Kim &data_word(0x0000808a,0x80000000); 414e71b7053SJung-uk Kim &data_word(0x80008000,0x80000000); 415e71b7053SJung-uk Kim &data_word(0x0000808b,0x00000000); 416e71b7053SJung-uk Kim &data_word(0x80000001,0x00000000); 417e71b7053SJung-uk Kim &data_word(0x80008081,0x80000000); 418e71b7053SJung-uk Kim &data_word(0x00008009,0x80000000); 419e71b7053SJung-uk Kim &data_word(0x0000008a,0x00000000); 420e71b7053SJung-uk Kim &data_word(0x00000088,0x00000000); 421e71b7053SJung-uk Kim &data_word(0x80008009,0x00000000); 422e71b7053SJung-uk Kim &data_word(0x8000000a,0x00000000); 423e71b7053SJung-uk Kim &data_word(0x8000808b,0x00000000); 424e71b7053SJung-uk Kim &data_word(0x0000008b,0x80000000); 425e71b7053SJung-uk Kim &data_word(0x00008089,0x80000000); 426e71b7053SJung-uk Kim &data_word(0x00008003,0x80000000); 427e71b7053SJung-uk Kim &data_word(0x00008002,0x80000000); 428e71b7053SJung-uk Kim &data_word(0x00000080,0x80000000); 429e71b7053SJung-uk Kim &data_word(0x0000800a,0x00000000); 430e71b7053SJung-uk Kim &data_word(0x8000000a,0x80000000); 431e71b7053SJung-uk Kim &data_word(0x80008081,0x80000000); 432e71b7053SJung-uk Kim &data_word(0x00008080,0x80000000); 433e71b7053SJung-uk Kim &data_word(0x80000001,0x00000000); 434e71b7053SJung-uk Kim &data_word(0x80008008,0x80000000); 435e71b7053SJung-uk Kim&asciz("Keccak-1600 absorb and squeeze for MMX, CRYPTOGAMS by <appro\@openssl.org>"); 436e71b7053SJung-uk Kim 437e71b7053SJung-uk Kim&asm_finish(); 438e71b7053SJung-uk Kim 43917f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!"; 440