1#!/usr/bin/env perl 2# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for AVX512VL. 17# 18# December 2017. 19# 20# This is an adaptation of AVX2 module that reuses register data 21# layout, but utilizes new 256-bit AVX512VL instructions. See AVX2 22# module for further information on layout. 23# 24######################################################################## 25# Numbers are cycles per processed byte out of large message. 26# 27# r=1088(*) 28# 29# Skylake-X 6.4/+47% 30# 31# (*) Corresponds to SHA3-256. Percentage after slash is improvement 32# coefficient in comparison to scalar keccak1600-x86_64.pl. 33 34# Digits in variables' names denote right-most coordinates: 35 36my ($A00, # [0][0] [0][0] [0][0] [0][0] # %ymm0 37 $A01, # [0][4] [0][3] [0][2] [0][1] # %ymm1 38 $A20, # [3][0] [1][0] [4][0] [2][0] # %ymm2 39 $A31, # [2][4] [4][3] [1][2] [3][1] # %ymm3 40 $A21, # [3][4] [1][3] [4][2] [2][1] # %ymm4 41 $A41, # [1][4] [2][3] [3][2] [4][1] # %ymm5 42 $A11) = # [4][4] [3][3] [2][2] [1][1] # %ymm6 43 map("%ymm$_",(0..6)); 44 45# We also need to map the magic order into offsets within structure: 46 47my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4] 48 [2,2], [6,0], [3,1], [4,2], [5,3], # [1][0..4] 49 [2,0], [4,0], [6,1], [5,2], [3,3], # [2][0..4] 50 [2,3], [3,0], [5,1], [6,2], [4,3], # [3][0..4] 51 [2,1], [5,0], [4,1], [3,2], [6,3]); # [4][0..4] 52 @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged); # ... and now linear 53 54my @T = map("%ymm$_",(7..15)); 55my ($C14,$C00,$D00,$D14) = @T[5..8]; 56my ($R20,$R01,$R31,$R21,$R41,$R11) = map("%ymm$_",(16..21)); 57 58$code.=<<___; 59.text 60 61.type __KeccakF1600,\@function 62.align 32 63__KeccakF1600: 64 lea iotas(%rip),%r10 65 mov \$24,%eax 66 jmp .Loop_avx512vl 67 68.align 32 69.Loop_avx512vl: 70 ######################################### Theta 71 vpshufd \$0b01001110,$A20,$C00 72 vpxor $A31,$A41,$C14 73 vpxor $A11,$A21,@T[2] 74 vpternlogq \$0x96,$A01,$T[2],$C14 # C[1..4] 75 76 vpxor $A20,$C00,$C00 77 vpermq \$0b01001110,$C00,@T[0] 78 79 vpermq \$0b10010011,$C14,@T[4] 80 vprolq \$1,$C14,@T[1] # ROL64(C[1..4],1) 81 82 vpermq \$0b00111001,@T[1],$D14 83 vpxor @T[4],@T[1],$D00 84 vpermq \$0b00000000,$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4] 85 86 vpternlogq \$0x96,@T[0],$A00,$C00 # C[0..0] 87 vprolq \$1,$C00,@T[1] # ROL64(C[0..0],1) 88 89 vpxor $D00,$A00,$A00 # ^= D[0..0] 90 91 vpblendd \$0b11000000,@T[1],$D14,$D14 92 vpblendd \$0b00000011,$C00,@T[4],@T[0] 93 94 ######################################### Rho + Pi + pre-Chi shuffle 95 vpxor $D00,$A20,$A20 # ^= D[0..0] from Theta 96 vprolvq $R20,$A20,$A20 97 98 vpternlogq \$0x96,@T[0],$D14,$A31 # ^= D[1..4] from Theta 99 vprolvq $R31,$A31,$A31 100 101 vpternlogq \$0x96,@T[0],$D14,$A21 # ^= D[1..4] from Theta 102 vprolvq $R21,$A21,$A21 103 104 vpternlogq \$0x96,@T[0],$D14,$A41 # ^= D[1..4] from Theta 105 vprolvq $R41,$A41,$A41 106 107 vpermq \$0b10001101,$A20,@T[3] # $A20 -> future $A31 108 vpermq \$0b10001101,$A31,@T[4] # $A31 -> future $A21 109 vpternlogq \$0x96,@T[0],$D14,$A11 # ^= D[1..4] from Theta 110 vprolvq $R11,$A11,@T[1] # $A11 -> future $A01 111 112 vpermq \$0b00011011,$A21,@T[5] # $A21 -> future $A41 113 vpermq \$0b01110010,$A41,@T[6] # $A41 -> future $A11 114 vpternlogq \$0x96,@T[0],$D14,$A01 # ^= D[1..4] from Theta 115 vprolvq $R01,$A01,@T[2] # $A01 -> future $A20 116 117 ######################################### Chi 118 vpblendd \$0b00001100,@T[6],@T[2],$A31 # [4][4] [2][0] 119 vpblendd \$0b00001100,@T[2],@T[4],@T[8] # [4][0] [2][1] 120 vpblendd \$0b00001100,@T[4],@T[3],$A41 # [4][2] [2][4] 121 vpblendd \$0b00001100,@T[3],@T[2],@T[7] # [4][3] [2][0] 122 vpblendd \$0b00110000,@T[4],$A31,$A31 # [1][3] [4][4] [2][0] 123 vpblendd \$0b00110000,@T[5],@T[8],@T[8] # [1][4] [4][0] [2][1] 124 vpblendd \$0b00110000,@T[2],$A41,$A41 # [1][0] [4][2] [2][4] 125 vpblendd \$0b00110000,@T[6],@T[7],@T[7] # [1][1] [4][3] [2][0] 126 vpblendd \$0b11000000,@T[5],$A31,$A31 # [3][2] [1][3] [4][4] [2][0] 127 vpblendd \$0b11000000,@T[6],@T[8],@T[8] # [3][3] [1][4] [4][0] [2][1] 128 vpblendd \$0b11000000,@T[6],$A41,$A41 # [3][3] [1][0] [4][2] [2][4] 129 vpblendd \$0b11000000,@T[4],@T[7],@T[7] # [3][4] [1][1] [4][3] [2][0] 130 vpternlogq \$0xC6,@T[8],@T[3],$A31 # [3][1] [1][2] [4][3] [2][4] 131 vpternlogq \$0xC6,@T[7],@T[5],$A41 # [3][2] [1][4] [4][1] [2][3] 132 133 vpsrldq \$8,@T[1],@T[0] 134 vpandn @T[0],@T[1],@T[0] # tgting [0][0] [0][0] [0][0] [0][0] 135 136 vpblendd \$0b00001100,@T[2],@T[5],$A11 # [4][0] [2][3] 137 vpblendd \$0b00001100,@T[5],@T[3],@T[8] # [4][1] [2][4] 138 vpblendd \$0b00110000,@T[3],$A11,$A11 # [1][2] [4][0] [2][3] 139 vpblendd \$0b00110000,@T[4],@T[8],@T[8] # [1][3] [4][1] [2][4] 140 vpblendd \$0b11000000,@T[4],$A11,$A11 # [3][4] [1][2] [4][0] [2][3] 141 vpblendd \$0b11000000,@T[2],@T[8],@T[8] # [3][0] [1][3] [4][1] [2][4] 142 vpternlogq \$0xC6,@T[8],@T[6],$A11 # [3][3] [1][1] [4][4] [2][2] 143 144 vpermq \$0b00011110,@T[1],$A21 # [0][1] [0][2] [0][4] [0][3] 145 vpblendd \$0b00110000,$A00,$A21,@T[8] # [0][1] [0][0] [0][4] [0][3] 146 vpermq \$0b00111001,@T[1],$A01 # [0][1] [0][4] [0][3] [0][2] 147 vpblendd \$0b11000000,$A00,$A01,$A01 # [0][0] [0][4] [0][3] [0][2] 148 149 vpblendd \$0b00001100,@T[5],@T[4],$A20 # [4][1] [2][1] 150 vpblendd \$0b00001100,@T[4],@T[6],@T[7] # [4][2] [2][2] 151 vpblendd \$0b00110000,@T[6],$A20,$A20 # [1][1] [4][1] [2][1] 152 vpblendd \$0b00110000,@T[3],@T[7],@T[7] # [1][2] [4][2] [2][2] 153 vpblendd \$0b11000000,@T[3],$A20,$A20 # [3][1] [1][1] [4][1] [2][1] 154 vpblendd \$0b11000000,@T[5],@T[7],@T[7] # [3][2] [1][2] [4][2] [2][2] 155 vpternlogq \$0xC6,@T[7],@T[2],$A20 # [3][0] [1][0] [4][0] [2][0] 156 157 vpermq \$0b00000000,@T[0],@T[0] # [0][0] [0][0] [0][0] [0][0] 158 vpermq \$0b00011011,$A31,$A31 # post-Chi shuffle 159 vpermq \$0b10001101,$A41,$A41 160 vpermq \$0b01110010,$A11,$A11 161 162 vpblendd \$0b00001100,@T[3],@T[6],$A21 # [4][3] [2][2] 163 vpblendd \$0b00001100,@T[6],@T[5],@T[7] # [4][4] [2][3] 164 vpblendd \$0b00110000,@T[5],$A21,$A21 # [1][4] [4][3] [2][2] 165 vpblendd \$0b00110000,@T[2],@T[7],@T[7] # [1][0] [4][4] [2][3] 166 vpblendd \$0b11000000,@T[2],$A21,$A21 # [3][0] [1][4] [4][3] [2][2] 167 vpblendd \$0b11000000,@T[3],@T[7],@T[7] # [3][1] [1][0] [4][4] [2][3] 168 169 vpternlogq \$0xC6,@T[8],@T[1],$A01 # [0][4] [0][3] [0][2] [0][1] 170 vpternlogq \$0xC6,@T[7],@T[4],$A21 # [3][4] [1][3] [4][2] [2][1] 171 172 ######################################### Iota 173 vpternlogq \$0x96,(%r10),@T[0],$A00 174 lea 32(%r10),%r10 175 176 dec %eax 177 jnz .Loop_avx512vl 178 179 ret 180.size __KeccakF1600,.-__KeccakF1600 181___ 182my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx"); 183my $out = $inp; # in squeeze 184 185$code.=<<___; 186.globl SHA3_absorb 187.type SHA3_absorb,\@function 188.align 32 189SHA3_absorb: 190 mov %rsp,%r11 191 192 lea -240(%rsp),%rsp 193 and \$-32,%rsp 194 195 lea 96($A_flat),$A_flat 196 lea 96($inp),$inp 197 lea 96(%rsp),%r10 198 lea rhotates_left(%rip),%r8 199 200 vzeroupper 201 202 vpbroadcastq -96($A_flat),$A00 # load A[5][5] 203 vmovdqu 8+32*0-96($A_flat),$A01 204 vmovdqu 8+32*1-96($A_flat),$A20 205 vmovdqu 8+32*2-96($A_flat),$A31 206 vmovdqu 8+32*3-96($A_flat),$A21 207 vmovdqu 8+32*4-96($A_flat),$A41 208 vmovdqu 8+32*5-96($A_flat),$A11 209 210 vmovdqa64 0*32(%r8),$R20 # load "rhotate" indices 211 vmovdqa64 1*32(%r8),$R01 212 vmovdqa64 2*32(%r8),$R31 213 vmovdqa64 3*32(%r8),$R21 214 vmovdqa64 4*32(%r8),$R41 215 vmovdqa64 5*32(%r8),$R11 216 217 vpxor @T[0],@T[0],@T[0] 218 vmovdqa @T[0],32*2-96(%r10) # zero transfer area on stack 219 vmovdqa @T[0],32*3-96(%r10) 220 vmovdqa @T[0],32*4-96(%r10) 221 vmovdqa @T[0],32*5-96(%r10) 222 vmovdqa @T[0],32*6-96(%r10) 223 224.Loop_absorb_avx512vl: 225 mov $bsz,%rax 226 sub $bsz,$len 227 jc .Ldone_absorb_avx512vl 228 229 shr \$3,%eax 230 vpbroadcastq 0-96($inp),@T[0] 231 vmovdqu 8-96($inp),@T[1] 232 sub \$4,%eax 233___ 234for(my $i=5; $i<25; $i++) { 235$code.=<<___ 236 dec %eax 237 jz .Labsorved_avx512vl 238 mov 8*$i-96($inp),%r8 239 mov %r8,$A_jagged[$i]-96(%r10) 240___ 241} 242$code.=<<___; 243.Labsorved_avx512vl: 244 lea ($inp,$bsz),$inp 245 246 vpxor @T[0],$A00,$A00 247 vpxor @T[1],$A01,$A01 248 vpxor 32*2-96(%r10),$A20,$A20 249 vpxor 32*3-96(%r10),$A31,$A31 250 vpxor 32*4-96(%r10),$A21,$A21 251 vpxor 32*5-96(%r10),$A41,$A41 252 vpxor 32*6-96(%r10),$A11,$A11 253 254 call __KeccakF1600 255 256 lea 96(%rsp),%r10 257 jmp .Loop_absorb_avx512vl 258 259.Ldone_absorb_avx512vl: 260 vmovq %xmm0,-96($A_flat) 261 vmovdqu $A01,8+32*0-96($A_flat) 262 vmovdqu $A20,8+32*1-96($A_flat) 263 vmovdqu $A31,8+32*2-96($A_flat) 264 vmovdqu $A21,8+32*3-96($A_flat) 265 vmovdqu $A41,8+32*4-96($A_flat) 266 vmovdqu $A11,8+32*5-96($A_flat) 267 268 vzeroupper 269 270 lea (%r11),%rsp 271 lea ($len,$bsz),%rax # return value 272 ret 273.size SHA3_absorb,.-SHA3_absorb 274 275.globl SHA3_squeeze 276.type SHA3_squeeze,\@function 277.align 32 278SHA3_squeeze: 279 mov %rsp,%r11 280 281 lea 96($A_flat),$A_flat 282 lea rhotates_left(%rip),%r8 283 shr \$3,$bsz 284 285 vzeroupper 286 287 vpbroadcastq -96($A_flat),$A00 288 vpxor @T[0],@T[0],@T[0] 289 vmovdqu 8+32*0-96($A_flat),$A01 290 vmovdqu 8+32*1-96($A_flat),$A20 291 vmovdqu 8+32*2-96($A_flat),$A31 292 vmovdqu 8+32*3-96($A_flat),$A21 293 vmovdqu 8+32*4-96($A_flat),$A41 294 vmovdqu 8+32*5-96($A_flat),$A11 295 296 vmovdqa64 0*32(%r8),$R20 # load "rhotate" indices 297 vmovdqa64 1*32(%r8),$R01 298 vmovdqa64 2*32(%r8),$R31 299 vmovdqa64 3*32(%r8),$R21 300 vmovdqa64 4*32(%r8),$R41 301 vmovdqa64 5*32(%r8),$R11 302 303 mov $bsz,%rax 304 305.Loop_squeeze_avx512vl: 306 mov @A_jagged[$i]-96($A_flat),%r8 307___ 308for (my $i=0; $i<25; $i++) { 309$code.=<<___; 310 sub \$8,$len 311 jc .Ltail_squeeze_avx512vl 312 mov %r8,($out) 313 lea 8($out),$out 314 je .Ldone_squeeze_avx512vl 315 dec %eax 316 je .Lextend_output_avx512vl 317 mov @A_jagged[$i+1]-120($A_flat),%r8 318___ 319} 320$code.=<<___; 321.Lextend_output_avx512vl: 322 call __KeccakF1600 323 324 vmovq %xmm0,-96($A_flat) 325 vmovdqu $A01,8+32*0-96($A_flat) 326 vmovdqu $A20,8+32*1-96($A_flat) 327 vmovdqu $A31,8+32*2-96($A_flat) 328 vmovdqu $A21,8+32*3-96($A_flat) 329 vmovdqu $A41,8+32*4-96($A_flat) 330 vmovdqu $A11,8+32*5-96($A_flat) 331 332 mov $bsz,%rax 333 jmp .Loop_squeeze_avx512vl 334 335 336.Ltail_squeeze_avx512vl: 337 add \$8,$len 338.Loop_tail_avx512vl: 339 mov %r8b,($out) 340 lea 1($out),$out 341 shr \$8,%r8 342 dec $len 343 jnz .Loop_tail_avx512vl 344 345.Ldone_squeeze_avx512vl: 346 vzeroupper 347 348 lea (%r11),%rsp 349 ret 350.size SHA3_squeeze,.-SHA3_squeeze 351 352.align 64 353rhotates_left: 354 .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0] 355 .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4] 356 .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4] 357 .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4] 358 .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4] 359 .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4] 360iotas: 361 .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001 362 .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082 363 .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a 364 .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000 365 .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b 366 .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 367 .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 368 .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009 369 .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a 370 .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088 371 .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009 372 .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a 373 .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b 374 .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b 375 .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089 376 .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003 377 .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002 378 .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080 379 .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a 380 .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a 381 .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 382 .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080 383 .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 384 .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008 385 386.asciz "Keccak-1600 absorb and squeeze for AVX512VL, CRYPTOGAMS by <appro\@openssl.org>" 387___ 388 389$output=pop; 390open STDOUT,">$output"; 391print $code; 392close STDOUT; 393