1#!/usr/bin/env perl 2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for ARMv8. 17# 18# June 2017. 19# 20# This is straightforward KECCAK_1X_ALT implementation. It makes no 21# sense to attempt SIMD/NEON implementation for following reason. 22# 64-bit lanes of vector registers can't be addressed as easily as in 23# 32-bit mode. This means that 64-bit NEON is bound to be slower than 24# 32-bit NEON, and this implementation is faster than 32-bit NEON on 25# same processor. Even though it takes more scalar xor's and andn's, 26# it gets compensated by availability of rotate. Not to forget that 27# most processors achieve higher issue rate with scalar instructions. 28# 29# February 2018. 30# 31# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT 32# variant with register permutation/rotation twist that allows to 33# eliminate copies to temporary registers. If you look closely you'll 34# notice that it uses only one lane of vector registers. The new 35# instructions effectively facilitate parallel hashing, which we don't 36# support [yet?]. But lowest-level core procedure is prepared for it. 37# The inner round is 67 [vector] instructions, so it's not actually 38# obvious that it will provide performance improvement [in serial 39# hash] as long as vector instructions issue rate is limited to 1 per 40# cycle... 41# 42###################################################################### 43# Numbers are cycles per processed byte. 44# 45# r=1088(*) 46# 47# Cortex-A53 13 48# Cortex-A57 12 49# X-Gene 14 50# Mongoose 10 51# Kryo 12 52# Denver 7.8 53# Apple A7 7.2 54# ThunderX2 9.7 55# 56# (*) Corresponds to SHA3-256. No improvement coefficients are listed 57# because they vary too much from compiler to compiler. Newer 58# compiler does much better and improvement varies from 5% on 59# Cortex-A57 to 25% on Cortex-A53. While in comparison to older 60# compiler this code is at least 2x faster... 61 62# $output is the last argument if it looks like a file (it has an extension) 63# $flavour is the first argument if it doesn't look like a file 64$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 65$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 66 67$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 68( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 69( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 70die "can't locate arm-xlate.pl"; 71 72open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 73 or die "can't call $xlate: $!"; 74*STDOUT=*OUT; 75 76my @rhotates = ([ 0, 1, 62, 28, 27 ], 77 [ 36, 44, 6, 55, 20 ], 78 [ 3, 10, 43, 25, 39 ], 79 [ 41, 45, 15, 21, 8 ], 80 [ 18, 2, 61, 56, 14 ]); 81 82$code.=<<___; 83#include "arm_arch.h" 84 85.text 86 87.align 8 // strategic alignment and padding that allows to use 88 // address value as loop termination condition... 89 .quad 0,0,0,0,0,0,0,0 90.type iotas,%object 91iotas: 92 .quad 0x0000000000000001 93 .quad 0x0000000000008082 94 .quad 0x800000000000808a 95 .quad 0x8000000080008000 96 .quad 0x000000000000808b 97 .quad 0x0000000080000001 98 .quad 0x8000000080008081 99 .quad 0x8000000000008009 100 .quad 0x000000000000008a 101 .quad 0x0000000000000088 102 .quad 0x0000000080008009 103 .quad 0x000000008000000a 104 .quad 0x000000008000808b 105 .quad 0x800000000000008b 106 .quad 0x8000000000008089 107 .quad 0x8000000000008003 108 .quad 0x8000000000008002 109 .quad 0x8000000000000080 110 .quad 0x000000000000800a 111 .quad 0x800000008000000a 112 .quad 0x8000000080008081 113 .quad 0x8000000000008080 114 .quad 0x0000000080000001 115 .quad 0x8000000080008008 116.size iotas,.-iotas 117___ 118 {{{ 119my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ], 120 (0, 5, 10, 15, 20)); 121 $A[3][3] = "x25"; # x18 is reserved 122 123my @C = map("x$_", (26,27,28,30)); 124 125$code.=<<___; 126.type KeccakF1600_int,%function 127.align 5 128KeccakF1600_int: 129 AARCH64_SIGN_LINK_REGISTER 130 adr $C[2],iotas 131 stp $C[2],x30,[sp,#16] // 32 bytes on top are mine 132 b .Loop 133.align 4 134.Loop: 135 ////////////////////////////////////////// Theta 136 eor $C[0],$A[0][0],$A[1][0] 137 stp $A[0][4],$A[1][4],[sp,#0] // offload pair... 138 eor $C[1],$A[0][1],$A[1][1] 139 eor $C[2],$A[0][2],$A[1][2] 140 eor $C[3],$A[0][3],$A[1][3] 141___ 142 $C[4]=$A[0][4]; 143 $C[5]=$A[1][4]; 144$code.=<<___; 145 eor $C[4],$A[0][4],$A[1][4] 146 eor $C[0],$C[0],$A[2][0] 147 eor $C[1],$C[1],$A[2][1] 148 eor $C[2],$C[2],$A[2][2] 149 eor $C[3],$C[3],$A[2][3] 150 eor $C[4],$C[4],$A[2][4] 151 eor $C[0],$C[0],$A[3][0] 152 eor $C[1],$C[1],$A[3][1] 153 eor $C[2],$C[2],$A[3][2] 154 eor $C[3],$C[3],$A[3][3] 155 eor $C[4],$C[4],$A[3][4] 156 eor $C[0],$C[0],$A[4][0] 157 eor $C[2],$C[2],$A[4][2] 158 eor $C[1],$C[1],$A[4][1] 159 eor $C[3],$C[3],$A[4][3] 160 eor $C[4],$C[4],$A[4][4] 161 162 eor $C[5],$C[0],$C[2],ror#63 163 164 eor $A[0][1],$A[0][1],$C[5] 165 eor $A[1][1],$A[1][1],$C[5] 166 eor $A[2][1],$A[2][1],$C[5] 167 eor $A[3][1],$A[3][1],$C[5] 168 eor $A[4][1],$A[4][1],$C[5] 169 170 eor $C[5],$C[1],$C[3],ror#63 171 eor $C[2],$C[2],$C[4],ror#63 172 eor $C[3],$C[3],$C[0],ror#63 173 eor $C[4],$C[4],$C[1],ror#63 174 175 eor $C[1], $A[0][2],$C[5] // mov $C[1],$A[0][2] 176 eor $A[1][2],$A[1][2],$C[5] 177 eor $A[2][2],$A[2][2],$C[5] 178 eor $A[3][2],$A[3][2],$C[5] 179 eor $A[4][2],$A[4][2],$C[5] 180 181 eor $A[0][0],$A[0][0],$C[4] 182 eor $A[1][0],$A[1][0],$C[4] 183 eor $A[2][0],$A[2][0],$C[4] 184 eor $A[3][0],$A[3][0],$C[4] 185 eor $A[4][0],$A[4][0],$C[4] 186___ 187 $C[4]=undef; 188 $C[5]=undef; 189$code.=<<___; 190 ldp $A[0][4],$A[1][4],[sp,#0] // re-load offloaded data 191 eor $C[0], $A[0][3],$C[2] // mov $C[0],$A[0][3] 192 eor $A[1][3],$A[1][3],$C[2] 193 eor $A[2][3],$A[2][3],$C[2] 194 eor $A[3][3],$A[3][3],$C[2] 195 eor $A[4][3],$A[4][3],$C[2] 196 197 eor $C[2], $A[0][4],$C[3] // mov $C[2],$A[0][4] 198 eor $A[1][4],$A[1][4],$C[3] 199 eor $A[2][4],$A[2][4],$C[3] 200 eor $A[3][4],$A[3][4],$C[3] 201 eor $A[4][4],$A[4][4],$C[3] 202 203 ////////////////////////////////////////// Rho+Pi 204 mov $C[3],$A[0][1] 205 ror $A[0][1],$A[1][1],#64-$rhotates[1][1] 206 //mov $C[1],$A[0][2] 207 ror $A[0][2],$A[2][2],#64-$rhotates[2][2] 208 //mov $C[0],$A[0][3] 209 ror $A[0][3],$A[3][3],#64-$rhotates[3][3] 210 //mov $C[2],$A[0][4] 211 ror $A[0][4],$A[4][4],#64-$rhotates[4][4] 212 213 ror $A[1][1],$A[1][4],#64-$rhotates[1][4] 214 ror $A[2][2],$A[2][3],#64-$rhotates[2][3] 215 ror $A[3][3],$A[3][2],#64-$rhotates[3][2] 216 ror $A[4][4],$A[4][1],#64-$rhotates[4][1] 217 218 ror $A[1][4],$A[4][2],#64-$rhotates[4][2] 219 ror $A[2][3],$A[3][4],#64-$rhotates[3][4] 220 ror $A[3][2],$A[2][1],#64-$rhotates[2][1] 221 ror $A[4][1],$A[1][3],#64-$rhotates[1][3] 222 223 ror $A[4][2],$A[2][4],#64-$rhotates[2][4] 224 ror $A[3][4],$A[4][3],#64-$rhotates[4][3] 225 ror $A[2][1],$A[1][2],#64-$rhotates[1][2] 226 ror $A[1][3],$A[3][1],#64-$rhotates[3][1] 227 228 ror $A[2][4],$A[4][0],#64-$rhotates[4][0] 229 ror $A[4][3],$A[3][0],#64-$rhotates[3][0] 230 ror $A[1][2],$A[2][0],#64-$rhotates[2][0] 231 ror $A[3][1],$A[1][0],#64-$rhotates[1][0] 232 233 ror $A[1][0],$C[0],#64-$rhotates[0][3] 234 ror $A[2][0],$C[3],#64-$rhotates[0][1] 235 ror $A[3][0],$C[2],#64-$rhotates[0][4] 236 ror $A[4][0],$C[1],#64-$rhotates[0][2] 237 238 ////////////////////////////////////////// Chi+Iota 239 bic $C[0],$A[0][2],$A[0][1] 240 bic $C[1],$A[0][3],$A[0][2] 241 bic $C[2],$A[0][0],$A[0][4] 242 bic $C[3],$A[0][1],$A[0][0] 243 eor $A[0][0],$A[0][0],$C[0] 244 bic $C[0],$A[0][4],$A[0][3] 245 eor $A[0][1],$A[0][1],$C[1] 246 ldr $C[1],[sp,#16] 247 eor $A[0][3],$A[0][3],$C[2] 248 eor $A[0][4],$A[0][4],$C[3] 249 eor $A[0][2],$A[0][2],$C[0] 250 ldr $C[3],[$C[1]],#8 // Iota[i++] 251 252 bic $C[0],$A[1][2],$A[1][1] 253 tst $C[1],#255 // are we done? 254 str $C[1],[sp,#16] 255 bic $C[1],$A[1][3],$A[1][2] 256 bic $C[2],$A[1][0],$A[1][4] 257 eor $A[0][0],$A[0][0],$C[3] // A[0][0] ^= Iota 258 bic $C[3],$A[1][1],$A[1][0] 259 eor $A[1][0],$A[1][0],$C[0] 260 bic $C[0],$A[1][4],$A[1][3] 261 eor $A[1][1],$A[1][1],$C[1] 262 eor $A[1][3],$A[1][3],$C[2] 263 eor $A[1][4],$A[1][4],$C[3] 264 eor $A[1][2],$A[1][2],$C[0] 265 266 bic $C[0],$A[2][2],$A[2][1] 267 bic $C[1],$A[2][3],$A[2][2] 268 bic $C[2],$A[2][0],$A[2][4] 269 bic $C[3],$A[2][1],$A[2][0] 270 eor $A[2][0],$A[2][0],$C[0] 271 bic $C[0],$A[2][4],$A[2][3] 272 eor $A[2][1],$A[2][1],$C[1] 273 eor $A[2][3],$A[2][3],$C[2] 274 eor $A[2][4],$A[2][4],$C[3] 275 eor $A[2][2],$A[2][2],$C[0] 276 277 bic $C[0],$A[3][2],$A[3][1] 278 bic $C[1],$A[3][3],$A[3][2] 279 bic $C[2],$A[3][0],$A[3][4] 280 bic $C[3],$A[3][1],$A[3][0] 281 eor $A[3][0],$A[3][0],$C[0] 282 bic $C[0],$A[3][4],$A[3][3] 283 eor $A[3][1],$A[3][1],$C[1] 284 eor $A[3][3],$A[3][3],$C[2] 285 eor $A[3][4],$A[3][4],$C[3] 286 eor $A[3][2],$A[3][2],$C[0] 287 288 bic $C[0],$A[4][2],$A[4][1] 289 bic $C[1],$A[4][3],$A[4][2] 290 bic $C[2],$A[4][0],$A[4][4] 291 bic $C[3],$A[4][1],$A[4][0] 292 eor $A[4][0],$A[4][0],$C[0] 293 bic $C[0],$A[4][4],$A[4][3] 294 eor $A[4][1],$A[4][1],$C[1] 295 eor $A[4][3],$A[4][3],$C[2] 296 eor $A[4][4],$A[4][4],$C[3] 297 eor $A[4][2],$A[4][2],$C[0] 298 299 bne .Loop 300 301 ldr x30,[sp,#24] 302 AARCH64_VALIDATE_LINK_REGISTER 303 ret 304.size KeccakF1600_int,.-KeccakF1600_int 305 306.type KeccakF1600,%function 307.align 5 308KeccakF1600: 309 AARCH64_SIGN_LINK_REGISTER 310 stp x29,x30,[sp,#-128]! 311 add x29,sp,#0 312 stp x19,x20,[sp,#16] 313 stp x21,x22,[sp,#32] 314 stp x23,x24,[sp,#48] 315 stp x25,x26,[sp,#64] 316 stp x27,x28,[sp,#80] 317 sub sp,sp,#48 318 319 str x0,[sp,#32] // offload argument 320 mov $C[0],x0 321 ldp $A[0][0],$A[0][1],[x0,#16*0] 322 ldp $A[0][2],$A[0][3],[$C[0],#16*1] 323 ldp $A[0][4],$A[1][0],[$C[0],#16*2] 324 ldp $A[1][1],$A[1][2],[$C[0],#16*3] 325 ldp $A[1][3],$A[1][4],[$C[0],#16*4] 326 ldp $A[2][0],$A[2][1],[$C[0],#16*5] 327 ldp $A[2][2],$A[2][3],[$C[0],#16*6] 328 ldp $A[2][4],$A[3][0],[$C[0],#16*7] 329 ldp $A[3][1],$A[3][2],[$C[0],#16*8] 330 ldp $A[3][3],$A[3][4],[$C[0],#16*9] 331 ldp $A[4][0],$A[4][1],[$C[0],#16*10] 332 ldp $A[4][2],$A[4][3],[$C[0],#16*11] 333 ldr $A[4][4],[$C[0],#16*12] 334 335 bl KeccakF1600_int 336 337 ldr $C[0],[sp,#32] 338 stp $A[0][0],$A[0][1],[$C[0],#16*0] 339 stp $A[0][2],$A[0][3],[$C[0],#16*1] 340 stp $A[0][4],$A[1][0],[$C[0],#16*2] 341 stp $A[1][1],$A[1][2],[$C[0],#16*3] 342 stp $A[1][3],$A[1][4],[$C[0],#16*4] 343 stp $A[2][0],$A[2][1],[$C[0],#16*5] 344 stp $A[2][2],$A[2][3],[$C[0],#16*6] 345 stp $A[2][4],$A[3][0],[$C[0],#16*7] 346 stp $A[3][1],$A[3][2],[$C[0],#16*8] 347 stp $A[3][3],$A[3][4],[$C[0],#16*9] 348 stp $A[4][0],$A[4][1],[$C[0],#16*10] 349 stp $A[4][2],$A[4][3],[$C[0],#16*11] 350 str $A[4][4],[$C[0],#16*12] 351 352 ldp x19,x20,[x29,#16] 353 add sp,sp,#48 354 ldp x21,x22,[x29,#32] 355 ldp x23,x24,[x29,#48] 356 ldp x25,x26,[x29,#64] 357 ldp x27,x28,[x29,#80] 358 ldp x29,x30,[sp],#128 359 AARCH64_VALIDATE_LINK_REGISTER 360 ret 361.size KeccakF1600,.-KeccakF1600 362 363.globl SHA3_absorb 364.type SHA3_absorb,%function 365.align 5 366SHA3_absorb: 367 AARCH64_SIGN_LINK_REGISTER 368 stp x29,x30,[sp,#-128]! 369 add x29,sp,#0 370 stp x19,x20,[sp,#16] 371 stp x21,x22,[sp,#32] 372 stp x23,x24,[sp,#48] 373 stp x25,x26,[sp,#64] 374 stp x27,x28,[sp,#80] 375 sub sp,sp,#64 376 377 stp x0,x1,[sp,#32] // offload arguments 378 stp x2,x3,[sp,#48] 379 380 mov $C[0],x0 // uint64_t A[5][5] 381 mov $C[1],x1 // const void *inp 382 mov $C[2],x2 // size_t len 383 mov $C[3],x3 // size_t bsz 384 ldp $A[0][0],$A[0][1],[$C[0],#16*0] 385 ldp $A[0][2],$A[0][3],[$C[0],#16*1] 386 ldp $A[0][4],$A[1][0],[$C[0],#16*2] 387 ldp $A[1][1],$A[1][2],[$C[0],#16*3] 388 ldp $A[1][3],$A[1][4],[$C[0],#16*4] 389 ldp $A[2][0],$A[2][1],[$C[0],#16*5] 390 ldp $A[2][2],$A[2][3],[$C[0],#16*6] 391 ldp $A[2][4],$A[3][0],[$C[0],#16*7] 392 ldp $A[3][1],$A[3][2],[$C[0],#16*8] 393 ldp $A[3][3],$A[3][4],[$C[0],#16*9] 394 ldp $A[4][0],$A[4][1],[$C[0],#16*10] 395 ldp $A[4][2],$A[4][3],[$C[0],#16*11] 396 ldr $A[4][4],[$C[0],#16*12] 397 b .Loop_absorb 398 399.align 4 400.Loop_absorb: 401 subs $C[0],$C[2],$C[3] // len - bsz 402 blo .Labsorbed 403 404 str $C[0],[sp,#48] // save len - bsz 405___ 406for (my $i=0; $i<24; $i+=2) { 407my $j = $i+1; 408$code.=<<___; 409 ldr $C[0],[$C[1]],#8 // *inp++ 410#ifdef __AARCH64EB__ 411 rev $C[0],$C[0] 412#endif 413 eor $A[$i/5][$i%5],$A[$i/5][$i%5],$C[0] 414 cmp $C[3],#8*($i+2) 415 blo .Lprocess_block 416 ldr $C[0],[$C[1]],#8 // *inp++ 417#ifdef __AARCH64EB__ 418 rev $C[0],$C[0] 419#endif 420 eor $A[$j/5][$j%5],$A[$j/5][$j%5],$C[0] 421 beq .Lprocess_block 422___ 423} 424$code.=<<___; 425 ldr $C[0],[$C[1]],#8 // *inp++ 426#ifdef __AARCH64EB__ 427 rev $C[0],$C[0] 428#endif 429 eor $A[4][4],$A[4][4],$C[0] 430 431.Lprocess_block: 432 str $C[1],[sp,#40] // save inp 433 434 bl KeccakF1600_int 435 436 ldr $C[1],[sp,#40] // restore arguments 437 ldp $C[2],$C[3],[sp,#48] 438 b .Loop_absorb 439 440.align 4 441.Labsorbed: 442 ldr $C[1],[sp,#32] 443 stp $A[0][0],$A[0][1],[$C[1],#16*0] 444 stp $A[0][2],$A[0][3],[$C[1],#16*1] 445 stp $A[0][4],$A[1][0],[$C[1],#16*2] 446 stp $A[1][1],$A[1][2],[$C[1],#16*3] 447 stp $A[1][3],$A[1][4],[$C[1],#16*4] 448 stp $A[2][0],$A[2][1],[$C[1],#16*5] 449 stp $A[2][2],$A[2][3],[$C[1],#16*6] 450 stp $A[2][4],$A[3][0],[$C[1],#16*7] 451 stp $A[3][1],$A[3][2],[$C[1],#16*8] 452 stp $A[3][3],$A[3][4],[$C[1],#16*9] 453 stp $A[4][0],$A[4][1],[$C[1],#16*10] 454 stp $A[4][2],$A[4][3],[$C[1],#16*11] 455 str $A[4][4],[$C[1],#16*12] 456 457 mov x0,$C[2] // return value 458 ldp x19,x20,[x29,#16] 459 add sp,sp,#64 460 ldp x21,x22,[x29,#32] 461 ldp x23,x24,[x29,#48] 462 ldp x25,x26,[x29,#64] 463 ldp x27,x28,[x29,#80] 464 ldp x29,x30,[sp],#128 465 AARCH64_VALIDATE_LINK_REGISTER 466 ret 467.size SHA3_absorb,.-SHA3_absorb 468___ 469{ 470my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22)); 471$code.=<<___; 472.globl SHA3_squeeze 473.type SHA3_squeeze,%function 474.align 5 475SHA3_squeeze: 476 AARCH64_SIGN_LINK_REGISTER 477 stp x29,x30,[sp,#-48]! 478 add x29,sp,#0 479 stp x19,x20,[sp,#16] 480 stp x21,x22,[sp,#32] 481 482 mov $A_flat,x0 // put aside arguments 483 mov $out,x1 484 mov $len,x2 485 mov $bsz,x3 486 487.Loop_squeeze: 488 ldr x4,[x0],#8 489 cmp $len,#8 490 blo .Lsqueeze_tail 491#ifdef __AARCH64EB__ 492 rev x4,x4 493#endif 494 str x4,[$out],#8 495 subs $len,$len,#8 496 beq .Lsqueeze_done 497 498 subs x3,x3,#8 499 bhi .Loop_squeeze 500 501 mov x0,$A_flat 502 bl KeccakF1600 503 mov x0,$A_flat 504 mov x3,$bsz 505 b .Loop_squeeze 506 507.align 4 508.Lsqueeze_tail: 509 strb w4,[$out],#1 510 lsr x4,x4,#8 511 subs $len,$len,#1 512 beq .Lsqueeze_done 513 strb w4,[$out],#1 514 lsr x4,x4,#8 515 subs $len,$len,#1 516 beq .Lsqueeze_done 517 strb w4,[$out],#1 518 lsr x4,x4,#8 519 subs $len,$len,#1 520 beq .Lsqueeze_done 521 strb w4,[$out],#1 522 lsr x4,x4,#8 523 subs $len,$len,#1 524 beq .Lsqueeze_done 525 strb w4,[$out],#1 526 lsr x4,x4,#8 527 subs $len,$len,#1 528 beq .Lsqueeze_done 529 strb w4,[$out],#1 530 lsr x4,x4,#8 531 subs $len,$len,#1 532 beq .Lsqueeze_done 533 strb w4,[$out],#1 534 535.Lsqueeze_done: 536 ldp x19,x20,[sp,#16] 537 ldp x21,x22,[sp,#32] 538 ldp x29,x30,[sp],#48 539 AARCH64_VALIDATE_LINK_REGISTER 540 ret 541.size SHA3_squeeze,.-SHA3_squeeze 542___ 543} }}} 544 {{{ 545my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b", 546 "v".($_+3).".16b", "v".($_+4).".16b" ], 547 (0, 5, 10, 15, 20)); 548 549my @C = map("v$_.16b", (25..31)); 550my @D = @C[4,5,6,2,3]; 551 552$code.=<<___; 553.type KeccakF1600_ce,%function 554.align 5 555KeccakF1600_ce: 556 mov x9,#24 557 adr x10,iotas 558 b .Loop_ce 559.align 4 560.Loop_ce: 561 ////////////////////////////////////////////////// Theta 562 eor3 $C[0],$A[4][0],$A[3][0],$A[2][0] 563 eor3 $C[1],$A[4][1],$A[3][1],$A[2][1] 564 eor3 $C[2],$A[4][2],$A[3][2],$A[2][2] 565 eor3 $C[3],$A[4][3],$A[3][3],$A[2][3] 566 eor3 $C[4],$A[4][4],$A[3][4],$A[2][4] 567 eor3 $C[0],$C[0], $A[1][0],$A[0][0] 568 eor3 $C[1],$C[1], $A[1][1],$A[0][1] 569 eor3 $C[2],$C[2], $A[1][2],$A[0][2] 570 eor3 $C[3],$C[3], $A[1][3],$A[0][3] 571 eor3 $C[4],$C[4], $A[1][4],$A[0][4] 572 573 rax1 $C[5],$C[0],$C[2] // D[1] 574 rax1 $C[6],$C[1],$C[3] // D[2] 575 rax1 $C[2],$C[2],$C[4] // D[3] 576 rax1 $C[3],$C[3],$C[0] // D[4] 577 rax1 $C[4],$C[4],$C[1] // D[0] 578 579 ////////////////////////////////////////////////// Theta+Rho+Pi 580 xar $C[0], $A[0][1],$D[1],#64-$rhotates[0][1] // C[0]=A[2][0] 581 582 xar $A[0][1],$A[1][1],$D[1],#64-$rhotates[1][1] 583 xar $A[1][1],$A[1][4],$D[4],#64-$rhotates[1][4] 584 xar $A[1][4],$A[4][2],$D[2],#64-$rhotates[4][2] 585 xar $A[4][2],$A[2][4],$D[4],#64-$rhotates[2][4] 586 xar $A[2][4],$A[4][0],$D[0],#64-$rhotates[4][0] 587 588 xar $C[1], $A[0][2],$D[2],#64-$rhotates[0][2] // C[1]=A[4][0] 589 590 xar $A[0][2],$A[2][2],$D[2],#64-$rhotates[2][2] 591 xar $A[2][2],$A[2][3],$D[3],#64-$rhotates[2][3] 592 xar $A[2][3],$A[3][4],$D[4],#64-$rhotates[3][4] 593 xar $A[3][4],$A[4][3],$D[3],#64-$rhotates[4][3] 594 xar $A[4][3],$A[3][0],$D[0],#64-$rhotates[3][0] 595 596 xar $A[3][0],$A[0][4],$D[4],#64-$rhotates[0][4] 597 598 xar $D[4], $A[4][4],$D[4],#64-$rhotates[4][4] // D[4]=A[0][4] 599 xar $A[4][4],$A[4][1],$D[1],#64-$rhotates[4][1] 600 xar $A[1][3],$A[1][3],$D[3],#64-$rhotates[1][3] // A[1][3]=A[4][1] 601 xar $A[0][4],$A[3][1],$D[1],#64-$rhotates[3][1] // A[0][4]=A[1][3] 602 xar $A[3][1],$A[1][0],$D[0],#64-$rhotates[1][0] 603 604 xar $A[1][0],$A[0][3],$D[3],#64-$rhotates[0][3] 605 606 eor $A[0][0],$A[0][0],$D[0] 607 608 xar $D[3], $A[3][3],$D[3],#64-$rhotates[3][3] // D[3]=A[0][3] 609 xar $A[0][3],$A[3][2],$D[2],#64-$rhotates[3][2] // A[0][3]=A[3][3] 610 xar $D[1], $A[2][1],$D[1],#64-$rhotates[2][1] // D[1]=A[3][2] 611 xar $D[2], $A[1][2],$D[2],#64-$rhotates[1][2] // D[2]=A[2][1] 612 xar $D[0], $A[2][0],$D[0],#64-$rhotates[2][0] // D[0]=A[1][2] 613 614 ////////////////////////////////////////////////// Chi+Iota 615 bcax $A[4][0],$C[1], $A[4][2],$A[1][3] // A[1][3]=A[4][1] 616 bcax $A[4][1],$A[1][3],$A[4][3],$A[4][2] // A[1][3]=A[4][1] 617 bcax $A[4][2],$A[4][2],$A[4][4],$A[4][3] 618 bcax $A[4][3],$A[4][3],$C[1], $A[4][4] 619 bcax $A[4][4],$A[4][4],$A[1][3],$C[1] // A[1][3]=A[4][1] 620 621 ld1r {$C[1]},[x10],#8 622 623 bcax $A[3][2],$D[1], $A[3][4],$A[0][3] // A[0][3]=A[3][3] 624 bcax $A[3][3],$A[0][3],$A[3][0],$A[3][4] // A[0][3]=A[3][3] 625 bcax $A[3][4],$A[3][4],$A[3][1],$A[3][0] 626 bcax $A[3][0],$A[3][0],$D[1], $A[3][1] 627 bcax $A[3][1],$A[3][1],$A[0][3],$D[1] // A[0][3]=A[3][3] 628 629 bcax $A[2][0],$C[0], $A[2][2],$D[2] 630 bcax $A[2][1],$D[2], $A[2][3],$A[2][2] 631 bcax $A[2][2],$A[2][2],$A[2][4],$A[2][3] 632 bcax $A[2][3],$A[2][3],$C[0], $A[2][4] 633 bcax $A[2][4],$A[2][4],$D[2], $C[0] 634 635 bcax $A[1][2],$D[0], $A[1][4],$A[0][4] // A[0][4]=A[1][3] 636 bcax $A[1][3],$A[0][4],$A[1][0],$A[1][4] // A[0][4]=A[1][3] 637 bcax $A[1][4],$A[1][4],$A[1][1],$A[1][0] 638 bcax $A[1][0],$A[1][0],$D[0], $A[1][1] 639 bcax $A[1][1],$A[1][1],$A[0][4],$D[0] // A[0][4]=A[1][3] 640 641 bcax $A[0][3],$D[3], $A[0][0],$D[4] 642 bcax $A[0][4],$D[4], $A[0][1],$A[0][0] 643 bcax $A[0][0],$A[0][0],$A[0][2],$A[0][1] 644 bcax $A[0][1],$A[0][1],$D[3], $A[0][2] 645 bcax $A[0][2],$A[0][2],$D[4], $D[3] 646 647 eor $A[0][0],$A[0][0],$C[1] 648 649 subs x9,x9,#1 650 bne .Loop_ce 651 652 ret 653.size KeccakF1600_ce,.-KeccakF1600_ce 654 655.type KeccakF1600_cext,%function 656.align 5 657KeccakF1600_cext: 658 AARCH64_SIGN_LINK_REGISTER 659 stp x29,x30,[sp,#-80]! 660 add x29,sp,#0 661 stp d8,d9,[sp,#16] // per ABI requirement 662 stp d10,d11,[sp,#32] 663 stp d12,d13,[sp,#48] 664 stp d14,d15,[sp,#64] 665___ 666for($i=0; $i<24; $i+=2) { # load A[5][5] 667my $j=$i+1; 668$code.=<<___; 669 ldp d$i,d$j,[x0,#8*$i] 670___ 671} 672$code.=<<___; 673 ldr d24,[x0,#8*$i] 674 bl KeccakF1600_ce 675 ldr x30,[sp,#8] 676___ 677for($i=0; $i<24; $i+=2) { # store A[5][5] 678my $j=$i+1; 679$code.=<<___; 680 stp d$i,d$j,[x0,#8*$i] 681___ 682} 683$code.=<<___; 684 str d24,[x0,#8*$i] 685 686 ldp d8,d9,[sp,#16] 687 ldp d10,d11,[sp,#32] 688 ldp d12,d13,[sp,#48] 689 ldp d14,d15,[sp,#64] 690 ldr x29,[sp],#80 691 AARCH64_VALIDATE_LINK_REGISTER 692 ret 693.size KeccakF1600_cext,.-KeccakF1600_cext 694___ 695 696{ 697my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3)); 698 699$code.=<<___; 700.globl SHA3_absorb_cext 701.type SHA3_absorb_cext,%function 702.align 5 703SHA3_absorb_cext: 704 AARCH64_SIGN_LINK_REGISTER 705 stp x29,x30,[sp,#-80]! 706 add x29,sp,#0 707 stp d8,d9,[sp,#16] // per ABI requirement 708 stp d10,d11,[sp,#32] 709 stp d12,d13,[sp,#48] 710 stp d14,d15,[sp,#64] 711___ 712for($i=0; $i<24; $i+=2) { # load A[5][5] 713my $j=$i+1; 714$code.=<<___; 715 ldp d$i,d$j,[x0,#8*$i] 716___ 717} 718$code.=<<___; 719 ldr d24,[x0,#8*$i] 720 b .Loop_absorb_ce 721 722.align 4 723.Loop_absorb_ce: 724 subs $len,$len,$bsz // len - bsz 725 blo .Labsorbed_ce 726___ 727for (my $i=0; $i<24; $i+=2) { 728my $j = $i+1; 729$code.=<<___; 730 ldr d31,[$inp],#8 // *inp++ 731#ifdef __AARCH64EB__ 732 rev64 v31.16b,v31.16b 733#endif 734 eor $A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b 735 cmp $bsz,#8*($i+2) 736 blo .Lprocess_block_ce 737 ldr d31,[$inp],#8 // *inp++ 738#ifdef __AARCH64EB__ 739 rev64 v31.16b,v31.16b 740#endif 741 eor $A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b 742 beq .Lprocess_block_ce 743___ 744} 745$code.=<<___; 746 ldr d31,[$inp],#8 // *inp++ 747#ifdef __AARCH64EB__ 748 rev64 v31.16b,v31.16b 749#endif 750 eor $A[4][4],$A[4][4],v31.16b 751 752.Lprocess_block_ce: 753 754 bl KeccakF1600_ce 755 756 b .Loop_absorb_ce 757 758.align 4 759.Labsorbed_ce: 760___ 761for($i=0; $i<24; $i+=2) { # store A[5][5] 762my $j=$i+1; 763$code.=<<___; 764 stp d$i,d$j,[x0,#8*$i] 765___ 766} 767$code.=<<___; 768 str d24,[x0,#8*$i] 769 add x0,$len,$bsz // return value 770 771 ldp d8,d9,[sp,#16] 772 ldp d10,d11,[sp,#32] 773 ldp d12,d13,[sp,#48] 774 ldp d14,d15,[sp,#64] 775 ldp x29,x30,[sp],#80 776 AARCH64_VALIDATE_LINK_REGISTER 777 ret 778.size SHA3_absorb_cext,.-SHA3_absorb_cext 779___ 780} 781{ 782my ($ctx,$out,$len,$bsz) = map("x$_",(0..3)); 783$code.=<<___; 784.globl SHA3_squeeze_cext 785.type SHA3_squeeze_cext,%function 786.align 5 787SHA3_squeeze_cext: 788 AARCH64_SIGN_LINK_REGISTER 789 stp x29,x30,[sp,#-16]! 790 add x29,sp,#0 791 mov x9,$ctx 792 mov x10,$bsz 793 794.Loop_squeeze_ce: 795 ldr x4,[x9],#8 796 cmp $len,#8 797 blo .Lsqueeze_tail_ce 798#ifdef __AARCH64EB__ 799 rev x4,x4 800#endif 801 str x4,[$out],#8 802 beq .Lsqueeze_done_ce 803 804 sub $len,$len,#8 805 subs x10,x10,#8 806 bhi .Loop_squeeze_ce 807 808 bl KeccakF1600_cext 809 ldr x30,[sp,#8] 810 mov x9,$ctx 811 mov x10,$bsz 812 b .Loop_squeeze_ce 813 814.align 4 815.Lsqueeze_tail_ce: 816 strb w4,[$out],#1 817 lsr x4,x4,#8 818 subs $len,$len,#1 819 beq .Lsqueeze_done_ce 820 strb w4,[$out],#1 821 lsr x4,x4,#8 822 subs $len,$len,#1 823 beq .Lsqueeze_done_ce 824 strb w4,[$out],#1 825 lsr x4,x4,#8 826 subs $len,$len,#1 827 beq .Lsqueeze_done_ce 828 strb w4,[$out],#1 829 lsr x4,x4,#8 830 subs $len,$len,#1 831 beq .Lsqueeze_done_ce 832 strb w4,[$out],#1 833 lsr x4,x4,#8 834 subs $len,$len,#1 835 beq .Lsqueeze_done_ce 836 strb w4,[$out],#1 837 lsr x4,x4,#8 838 subs $len,$len,#1 839 beq .Lsqueeze_done_ce 840 strb w4,[$out],#1 841 842.Lsqueeze_done_ce: 843 ldr x29,[sp],#16 844 AARCH64_VALIDATE_LINK_REGISTER 845 ret 846.size SHA3_squeeze_cext,.-SHA3_squeeze_cext 847___ 848} }}} 849$code.=<<___; 850.asciz "Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 851___ 852 853{ my %opcode = ( 854 "rax1" => 0xce608c00, "eor3" => 0xce000000, 855 "bcax" => 0xce200000, "xar" => 0xce800000 ); 856 857 sub unsha3 { 858 my ($mnemonic,$arg)=@_; 859 860 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/ 861 && 862 sprintf ".inst\t0x%08x\t//%s %s", 863 $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10), 864 $mnemonic,$arg; 865 } 866} 867 868foreach(split("\n",$code)) { 869 870 s/\`([^\`]*)\`/eval($1)/ge; 871 872 m/\bld1r\b/ and s/\.16b/.2d/g or 873 s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge; 874 875 print $_,"\n"; 876} 877 878close STDOUT or die "error closing STDOUT: $!"; 879