1#!/usr/bin/env perl 2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for ARMv4. 17# 18# June 2017. 19# 20# Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit 21# interleaving. How does it compare to Keccak Code Package? It's as 22# fast, but several times smaller, and is endian- and ISA-neutral. ISA 23# neutrality means that minimum ISA requirement is ARMv4, yet it can 24# be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with 25# register layout taken from Keccak Code Package. It's also as fast, 26# in fact faster by 10-15% on some processors, and endian-neutral. 27# 28# August 2017. 29# 30# Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2 31# of rotate instructions with logical ones. This resulted in ~10% 32# improvement on most processors. Switch to KECCAK_2X effectively 33# minimizes re-loads from temporary storage, and merged rotates just 34# eliminate corresponding instructions. As for latter. When examining 35# code you'll notice commented ror instructions. These are eliminated 36# ones, and you should trace destination register below to see what's 37# going on. Just in case, why not all rotates are eliminated. Trouble 38# is that you have operations that require both inputs to be rotated, 39# e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using 40# 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation 41# that takes 'a' as input. And thing is that this next operation can 42# be in next round. It's totally possible to "carry" rotate "factors" 43# to the next round, but it makes code more complex. And the last word 44# is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the 45# time being]... 46# 47# Reduce per-round instruction count in Thumb-2 case by 16%. This is 48# achieved by folding ldr/str pairs to their double-word counterparts. 49# Theoretically this should have improved performance on single-issue 50# cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as 51# usual... 52# 53######################################################################## 54# Numbers are cycles per processed byte. Non-NEON results account even 55# for input bit interleaving. 56# 57# r=1088(*) Thumb-2(**) NEON 58# 59# ARM11xx 82/+150% 60# Cortex-A5 88/+160%, 86, 36 61# Cortex-A7 78/+160%, 68, 34 62# Cortex-A8 51/+230%, 57, 30 63# Cortex-A9 53/+210%, 51, 26 64# Cortex-A15 42/+160%, 38, 18 65# Snapdragon S4 43/+210%, 38, 24 66# 67# (*) Corresponds to SHA3-256. Percentage after slash is improvement 68# over compiler-generated KECCAK_2X reference code. 69# (**) Thumb-2 results for Cortex-A5/A7 are likely to apply even to 70# Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable 71# processors are presented mostly for reference purposes. 72 73# $output is the last argument if it looks like a file (it has an extension) 74# $flavour is the first argument if it doesn't look like a file 75$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 76$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 77 78if ($flavour && $flavour ne "void") { 79 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 80 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 81 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 82 die "can't locate arm-xlate.pl"; 83 84 open STDOUT,"| \"$^X\" $xlate $flavour \"$output\"" 85 or die "can't call $xlate: $!"; 86} else { 87 $output and open STDOUT,">$output"; 88} 89 90my @C = map("r$_",(0..9)); 91my @E = map("r$_",(10..12,14)); 92 93######################################################################## 94# Stack layout 95# ----->+-----------------------+ 96# | uint64_t A[5][5] | 97# | ... | 98# +200->+-----------------------+ 99# | uint64_t D[5] | 100# | ... | 101# +240->+-----------------------+ 102# | uint64_t T[5][5] | 103# | ... | 104# +440->+-----------------------+ 105# | saved lr | 106# +444->+-----------------------+ 107# | loop counter | 108# +448->+-----------------------+ 109# | ... 110 111my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20)); 112my @D = map(8*$_, (25..29)); 113my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50)); 114 115$code.=<<___; 116#include "arm_arch.h" 117 118#if defined(__thumb2__) 119.syntax unified 120.thumb 121#else 122.code 32 123#endif 124 125.text 126 127.type iotas32, %object 128.align 5 129iotas32: 130 .long 0x00000001, 0x00000000 131 .long 0x00000000, 0x00000089 132 .long 0x00000000, 0x8000008b 133 .long 0x00000000, 0x80008080 134 .long 0x00000001, 0x0000008b 135 .long 0x00000001, 0x00008000 136 .long 0x00000001, 0x80008088 137 .long 0x00000001, 0x80000082 138 .long 0x00000000, 0x0000000b 139 .long 0x00000000, 0x0000000a 140 .long 0x00000001, 0x00008082 141 .long 0x00000000, 0x00008003 142 .long 0x00000001, 0x0000808b 143 .long 0x00000001, 0x8000000b 144 .long 0x00000001, 0x8000008a 145 .long 0x00000001, 0x80000081 146 .long 0x00000000, 0x80000081 147 .long 0x00000000, 0x80000008 148 .long 0x00000000, 0x00000083 149 .long 0x00000000, 0x80008003 150 .long 0x00000001, 0x80008088 151 .long 0x00000000, 0x80000088 152 .long 0x00000001, 0x00008000 153 .long 0x00000000, 0x80008082 154.size iotas32,.-iotas32 155 156.type KeccakF1600_int, %function 157.align 5 158KeccakF1600_int: 159 add @C[9],sp,#$A[4][2] 160 add @E[2],sp,#$A[0][0] 161 add @E[0],sp,#$A[1][0] 162 ldmia @C[9],{@C[4]-@C[9]} @ A[4][2..4] 163KeccakF1600_enter: 164 str lr,[sp,#440] 165 eor @E[1],@E[1],@E[1] 166 str @E[1],[sp,#444] 167 b .Lround2x 168 169.align 4 170.Lround2x: 171___ 172sub Round { 173my (@A,@R); (@A[0..4],@R) = @_; 174 175$code.=<<___; 176 ldmia @E[2],{@C[0]-@C[3]} @ A[0][0..1] 177 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1] 178#ifdef __thumb2__ 179 eor @C[0],@C[0],@E[0] 180 eor @C[1],@C[1],@E[1] 181 eor @C[2],@C[2],@E[2] 182 ldrd @E[0],@E[1],[sp,#$A[1][2]] 183 eor @C[3],@C[3],@E[3] 184 ldrd @E[2],@E[3],[sp,#$A[1][3]] 185 eor @C[4],@C[4],@E[0] 186 eor @C[5],@C[5],@E[1] 187 eor @C[6],@C[6],@E[2] 188 ldrd @E[0],@E[1],[sp,#$A[1][4]] 189 eor @C[7],@C[7],@E[3] 190 ldrd @E[2],@E[3],[sp,#$A[2][0]] 191 eor @C[8],@C[8],@E[0] 192 eor @C[9],@C[9],@E[1] 193 eor @C[0],@C[0],@E[2] 194 ldrd @E[0],@E[1],[sp,#$A[2][1]] 195 eor @C[1],@C[1],@E[3] 196 ldrd @E[2],@E[3],[sp,#$A[2][2]] 197 eor @C[2],@C[2],@E[0] 198 eor @C[3],@C[3],@E[1] 199 eor @C[4],@C[4],@E[2] 200 ldrd @E[0],@E[1],[sp,#$A[2][3]] 201 eor @C[5],@C[5],@E[3] 202 ldrd @E[2],@E[3],[sp,#$A[2][4]] 203 eor @C[6],@C[6],@E[0] 204 eor @C[7],@C[7],@E[1] 205 eor @C[8],@C[8],@E[2] 206 ldrd @E[0],@E[1],[sp,#$A[3][0]] 207 eor @C[9],@C[9],@E[3] 208 ldrd @E[2],@E[3],[sp,#$A[3][1]] 209 eor @C[0],@C[0],@E[0] 210 eor @C[1],@C[1],@E[1] 211 eor @C[2],@C[2],@E[2] 212 ldrd @E[0],@E[1],[sp,#$A[3][2]] 213 eor @C[3],@C[3],@E[3] 214 ldrd @E[2],@E[3],[sp,#$A[3][3]] 215 eor @C[4],@C[4],@E[0] 216 eor @C[5],@C[5],@E[1] 217 eor @C[6],@C[6],@E[2] 218 ldrd @E[0],@E[1],[sp,#$A[3][4]] 219 eor @C[7],@C[7],@E[3] 220 ldrd @E[2],@E[3],[sp,#$A[4][0]] 221 eor @C[8],@C[8],@E[0] 222 eor @C[9],@C[9],@E[1] 223 eor @C[0],@C[0],@E[2] 224 ldrd @E[0],@E[1],[sp,#$A[4][1]] 225 eor @C[1],@C[1],@E[3] 226 ldrd @E[2],@E[3],[sp,#$A[0][2]] 227 eor @C[2],@C[2],@E[0] 228 eor @C[3],@C[3],@E[1] 229 eor @C[4],@C[4],@E[2] 230 ldrd @E[0],@E[1],[sp,#$A[0][3]] 231 eor @C[5],@C[5],@E[3] 232 ldrd @E[2],@E[3],[sp,#$A[0][4]] 233#else 234 eor @C[0],@C[0],@E[0] 235 add @E[0],sp,#$A[1][2] 236 eor @C[1],@C[1],@E[1] 237 eor @C[2],@C[2],@E[2] 238 eor @C[3],@C[3],@E[3] 239 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3] 240 eor @C[4],@C[4],@E[0] 241 add @E[0],sp,#$A[1][4] 242 eor @C[5],@C[5],@E[1] 243 eor @C[6],@C[6],@E[2] 244 eor @C[7],@C[7],@E[3] 245 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0] 246 eor @C[8],@C[8],@E[0] 247 add @E[0],sp,#$A[2][1] 248 eor @C[9],@C[9],@E[1] 249 eor @C[0],@C[0],@E[2] 250 eor @C[1],@C[1],@E[3] 251 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2] 252 eor @C[2],@C[2],@E[0] 253 add @E[0],sp,#$A[2][3] 254 eor @C[3],@C[3],@E[1] 255 eor @C[4],@C[4],@E[2] 256 eor @C[5],@C[5],@E[3] 257 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4] 258 eor @C[6],@C[6],@E[0] 259 add @E[0],sp,#$A[3][0] 260 eor @C[7],@C[7],@E[1] 261 eor @C[8],@C[8],@E[2] 262 eor @C[9],@C[9],@E[3] 263 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1] 264 eor @C[0],@C[0],@E[0] 265 add @E[0],sp,#$A[3][2] 266 eor @C[1],@C[1],@E[1] 267 eor @C[2],@C[2],@E[2] 268 eor @C[3],@C[3],@E[3] 269 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3] 270 eor @C[4],@C[4],@E[0] 271 add @E[0],sp,#$A[3][4] 272 eor @C[5],@C[5],@E[1] 273 eor @C[6],@C[6],@E[2] 274 eor @C[7],@C[7],@E[3] 275 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0] 276 eor @C[8],@C[8],@E[0] 277 ldr @E[0],[sp,#$A[4][1]] @ A[4][1] 278 eor @C[9],@C[9],@E[1] 279 ldr @E[1],[sp,#$A[4][1]+4] 280 eor @C[0],@C[0],@E[2] 281 ldr @E[2],[sp,#$A[0][2]] @ A[0][2] 282 eor @C[1],@C[1],@E[3] 283 ldr @E[3],[sp,#$A[0][2]+4] 284 eor @C[2],@C[2],@E[0] 285 add @E[0],sp,#$A[0][3] 286 eor @C[3],@C[3],@E[1] 287 eor @C[4],@C[4],@E[2] 288 eor @C[5],@C[5],@E[3] 289 ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[0][3..4] 290#endif 291 eor @C[6],@C[6],@E[0] 292 eor @C[7],@C[7],@E[1] 293 eor @C[8],@C[8],@E[2] 294 eor @C[9],@C[9],@E[3] 295 296 eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0]; 297 str.l @E[0],[sp,#$D[1]] @ D[1] = E[0] 298 eor @E[1],@C[1],@C[4] 299 str.h @E[1],[sp,#$D[1]+4] 300 eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3]; 301 eor @E[3],@C[7],@C[0] 302 str.l @E[2],[sp,#$D[4]] @ D[4] = E[1] 303 eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4]; 304 str.h @E[3],[sp,#$D[4]+4] 305 eor @C[1],@C[9],@C[2] 306 str.l @C[0],[sp,#$D[0]] @ D[0] = C[0] 307 eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1]; 308 ldr.l @C[7],[sp,#$A[3][3]] 309 eor @C[3],@C[3],@C[6] 310 str.h @C[1],[sp,#$D[0]+4] 311 ldr.h @C[6],[sp,#$A[3][3]+4] 312 str.l @C[2],[sp,#$D[2]] @ D[2] = C[1] 313 eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2]; 314 str.h @C[3],[sp,#$D[2]+4] 315 eor @C[5],@C[5],@C[8] 316 317 ldr.l @C[8],[sp,#$A[4][4]] 318 ldr.h @C[9],[sp,#$A[4][4]+4] 319 str.l @C[4],[sp,#$D[3]] @ D[3] = C[2] 320 eor @C[7],@C[7],@C[4] 321 str.h @C[5],[sp,#$D[3]+4] 322 eor @C[6],@C[6],@C[5] 323 ldr.l @C[4],[sp,#$A[0][0]] 324 @ ror @C[7],@C[7],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */ 325 @ ror @C[6],@C[6],#32-11 326 ldr.h @C[5],[sp,#$A[0][0]+4] 327 eor @C[8],@C[8],@E[2] 328 eor @C[9],@C[9],@E[3] 329 ldr.l @E[2],[sp,#$A[2][2]] 330 eor @C[0],@C[0],@C[4] 331 ldr.h @E[3],[sp,#$A[2][2]+4] 332 @ ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */ 333 @ ror @C[9],@C[9],#32-7 334 eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */ 335 eor @E[2],@E[2],@C[2] 336 ldr.l @C[2],[sp,#$A[1][1]] 337 eor @E[3],@E[3],@C[3] 338 ldr.h @C[3],[sp,#$A[1][1]+4] 339 ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */ 340 ldr @E[2],[sp,#444] @ load counter 341 eor @C[2],@C[2],@E[0] 342 adr @E[0],iotas32 343 ror @C[4],@E[3],#32-22 344 add @E[3],@E[0],@E[2] 345 eor @C[3],@C[3],@E[1] 346___ 347$code.=<<___ if ($A[0][0] != $T[0][0]); 348 ldmia @E[3],{@E[0],@E[1]} @ iotas[i] 349___ 350$code.=<<___ if ($A[0][0] == $T[0][0]); 351 ldr.l @E[0],[@E[3],#8] @ iotas[i].lo 352 add @E[2],@E[2],#16 353 ldr.h @E[1],[@E[3],#12] @ iotas[i].hi 354 cmp @E[2],#192 355 str @E[2],[sp,#444] @ store counter 356___ 357$code.=<<___; 358 bic @E[2],@C[4],@C[2],ror#32-22 359 bic @E[3],@C[5],@C[3],ror#32-22 360 ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */ 361 ror @C[3],@C[3],#32-22 362 eor @E[2],@E[2],@C[0] 363 eor @E[3],@E[3],@C[1] 364 eor @E[0],@E[0],@E[2] 365 eor @E[1],@E[1],@E[3] 366 str.l @E[0],[sp,#$R[0][0]] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; 367 bic @E[2],@C[6],@C[4],ror#11 368 str.h @E[1],[sp,#$R[0][0]+4] 369 bic @E[3],@C[7],@C[5],ror#10 370 bic @E[0],@C[8],@C[6],ror#32-(11-7) 371 bic @E[1],@C[9],@C[7],ror#32-(10-7) 372 eor @E[2],@C[2],@E[2],ror#32-11 373 str.l @E[2],[sp,#$R[0][1]] @ R[0][1] = C[1] ^ (~C[2] & C[3]); 374 eor @E[3],@C[3],@E[3],ror#32-10 375 str.h @E[3],[sp,#$R[0][1]+4] 376 eor @E[0],@C[4],@E[0],ror#32-7 377 eor @E[1],@C[5],@E[1],ror#32-7 378 str.l @E[0],[sp,#$R[0][2]] @ R[0][2] = C[2] ^ (~C[3] & C[4]); 379 bic @E[2],@C[0],@C[8],ror#32-7 380 str.h @E[1],[sp,#$R[0][2]+4] 381 bic @E[3],@C[1],@C[9],ror#32-7 382 eor @E[2],@E[2],@C[6],ror#32-11 383 str.l @E[2],[sp,#$R[0][3]] @ R[0][3] = C[3] ^ (~C[4] & C[0]); 384 eor @E[3],@E[3],@C[7],ror#32-10 385 str.h @E[3],[sp,#$R[0][3]+4] 386 bic @E[0],@C[2],@C[0] 387 add @E[3],sp,#$D[3] 388 ldr.l @C[0],[sp,#$A[0][3]] @ A[0][3] 389 bic @E[1],@C[3],@C[1] 390 ldr.h @C[1],[sp,#$A[0][3]+4] 391 eor @E[0],@E[0],@C[8],ror#32-7 392 eor @E[1],@E[1],@C[9],ror#32-7 393 str.l @E[0],[sp,#$R[0][4]] @ R[0][4] = C[4] ^ (~C[0] & C[1]); 394 add @C[9],sp,#$D[0] 395 str.h @E[1],[sp,#$R[0][4]+4] 396 397 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4] 398 ldmia @C[9],{@C[6]-@C[9]} @ D[0..1] 399 400 ldr.l @C[2],[sp,#$A[1][4]] @ A[1][4] 401 eor @C[0],@C[0],@E[0] 402 ldr.h @C[3],[sp,#$A[1][4]+4] 403 eor @C[1],@C[1],@E[1] 404 @ ror @C[0],@C[0],#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]); 405 ldr.l @E[0],[sp,#$A[3][1]] @ A[3][1] 406 @ ror @C[1],@C[1],#32-14 407 ldr.h @E[1],[sp,#$A[3][1]+4] 408 409 eor @C[2],@C[2],@E[2] 410 ldr.l @C[4],[sp,#$A[2][0]] @ A[2][0] 411 eor @C[3],@C[3],@E[3] 412 ldr.h @C[5],[sp,#$A[2][0]+4] 413 @ ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); 414 @ ror @C[3],@C[3],#32-10 415 416 eor @C[6],@C[6],@C[4] 417 ldr.l @E[2],[sp,#$D[2]] @ D[2] 418 eor @C[7],@C[7],@C[5] 419 ldr.h @E[3],[sp,#$D[2]+4] 420 ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]); 421 ror @C[4],@C[7],#32-2 422 423 eor @E[0],@E[0],@C[8] 424 ldr.l @C[8],[sp,#$A[4][2]] @ A[4][2] 425 eor @E[1],@E[1],@C[9] 426 ldr.h @C[9],[sp,#$A[4][2]+4] 427 ror @C[7],@E[0],#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]); 428 ror @C[6],@E[1],#32-23 429 430 bic @E[0],@C[4],@C[2],ror#32-10 431 bic @E[1],@C[5],@C[3],ror#32-10 432 eor @E[2],@E[2],@C[8] 433 eor @E[3],@E[3],@C[9] 434 ror @C[9],@E[2],#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]); 435 ror @C[8],@E[3],#32-31 436 eor @E[0],@E[0],@C[0],ror#32-14 437 eor @E[1],@E[1],@C[1],ror#32-14 438 str.l @E[0],[sp,#$R[1][0]] @ R[1][0] = C[0] ^ (~C[1] & C[2]) 439 bic @E[2],@C[6],@C[4] 440 str.h @E[1],[sp,#$R[1][0]+4] 441 bic @E[3],@C[7],@C[5] 442 eor @E[2],@E[2],@C[2],ror#32-10 443 str.l @E[2],[sp,#$R[1][1]] @ R[1][1] = C[1] ^ (~C[2] & C[3]); 444 eor @E[3],@E[3],@C[3],ror#32-10 445 str.h @E[3],[sp,#$R[1][1]+4] 446 bic @E[0],@C[8],@C[6] 447 bic @E[1],@C[9],@C[7] 448 bic @E[2],@C[0],@C[8],ror#14 449 bic @E[3],@C[1],@C[9],ror#14 450 eor @E[0],@E[0],@C[4] 451 eor @E[1],@E[1],@C[5] 452 str.l @E[0],[sp,#$R[1][2]] @ R[1][2] = C[2] ^ (~C[3] & C[4]); 453 bic @C[2],@C[2],@C[0],ror#32-(14-10) 454 str.h @E[1],[sp,#$R[1][2]+4] 455 eor @E[2],@C[6],@E[2],ror#32-14 456 bic @E[1],@C[3],@C[1],ror#32-(14-10) 457 str.l @E[2],[sp,#$R[1][3]] @ R[1][3] = C[3] ^ (~C[4] & C[0]); 458 eor @E[3],@C[7],@E[3],ror#32-14 459 str.h @E[3],[sp,#$R[1][3]+4] 460 add @E[2],sp,#$D[1] 461 ldr.l @C[1],[sp,#$A[0][1]] @ A[0][1] 462 eor @E[0],@C[8],@C[2],ror#32-10 463 ldr.h @C[0],[sp,#$A[0][1]+4] 464 eor @E[1],@C[9],@E[1],ror#32-10 465 str.l @E[0],[sp,#$R[1][4]] @ R[1][4] = C[4] ^ (~C[0] & C[1]); 466 str.h @E[1],[sp,#$R[1][4]+4] 467 468 add @C[9],sp,#$D[3] 469 ldmia @E[2],{@E[0]-@E[2],@E[3]} @ D[1..2] 470 ldr.l @C[2],[sp,#$A[1][2]] @ A[1][2] 471 ldr.h @C[3],[sp,#$A[1][2]+4] 472 ldmia @C[9],{@C[6]-@C[9]} @ D[3..4] 473 474 eor @C[1],@C[1],@E[0] 475 ldr.l @C[4],[sp,#$A[2][3]] @ A[2][3] 476 eor @C[0],@C[0],@E[1] 477 ldr.h @C[5],[sp,#$A[2][3]+4] 478 ror @C[0],@C[0],#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]); 479 480 eor @C[2],@C[2],@E[2] 481 ldr.l @E[0],[sp,#$A[3][4]] @ A[3][4] 482 eor @C[3],@C[3],@E[3] 483 ldr.h @E[1],[sp,#$A[3][4]+4] 484 @ ror @C[2],@C[2],#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]); 485 ldr.l @E[2],[sp,#$D[0]] @ D[0] 486 @ ror @C[3],@C[3],#32-3 487 ldr.h @E[3],[sp,#$D[0]+4] 488 489 eor @C[4],@C[4],@C[6] 490 eor @C[5],@C[5],@C[7] 491 @ ror @C[5],@C[6],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); 492 @ ror @C[4],@C[7],#32-13 @ [track reverse order below] 493 494 eor @E[0],@E[0],@C[8] 495 ldr.l @C[8],[sp,#$A[4][0]] @ A[4][0] 496 eor @E[1],@E[1],@C[9] 497 ldr.h @C[9],[sp,#$A[4][0]+4] 498 ror @C[6],@E[0],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]); 499 ror @C[7],@E[1],#32-4 500 501 eor @E[2],@E[2],@C[8] 502 eor @E[3],@E[3],@C[9] 503 ror @C[8],@E[2],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]); 504 ror @C[9],@E[3],#32-9 505 506 bic @E[0],@C[5],@C[2],ror#13-3 507 bic @E[1],@C[4],@C[3],ror#12-3 508 bic @E[2],@C[6],@C[5],ror#32-13 509 bic @E[3],@C[7],@C[4],ror#32-12 510 eor @E[0],@C[0],@E[0],ror#32-13 511 eor @E[1],@C[1],@E[1],ror#32-12 512 str.l @E[0],[sp,#$R[2][0]] @ R[2][0] = C[0] ^ (~C[1] & C[2]) 513 eor @E[2],@E[2],@C[2],ror#32-3 514 str.h @E[1],[sp,#$R[2][0]+4] 515 eor @E[3],@E[3],@C[3],ror#32-3 516 str.l @E[2],[sp,#$R[2][1]] @ R[2][1] = C[1] ^ (~C[2] & C[3]); 517 bic @E[0],@C[8],@C[6] 518 bic @E[1],@C[9],@C[7] 519 str.h @E[3],[sp,#$R[2][1]+4] 520 eor @E[0],@E[0],@C[5],ror#32-13 521 eor @E[1],@E[1],@C[4],ror#32-12 522 str.l @E[0],[sp,#$R[2][2]] @ R[2][2] = C[2] ^ (~C[3] & C[4]); 523 bic @E[2],@C[0],@C[8] 524 str.h @E[1],[sp,#$R[2][2]+4] 525 bic @E[3],@C[1],@C[9] 526 eor @E[2],@E[2],@C[6] 527 eor @E[3],@E[3],@C[7] 528 str.l @E[2],[sp,#$R[2][3]] @ R[2][3] = C[3] ^ (~C[4] & C[0]); 529 bic @E[0],@C[2],@C[0],ror#3 530 str.h @E[3],[sp,#$R[2][3]+4] 531 bic @E[1],@C[3],@C[1],ror#3 532 ldr.l @C[1],[sp,#$A[0][4]] @ A[0][4] [in reverse order] 533 eor @E[0],@C[8],@E[0],ror#32-3 534 ldr.h @C[0],[sp,#$A[0][4]+4] 535 eor @E[1],@C[9],@E[1],ror#32-3 536 str.l @E[0],[sp,#$R[2][4]] @ R[2][4] = C[4] ^ (~C[0] & C[1]); 537 add @C[9],sp,#$D[1] 538 str.h @E[1],[sp,#$R[2][4]+4] 539 540 ldr.l @E[0],[sp,#$D[4]] @ D[4] 541 ldr.h @E[1],[sp,#$D[4]+4] 542 ldr.l @E[2],[sp,#$D[0]] @ D[0] 543 ldr.h @E[3],[sp,#$D[0]+4] 544 545 ldmia @C[9],{@C[6]-@C[9]} @ D[1..2] 546 547 eor @C[1],@C[1],@E[0] 548 ldr.l @C[2],[sp,#$A[1][0]] @ A[1][0] 549 eor @C[0],@C[0],@E[1] 550 ldr.h @C[3],[sp,#$A[1][0]+4] 551 @ ror @C[1],@E[0],#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]); 552 ldr.l @C[4],[sp,#$A[2][1]] @ A[2][1] 553 @ ror @C[0],@E[1],#32-14 @ [was loaded in reverse order] 554 ldr.h @C[5],[sp,#$A[2][1]+4] 555 556 eor @C[2],@C[2],@E[2] 557 ldr.l @E[0],[sp,#$A[3][2]] @ A[3][2] 558 eor @C[3],@C[3],@E[3] 559 ldr.h @E[1],[sp,#$A[3][2]+4] 560 @ ror @C[2],@C[2],#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]); 561 ldr.l @E[2],[sp,#$D[3]] @ D[3] 562 @ ror @C[3],@C[3],#32-18 563 ldr.h @E[3],[sp,#$D[3]+4] 564 565 eor @C[6],@C[6],@C[4] 566 eor @C[7],@C[7],@C[5] 567 ror @C[4],@C[6],#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]); 568 ror @C[5],@C[7],#32-5 569 570 eor @E[0],@E[0],@C[8] 571 ldr.l @C[8],[sp,#$A[4][3]] @ A[4][3] 572 eor @E[1],@E[1],@C[9] 573 ldr.h @C[9],[sp,#$A[4][3]+4] 574 ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); 575 ror @C[6],@E[1],#32-8 576 577 eor @E[2],@E[2],@C[8] 578 eor @E[3],@E[3],@C[9] 579 ror @C[8],@E[2],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]); 580 ror @C[9],@E[3],#32-28 581 582 bic @E[0],@C[4],@C[2],ror#32-18 583 bic @E[1],@C[5],@C[3],ror#32-18 584 eor @E[0],@E[0],@C[0],ror#32-14 585 eor @E[1],@E[1],@C[1],ror#32-13 586 str.l @E[0],[sp,#$R[3][0]] @ R[3][0] = C[0] ^ (~C[1] & C[2]) 587 bic @E[2],@C[6],@C[4] 588 str.h @E[1],[sp,#$R[3][0]+4] 589 bic @E[3],@C[7],@C[5] 590 eor @E[2],@E[2],@C[2],ror#32-18 591 str.l @E[2],[sp,#$R[3][1]] @ R[3][1] = C[1] ^ (~C[2] & C[3]); 592 eor @E[3],@E[3],@C[3],ror#32-18 593 str.h @E[3],[sp,#$R[3][1]+4] 594 bic @E[0],@C[8],@C[6] 595 bic @E[1],@C[9],@C[7] 596 bic @E[2],@C[0],@C[8],ror#14 597 bic @E[3],@C[1],@C[9],ror#13 598 eor @E[0],@E[0],@C[4] 599 eor @E[1],@E[1],@C[5] 600 str.l @E[0],[sp,#$R[3][2]] @ R[3][2] = C[2] ^ (~C[3] & C[4]); 601 bic @C[2],@C[2],@C[0],ror#18-14 602 str.h @E[1],[sp,#$R[3][2]+4] 603 eor @E[2],@C[6],@E[2],ror#32-14 604 bic @E[1],@C[3],@C[1],ror#18-13 605 eor @E[3],@C[7],@E[3],ror#32-13 606 str.l @E[2],[sp,#$R[3][3]] @ R[3][3] = C[3] ^ (~C[4] & C[0]); 607 str.h @E[3],[sp,#$R[3][3]+4] 608 add @E[3],sp,#$D[2] 609 ldr.l @C[0],[sp,#$A[0][2]] @ A[0][2] 610 eor @E[0],@C[8],@C[2],ror#32-18 611 ldr.h @C[1],[sp,#$A[0][2]+4] 612 eor @E[1],@C[9],@E[1],ror#32-18 613 str.l @E[0],[sp,#$R[3][4]] @ R[3][4] = C[4] ^ (~C[0] & C[1]); 614 str.h @E[1],[sp,#$R[3][4]+4] 615 616 ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3] 617 ldr.l @C[2],[sp,#$A[1][3]] @ A[1][3] 618 ldr.h @C[3],[sp,#$A[1][3]+4] 619 ldr.l @C[6],[sp,#$D[4]] @ D[4] 620 ldr.h @C[7],[sp,#$D[4]+4] 621 622 eor @C[0],@C[0],@E[0] 623 ldr.l @C[4],[sp,#$A[2][4]] @ A[2][4] 624 eor @C[1],@C[1],@E[1] 625 ldr.h @C[5],[sp,#$A[2][4]+4] 626 @ ror @C[0],@C[0],#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]); 627 ldr.l @C[8],[sp,#$D[0]] @ D[0] 628 @ ror @C[1],@C[1],#32-31 629 ldr.h @C[9],[sp,#$D[0]+4] 630 631 eor @E[2],@E[2],@C[2] 632 ldr.l @E[0],[sp,#$A[3][0]] @ A[3][0] 633 eor @E[3],@E[3],@C[3] 634 ldr.h @E[1],[sp,#$A[3][0]+4] 635 ror @C[3],@E[2],#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]); 636 ldr.l @E[2],[sp,#$D[1]] @ D[1] 637 ror @C[2],@E[3],#32-28 638 ldr.h @E[3],[sp,#$D[1]+4] 639 640 eor @C[6],@C[6],@C[4] 641 eor @C[7],@C[7],@C[5] 642 ror @C[5],@C[6],#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]); 643 ror @C[4],@C[7],#32-20 644 645 eor @E[0],@E[0],@C[8] 646 ldr.l @C[8],[sp,#$A[4][1]] @ A[4][1] 647 eor @E[1],@E[1],@C[9] 648 ldr.h @C[9],[sp,#$A[4][1]+4] 649 ror @C[7],@E[0],#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]); 650 ror @C[6],@E[1],#32-21 651 652 eor @C[8],@C[8],@E[2] 653 eor @C[9],@C[9],@E[3] 654 @ ror @C[8],@C[2],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); 655 @ ror @C[9],@C[3],#32-1 656 657 bic @E[0],@C[4],@C[2] 658 bic @E[1],@C[5],@C[3] 659 eor @E[0],@E[0],@C[0],ror#32-31 660 str.l @E[0],[sp,#$R[4][0]] @ R[4][0] = C[0] ^ (~C[1] & C[2]) 661 eor @E[1],@E[1],@C[1],ror#32-31 662 str.h @E[1],[sp,#$R[4][0]+4] 663 bic @E[2],@C[6],@C[4] 664 bic @E[3],@C[7],@C[5] 665 eor @E[2],@E[2],@C[2] 666 eor @E[3],@E[3],@C[3] 667 str.l @E[2],[sp,#$R[4][1]] @ R[4][1] = C[1] ^ (~C[2] & C[3]); 668 bic @E[0],@C[8],@C[6],ror#1 669 str.h @E[3],[sp,#$R[4][1]+4] 670 bic @E[1],@C[9],@C[7],ror#1 671 bic @E[2],@C[0],@C[8],ror#31-1 672 bic @E[3],@C[1],@C[9],ror#31-1 673 eor @C[4],@C[4],@E[0],ror#32-1 674 str.l @C[4],[sp,#$R[4][2]] @ R[4][2] = C[2] ^= (~C[3] & C[4]); 675 eor @C[5],@C[5],@E[1],ror#32-1 676 str.h @C[5],[sp,#$R[4][2]+4] 677 eor @C[6],@C[6],@E[2],ror#32-31 678 eor @C[7],@C[7],@E[3],ror#32-31 679 str.l @C[6],[sp,#$R[4][3]] @ R[4][3] = C[3] ^= (~C[4] & C[0]); 680 bic @E[0],@C[2],@C[0],ror#32-31 681 str.h @C[7],[sp,#$R[4][3]+4] 682 bic @E[1],@C[3],@C[1],ror#32-31 683 add @E[2],sp,#$R[0][0] 684 eor @C[8],@E[0],@C[8],ror#32-1 685 add @E[0],sp,#$R[1][0] 686 eor @C[9],@E[1],@C[9],ror#32-1 687 str.l @C[8],[sp,#$R[4][4]] @ R[4][4] = C[4] ^= (~C[0] & C[1]); 688 str.h @C[9],[sp,#$R[4][4]+4] 689___ 690} 691 Round(@A,@T); 692 Round(@T,@A); 693$code.=<<___; 694 blo .Lround2x 695 696#if __ARM_ARCH__>=5 697 ldr pc,[sp,#440] 698#else 699 ldr lr,[sp,#440] 700 tst lr,#1 701 moveq pc,lr @ be binary compatible with V4, yet 702 bx lr @ interoperable with Thumb ISA:-) 703#endif 704.size KeccakF1600_int,.-KeccakF1600_int 705 706.type KeccakF1600, %function 707.align 5 708KeccakF1600: 709 stmdb sp!,{r0,r4-r11,lr} 710 sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],... 711 712 add @E[0],r0,#$A[1][0] 713 add @E[1],sp,#$A[1][0] 714 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack 715 stmia sp, {@C[0]-@C[9]} 716 ldmia @E[0]!,{@C[0]-@C[9]} 717 stmia @E[1]!,{@C[0]-@C[9]} 718 ldmia @E[0]!,{@C[0]-@C[9]} 719 stmia @E[1]!,{@C[0]-@C[9]} 720 ldmia @E[0]!,{@C[0]-@C[9]} 721 stmia @E[1]!,{@C[0]-@C[9]} 722 ldmia @E[0], {@C[0]-@C[9]} 723 add @E[2],sp,#$A[0][0] 724 add @E[0],sp,#$A[1][0] 725 stmia @E[1], {@C[0]-@C[9]} 726 727 bl KeccakF1600_enter 728 729 ldr @E[1], [sp,#440+16] @ restore pointer to A 730 ldmia sp, {@C[0]-@C[9]} 731 stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5] 732 ldmia @E[0]!,{@C[0]-@C[9]} 733 stmia @E[1]!,{@C[0]-@C[9]} 734 ldmia @E[0]!,{@C[0]-@C[9]} 735 stmia @E[1]!,{@C[0]-@C[9]} 736 ldmia @E[0]!,{@C[0]-@C[9]} 737 stmia @E[1]!,{@C[0]-@C[9]} 738 ldmia @E[0], {@C[0]-@C[9]} 739 stmia @E[1], {@C[0]-@C[9]} 740 741 add sp,sp,#440+20 742#if __ARM_ARCH__>=5 743 ldmia sp!,{r4-r11,pc} 744#else 745 ldmia sp!,{r4-r11,lr} 746 tst lr,#1 747 moveq pc,lr @ be binary compatible with V4, yet 748 bx lr @ interoperable with Thumb ISA:-) 749#endif 750.size KeccakF1600,.-KeccakF1600 751___ 752{ my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14)); 753 754######################################################################## 755# Stack layout 756# ----->+-----------------------+ 757# | uint64_t A[5][5] | 758# | ... | 759# | ... | 760# +456->+-----------------------+ 761# | 0x55555555 | 762# +460->+-----------------------+ 763# | 0x33333333 | 764# +464->+-----------------------+ 765# | 0x0f0f0f0f | 766# +468->+-----------------------+ 767# | 0x00ff00ff | 768# +472->+-----------------------+ 769# | uint64_t *A | 770# +476->+-----------------------+ 771# | const void *inp | 772# +480->+-----------------------+ 773# | size_t len | 774# +484->+-----------------------+ 775# | size_t bs | 776# +488->+-----------------------+ 777# | .... 778 779$code.=<<___; 780.global SHA3_absorb 781.type SHA3_absorb,%function 782.align 5 783SHA3_absorb: 784 stmdb sp!,{r0-r12,lr} 785 sub sp,sp,#456+16 786 787 add $A_flat,r0,#$A[1][0] 788 @ mov $inp,r1 789 mov $len,r2 790 mov $bsz,r3 791 cmp r2,r3 792 blo .Labsorb_abort 793 794 add $inp,sp,#0 795 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack 796 stmia $inp!, {@C[0]-@C[9]} 797 ldmia $A_flat!,{@C[0]-@C[9]} 798 stmia $inp!, {@C[0]-@C[9]} 799 ldmia $A_flat!,{@C[0]-@C[9]} 800 stmia $inp!, {@C[0]-@C[9]} 801 ldmia $A_flat!,{@C[0]-@C[9]} 802 stmia $inp!, {@C[0]-@C[9]} 803 ldmia $A_flat!,{@C[0]-@C[9]} 804 stmia $inp, {@C[0]-@C[9]} 805 806 ldr $inp,[sp,#476] @ restore $inp 807#ifdef __thumb2__ 808 mov r9,#0x00ff00ff 809 mov r8,#0x0f0f0f0f 810 mov r7,#0x33333333 811 mov r6,#0x55555555 812#else 813 mov r6,#0x11 @ compose constants 814 mov r8,#0x0f 815 mov r9,#0xff 816 orr r6,r6,r6,lsl#8 817 orr r8,r8,r8,lsl#8 818 orr r6,r6,r6,lsl#16 @ 0x11111111 819 orr r9,r9,r9,lsl#16 @ 0x00ff00ff 820 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f 821 orr r7,r6,r6,lsl#1 @ 0x33333333 822 orr r6,r6,r6,lsl#2 @ 0x55555555 823#endif 824 str r9,[sp,#468] 825 str r8,[sp,#464] 826 str r7,[sp,#460] 827 str r6,[sp,#456] 828 b .Loop_absorb 829 830.align 4 831.Loop_absorb: 832 subs r0,$len,$bsz 833 blo .Labsorbed 834 add $A_flat,sp,#0 835 str r0,[sp,#480] @ save len - bsz 836 837.align 4 838.Loop_block: 839 ldrb r0,[$inp],#1 840 ldrb r1,[$inp],#1 841 ldrb r2,[$inp],#1 842 ldrb r3,[$inp],#1 843 ldrb r4,[$inp],#1 844 orr r0,r0,r1,lsl#8 845 ldrb r1,[$inp],#1 846 orr r0,r0,r2,lsl#16 847 ldrb r2,[$inp],#1 848 orr r0,r0,r3,lsl#24 @ lo 849 ldrb r3,[$inp],#1 850 orr r1,r4,r1,lsl#8 851 orr r1,r1,r2,lsl#16 852 orr r1,r1,r3,lsl#24 @ hi 853 854 and r2,r0,r6 @ &=0x55555555 855 and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa 856 and r3,r1,r6 @ &=0x55555555 857 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa 858 orr r2,r2,r2,lsr#1 859 orr r0,r0,r0,lsl#1 860 orr r3,r3,r3,lsr#1 861 orr r1,r1,r1,lsl#1 862 and r2,r2,r7 @ &=0x33333333 863 and r0,r0,r7,lsl#2 @ &=0xcccccccc 864 and r3,r3,r7 @ &=0x33333333 865 and r1,r1,r7,lsl#2 @ &=0xcccccccc 866 orr r2,r2,r2,lsr#2 867 orr r0,r0,r0,lsl#2 868 orr r3,r3,r3,lsr#2 869 orr r1,r1,r1,lsl#2 870 and r2,r2,r8 @ &=0x0f0f0f0f 871 and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0 872 and r3,r3,r8 @ &=0x0f0f0f0f 873 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0 874 ldmia $A_flat,{r4-r5} @ A_flat[i] 875 orr r2,r2,r2,lsr#4 876 orr r0,r0,r0,lsl#4 877 orr r3,r3,r3,lsr#4 878 orr r1,r1,r1,lsl#4 879 and r2,r2,r9 @ &=0x00ff00ff 880 and r0,r0,r9,lsl#8 @ &=0xff00ff00 881 and r3,r3,r9 @ &=0x00ff00ff 882 and r1,r1,r9,lsl#8 @ &=0xff00ff00 883 orr r2,r2,r2,lsr#8 884 orr r0,r0,r0,lsl#8 885 orr r3,r3,r3,lsr#8 886 orr r1,r1,r1,lsl#8 887 888 lsl r2,r2,#16 889 lsr r1,r1,#16 890 eor r4,r4,r3,lsl#16 891 eor r5,r5,r0,lsr#16 892 eor r4,r4,r2,lsr#16 893 eor r5,r5,r1,lsl#16 894 stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7]) 895 896 subs $bsz,$bsz,#8 897 bhi .Loop_block 898 899 str $inp,[sp,#476] 900 901 bl KeccakF1600_int 902 903 add r14,sp,#456 904 ldmia r14,{r6-r12,r14} @ restore constants and variables 905 b .Loop_absorb 906 907.align 4 908.Labsorbed: 909 add $inp,sp,#$A[1][0] 910 ldmia sp, {@C[0]-@C[9]} 911 stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5] 912 ldmia $inp!, {@C[0]-@C[9]} 913 stmia $A_flat!,{@C[0]-@C[9]} 914 ldmia $inp!, {@C[0]-@C[9]} 915 stmia $A_flat!,{@C[0]-@C[9]} 916 ldmia $inp!, {@C[0]-@C[9]} 917 stmia $A_flat!,{@C[0]-@C[9]} 918 ldmia $inp, {@C[0]-@C[9]} 919 stmia $A_flat, {@C[0]-@C[9]} 920 921.Labsorb_abort: 922 add sp,sp,#456+32 923 mov r0,$len @ return value 924#if __ARM_ARCH__>=5 925 ldmia sp!,{r4-r12,pc} 926#else 927 ldmia sp!,{r4-r12,lr} 928 tst lr,#1 929 moveq pc,lr @ be binary compatible with V4, yet 930 bx lr @ interoperable with Thumb ISA:-) 931#endif 932.size SHA3_absorb,.-SHA3_absorb 933___ 934} 935{ my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12)); 936 937$code.=<<___; 938.global SHA3_squeeze 939.type SHA3_squeeze,%function 940.align 5 941SHA3_squeeze: 942 stmdb sp!,{r0,r3-r10,lr} 943 944 mov $A_flat,r0 945 mov $out,r1 946 mov $len,r2 947 mov $bsz,r3 948 949#ifdef __thumb2__ 950 mov r9,#0x00ff00ff 951 mov r8,#0x0f0f0f0f 952 mov r7,#0x33333333 953 mov r6,#0x55555555 954#else 955 mov r6,#0x11 @ compose constants 956 mov r8,#0x0f 957 mov r9,#0xff 958 orr r6,r6,r6,lsl#8 959 orr r8,r8,r8,lsl#8 960 orr r6,r6,r6,lsl#16 @ 0x11111111 961 orr r9,r9,r9,lsl#16 @ 0x00ff00ff 962 orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f 963 orr r7,r6,r6,lsl#1 @ 0x33333333 964 orr r6,r6,r6,lsl#2 @ 0x55555555 965#endif 966 stmdb sp!,{r6-r9} 967 968 mov r14,$A_flat 969 b .Loop_squeeze 970 971.align 4 972.Loop_squeeze: 973 ldmia $A_flat!,{r0,r1} @ A_flat[i++] 974 975 lsl r2,r0,#16 976 lsl r3,r1,#16 @ r3 = r1 << 16 977 lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff 978 lsr r1,r1,#16 979 lsr r0,r0,#16 @ r0 = r0 >> 16 980 lsl r1,r1,#16 @ r1 = r1 & 0xffff0000 981 982 orr r2,r2,r2,lsl#8 983 orr r3,r3,r3,lsr#8 984 orr r0,r0,r0,lsl#8 985 orr r1,r1,r1,lsr#8 986 and r2,r2,r9 @ &=0x00ff00ff 987 and r3,r3,r9,lsl#8 @ &=0xff00ff00 988 and r0,r0,r9 @ &=0x00ff00ff 989 and r1,r1,r9,lsl#8 @ &=0xff00ff00 990 orr r2,r2,r2,lsl#4 991 orr r3,r3,r3,lsr#4 992 orr r0,r0,r0,lsl#4 993 orr r1,r1,r1,lsr#4 994 and r2,r2,r8 @ &=0x0f0f0f0f 995 and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0 996 and r0,r0,r8 @ &=0x0f0f0f0f 997 and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0 998 orr r2,r2,r2,lsl#2 999 orr r3,r3,r3,lsr#2 1000 orr r0,r0,r0,lsl#2 1001 orr r1,r1,r1,lsr#2 1002 and r2,r2,r7 @ &=0x33333333 1003 and r3,r3,r7,lsl#2 @ &=0xcccccccc 1004 and r0,r0,r7 @ &=0x33333333 1005 and r1,r1,r7,lsl#2 @ &=0xcccccccc 1006 orr r2,r2,r2,lsl#1 1007 orr r3,r3,r3,lsr#1 1008 orr r0,r0,r0,lsl#1 1009 orr r1,r1,r1,lsr#1 1010 and r2,r2,r6 @ &=0x55555555 1011 and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa 1012 and r0,r0,r6 @ &=0x55555555 1013 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa 1014 1015 orr r2,r2,r3 1016 orr r0,r0,r1 1017 1018 cmp $len,#8 1019 blo .Lsqueeze_tail 1020 lsr r1,r2,#8 1021 strb r2,[$out],#1 1022 lsr r3,r2,#16 1023 strb r1,[$out],#1 1024 lsr r2,r2,#24 1025 strb r3,[$out],#1 1026 strb r2,[$out],#1 1027 1028 lsr r1,r0,#8 1029 strb r0,[$out],#1 1030 lsr r3,r0,#16 1031 strb r1,[$out],#1 1032 lsr r0,r0,#24 1033 strb r3,[$out],#1 1034 strb r0,[$out],#1 1035 subs $len,$len,#8 1036 beq .Lsqueeze_done 1037 1038 subs $bsz,$bsz,#8 @ bsz -= 8 1039 bhi .Loop_squeeze 1040 1041 mov r0,r14 @ original $A_flat 1042 1043 bl KeccakF1600 1044 1045 ldmia sp,{r6-r10,r12} @ restore constants and variables 1046 mov r14,$A_flat 1047 b .Loop_squeeze 1048 1049.align 4 1050.Lsqueeze_tail: 1051 strb r2,[$out],#1 1052 lsr r2,r2,#8 1053 subs $len,$len,#1 1054 beq .Lsqueeze_done 1055 strb r2,[$out],#1 1056 lsr r2,r2,#8 1057 subs $len,$len,#1 1058 beq .Lsqueeze_done 1059 strb r2,[$out],#1 1060 lsr r2,r2,#8 1061 subs $len,$len,#1 1062 beq .Lsqueeze_done 1063 strb r2,[$out],#1 1064 subs $len,$len,#1 1065 beq .Lsqueeze_done 1066 1067 strb r0,[$out],#1 1068 lsr r0,r0,#8 1069 subs $len,$len,#1 1070 beq .Lsqueeze_done 1071 strb r0,[$out],#1 1072 lsr r0,r0,#8 1073 subs $len,$len,#1 1074 beq .Lsqueeze_done 1075 strb r0,[$out] 1076 b .Lsqueeze_done 1077 1078.align 4 1079.Lsqueeze_done: 1080 add sp,sp,#24 1081#if __ARM_ARCH__>=5 1082 ldmia sp!,{r4-r10,pc} 1083#else 1084 ldmia sp!,{r4-r10,lr} 1085 tst lr,#1 1086 moveq pc,lr @ be binary compatible with V4, yet 1087 bx lr @ interoperable with Thumb ISA:-) 1088#endif 1089.size SHA3_squeeze,.-SHA3_squeeze 1090___ 1091} 1092 1093$code.=<<___; 1094#if __ARM_MAX_ARCH__>=7 1095.fpu neon 1096 1097.type iotas64, %object 1098.align 5 1099iotas64: 1100 .quad 0x0000000000000001 1101 .quad 0x0000000000008082 1102 .quad 0x800000000000808a 1103 .quad 0x8000000080008000 1104 .quad 0x000000000000808b 1105 .quad 0x0000000080000001 1106 .quad 0x8000000080008081 1107 .quad 0x8000000000008009 1108 .quad 0x000000000000008a 1109 .quad 0x0000000000000088 1110 .quad 0x0000000080008009 1111 .quad 0x000000008000000a 1112 .quad 0x000000008000808b 1113 .quad 0x800000000000008b 1114 .quad 0x8000000000008089 1115 .quad 0x8000000000008003 1116 .quad 0x8000000000008002 1117 .quad 0x8000000000000080 1118 .quad 0x000000000000800a 1119 .quad 0x800000008000000a 1120 .quad 0x8000000080008081 1121 .quad 0x8000000000008080 1122 .quad 0x0000000080000001 1123 .quad 0x8000000080008008 1124.size iotas64,.-iotas64 1125 1126.type KeccakF1600_neon, %function 1127.align 5 1128KeccakF1600_neon: 1129 add r1, r0, #16 1130 adr r2, iotas64 1131 mov r3, #24 @ loop counter 1132 b .Loop_neon 1133 1134.align 4 1135.Loop_neon: 1136 @ Theta 1137 vst1.64 {q4}, [r0,:64] @ offload A[0..1][4] 1138 veor q13, q0, q5 @ A[0..1][0]^A[2..3][0] 1139 vst1.64 {d18}, [r1,:64] @ offload A[2][4] 1140 veor q14, q1, q6 @ A[0..1][1]^A[2..3][1] 1141 veor q15, q2, q7 @ A[0..1][2]^A[2..3][2] 1142 veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0] 1143 veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1] 1144 veor q14, q3, q8 @ A[0..1][3]^A[2..3][3] 1145 veor q4, q4, q9 @ A[0..1][4]^A[2..3][4] 1146 veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2] 1147 veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3] 1148 veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4] 1149 veor q13, q13, q10 @ C[0..1]^=A[4][0..1] 1150 veor q14, q15, q11 @ C[2..3]^=A[4][2..3] 1151 veor d25, d25, d24 @ C[4]^=A[4][4] 1152 1153 vadd.u64 q4, q13, q13 @ C[0..1]<<1 1154 vadd.u64 q15, q14, q14 @ C[2..3]<<1 1155 vadd.u64 d18, d25, d25 @ C[4]<<1 1156 vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1) 1157 vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1) 1158 vsri.u64 d18, d25, #63 @ ROL64(C[4],1) 1159 veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1) 1160 veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1) 1161 veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1) 1162 veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1) 1163 1164 veor d0, d0, d25 @ A[0][0] ^= C[4] 1165 veor d1, d1, d25 @ A[1][0] ^= C[4] 1166 veor d10, d10, d25 @ A[2][0] ^= C[4] 1167 veor d11, d11, d25 @ A[3][0] ^= C[4] 1168 veor d20, d20, d25 @ A[4][0] ^= C[4] 1169 1170 veor d2, d2, d26 @ A[0][1] ^= D[1] 1171 veor d3, d3, d26 @ A[1][1] ^= D[1] 1172 veor d12, d12, d26 @ A[2][1] ^= D[1] 1173 veor d13, d13, d26 @ A[3][1] ^= D[1] 1174 veor d21, d21, d26 @ A[4][1] ^= D[1] 1175 vmov d26, d27 1176 1177 veor d6, d6, d28 @ A[0][3] ^= C[2] 1178 veor d7, d7, d28 @ A[1][3] ^= C[2] 1179 veor d16, d16, d28 @ A[2][3] ^= C[2] 1180 veor d17, d17, d28 @ A[3][3] ^= C[2] 1181 veor d23, d23, d28 @ A[4][3] ^= C[2] 1182 vld1.64 {q4}, [r0,:64] @ restore A[0..1][4] 1183 vmov d28, d29 1184 1185 vld1.64 {d18}, [r1,:64] @ restore A[2][4] 1186 veor q2, q2, q13 @ A[0..1][2] ^= D[2] 1187 veor q7, q7, q13 @ A[2..3][2] ^= D[2] 1188 veor d22, d22, d27 @ A[4][2] ^= D[2] 1189 1190 veor q4, q4, q14 @ A[0..1][4] ^= C[3] 1191 veor q9, q9, q14 @ A[2..3][4] ^= C[3] 1192 veor d24, d24, d29 @ A[4][4] ^= C[3] 1193 1194 @ Rho + Pi 1195 vmov d26, d2 @ C[1] = A[0][1] 1196 vshl.u64 d2, d3, #44 1197 vmov d27, d4 @ C[2] = A[0][2] 1198 vshl.u64 d4, d14, #43 1199 vmov d28, d6 @ C[3] = A[0][3] 1200 vshl.u64 d6, d17, #21 1201 vmov d29, d8 @ C[4] = A[0][4] 1202 vshl.u64 d8, d24, #14 1203 vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1]) 1204 vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2]) 1205 vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3]) 1206 vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4]) 1207 1208 vshl.u64 d3, d9, #20 1209 vshl.u64 d14, d16, #25 1210 vshl.u64 d17, d15, #15 1211 vshl.u64 d24, d21, #2 1212 vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4]) 1213 vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3]) 1214 vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2]) 1215 vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1]) 1216 1217 vshl.u64 d9, d22, #61 1218 @ vshl.u64 d16, d19, #8 1219 vshl.u64 d15, d12, #10 1220 vshl.u64 d21, d7, #55 1221 vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2]) 1222 vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4]) 1223 vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1]) 1224 vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3]) 1225 1226 vshl.u64 d22, d18, #39 1227 @ vshl.u64 d19, d23, #56 1228 vshl.u64 d12, d5, #6 1229 vshl.u64 d7, d13, #45 1230 vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4]) 1231 vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3]) 1232 vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2]) 1233 vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1]) 1234 1235 vshl.u64 d18, d20, #18 1236 vshl.u64 d23, d11, #41 1237 vshl.u64 d5, d10, #3 1238 vshl.u64 d13, d1, #36 1239 vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0]) 1240 vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0]) 1241 vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0]) 1242 vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0]) 1243 1244 vshl.u64 d1, d28, #28 1245 vshl.u64 d10, d26, #1 1246 vshl.u64 d11, d29, #27 1247 vshl.u64 d20, d27, #62 1248 vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3]) 1249 vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1]) 1250 vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4]) 1251 vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2]) 1252 1253 @ Chi + Iota 1254 vbic q13, q2, q1 1255 vbic q14, q3, q2 1256 vbic q15, q4, q3 1257 veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2]) 1258 veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3]) 1259 veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4]) 1260 vst1.64 {q13}, [r0,:64] @ offload A[0..1][0] 1261 vbic q13, q0, q4 1262 vbic q15, q1, q0 1263 vmov q1, q14 @ A[0..1][1] 1264 veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0]) 1265 veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1]) 1266 1267 vbic q13, q7, q6 1268 vmov q0, q5 @ A[2..3][0] 1269 vbic q14, q8, q7 1270 vmov q15, q6 @ A[2..3][1] 1271 veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2]) 1272 vbic q13, q9, q8 1273 veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3]) 1274 vbic q14, q0, q9 1275 veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4]) 1276 vbic q13, q15, q0 1277 veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0]) 1278 vmov q14, q10 @ A[4][0..1] 1279 veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1]) 1280 1281 vld1.64 d25, [r2,:64]! @ Iota[i++] 1282 vbic d26, d22, d21 1283 vbic d27, d23, d22 1284 vld1.64 {q0}, [r0,:64] @ restore A[0..1][0] 1285 veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2]) 1286 vbic d26, d24, d23 1287 veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3]) 1288 vbic d27, d28, d24 1289 veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4]) 1290 vbic d26, d29, d28 1291 veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0]) 1292 veor d0, d0, d25 @ A[0][0] ^= Iota[i] 1293 veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1]) 1294 1295 subs r3, r3, #1 1296 bne .Loop_neon 1297 1298 ret 1299.size KeccakF1600_neon,.-KeccakF1600_neon 1300 1301.global SHA3_absorb_neon 1302.type SHA3_absorb_neon, %function 1303.align 5 1304SHA3_absorb_neon: 1305 stmdb sp!, {r4-r6,lr} 1306 vstmdb sp!, {d8-d15} 1307 1308 mov r4, r1 @ inp 1309 mov r5, r2 @ len 1310 mov r6, r3 @ bsz 1311 1312 vld1.32 {d0}, [r0,:64]! @ A[0][0] 1313 vld1.32 {d2}, [r0,:64]! @ A[0][1] 1314 vld1.32 {d4}, [r0,:64]! @ A[0][2] 1315 vld1.32 {d6}, [r0,:64]! @ A[0][3] 1316 vld1.32 {d8}, [r0,:64]! @ A[0][4] 1317 1318 vld1.32 {d1}, [r0,:64]! @ A[1][0] 1319 vld1.32 {d3}, [r0,:64]! @ A[1][1] 1320 vld1.32 {d5}, [r0,:64]! @ A[1][2] 1321 vld1.32 {d7}, [r0,:64]! @ A[1][3] 1322 vld1.32 {d9}, [r0,:64]! @ A[1][4] 1323 1324 vld1.32 {d10}, [r0,:64]! @ A[2][0] 1325 vld1.32 {d12}, [r0,:64]! @ A[2][1] 1326 vld1.32 {d14}, [r0,:64]! @ A[2][2] 1327 vld1.32 {d16}, [r0,:64]! @ A[2][3] 1328 vld1.32 {d18}, [r0,:64]! @ A[2][4] 1329 1330 vld1.32 {d11}, [r0,:64]! @ A[3][0] 1331 vld1.32 {d13}, [r0,:64]! @ A[3][1] 1332 vld1.32 {d15}, [r0,:64]! @ A[3][2] 1333 vld1.32 {d17}, [r0,:64]! @ A[3][3] 1334 vld1.32 {d19}, [r0,:64]! @ A[3][4] 1335 1336 vld1.32 {d20-d23}, [r0,:64]! @ A[4][0..3] 1337 vld1.32 {d24}, [r0,:64] @ A[4][4] 1338 sub r0, r0, #24*8 @ rewind 1339 b .Loop_absorb_neon 1340 1341.align 4 1342.Loop_absorb_neon: 1343 subs r12, r5, r6 @ len - bsz 1344 blo .Labsorbed_neon 1345 mov r5, r12 1346 1347 vld1.8 {d31}, [r4]! @ endian-neutral loads... 1348 cmp r6, #8*2 1349 veor d0, d0, d31 @ A[0][0] ^= *inp++ 1350 blo .Lprocess_neon 1351 vld1.8 {d31}, [r4]! 1352 veor d2, d2, d31 @ A[0][1] ^= *inp++ 1353 beq .Lprocess_neon 1354 vld1.8 {d31}, [r4]! 1355 cmp r6, #8*4 1356 veor d4, d4, d31 @ A[0][2] ^= *inp++ 1357 blo .Lprocess_neon 1358 vld1.8 {d31}, [r4]! 1359 veor d6, d6, d31 @ A[0][3] ^= *inp++ 1360 beq .Lprocess_neon 1361 vld1.8 {d31},[r4]! 1362 cmp r6, #8*6 1363 veor d8, d8, d31 @ A[0][4] ^= *inp++ 1364 blo .Lprocess_neon 1365 1366 vld1.8 {d31}, [r4]! 1367 veor d1, d1, d31 @ A[1][0] ^= *inp++ 1368 beq .Lprocess_neon 1369 vld1.8 {d31}, [r4]! 1370 cmp r6, #8*8 1371 veor d3, d3, d31 @ A[1][1] ^= *inp++ 1372 blo .Lprocess_neon 1373 vld1.8 {d31}, [r4]! 1374 veor d5, d5, d31 @ A[1][2] ^= *inp++ 1375 beq .Lprocess_neon 1376 vld1.8 {d31}, [r4]! 1377 cmp r6, #8*10 1378 veor d7, d7, d31 @ A[1][3] ^= *inp++ 1379 blo .Lprocess_neon 1380 vld1.8 {d31}, [r4]! 1381 veor d9, d9, d31 @ A[1][4] ^= *inp++ 1382 beq .Lprocess_neon 1383 1384 vld1.8 {d31}, [r4]! 1385 cmp r6, #8*12 1386 veor d10, d10, d31 @ A[2][0] ^= *inp++ 1387 blo .Lprocess_neon 1388 vld1.8 {d31}, [r4]! 1389 veor d12, d12, d31 @ A[2][1] ^= *inp++ 1390 beq .Lprocess_neon 1391 vld1.8 {d31}, [r4]! 1392 cmp r6, #8*14 1393 veor d14, d14, d31 @ A[2][2] ^= *inp++ 1394 blo .Lprocess_neon 1395 vld1.8 {d31}, [r4]! 1396 veor d16, d16, d31 @ A[2][3] ^= *inp++ 1397 beq .Lprocess_neon 1398 vld1.8 {d31}, [r4]! 1399 cmp r6, #8*16 1400 veor d18, d18, d31 @ A[2][4] ^= *inp++ 1401 blo .Lprocess_neon 1402 1403 vld1.8 {d31}, [r4]! 1404 veor d11, d11, d31 @ A[3][0] ^= *inp++ 1405 beq .Lprocess_neon 1406 vld1.8 {d31}, [r4]! 1407 cmp r6, #8*18 1408 veor d13, d13, d31 @ A[3][1] ^= *inp++ 1409 blo .Lprocess_neon 1410 vld1.8 {d31}, [r4]! 1411 veor d15, d15, d31 @ A[3][2] ^= *inp++ 1412 beq .Lprocess_neon 1413 vld1.8 {d31}, [r4]! 1414 cmp r6, #8*20 1415 veor d17, d17, d31 @ A[3][3] ^= *inp++ 1416 blo .Lprocess_neon 1417 vld1.8 {d31}, [r4]! 1418 veor d19, d19, d31 @ A[3][4] ^= *inp++ 1419 beq .Lprocess_neon 1420 1421 vld1.8 {d31}, [r4]! 1422 cmp r6, #8*22 1423 veor d20, d20, d31 @ A[4][0] ^= *inp++ 1424 blo .Lprocess_neon 1425 vld1.8 {d31}, [r4]! 1426 veor d21, d21, d31 @ A[4][1] ^= *inp++ 1427 beq .Lprocess_neon 1428 vld1.8 {d31}, [r4]! 1429 cmp r6, #8*24 1430 veor d22, d22, d31 @ A[4][2] ^= *inp++ 1431 blo .Lprocess_neon 1432 vld1.8 {d31}, [r4]! 1433 veor d23, d23, d31 @ A[4][3] ^= *inp++ 1434 beq .Lprocess_neon 1435 vld1.8 {d31}, [r4]! 1436 veor d24, d24, d31 @ A[4][4] ^= *inp++ 1437 1438.Lprocess_neon: 1439 bl KeccakF1600_neon 1440 b .Loop_absorb_neon 1441 1442.align 4 1443.Labsorbed_neon: 1444 vst1.32 {d0}, [r0,:64]! @ A[0][0..4] 1445 vst1.32 {d2}, [r0,:64]! 1446 vst1.32 {d4}, [r0,:64]! 1447 vst1.32 {d6}, [r0,:64]! 1448 vst1.32 {d8}, [r0,:64]! 1449 1450 vst1.32 {d1}, [r0,:64]! @ A[1][0..4] 1451 vst1.32 {d3}, [r0,:64]! 1452 vst1.32 {d5}, [r0,:64]! 1453 vst1.32 {d7}, [r0,:64]! 1454 vst1.32 {d9}, [r0,:64]! 1455 1456 vst1.32 {d10}, [r0,:64]! @ A[2][0..4] 1457 vst1.32 {d12}, [r0,:64]! 1458 vst1.32 {d14}, [r0,:64]! 1459 vst1.32 {d16}, [r0,:64]! 1460 vst1.32 {d18}, [r0,:64]! 1461 1462 vst1.32 {d11}, [r0,:64]! @ A[3][0..4] 1463 vst1.32 {d13}, [r0,:64]! 1464 vst1.32 {d15}, [r0,:64]! 1465 vst1.32 {d17}, [r0,:64]! 1466 vst1.32 {d19}, [r0,:64]! 1467 1468 vst1.32 {d20-d23}, [r0,:64]! @ A[4][0..4] 1469 vst1.32 {d24}, [r0,:64] 1470 1471 mov r0, r5 @ return value 1472 vldmia sp!, {d8-d15} 1473 ldmia sp!, {r4-r6,pc} 1474.size SHA3_absorb_neon,.-SHA3_absorb_neon 1475 1476.global SHA3_squeeze_neon 1477.type SHA3_squeeze_neon, %function 1478.align 5 1479SHA3_squeeze_neon: 1480 stmdb sp!, {r4-r6,lr} 1481 1482 mov r4, r1 @ out 1483 mov r5, r2 @ len 1484 mov r6, r3 @ bsz 1485 mov r12, r0 @ A_flat 1486 mov r14, r3 @ bsz 1487 b .Loop_squeeze_neon 1488 1489.align 4 1490.Loop_squeeze_neon: 1491 cmp r5, #8 1492 blo .Lsqueeze_neon_tail 1493 vld1.32 {d0}, [r12]! 1494 vst1.8 {d0}, [r4]! @ endian-neutral store 1495 1496 subs r5, r5, #8 @ len -= 8 1497 beq .Lsqueeze_neon_done 1498 1499 subs r14, r14, #8 @ bsz -= 8 1500 bhi .Loop_squeeze_neon 1501 1502 vstmdb sp!, {d8-d15} 1503 1504 vld1.32 {d0}, [r0,:64]! @ A[0][0..4] 1505 vld1.32 {d2}, [r0,:64]! 1506 vld1.32 {d4}, [r0,:64]! 1507 vld1.32 {d6}, [r0,:64]! 1508 vld1.32 {d8}, [r0,:64]! 1509 1510 vld1.32 {d1}, [r0,:64]! @ A[1][0..4] 1511 vld1.32 {d3}, [r0,:64]! 1512 vld1.32 {d5}, [r0,:64]! 1513 vld1.32 {d7}, [r0,:64]! 1514 vld1.32 {d9}, [r0,:64]! 1515 1516 vld1.32 {d10}, [r0,:64]! @ A[2][0..4] 1517 vld1.32 {d12}, [r0,:64]! 1518 vld1.32 {d14}, [r0,:64]! 1519 vld1.32 {d16}, [r0,:64]! 1520 vld1.32 {d18}, [r0,:64]! 1521 1522 vld1.32 {d11}, [r0,:64]! @ A[3][0..4] 1523 vld1.32 {d13}, [r0,:64]! 1524 vld1.32 {d15}, [r0,:64]! 1525 vld1.32 {d17}, [r0,:64]! 1526 vld1.32 {d19}, [r0,:64]! 1527 1528 vld1.32 {d20-d23}, [r0,:64]! @ A[4][0..4] 1529 vld1.32 {d24}, [r0,:64] 1530 sub r0, r0, #24*8 @ rewind 1531 1532 bl KeccakF1600_neon 1533 1534 mov r12, r0 @ A_flat 1535 vst1.32 {d0}, [r0,:64]! @ A[0][0..4] 1536 vst1.32 {d2}, [r0,:64]! 1537 vst1.32 {d4}, [r0,:64]! 1538 vst1.32 {d6}, [r0,:64]! 1539 vst1.32 {d8}, [r0,:64]! 1540 1541 vst1.32 {d1}, [r0,:64]! @ A[1][0..4] 1542 vst1.32 {d3}, [r0,:64]! 1543 vst1.32 {d5}, [r0,:64]! 1544 vst1.32 {d7}, [r0,:64]! 1545 vst1.32 {d9}, [r0,:64]! 1546 1547 vst1.32 {d10}, [r0,:64]! @ A[2][0..4] 1548 vst1.32 {d12}, [r0,:64]! 1549 vst1.32 {d14}, [r0,:64]! 1550 vst1.32 {d16}, [r0,:64]! 1551 vst1.32 {d18}, [r0,:64]! 1552 1553 vst1.32 {d11}, [r0,:64]! @ A[3][0..4] 1554 vst1.32 {d13}, [r0,:64]! 1555 vst1.32 {d15}, [r0,:64]! 1556 vst1.32 {d17}, [r0,:64]! 1557 vst1.32 {d19}, [r0,:64]! 1558 1559 vst1.32 {d20-d23}, [r0,:64]! @ A[4][0..4] 1560 mov r14, r6 @ bsz 1561 vst1.32 {d24}, [r0,:64] 1562 mov r0, r12 @ rewind 1563 1564 vldmia sp!, {d8-d15} 1565 b .Loop_squeeze_neon 1566 1567.align 4 1568.Lsqueeze_neon_tail: 1569 ldmia r12, {r2,r3} 1570 cmp r5, #2 1571 strb r2, [r4],#1 @ endian-neutral store 1572 lsr r2, r2, #8 1573 blo .Lsqueeze_neon_done 1574 strb r2, [r4], #1 1575 lsr r2, r2, #8 1576 beq .Lsqueeze_neon_done 1577 strb r2, [r4], #1 1578 lsr r2, r2, #8 1579 cmp r5, #4 1580 blo .Lsqueeze_neon_done 1581 strb r2, [r4], #1 1582 beq .Lsqueeze_neon_done 1583 1584 strb r3, [r4], #1 1585 lsr r3, r3, #8 1586 cmp r5, #6 1587 blo .Lsqueeze_neon_done 1588 strb r3, [r4], #1 1589 lsr r3, r3, #8 1590 beq .Lsqueeze_neon_done 1591 strb r3, [r4], #1 1592 1593.Lsqueeze_neon_done: 1594 ldmia sp!, {r4-r6,pc} 1595.size SHA3_squeeze_neon,.-SHA3_squeeze_neon 1596#endif 1597.asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" 1598.align 2 1599___ 1600 1601{ 1602 my %ldr, %str; 1603 1604 sub ldrd { 1605 my ($mnemonic,$half,$reg,$ea) = @_; 1606 my $op = $mnemonic eq "ldr" ? \%ldr : \%str; 1607 1608 if ($half eq "l") { 1609 $$op{reg} = $reg; 1610 $$op{ea} = $ea; 1611 sprintf "#ifndef __thumb2__\n" . 1612 " %s\t%s,%s\n" . 1613 "#endif", $mnemonic,$reg,$ea; 1614 } else { 1615 sprintf "#ifndef __thumb2__\n" . 1616 " %s\t%s,%s\n" . 1617 "#else\n" . 1618 " %sd\t%s,%s,%s\n" . 1619 "#endif", $mnemonic,$reg,$ea, 1620 $mnemonic,$$op{reg},$reg,$$op{ea}; 1621 } 1622 } 1623} 1624 1625foreach (split($/,$code)) { 1626 s/\`([^\`]*)\`/eval $1/ge; 1627 1628 s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or 1629 s/\b(ror|ls[rl])\s+(r[0-9]+.*)#/mov $2$1#/g or 1630 s/\bret\b/bx lr/g or 1631 s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4 1632 1633 print $_,"\n"; 1634} 1635 1636close STDOUT or die "error closing STDOUT: $!"; # enforce flush 1637