1#! /usr/bin/env perl 2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# ECP_NISTZ256 module for ARMv4. 18# 19# October 2014. 20# 21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22# http://eprint.iacr.org/2013/816. In the process of adaptation 23# original .c module was made 32-bit savvy in order to make this 24# implementation possible. 25# 26# with/without -DECP_NISTZ256_ASM 27# Cortex-A8 +53-170% 28# Cortex-A9 +76-205% 29# Cortex-A15 +100-316% 30# Snapdragon S4 +66-187% 31# 32# Ranges denote minimum and maximum improvement coefficients depending 33# on benchmark. Lower coefficients are for ECDSA sign, server-side 34# operation. Keep in mind that +200% means 3x improvement. 35 36$flavour = shift; 37if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 38else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 39 40if ($flavour && $flavour ne "void") { 41 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 42 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 43 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 44 die "can't locate arm-xlate.pl"; 45 46 open STDOUT,"| \"$^X\" $xlate $flavour $output"; 47} else { 48 open STDOUT,">$output"; 49} 50 51$code.=<<___; 52#include "arm_arch.h" 53 54.text 55#if defined(__thumb2__) 56.syntax unified 57.thumb 58#else 59.code 32 60#endif 61___ 62######################################################################## 63# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 64# 65$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 66open TABLE,"<ecp_nistz256_table.c" or 67open TABLE,"<${dir}../ecp_nistz256_table.c" or 68die "failed to open ecp_nistz256_table.c:",$!; 69 70use integer; 71 72foreach(<TABLE>) { 73 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; 74} 75close TABLE; 76 77# See ecp_nistz256_table.c for explanation for why it's 64*16*37. 78# 64*16*37-1 is because $#arr returns last valid index or @arr, not 79# amount of elements. 80die "insane number of elements" if ($#arr != 64*16*37-1); 81 82$code.=<<___; 83.globl ecp_nistz256_precomputed 84.type ecp_nistz256_precomputed,%object 85.align 12 86ecp_nistz256_precomputed: 87___ 88######################################################################## 89# this conversion smashes P256_POINT_AFFINE by individual bytes with 90# 64 byte interval, similar to 91# 1111222233334444 92# 1234123412341234 93for(1..37) { 94 @tbl = splice(@arr,0,64*16); 95 for($i=0;$i<64;$i++) { 96 undef @line; 97 for($j=0;$j<64;$j++) { 98 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; 99 } 100 $code.=".byte\t"; 101 $code.=join(',',map { sprintf "0x%02x",$_} @line); 102 $code.="\n"; 103 } 104} 105$code.=<<___; 106.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed 107.align 5 108.LRR: @ 2^512 mod P precomputed for NIST P256 polynomial 109.long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb 110.long 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004 111.Lone: 112.long 1,0,0,0,0,0,0,0 113.asciz "ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" 114.align 6 115___ 116 117######################################################################## 118# common register layout, note that $t2 is link register, so that if 119# internal subroutine uses $t2, then it has to offload lr... 120 121($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)= 122 map("r$_",(0..12,14)); 123($t0,$t3)=($ff,$a_ptr); 124 125$code.=<<___; 126@ void ecp_nistz256_to_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); 127.globl ecp_nistz256_to_mont 128.type ecp_nistz256_to_mont,%function 129ecp_nistz256_to_mont: 130 adr $b_ptr,.LRR 131 b .Lecp_nistz256_mul_mont 132.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont 133 134@ void ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); 135.globl ecp_nistz256_from_mont 136.type ecp_nistz256_from_mont,%function 137ecp_nistz256_from_mont: 138 adr $b_ptr,.Lone 139 b .Lecp_nistz256_mul_mont 140.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont 141 142@ void ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); 143.globl ecp_nistz256_mul_by_2 144.type ecp_nistz256_mul_by_2,%function 145.align 4 146ecp_nistz256_mul_by_2: 147 stmdb sp!,{r4-r12,lr} 148 bl __ecp_nistz256_mul_by_2 149#if __ARM_ARCH__>=5 || !defined(__thumb__) 150 ldmia sp!,{r4-r12,pc} 151#else 152 ldmia sp!,{r4-r12,lr} 153 bx lr @ interoperable with Thumb ISA:-) 154#endif 155.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 156 157.type __ecp_nistz256_mul_by_2,%function 158.align 4 159__ecp_nistz256_mul_by_2: 160 ldr $a0,[$a_ptr,#0] 161 ldr $a1,[$a_ptr,#4] 162 ldr $a2,[$a_ptr,#8] 163 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself 164 ldr $a3,[$a_ptr,#12] 165 adcs $a1,$a1,$a1 166 ldr $a4,[$a_ptr,#16] 167 adcs $a2,$a2,$a2 168 ldr $a5,[$a_ptr,#20] 169 adcs $a3,$a3,$a3 170 ldr $a6,[$a_ptr,#24] 171 adcs $a4,$a4,$a4 172 ldr $a7,[$a_ptr,#28] 173 adcs $a5,$a5,$a5 174 adcs $a6,$a6,$a6 175 mov $ff,#0 176 adcs $a7,$a7,$a7 177 adc $ff,$ff,#0 178 179 b .Lreduce_by_sub 180.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 181 182@ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8], 183@ const BN_ULONG r2[8]); 184.globl ecp_nistz256_add 185.type ecp_nistz256_add,%function 186.align 4 187ecp_nistz256_add: 188 stmdb sp!,{r4-r12,lr} 189 bl __ecp_nistz256_add 190#if __ARM_ARCH__>=5 || !defined(__thumb__) 191 ldmia sp!,{r4-r12,pc} 192#else 193 ldmia sp!,{r4-r12,lr} 194 bx lr @ interoperable with Thumb ISA:-) 195#endif 196.size ecp_nistz256_add,.-ecp_nistz256_add 197 198.type __ecp_nistz256_add,%function 199.align 4 200__ecp_nistz256_add: 201 str lr,[sp,#-4]! @ push lr 202 203 ldr $a0,[$a_ptr,#0] 204 ldr $a1,[$a_ptr,#4] 205 ldr $a2,[$a_ptr,#8] 206 ldr $a3,[$a_ptr,#12] 207 ldr $a4,[$a_ptr,#16] 208 ldr $t0,[$b_ptr,#0] 209 ldr $a5,[$a_ptr,#20] 210 ldr $t1,[$b_ptr,#4] 211 ldr $a6,[$a_ptr,#24] 212 ldr $t2,[$b_ptr,#8] 213 ldr $a7,[$a_ptr,#28] 214 ldr $t3,[$b_ptr,#12] 215 adds $a0,$a0,$t0 216 ldr $t0,[$b_ptr,#16] 217 adcs $a1,$a1,$t1 218 ldr $t1,[$b_ptr,#20] 219 adcs $a2,$a2,$t2 220 ldr $t2,[$b_ptr,#24] 221 adcs $a3,$a3,$t3 222 ldr $t3,[$b_ptr,#28] 223 adcs $a4,$a4,$t0 224 adcs $a5,$a5,$t1 225 adcs $a6,$a6,$t2 226 mov $ff,#0 227 adcs $a7,$a7,$t3 228 adc $ff,$ff,#0 229 ldr lr,[sp],#4 @ pop lr 230 231.Lreduce_by_sub: 232 233 @ if a+b >= modulus, subtract modulus. 234 @ 235 @ But since comparison implies subtraction, we subtract 236 @ modulus and then add it back if subtraction borrowed. 237 238 subs $a0,$a0,#-1 239 sbcs $a1,$a1,#-1 240 sbcs $a2,$a2,#-1 241 sbcs $a3,$a3,#0 242 sbcs $a4,$a4,#0 243 sbcs $a5,$a5,#0 244 sbcs $a6,$a6,#1 245 sbcs $a7,$a7,#-1 246 sbc $ff,$ff,#0 247 248 @ Note that because mod has special form, i.e. consists of 249 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 250 @ using value of borrow as a whole or extracting single bit. 251 @ Follow $ff register... 252 253 adds $a0,$a0,$ff @ add synthesized modulus 254 adcs $a1,$a1,$ff 255 str $a0,[$r_ptr,#0] 256 adcs $a2,$a2,$ff 257 str $a1,[$r_ptr,#4] 258 adcs $a3,$a3,#0 259 str $a2,[$r_ptr,#8] 260 adcs $a4,$a4,#0 261 str $a3,[$r_ptr,#12] 262 adcs $a5,$a5,#0 263 str $a4,[$r_ptr,#16] 264 adcs $a6,$a6,$ff,lsr#31 265 str $a5,[$r_ptr,#20] 266 adcs $a7,$a7,$ff 267 str $a6,[$r_ptr,#24] 268 str $a7,[$r_ptr,#28] 269 270 mov pc,lr 271.size __ecp_nistz256_add,.-__ecp_nistz256_add 272 273@ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]); 274.globl ecp_nistz256_mul_by_3 275.type ecp_nistz256_mul_by_3,%function 276.align 4 277ecp_nistz256_mul_by_3: 278 stmdb sp!,{r4-r12,lr} 279 bl __ecp_nistz256_mul_by_3 280#if __ARM_ARCH__>=5 || !defined(__thumb__) 281 ldmia sp!,{r4-r12,pc} 282#else 283 ldmia sp!,{r4-r12,lr} 284 bx lr @ interoperable with Thumb ISA:-) 285#endif 286.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 287 288.type __ecp_nistz256_mul_by_3,%function 289.align 4 290__ecp_nistz256_mul_by_3: 291 str lr,[sp,#-4]! @ push lr 292 293 @ As multiplication by 3 is performed as 2*n+n, below are inline 294 @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see 295 @ corresponding subroutines for details. 296 297 ldr $a0,[$a_ptr,#0] 298 ldr $a1,[$a_ptr,#4] 299 ldr $a2,[$a_ptr,#8] 300 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] 301 ldr $a3,[$a_ptr,#12] 302 adcs $a1,$a1,$a1 303 ldr $a4,[$a_ptr,#16] 304 adcs $a2,$a2,$a2 305 ldr $a5,[$a_ptr,#20] 306 adcs $a3,$a3,$a3 307 ldr $a6,[$a_ptr,#24] 308 adcs $a4,$a4,$a4 309 ldr $a7,[$a_ptr,#28] 310 adcs $a5,$a5,$a5 311 adcs $a6,$a6,$a6 312 mov $ff,#0 313 adcs $a7,$a7,$a7 314 adc $ff,$ff,#0 315 316 subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores 317 sbcs $a1,$a1,#-1 318 sbcs $a2,$a2,#-1 319 sbcs $a3,$a3,#0 320 sbcs $a4,$a4,#0 321 sbcs $a5,$a5,#0 322 sbcs $a6,$a6,#1 323 sbcs $a7,$a7,#-1 324 sbc $ff,$ff,#0 325 326 adds $a0,$a0,$ff @ add synthesized modulus 327 adcs $a1,$a1,$ff 328 adcs $a2,$a2,$ff 329 adcs $a3,$a3,#0 330 adcs $a4,$a4,#0 331 ldr $b_ptr,[$a_ptr,#0] 332 adcs $a5,$a5,#0 333 ldr $t1,[$a_ptr,#4] 334 adcs $a6,$a6,$ff,lsr#31 335 ldr $t2,[$a_ptr,#8] 336 adc $a7,$a7,$ff 337 338 ldr $t0,[$a_ptr,#12] 339 adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7] 340 ldr $b_ptr,[$a_ptr,#16] 341 adcs $a1,$a1,$t1 342 ldr $t1,[$a_ptr,#20] 343 adcs $a2,$a2,$t2 344 ldr $t2,[$a_ptr,#24] 345 adcs $a3,$a3,$t0 346 ldr $t3,[$a_ptr,#28] 347 adcs $a4,$a4,$b_ptr 348 adcs $a5,$a5,$t1 349 adcs $a6,$a6,$t2 350 mov $ff,#0 351 adcs $a7,$a7,$t3 352 adc $ff,$ff,#0 353 ldr lr,[sp],#4 @ pop lr 354 355 b .Lreduce_by_sub 356.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 357 358@ void ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); 359.globl ecp_nistz256_div_by_2 360.type ecp_nistz256_div_by_2,%function 361.align 4 362ecp_nistz256_div_by_2: 363 stmdb sp!,{r4-r12,lr} 364 bl __ecp_nistz256_div_by_2 365#if __ARM_ARCH__>=5 || !defined(__thumb__) 366 ldmia sp!,{r4-r12,pc} 367#else 368 ldmia sp!,{r4-r12,lr} 369 bx lr @ interoperable with Thumb ISA:-) 370#endif 371.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 372 373.type __ecp_nistz256_div_by_2,%function 374.align 4 375__ecp_nistz256_div_by_2: 376 @ ret = (a is odd ? a+mod : a) >> 1 377 378 ldr $a0,[$a_ptr,#0] 379 ldr $a1,[$a_ptr,#4] 380 ldr $a2,[$a_ptr,#8] 381 mov $ff,$a0,lsl#31 @ place least significant bit to most 382 @ significant position, now arithmetic 383 @ right shift by 31 will produce -1 or 384 @ 0, while logical right shift 1 or 0, 385 @ this is how modulus is conditionally 386 @ synthesized in this case... 387 ldr $a3,[$a_ptr,#12] 388 adds $a0,$a0,$ff,asr#31 389 ldr $a4,[$a_ptr,#16] 390 adcs $a1,$a1,$ff,asr#31 391 ldr $a5,[$a_ptr,#20] 392 adcs $a2,$a2,$ff,asr#31 393 ldr $a6,[$a_ptr,#24] 394 adcs $a3,$a3,#0 395 ldr $a7,[$a_ptr,#28] 396 adcs $a4,$a4,#0 397 mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early 398 @ because it doesn't affect flags 399 adcs $a5,$a5,#0 400 orr $a0,$a0,$a1,lsl#31 401 adcs $a6,$a6,$ff,lsr#31 402 mov $b_ptr,#0 403 adcs $a7,$a7,$ff,asr#31 404 mov $a1,$a1,lsr#1 405 adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition 406 407 orr $a1,$a1,$a2,lsl#31 408 mov $a2,$a2,lsr#1 409 str $a0,[$r_ptr,#0] 410 orr $a2,$a2,$a3,lsl#31 411 mov $a3,$a3,lsr#1 412 str $a1,[$r_ptr,#4] 413 orr $a3,$a3,$a4,lsl#31 414 mov $a4,$a4,lsr#1 415 str $a2,[$r_ptr,#8] 416 orr $a4,$a4,$a5,lsl#31 417 mov $a5,$a5,lsr#1 418 str $a3,[$r_ptr,#12] 419 orr $a5,$a5,$a6,lsl#31 420 mov $a6,$a6,lsr#1 421 str $a4,[$r_ptr,#16] 422 orr $a6,$a6,$a7,lsl#31 423 mov $a7,$a7,lsr#1 424 str $a5,[$r_ptr,#20] 425 orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit 426 str $a6,[$r_ptr,#24] 427 str $a7,[$r_ptr,#28] 428 429 mov pc,lr 430.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 431 432@ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8], 433@ const BN_ULONG r2[8]); 434.globl ecp_nistz256_sub 435.type ecp_nistz256_sub,%function 436.align 4 437ecp_nistz256_sub: 438 stmdb sp!,{r4-r12,lr} 439 bl __ecp_nistz256_sub 440#if __ARM_ARCH__>=5 || !defined(__thumb__) 441 ldmia sp!,{r4-r12,pc} 442#else 443 ldmia sp!,{r4-r12,lr} 444 bx lr @ interoperable with Thumb ISA:-) 445#endif 446.size ecp_nistz256_sub,.-ecp_nistz256_sub 447 448.type __ecp_nistz256_sub,%function 449.align 4 450__ecp_nistz256_sub: 451 str lr,[sp,#-4]! @ push lr 452 453 ldr $a0,[$a_ptr,#0] 454 ldr $a1,[$a_ptr,#4] 455 ldr $a2,[$a_ptr,#8] 456 ldr $a3,[$a_ptr,#12] 457 ldr $a4,[$a_ptr,#16] 458 ldr $t0,[$b_ptr,#0] 459 ldr $a5,[$a_ptr,#20] 460 ldr $t1,[$b_ptr,#4] 461 ldr $a6,[$a_ptr,#24] 462 ldr $t2,[$b_ptr,#8] 463 ldr $a7,[$a_ptr,#28] 464 ldr $t3,[$b_ptr,#12] 465 subs $a0,$a0,$t0 466 ldr $t0,[$b_ptr,#16] 467 sbcs $a1,$a1,$t1 468 ldr $t1,[$b_ptr,#20] 469 sbcs $a2,$a2,$t2 470 ldr $t2,[$b_ptr,#24] 471 sbcs $a3,$a3,$t3 472 ldr $t3,[$b_ptr,#28] 473 sbcs $a4,$a4,$t0 474 sbcs $a5,$a5,$t1 475 sbcs $a6,$a6,$t2 476 sbcs $a7,$a7,$t3 477 sbc $ff,$ff,$ff @ broadcast borrow bit 478 ldr lr,[sp],#4 @ pop lr 479 480.Lreduce_by_add: 481 482 @ if a-b borrows, add modulus. 483 @ 484 @ Note that because mod has special form, i.e. consists of 485 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 486 @ broadcasting borrow bit to a register, $ff, and using it as 487 @ a whole or extracting single bit. 488 489 adds $a0,$a0,$ff @ add synthesized modulus 490 adcs $a1,$a1,$ff 491 str $a0,[$r_ptr,#0] 492 adcs $a2,$a2,$ff 493 str $a1,[$r_ptr,#4] 494 adcs $a3,$a3,#0 495 str $a2,[$r_ptr,#8] 496 adcs $a4,$a4,#0 497 str $a3,[$r_ptr,#12] 498 adcs $a5,$a5,#0 499 str $a4,[$r_ptr,#16] 500 adcs $a6,$a6,$ff,lsr#31 501 str $a5,[$r_ptr,#20] 502 adcs $a7,$a7,$ff 503 str $a6,[$r_ptr,#24] 504 str $a7,[$r_ptr,#28] 505 506 mov pc,lr 507.size __ecp_nistz256_sub,.-__ecp_nistz256_sub 508 509@ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]); 510.globl ecp_nistz256_neg 511.type ecp_nistz256_neg,%function 512.align 4 513ecp_nistz256_neg: 514 stmdb sp!,{r4-r12,lr} 515 bl __ecp_nistz256_neg 516#if __ARM_ARCH__>=5 || !defined(__thumb__) 517 ldmia sp!,{r4-r12,pc} 518#else 519 ldmia sp!,{r4-r12,lr} 520 bx lr @ interoperable with Thumb ISA:-) 521#endif 522.size ecp_nistz256_neg,.-ecp_nistz256_neg 523 524.type __ecp_nistz256_neg,%function 525.align 4 526__ecp_nistz256_neg: 527 ldr $a0,[$a_ptr,#0] 528 eor $ff,$ff,$ff 529 ldr $a1,[$a_ptr,#4] 530 ldr $a2,[$a_ptr,#8] 531 subs $a0,$ff,$a0 532 ldr $a3,[$a_ptr,#12] 533 sbcs $a1,$ff,$a1 534 ldr $a4,[$a_ptr,#16] 535 sbcs $a2,$ff,$a2 536 ldr $a5,[$a_ptr,#20] 537 sbcs $a3,$ff,$a3 538 ldr $a6,[$a_ptr,#24] 539 sbcs $a4,$ff,$a4 540 ldr $a7,[$a_ptr,#28] 541 sbcs $a5,$ff,$a5 542 sbcs $a6,$ff,$a6 543 sbcs $a7,$ff,$a7 544 sbc $ff,$ff,$ff 545 546 b .Lreduce_by_add 547.size __ecp_nistz256_neg,.-__ecp_nistz256_neg 548___ 549{ 550my @acc=map("r$_",(3..11)); 551my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14)); 552 553$code.=<<___; 554@ void ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); 555.globl ecp_nistz256_sqr_mont 556.type ecp_nistz256_sqr_mont,%function 557.align 4 558ecp_nistz256_sqr_mont: 559 mov $b_ptr,$a_ptr 560 b .Lecp_nistz256_mul_mont 561.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 562 563@ void ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8], 564@ const BN_ULONG r2[8]); 565.globl ecp_nistz256_mul_mont 566.type ecp_nistz256_mul_mont,%function 567.align 4 568ecp_nistz256_mul_mont: 569.Lecp_nistz256_mul_mont: 570 stmdb sp!,{r4-r12,lr} 571 bl __ecp_nistz256_mul_mont 572#if __ARM_ARCH__>=5 || !defined(__thumb__) 573 ldmia sp!,{r4-r12,pc} 574#else 575 ldmia sp!,{r4-r12,lr} 576 bx lr @ interoperable with Thumb ISA:-) 577#endif 578.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 579 580.type __ecp_nistz256_mul_mont,%function 581.align 4 582__ecp_nistz256_mul_mont: 583 stmdb sp!,{r0-r2,lr} @ make a copy of arguments too 584 585 ldr $bj,[$b_ptr,#0] @ b[0] 586 ldmia $a_ptr,{@acc[1]-@acc[8]} 587 588 umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0] 589 stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so 590 @ that it can be addressed 591 @ without spending register 592 @ on address 593 umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0] 594 umull @acc[2],$t1,@acc[3],$bj 595 adds @acc[1],@acc[1],$t3 @ accumulate high part of mult 596 umull @acc[3],$t2,@acc[4],$bj 597 adcs @acc[2],@acc[2],$t0 598 umull @acc[4],$t3,@acc[5],$bj 599 adcs @acc[3],@acc[3],$t1 600 umull @acc[5],$t0,@acc[6],$bj 601 adcs @acc[4],@acc[4],$t2 602 umull @acc[6],$t1,@acc[7],$bj 603 adcs @acc[5],@acc[5],$t3 604 umull @acc[7],$t2,@acc[8],$bj 605 adcs @acc[6],@acc[6],$t0 606 adcs @acc[7],@acc[7],$t1 607 eor $t3,$t3,$t3 @ first overflow bit is zero 608 adc @acc[8],$t2,#0 609___ 610for(my $i=1;$i<8;$i++) { 611my $t4=@acc[0]; 612 613 # Reduction iteration is normally performed by accumulating 614 # result of multiplication of modulus by "magic" digit [and 615 # omitting least significant word, which is guaranteed to 616 # be 0], but thanks to special form of modulus and "magic" 617 # digit being equal to least significant word, it can be 618 # performed with additions and subtractions alone. Indeed: 619 # 620 # ffff.0001.0000.0000.0000.ffff.ffff.ffff 621 # * abcd 622 # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 623 # 624 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 625 # rewrite above as: 626 # 627 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 628 # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 629 # - abcd.0000.0000.0000.0000.0000.0000.abcd 630 # 631 # or marking redundant operations: 632 # 633 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- 634 # + abcd.0000.abcd.0000.0000.abcd.----.----.---- 635 # - abcd.----.----.----.----.----.----.---- 636 637$code.=<<___; 638 @ multiplication-less reduction $i 639 adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0] 640 ldr $bj,[sp,#40] @ restore b_ptr 641 adcs @acc[4],@acc[4],#0 @ r[4]+=0 642 adcs @acc[5],@acc[5],#0 @ r[5]+=0 643 adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0] 644 ldr $t1,[sp,#0] @ load a[0] 645 adcs @acc[7],@acc[7],#0 @ r[7]+=0 646 ldr $bj,[$bj,#4*$i] @ load b[i] 647 adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0] 648 eor $t0,$t0,$t0 649 adc $t3,$t3,#0 @ overflow bit 650 subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0] 651 ldr $t2,[sp,#4] @ a[1] 652 sbcs @acc[8],@acc[8],#0 @ r[8]-=0 653 umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i] 654 eor $t1,$t1,$t1 655 sbc @acc[0],$t3,#0 @ overflow bit, keep in mind 656 @ that netto result is 657 @ addition of a value which 658 @ makes underflow impossible 659 660 ldr $t3,[sp,#8] @ a[2] 661 umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i] 662 str @acc[0],[sp,#36] @ temporarily offload overflow 663 eor $t2,$t2,$t2 664 ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0] 665 umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i] 666 eor $t3,$t3,$t3 667 adds @acc[2],@acc[2],$t0 @ accumulate high part of mult 668 ldr $t0,[sp,#16] @ a[4] 669 umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i] 670 eor $t4,$t4,$t4 671 adcs @acc[3],@acc[3],$t1 672 ldr $t1,[sp,#20] @ a[5] 673 umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i] 674 eor $t0,$t0,$t0 675 adcs @acc[4],@acc[4],$t2 676 ldr $t2,[sp,#24] @ a[6] 677 umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i] 678 eor $t1,$t1,$t1 679 adcs @acc[5],@acc[5],$t3 680 ldr $t3,[sp,#28] @ a[7] 681 umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i] 682 eor $t2,$t2,$t2 683 adcs @acc[6],@acc[6],$t4 684 ldr @acc[0],[sp,#36] @ restore overflow bit 685 umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i] 686 eor $t3,$t3,$t3 687 adcs @acc[7],@acc[7],$t0 688 adcs @acc[8],@acc[8],$t1 689 adcs @acc[0],$acc[0],$t2 690 adc $t3,$t3,#0 @ new overflow bit 691___ 692 push(@acc,shift(@acc)); # rotate registers, so that 693 # "r[i]" becomes r[i] 694} 695$code.=<<___; 696 @ last multiplication-less reduction 697 adds @acc[3],@acc[3],@acc[0] 698 ldr $r_ptr,[sp,#32] @ restore r_ptr 699 adcs @acc[4],@acc[4],#0 700 adcs @acc[5],@acc[5],#0 701 adcs @acc[6],@acc[6],@acc[0] 702 adcs @acc[7],@acc[7],#0 703 adcs @acc[8],@acc[8],@acc[0] 704 adc $t3,$t3,#0 705 subs @acc[7],@acc[7],@acc[0] 706 sbcs @acc[8],@acc[8],#0 707 sbc @acc[0],$t3,#0 @ overflow bit 708 709 @ Final step is "if result > mod, subtract mod", but we do it 710 @ "other way around", namely subtract modulus from result 711 @ and if it borrowed, add modulus back. 712 713 adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1 714 adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1 715 adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1 716 sbcs @acc[4],@acc[4],#0 717 sbcs @acc[5],@acc[5],#0 718 sbcs @acc[6],@acc[6],#0 719 sbcs @acc[7],@acc[7],#1 720 adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1 721 ldr lr,[sp,#44] @ restore lr 722 sbc @acc[0],@acc[0],#0 @ broadcast borrow bit 723 add sp,sp,#48 724 725 @ Note that because mod has special form, i.e. consists of 726 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 727 @ broadcasting borrow bit to a register, @acc[0], and using it as 728 @ a whole or extracting single bit. 729 730 adds @acc[1],@acc[1],@acc[0] @ add modulus or zero 731 adcs @acc[2],@acc[2],@acc[0] 732 str @acc[1],[$r_ptr,#0] 733 adcs @acc[3],@acc[3],@acc[0] 734 str @acc[2],[$r_ptr,#4] 735 adcs @acc[4],@acc[4],#0 736 str @acc[3],[$r_ptr,#8] 737 adcs @acc[5],@acc[5],#0 738 str @acc[4],[$r_ptr,#12] 739 adcs @acc[6],@acc[6],#0 740 str @acc[5],[$r_ptr,#16] 741 adcs @acc[7],@acc[7],@acc[0],lsr#31 742 str @acc[6],[$r_ptr,#20] 743 adc @acc[8],@acc[8],@acc[0] 744 str @acc[7],[$r_ptr,#24] 745 str @acc[8],[$r_ptr,#28] 746 747 mov pc,lr 748.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 749___ 750} 751 752{ 753my ($out,$inp,$index,$mask)=map("r$_",(0..3)); 754$code.=<<___; 755@ void ecp_nistz256_scatter_w5(void *r0,const P256_POINT *r1, 756@ int r2); 757.globl ecp_nistz256_scatter_w5 758.type ecp_nistz256_scatter_w5,%function 759.align 5 760ecp_nistz256_scatter_w5: 761 stmdb sp!,{r4-r11} 762 763 add $out,$out,$index,lsl#2 764 765 ldmia $inp!,{r4-r11} @ X 766 str r4,[$out,#64*0-4] 767 str r5,[$out,#64*1-4] 768 str r6,[$out,#64*2-4] 769 str r7,[$out,#64*3-4] 770 str r8,[$out,#64*4-4] 771 str r9,[$out,#64*5-4] 772 str r10,[$out,#64*6-4] 773 str r11,[$out,#64*7-4] 774 add $out,$out,#64*8 775 776 ldmia $inp!,{r4-r11} @ Y 777 str r4,[$out,#64*0-4] 778 str r5,[$out,#64*1-4] 779 str r6,[$out,#64*2-4] 780 str r7,[$out,#64*3-4] 781 str r8,[$out,#64*4-4] 782 str r9,[$out,#64*5-4] 783 str r10,[$out,#64*6-4] 784 str r11,[$out,#64*7-4] 785 add $out,$out,#64*8 786 787 ldmia $inp,{r4-r11} @ Z 788 str r4,[$out,#64*0-4] 789 str r5,[$out,#64*1-4] 790 str r6,[$out,#64*2-4] 791 str r7,[$out,#64*3-4] 792 str r8,[$out,#64*4-4] 793 str r9,[$out,#64*5-4] 794 str r10,[$out,#64*6-4] 795 str r11,[$out,#64*7-4] 796 797 ldmia sp!,{r4-r11} 798#if __ARM_ARCH__>=5 || defined(__thumb__) 799 bx lr 800#else 801 mov pc,lr 802#endif 803.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 804 805@ void ecp_nistz256_gather_w5(P256_POINT *r0,const void *r1, 806@ int r2); 807.globl ecp_nistz256_gather_w5 808.type ecp_nistz256_gather_w5,%function 809.align 5 810ecp_nistz256_gather_w5: 811 stmdb sp!,{r4-r11} 812 813 cmp $index,#0 814 mov $mask,#0 815#ifdef __thumb2__ 816 itt ne 817#endif 818 subne $index,$index,#1 819 movne $mask,#-1 820 add $inp,$inp,$index,lsl#2 821 822 ldr r4,[$inp,#64*0] 823 ldr r5,[$inp,#64*1] 824 ldr r6,[$inp,#64*2] 825 and r4,r4,$mask 826 ldr r7,[$inp,#64*3] 827 and r5,r5,$mask 828 ldr r8,[$inp,#64*4] 829 and r6,r6,$mask 830 ldr r9,[$inp,#64*5] 831 and r7,r7,$mask 832 ldr r10,[$inp,#64*6] 833 and r8,r8,$mask 834 ldr r11,[$inp,#64*7] 835 add $inp,$inp,#64*8 836 and r9,r9,$mask 837 and r10,r10,$mask 838 and r11,r11,$mask 839 stmia $out!,{r4-r11} @ X 840 841 ldr r4,[$inp,#64*0] 842 ldr r5,[$inp,#64*1] 843 ldr r6,[$inp,#64*2] 844 and r4,r4,$mask 845 ldr r7,[$inp,#64*3] 846 and r5,r5,$mask 847 ldr r8,[$inp,#64*4] 848 and r6,r6,$mask 849 ldr r9,[$inp,#64*5] 850 and r7,r7,$mask 851 ldr r10,[$inp,#64*6] 852 and r8,r8,$mask 853 ldr r11,[$inp,#64*7] 854 add $inp,$inp,#64*8 855 and r9,r9,$mask 856 and r10,r10,$mask 857 and r11,r11,$mask 858 stmia $out!,{r4-r11} @ Y 859 860 ldr r4,[$inp,#64*0] 861 ldr r5,[$inp,#64*1] 862 ldr r6,[$inp,#64*2] 863 and r4,r4,$mask 864 ldr r7,[$inp,#64*3] 865 and r5,r5,$mask 866 ldr r8,[$inp,#64*4] 867 and r6,r6,$mask 868 ldr r9,[$inp,#64*5] 869 and r7,r7,$mask 870 ldr r10,[$inp,#64*6] 871 and r8,r8,$mask 872 ldr r11,[$inp,#64*7] 873 and r9,r9,$mask 874 and r10,r10,$mask 875 and r11,r11,$mask 876 stmia $out,{r4-r11} @ Z 877 878 ldmia sp!,{r4-r11} 879#if __ARM_ARCH__>=5 || defined(__thumb__) 880 bx lr 881#else 882 mov pc,lr 883#endif 884.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 885 886@ void ecp_nistz256_scatter_w7(void *r0,const P256_POINT_AFFINE *r1, 887@ int r2); 888.globl ecp_nistz256_scatter_w7 889.type ecp_nistz256_scatter_w7,%function 890.align 5 891ecp_nistz256_scatter_w7: 892 add $out,$out,$index 893 mov $index,#64/4 894.Loop_scatter_w7: 895 ldr $mask,[$inp],#4 896 subs $index,$index,#1 897 strb $mask,[$out,#64*0] 898 mov $mask,$mask,lsr#8 899 strb $mask,[$out,#64*1] 900 mov $mask,$mask,lsr#8 901 strb $mask,[$out,#64*2] 902 mov $mask,$mask,lsr#8 903 strb $mask,[$out,#64*3] 904 add $out,$out,#64*4 905 bne .Loop_scatter_w7 906 907#if __ARM_ARCH__>=5 || defined(__thumb__) 908 bx lr 909#else 910 mov pc,lr 911#endif 912.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 913 914@ void ecp_nistz256_gather_w7(P256_POINT_AFFINE *r0,const void *r1, 915@ int r2); 916.globl ecp_nistz256_gather_w7 917.type ecp_nistz256_gather_w7,%function 918.align 5 919ecp_nistz256_gather_w7: 920 stmdb sp!,{r4-r7} 921 922 cmp $index,#0 923 mov $mask,#0 924#ifdef __thumb2__ 925 itt ne 926#endif 927 subne $index,$index,#1 928 movne $mask,#-1 929 add $inp,$inp,$index 930 mov $index,#64/4 931 nop 932.Loop_gather_w7: 933 ldrb r4,[$inp,#64*0] 934 subs $index,$index,#1 935 ldrb r5,[$inp,#64*1] 936 ldrb r6,[$inp,#64*2] 937 ldrb r7,[$inp,#64*3] 938 add $inp,$inp,#64*4 939 orr r4,r4,r5,lsl#8 940 orr r4,r4,r6,lsl#16 941 orr r4,r4,r7,lsl#24 942 and r4,r4,$mask 943 str r4,[$out],#4 944 bne .Loop_gather_w7 945 946 ldmia sp!,{r4-r7} 947#if __ARM_ARCH__>=5 || defined(__thumb__) 948 bx lr 949#else 950 mov pc,lr 951#endif 952.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 953___ 954} 955if (0) { 956# In comparison to integer-only equivalent of below subroutine: 957# 958# Cortex-A8 +10% 959# Cortex-A9 -10% 960# Snapdragon S4 +5% 961# 962# As not all time is spent in multiplication, overall impact is deemed 963# too low to care about. 964 965my ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7)); 966my $mask="q4"; 967my $mult="q5"; 968my @AxB=map("q$_",(8..15)); 969 970my ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3)); 971 972$code.=<<___; 973#if __ARM_ARCH__>=7 974.fpu neon 975 976.globl ecp_nistz256_mul_mont_neon 977.type ecp_nistz256_mul_mont_neon,%function 978.align 5 979ecp_nistz256_mul_mont_neon: 980 mov ip,sp 981 stmdb sp!,{r4-r9} 982 vstmdb sp!,{q4-q5} @ ABI specification says so 983 984 sub $toutptr,sp,#40 985 vld1.32 {${Bi}[0]},[$bptr,:32]! 986 veor $zero,$zero,$zero 987 vld1.32 {$A0-$A3}, [$aptr] @ can't specify :32 :-( 988 vzip.16 $Bi,$zero 989 mov sp,$toutptr @ alloca 990 vmov.i64 $mask,#0xffff 991 992 vmull.u32 @AxB[0],$Bi,${A0}[0] 993 vmull.u32 @AxB[1],$Bi,${A0}[1] 994 vmull.u32 @AxB[2],$Bi,${A1}[0] 995 vmull.u32 @AxB[3],$Bi,${A1}[1] 996 vshr.u64 $temp,@AxB[0]#lo,#16 997 vmull.u32 @AxB[4],$Bi,${A2}[0] 998 vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp 999 vmull.u32 @AxB[5],$Bi,${A2}[1] 1000 vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 32 bits of a[0]*b[0] 1001 vmull.u32 @AxB[6],$Bi,${A3}[0] 1002 vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] 1003 vmull.u32 @AxB[7],$Bi,${A3}[1] 1004___ 1005for($i=1;$i<8;$i++) { 1006$code.=<<___; 1007 vld1.32 {${Bi}[0]},[$bptr,:32]! 1008 veor $zero,$zero,$zero 1009 vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ reduction 1010 vshl.u64 $mult,@AxB[0],#32 1011 vadd.u64 @AxB[3],@AxB[3],@AxB[0] 1012 vsub.u64 $mult,$mult,@AxB[0] 1013 vzip.16 $Bi,$zero 1014 vadd.u64 @AxB[6],@AxB[6],@AxB[0] 1015 vadd.u64 @AxB[7],@AxB[7],$mult 1016___ 1017 push(@AxB,shift(@AxB)); 1018$code.=<<___; 1019 vmlal.u32 @AxB[0],$Bi,${A0}[0] 1020 vmlal.u32 @AxB[1],$Bi,${A0}[1] 1021 vmlal.u32 @AxB[2],$Bi,${A1}[0] 1022 vmlal.u32 @AxB[3],$Bi,${A1}[1] 1023 vshr.u64 $temp,@AxB[0]#lo,#16 1024 vmlal.u32 @AxB[4],$Bi,${A2}[0] 1025 vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp 1026 vmlal.u32 @AxB[5],$Bi,${A2}[1] 1027 vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 33 bits of a[0]*b[i]+t[0] 1028 vmlal.u32 @AxB[6],$Bi,${A3}[0] 1029 vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] 1030 vmull.u32 @AxB[7],$Bi,${A3}[1] 1031___ 1032} 1033$code.=<<___; 1034 vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ last reduction 1035 vshl.u64 $mult,@AxB[0],#32 1036 vadd.u64 @AxB[3],@AxB[3],@AxB[0] 1037 vsub.u64 $mult,$mult,@AxB[0] 1038 vadd.u64 @AxB[6],@AxB[6],@AxB[0] 1039 vadd.u64 @AxB[7],@AxB[7],$mult 1040 1041 vshr.u64 $temp,@AxB[1]#lo,#16 @ convert 1042 vadd.u64 @AxB[1]#hi,@AxB[1]#hi,$temp 1043 vshr.u64 $temp,@AxB[1]#hi,#16 1044 vzip.16 @AxB[1]#lo,@AxB[1]#hi 1045___ 1046foreach (2..7) { 1047$code.=<<___; 1048 vadd.u64 @AxB[$_]#lo,@AxB[$_]#lo,$temp 1049 vst1.32 {@AxB[$_-1]#lo[0]},[$toutptr,:32]! 1050 vshr.u64 $temp,@AxB[$_]#lo,#16 1051 vadd.u64 @AxB[$_]#hi,@AxB[$_]#hi,$temp 1052 vshr.u64 $temp,@AxB[$_]#hi,#16 1053 vzip.16 @AxB[$_]#lo,@AxB[$_]#hi 1054___ 1055} 1056$code.=<<___; 1057 vst1.32 {@AxB[7]#lo[0]},[$toutptr,:32]! 1058 vst1.32 {$temp},[$toutptr] @ upper 33 bits 1059 1060 ldr r1,[sp,#0] 1061 ldr r2,[sp,#4] 1062 ldr r3,[sp,#8] 1063 subs r1,r1,#-1 1064 ldr r4,[sp,#12] 1065 sbcs r2,r2,#-1 1066 ldr r5,[sp,#16] 1067 sbcs r3,r3,#-1 1068 ldr r6,[sp,#20] 1069 sbcs r4,r4,#0 1070 ldr r7,[sp,#24] 1071 sbcs r5,r5,#0 1072 ldr r8,[sp,#28] 1073 sbcs r6,r6,#0 1074 ldr r9,[sp,#32] @ top-most bit 1075 sbcs r7,r7,#1 1076 sub sp,ip,#40+16 1077 sbcs r8,r8,#-1 1078 sbc r9,r9,#0 1079 vldmia sp!,{q4-q5} 1080 1081 adds r1,r1,r9 1082 adcs r2,r2,r9 1083 str r1,[$rptr,#0] 1084 adcs r3,r3,r9 1085 str r2,[$rptr,#4] 1086 adcs r4,r4,#0 1087 str r3,[$rptr,#8] 1088 adcs r5,r5,#0 1089 str r4,[$rptr,#12] 1090 adcs r6,r6,#0 1091 str r5,[$rptr,#16] 1092 adcs r7,r7,r9,lsr#31 1093 str r6,[$rptr,#20] 1094 adcs r8,r8,r9 1095 str r7,[$rptr,#24] 1096 str r8,[$rptr,#28] 1097 1098 ldmia sp!,{r4-r9} 1099 bx lr 1100.size ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon 1101#endif 1102___ 1103} 1104 1105{{{ 1106######################################################################## 1107# Below $aN assignment matches order in which 256-bit result appears in 1108# register bank at return from __ecp_nistz256_mul_mont, so that we can 1109# skip over reloading it from memory. This means that below functions 1110# use custom calling sequence accepting 256-bit input in registers, 1111# output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr. 1112# 1113# See their "normal" counterparts for insights on calculations. 1114 1115my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7, 1116 $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1)); 1117my $ff=$b_ptr; 1118 1119$code.=<<___; 1120.type __ecp_nistz256_sub_from,%function 1121.align 5 1122__ecp_nistz256_sub_from: 1123 str lr,[sp,#-4]! @ push lr 1124 1125 ldr $t0,[$b_ptr,#0] 1126 ldr $t1,[$b_ptr,#4] 1127 ldr $t2,[$b_ptr,#8] 1128 ldr $t3,[$b_ptr,#12] 1129 subs $a0,$a0,$t0 1130 ldr $t0,[$b_ptr,#16] 1131 sbcs $a1,$a1,$t1 1132 ldr $t1,[$b_ptr,#20] 1133 sbcs $a2,$a2,$t2 1134 ldr $t2,[$b_ptr,#24] 1135 sbcs $a3,$a3,$t3 1136 ldr $t3,[$b_ptr,#28] 1137 sbcs $a4,$a4,$t0 1138 sbcs $a5,$a5,$t1 1139 sbcs $a6,$a6,$t2 1140 sbcs $a7,$a7,$t3 1141 sbc $ff,$ff,$ff @ broadcast borrow bit 1142 ldr lr,[sp],#4 @ pop lr 1143 1144 adds $a0,$a0,$ff @ add synthesized modulus 1145 adcs $a1,$a1,$ff 1146 str $a0,[$r_ptr,#0] 1147 adcs $a2,$a2,$ff 1148 str $a1,[$r_ptr,#4] 1149 adcs $a3,$a3,#0 1150 str $a2,[$r_ptr,#8] 1151 adcs $a4,$a4,#0 1152 str $a3,[$r_ptr,#12] 1153 adcs $a5,$a5,#0 1154 str $a4,[$r_ptr,#16] 1155 adcs $a6,$a6,$ff,lsr#31 1156 str $a5,[$r_ptr,#20] 1157 adcs $a7,$a7,$ff 1158 str $a6,[$r_ptr,#24] 1159 str $a7,[$r_ptr,#28] 1160 1161 mov pc,lr 1162.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 1163 1164.type __ecp_nistz256_sub_morf,%function 1165.align 5 1166__ecp_nistz256_sub_morf: 1167 str lr,[sp,#-4]! @ push lr 1168 1169 ldr $t0,[$b_ptr,#0] 1170 ldr $t1,[$b_ptr,#4] 1171 ldr $t2,[$b_ptr,#8] 1172 ldr $t3,[$b_ptr,#12] 1173 subs $a0,$t0,$a0 1174 ldr $t0,[$b_ptr,#16] 1175 sbcs $a1,$t1,$a1 1176 ldr $t1,[$b_ptr,#20] 1177 sbcs $a2,$t2,$a2 1178 ldr $t2,[$b_ptr,#24] 1179 sbcs $a3,$t3,$a3 1180 ldr $t3,[$b_ptr,#28] 1181 sbcs $a4,$t0,$a4 1182 sbcs $a5,$t1,$a5 1183 sbcs $a6,$t2,$a6 1184 sbcs $a7,$t3,$a7 1185 sbc $ff,$ff,$ff @ broadcast borrow bit 1186 ldr lr,[sp],#4 @ pop lr 1187 1188 adds $a0,$a0,$ff @ add synthesized modulus 1189 adcs $a1,$a1,$ff 1190 str $a0,[$r_ptr,#0] 1191 adcs $a2,$a2,$ff 1192 str $a1,[$r_ptr,#4] 1193 adcs $a3,$a3,#0 1194 str $a2,[$r_ptr,#8] 1195 adcs $a4,$a4,#0 1196 str $a3,[$r_ptr,#12] 1197 adcs $a5,$a5,#0 1198 str $a4,[$r_ptr,#16] 1199 adcs $a6,$a6,$ff,lsr#31 1200 str $a5,[$r_ptr,#20] 1201 adcs $a7,$a7,$ff 1202 str $a6,[$r_ptr,#24] 1203 str $a7,[$r_ptr,#28] 1204 1205 mov pc,lr 1206.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 1207 1208.type __ecp_nistz256_add_self,%function 1209.align 4 1210__ecp_nistz256_add_self: 1211 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] 1212 adcs $a1,$a1,$a1 1213 adcs $a2,$a2,$a2 1214 adcs $a3,$a3,$a3 1215 adcs $a4,$a4,$a4 1216 adcs $a5,$a5,$a5 1217 adcs $a6,$a6,$a6 1218 mov $ff,#0 1219 adcs $a7,$a7,$a7 1220 adc $ff,$ff,#0 1221 1222 @ if a+b >= modulus, subtract modulus. 1223 @ 1224 @ But since comparison implies subtraction, we subtract 1225 @ modulus and then add it back if subtraction borrowed. 1226 1227 subs $a0,$a0,#-1 1228 sbcs $a1,$a1,#-1 1229 sbcs $a2,$a2,#-1 1230 sbcs $a3,$a3,#0 1231 sbcs $a4,$a4,#0 1232 sbcs $a5,$a5,#0 1233 sbcs $a6,$a6,#1 1234 sbcs $a7,$a7,#-1 1235 sbc $ff,$ff,#0 1236 1237 @ Note that because mod has special form, i.e. consists of 1238 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 1239 @ using value of borrow as a whole or extracting single bit. 1240 @ Follow $ff register... 1241 1242 adds $a0,$a0,$ff @ add synthesized modulus 1243 adcs $a1,$a1,$ff 1244 str $a0,[$r_ptr,#0] 1245 adcs $a2,$a2,$ff 1246 str $a1,[$r_ptr,#4] 1247 adcs $a3,$a3,#0 1248 str $a2,[$r_ptr,#8] 1249 adcs $a4,$a4,#0 1250 str $a3,[$r_ptr,#12] 1251 adcs $a5,$a5,#0 1252 str $a4,[$r_ptr,#16] 1253 adcs $a6,$a6,$ff,lsr#31 1254 str $a5,[$r_ptr,#20] 1255 adcs $a7,$a7,$ff 1256 str $a6,[$r_ptr,#24] 1257 str $a7,[$r_ptr,#28] 1258 1259 mov pc,lr 1260.size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self 1261 1262___ 1263 1264######################################################################## 1265# following subroutines are "literal" implementation of those found in 1266# ecp_nistz256.c 1267# 1268######################################################################## 1269# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 1270# 1271{ 1272my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 1273# above map() describes stack layout with 5 temporary 1274# 256-bit vectors on top. Then note that we push 1275# starting from r0, which means that we have copy of 1276# input arguments just below these temporary vectors. 1277 1278$code.=<<___; 1279.globl ecp_nistz256_point_double 1280.type ecp_nistz256_point_double,%function 1281.align 5 1282ecp_nistz256_point_double: 1283 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional 1284 sub sp,sp,#32*5 1285 1286.Lpoint_double_shortcut: 1287 add r3,sp,#$in_x 1288 ldmia $a_ptr!,{r4-r11} @ copy in_x 1289 stmia r3,{r4-r11} 1290 1291 add $r_ptr,sp,#$S 1292 bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y); 1293 1294 add $b_ptr,$a_ptr,#32 1295 add $a_ptr,$a_ptr,#32 1296 add $r_ptr,sp,#$Zsqr 1297 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z); 1298 1299 add $a_ptr,sp,#$S 1300 add $b_ptr,sp,#$S 1301 add $r_ptr,sp,#$S 1302 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S); 1303 1304 ldr $b_ptr,[sp,#32*5+4] 1305 add $a_ptr,$b_ptr,#32 1306 add $b_ptr,$b_ptr,#64 1307 add $r_ptr,sp,#$tmp0 1308 bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y); 1309 1310 ldr $r_ptr,[sp,#32*5] 1311 add $r_ptr,$r_ptr,#64 1312 bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0); 1313 1314 add $a_ptr,sp,#$in_x 1315 add $b_ptr,sp,#$Zsqr 1316 add $r_ptr,sp,#$M 1317 bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr); 1318 1319 add $a_ptr,sp,#$in_x 1320 add $b_ptr,sp,#$Zsqr 1321 add $r_ptr,sp,#$Zsqr 1322 bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr); 1323 1324 add $a_ptr,sp,#$S 1325 add $b_ptr,sp,#$S 1326 add $r_ptr,sp,#$tmp0 1327 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S); 1328 1329 add $a_ptr,sp,#$Zsqr 1330 add $b_ptr,sp,#$M 1331 add $r_ptr,sp,#$M 1332 bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr); 1333 1334 ldr $r_ptr,[sp,#32*5] 1335 add $a_ptr,sp,#$tmp0 1336 add $r_ptr,$r_ptr,#32 1337 bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0); 1338 1339 add $a_ptr,sp,#$M 1340 add $r_ptr,sp,#$M 1341 bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M); 1342 1343 add $a_ptr,sp,#$in_x 1344 add $b_ptr,sp,#$S 1345 add $r_ptr,sp,#$S 1346 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x); 1347 1348 add $r_ptr,sp,#$tmp0 1349 bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S); 1350 1351 ldr $r_ptr,[sp,#32*5] 1352 add $a_ptr,sp,#$M 1353 add $b_ptr,sp,#$M 1354 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M); 1355 1356 add $b_ptr,sp,#$tmp0 1357 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0); 1358 1359 add $b_ptr,sp,#$S 1360 add $r_ptr,sp,#$S 1361 bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x); 1362 1363 add $a_ptr,sp,#$M 1364 add $b_ptr,sp,#$S 1365 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M); 1366 1367 ldr $r_ptr,[sp,#32*5] 1368 add $b_ptr,$r_ptr,#32 1369 add $r_ptr,$r_ptr,#32 1370 bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y); 1371 1372 add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3" 1373#if __ARM_ARCH__>=5 || !defined(__thumb__) 1374 ldmia sp!,{r4-r12,pc} 1375#else 1376 ldmia sp!,{r4-r12,lr} 1377 bx lr @ interoperable with Thumb ISA:-) 1378#endif 1379.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 1380___ 1381} 1382 1383######################################################################## 1384# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 1385# const P256_POINT *in2); 1386{ 1387my ($res_x,$res_y,$res_z, 1388 $in1_x,$in1_y,$in1_z, 1389 $in2_x,$in2_y,$in2_z, 1390 $H,$Hsqr,$R,$Rsqr,$Hcub, 1391 $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); 1392my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 1393# above map() describes stack layout with 18 temporary 1394# 256-bit vectors on top. Then note that we push 1395# starting from r0, which means that we have copy of 1396# input arguments just below these temporary vectors. 1397# We use three of them for ~in1infty, ~in2infty and 1398# result of check for zero. 1399 1400$code.=<<___; 1401.globl ecp_nistz256_point_add 1402.type ecp_nistz256_point_add,%function 1403.align 5 1404ecp_nistz256_point_add: 1405 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional 1406 sub sp,sp,#32*18+16 1407 1408 ldmia $b_ptr!,{r4-r11} @ copy in2_x 1409 add r3,sp,#$in2_x 1410 stmia r3!,{r4-r11} 1411 ldmia $b_ptr!,{r4-r11} @ copy in2_y 1412 stmia r3!,{r4-r11} 1413 ldmia $b_ptr,{r4-r11} @ copy in2_z 1414 orr r12,r4,r5 1415 orr r12,r12,r6 1416 orr r12,r12,r7 1417 orr r12,r12,r8 1418 orr r12,r12,r9 1419 orr r12,r12,r10 1420 orr r12,r12,r11 1421 cmp r12,#0 1422#ifdef __thumb2__ 1423 it ne 1424#endif 1425 movne r12,#-1 1426 stmia r3,{r4-r11} 1427 str r12,[sp,#32*18+8] @ ~in2infty 1428 1429 ldmia $a_ptr!,{r4-r11} @ copy in1_x 1430 add r3,sp,#$in1_x 1431 stmia r3!,{r4-r11} 1432 ldmia $a_ptr!,{r4-r11} @ copy in1_y 1433 stmia r3!,{r4-r11} 1434 ldmia $a_ptr,{r4-r11} @ copy in1_z 1435 orr r12,r4,r5 1436 orr r12,r12,r6 1437 orr r12,r12,r7 1438 orr r12,r12,r8 1439 orr r12,r12,r9 1440 orr r12,r12,r10 1441 orr r12,r12,r11 1442 cmp r12,#0 1443#ifdef __thumb2__ 1444 it ne 1445#endif 1446 movne r12,#-1 1447 stmia r3,{r4-r11} 1448 str r12,[sp,#32*18+4] @ ~in1infty 1449 1450 add $a_ptr,sp,#$in2_z 1451 add $b_ptr,sp,#$in2_z 1452 add $r_ptr,sp,#$Z2sqr 1453 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z); 1454 1455 add $a_ptr,sp,#$in1_z 1456 add $b_ptr,sp,#$in1_z 1457 add $r_ptr,sp,#$Z1sqr 1458 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); 1459 1460 add $a_ptr,sp,#$in2_z 1461 add $b_ptr,sp,#$Z2sqr 1462 add $r_ptr,sp,#$S1 1463 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z); 1464 1465 add $a_ptr,sp,#$in1_z 1466 add $b_ptr,sp,#$Z1sqr 1467 add $r_ptr,sp,#$S2 1468 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); 1469 1470 add $a_ptr,sp,#$in1_y 1471 add $b_ptr,sp,#$S1 1472 add $r_ptr,sp,#$S1 1473 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y); 1474 1475 add $a_ptr,sp,#$in2_y 1476 add $b_ptr,sp,#$S2 1477 add $r_ptr,sp,#$S2 1478 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); 1479 1480 add $b_ptr,sp,#$S1 1481 add $r_ptr,sp,#$R 1482 bl __ecp_nistz256_sub_from @ p256_sub(R, S2, S1); 1483 1484 orr $a0,$a0,$a1 @ see if result is zero 1485 orr $a2,$a2,$a3 1486 orr $a4,$a4,$a5 1487 orr $a0,$a0,$a2 1488 orr $a4,$a4,$a6 1489 orr $a0,$a0,$a7 1490 add $a_ptr,sp,#$in1_x 1491 orr $a0,$a0,$a4 1492 add $b_ptr,sp,#$Z2sqr 1493 str $a0,[sp,#32*18+12] 1494 1495 add $r_ptr,sp,#$U1 1496 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr); 1497 1498 add $a_ptr,sp,#$in2_x 1499 add $b_ptr,sp,#$Z1sqr 1500 add $r_ptr,sp,#$U2 1501 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr); 1502 1503 add $b_ptr,sp,#$U1 1504 add $r_ptr,sp,#$H 1505 bl __ecp_nistz256_sub_from @ p256_sub(H, U2, U1); 1506 1507 orr $a0,$a0,$a1 @ see if result is zero 1508 orr $a2,$a2,$a3 1509 orr $a4,$a4,$a5 1510 orr $a0,$a0,$a2 1511 orr $a4,$a4,$a6 1512 orr $a0,$a0,$a7 1513 orr $a0,$a0,$a4 @ ~is_equal(U1,U2) 1514 1515 ldr $t0,[sp,#32*18+4] @ ~in1infty 1516 ldr $t1,[sp,#32*18+8] @ ~in2infty 1517 ldr $t2,[sp,#32*18+12] @ ~is_equal(S1,S2) 1518 mvn $t0,$t0 @ -1/0 -> 0/-1 1519 mvn $t1,$t1 @ -1/0 -> 0/-1 1520 orr $a0,$t0 1521 orr $a0,$t1 1522 orrs $a0,$t2 @ set flags 1523 1524 @ if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) 1525 bne .Ladd_proceed 1526 1527.Ladd_double: 1528 ldr $a_ptr,[sp,#32*18+20] 1529 add sp,sp,#32*(18-5)+16 @ difference in frame sizes 1530 b .Lpoint_double_shortcut 1531 1532.align 4 1533.Ladd_proceed: 1534 add $a_ptr,sp,#$R 1535 add $b_ptr,sp,#$R 1536 add $r_ptr,sp,#$Rsqr 1537 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); 1538 1539 add $a_ptr,sp,#$H 1540 add $b_ptr,sp,#$in1_z 1541 add $r_ptr,sp,#$res_z 1542 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); 1543 1544 add $a_ptr,sp,#$H 1545 add $b_ptr,sp,#$H 1546 add $r_ptr,sp,#$Hsqr 1547 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); 1548 1549 add $a_ptr,sp,#$in2_z 1550 add $b_ptr,sp,#$res_z 1551 add $r_ptr,sp,#$res_z 1552 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z); 1553 1554 add $a_ptr,sp,#$H 1555 add $b_ptr,sp,#$Hsqr 1556 add $r_ptr,sp,#$Hcub 1557 bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); 1558 1559 add $a_ptr,sp,#$Hsqr 1560 add $b_ptr,sp,#$U1 1561 add $r_ptr,sp,#$U2 1562 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr); 1563 1564 add $r_ptr,sp,#$Hsqr 1565 bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); 1566 1567 add $b_ptr,sp,#$Rsqr 1568 add $r_ptr,sp,#$res_x 1569 bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); 1570 1571 add $b_ptr,sp,#$Hcub 1572 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); 1573 1574 add $b_ptr,sp,#$U2 1575 add $r_ptr,sp,#$res_y 1576 bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); 1577 1578 add $a_ptr,sp,#$Hcub 1579 add $b_ptr,sp,#$S1 1580 add $r_ptr,sp,#$S2 1581 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub); 1582 1583 add $a_ptr,sp,#$R 1584 add $b_ptr,sp,#$res_y 1585 add $r_ptr,sp,#$res_y 1586 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); 1587 1588 add $b_ptr,sp,#$S2 1589 bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); 1590 1591 ldr r11,[sp,#32*18+4] @ ~in1infty 1592 ldr r12,[sp,#32*18+8] @ ~in2infty 1593 add r1,sp,#$res_x 1594 add r2,sp,#$in2_x 1595 and r10,r11,r12 @ ~in1infty & ~in2infty 1596 mvn r11,r11 1597 add r3,sp,#$in1_x 1598 and r11,r11,r12 @ in1infty & ~in2infty 1599 mvn r12,r12 @ in2infty 1600 ldr $r_ptr,[sp,#32*18+16] 1601___ 1602for($i=0;$i<96;$i+=8) { # conditional moves 1603$code.=<<___; 1604 ldmia r1!,{r4-r5} @ res_x 1605 ldmia r2!,{r6-r7} @ in2_x 1606 ldmia r3!,{r8-r9} @ in1_x 1607 and r4,r4,r10 @ ~in1infty & ~in2infty 1608 and r5,r5,r10 1609 and r6,r6,r11 @ in1infty & ~in2infty 1610 and r7,r7,r11 1611 and r8,r8,r12 @ in2infty 1612 and r9,r9,r12 1613 orr r4,r4,r6 1614 orr r5,r5,r7 1615 orr r4,r4,r8 1616 orr r5,r5,r9 1617 stmia $r_ptr!,{r4-r5} 1618___ 1619} 1620$code.=<<___; 1621.Ladd_done: 1622 add sp,sp,#32*18+16+16 @ +16 means "skip even over saved r0-r3" 1623#if __ARM_ARCH__>=5 || !defined(__thumb__) 1624 ldmia sp!,{r4-r12,pc} 1625#else 1626 ldmia sp!,{r4-r12,lr} 1627 bx lr @ interoperable with Thumb ISA:-) 1628#endif 1629.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 1630___ 1631} 1632 1633######################################################################## 1634# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 1635# const P256_POINT_AFFINE *in2); 1636{ 1637my ($res_x,$res_y,$res_z, 1638 $in1_x,$in1_y,$in1_z, 1639 $in2_x,$in2_y, 1640 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); 1641my $Z1sqr = $S2; 1642# above map() describes stack layout with 18 temporary 1643# 256-bit vectors on top. Then note that we push 1644# starting from r0, which means that we have copy of 1645# input arguments just below these temporary vectors. 1646# We use two of them for ~in1infty, ~in2infty. 1647 1648my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); 1649 1650$code.=<<___; 1651.globl ecp_nistz256_point_add_affine 1652.type ecp_nistz256_point_add_affine,%function 1653.align 5 1654ecp_nistz256_point_add_affine: 1655 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional 1656 sub sp,sp,#32*15 1657 1658 ldmia $a_ptr!,{r4-r11} @ copy in1_x 1659 add r3,sp,#$in1_x 1660 stmia r3!,{r4-r11} 1661 ldmia $a_ptr!,{r4-r11} @ copy in1_y 1662 stmia r3!,{r4-r11} 1663 ldmia $a_ptr,{r4-r11} @ copy in1_z 1664 orr r12,r4,r5 1665 orr r12,r12,r6 1666 orr r12,r12,r7 1667 orr r12,r12,r8 1668 orr r12,r12,r9 1669 orr r12,r12,r10 1670 orr r12,r12,r11 1671 cmp r12,#0 1672#ifdef __thumb2__ 1673 it ne 1674#endif 1675 movne r12,#-1 1676 stmia r3,{r4-r11} 1677 str r12,[sp,#32*15+4] @ ~in1infty 1678 1679 ldmia $b_ptr!,{r4-r11} @ copy in2_x 1680 add r3,sp,#$in2_x 1681 orr r12,r4,r5 1682 orr r12,r12,r6 1683 orr r12,r12,r7 1684 orr r12,r12,r8 1685 orr r12,r12,r9 1686 orr r12,r12,r10 1687 orr r12,r12,r11 1688 stmia r3!,{r4-r11} 1689 ldmia $b_ptr!,{r4-r11} @ copy in2_y 1690 orr r12,r12,r4 1691 orr r12,r12,r5 1692 orr r12,r12,r6 1693 orr r12,r12,r7 1694 orr r12,r12,r8 1695 orr r12,r12,r9 1696 orr r12,r12,r10 1697 orr r12,r12,r11 1698 stmia r3!,{r4-r11} 1699 cmp r12,#0 1700#ifdef __thumb2__ 1701 it ne 1702#endif 1703 movne r12,#-1 1704 str r12,[sp,#32*15+8] @ ~in2infty 1705 1706 add $a_ptr,sp,#$in1_z 1707 add $b_ptr,sp,#$in1_z 1708 add $r_ptr,sp,#$Z1sqr 1709 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); 1710 1711 add $a_ptr,sp,#$Z1sqr 1712 add $b_ptr,sp,#$in2_x 1713 add $r_ptr,sp,#$U2 1714 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x); 1715 1716 add $b_ptr,sp,#$in1_x 1717 add $r_ptr,sp,#$H 1718 bl __ecp_nistz256_sub_from @ p256_sub(H, U2, in1_x); 1719 1720 add $a_ptr,sp,#$Z1sqr 1721 add $b_ptr,sp,#$in1_z 1722 add $r_ptr,sp,#$S2 1723 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); 1724 1725 add $a_ptr,sp,#$H 1726 add $b_ptr,sp,#$in1_z 1727 add $r_ptr,sp,#$res_z 1728 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); 1729 1730 add $a_ptr,sp,#$in2_y 1731 add $b_ptr,sp,#$S2 1732 add $r_ptr,sp,#$S2 1733 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); 1734 1735 add $b_ptr,sp,#$in1_y 1736 add $r_ptr,sp,#$R 1737 bl __ecp_nistz256_sub_from @ p256_sub(R, S2, in1_y); 1738 1739 add $a_ptr,sp,#$H 1740 add $b_ptr,sp,#$H 1741 add $r_ptr,sp,#$Hsqr 1742 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); 1743 1744 add $a_ptr,sp,#$R 1745 add $b_ptr,sp,#$R 1746 add $r_ptr,sp,#$Rsqr 1747 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); 1748 1749 add $a_ptr,sp,#$H 1750 add $b_ptr,sp,#$Hsqr 1751 add $r_ptr,sp,#$Hcub 1752 bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); 1753 1754 add $a_ptr,sp,#$Hsqr 1755 add $b_ptr,sp,#$in1_x 1756 add $r_ptr,sp,#$U2 1757 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr); 1758 1759 add $r_ptr,sp,#$Hsqr 1760 bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); 1761 1762 add $b_ptr,sp,#$Rsqr 1763 add $r_ptr,sp,#$res_x 1764 bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); 1765 1766 add $b_ptr,sp,#$Hcub 1767 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); 1768 1769 add $b_ptr,sp,#$U2 1770 add $r_ptr,sp,#$res_y 1771 bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); 1772 1773 add $a_ptr,sp,#$Hcub 1774 add $b_ptr,sp,#$in1_y 1775 add $r_ptr,sp,#$S2 1776 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub); 1777 1778 add $a_ptr,sp,#$R 1779 add $b_ptr,sp,#$res_y 1780 add $r_ptr,sp,#$res_y 1781 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); 1782 1783 add $b_ptr,sp,#$S2 1784 bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); 1785 1786 ldr r11,[sp,#32*15+4] @ ~in1infty 1787 ldr r12,[sp,#32*15+8] @ ~in2infty 1788 add r1,sp,#$res_x 1789 add r2,sp,#$in2_x 1790 and r10,r11,r12 @ ~in1infty & ~in2infty 1791 mvn r11,r11 1792 add r3,sp,#$in1_x 1793 and r11,r11,r12 @ in1infty & ~in2infty 1794 mvn r12,r12 @ in2infty 1795 ldr $r_ptr,[sp,#32*15] 1796___ 1797for($i=0;$i<64;$i+=8) { # conditional moves 1798$code.=<<___; 1799 ldmia r1!,{r4-r5} @ res_x 1800 ldmia r2!,{r6-r7} @ in2_x 1801 ldmia r3!,{r8-r9} @ in1_x 1802 and r4,r4,r10 @ ~in1infty & ~in2infty 1803 and r5,r5,r10 1804 and r6,r6,r11 @ in1infty & ~in2infty 1805 and r7,r7,r11 1806 and r8,r8,r12 @ in2infty 1807 and r9,r9,r12 1808 orr r4,r4,r6 1809 orr r5,r5,r7 1810 orr r4,r4,r8 1811 orr r5,r5,r9 1812 stmia $r_ptr!,{r4-r5} 1813___ 1814} 1815for(;$i<96;$i+=8) { 1816my $j=($i-64)/4; 1817$code.=<<___; 1818 ldmia r1!,{r4-r5} @ res_z 1819 ldmia r3!,{r8-r9} @ in1_z 1820 and r4,r4,r10 1821 and r5,r5,r10 1822 and r6,r11,#@ONE_mont[$j] 1823 and r7,r11,#@ONE_mont[$j+1] 1824 and r8,r8,r12 1825 and r9,r9,r12 1826 orr r4,r4,r6 1827 orr r5,r5,r7 1828 orr r4,r4,r8 1829 orr r5,r5,r9 1830 stmia $r_ptr!,{r4-r5} 1831___ 1832} 1833$code.=<<___; 1834 add sp,sp,#32*15+16 @ +16 means "skip even over saved r0-r3" 1835#if __ARM_ARCH__>=5 || !defined(__thumb__) 1836 ldmia sp!,{r4-r12,pc} 1837#else 1838 ldmia sp!,{r4-r12,lr} 1839 bx lr @ interoperable with Thumb ISA:-) 1840#endif 1841.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1842___ 1843} }}} 1844 1845foreach (split("\n",$code)) { 1846 s/\`([^\`]*)\`/eval $1/geo; 1847 1848 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; 1849 1850 print $_,"\n"; 1851} 1852close STDOUT or die "error closing STDOUT: $!"; # enforce flush 1853