1#! /usr/bin/env perl 2# Copyright 2015-2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# ECP_NISTZ256 module for ARMv4. 18# 19# October 2014. 20# 21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22# http://eprint.iacr.org/2013/816. In the process of adaptation 23# original .c module was made 32-bit savvy in order to make this 24# implementation possible. 25# 26# with/without -DECP_NISTZ256_ASM 27# Cortex-A8 +53-170% 28# Cortex-A9 +76-205% 29# Cortex-A15 +100-316% 30# Snapdragon S4 +66-187% 31# 32# Ranges denote minimum and maximum improvement coefficients depending 33# on benchmark. Lower coefficients are for ECDSA sign, server-side 34# operation. Keep in mind that +200% means 3x improvement. 35 36$flavour = shift; 37if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 38else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 39 40if ($flavour && $flavour ne "void") { 41 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 42 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 43 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 44 die "can't locate arm-xlate.pl"; 45 46 open STDOUT,"| \"$^X\" $xlate $flavour $output"; 47} else { 48 open STDOUT,">$output"; 49} 50 51$code.=<<___; 52#include "arm_arch.h" 53 54.text 55#if defined(__thumb2__) 56.syntax unified 57.thumb 58#else 59.code 32 60#endif 61___ 62######################################################################## 63# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 64# 65$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 66open TABLE,"<ecp_nistz256_table.c" or 67open TABLE,"<${dir}../ecp_nistz256_table.c" or 68die "failed to open ecp_nistz256_table.c:",$!; 69 70use integer; 71 72foreach(<TABLE>) { 73 s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; 74} 75close TABLE; 76 77# See ecp_nistz256_table.c for explanation for why it's 64*16*37. 78# 64*16*37-1 is because $#arr returns last valid index or @arr, not 79# amount of elements. 80die "insane number of elements" if ($#arr != 64*16*37-1); 81 82$code.=<<___; 83.globl ecp_nistz256_precomputed 84.type ecp_nistz256_precomputed,%object 85.align 12 86ecp_nistz256_precomputed: 87___ 88######################################################################## 89# this conversion smashes P256_POINT_AFFINE by individual bytes with 90# 64 byte interval, similar to 91# 1111222233334444 92# 1234123412341234 93for(1..37) { 94 @tbl = splice(@arr,0,64*16); 95 for($i=0;$i<64;$i++) { 96 undef @line; 97 for($j=0;$j<64;$j++) { 98 push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; 99 } 100 $code.=".byte\t"; 101 $code.=join(',',map { sprintf "0x%02x",$_} @line); 102 $code.="\n"; 103 } 104} 105$code.=<<___; 106.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed 107.align 5 108.LRR: @ 2^512 mod P precomputed for NIST P256 polynomial 109.long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb 110.long 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004 111.Lone: 112.long 1,0,0,0,0,0,0,0 113.asciz "ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" 114.align 6 115___ 116 117######################################################################## 118# common register layout, note that $t2 is link register, so that if 119# internal subroutine uses $t2, then it has to offload lr... 120 121($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)= 122 map("r$_",(0..12,14)); 123($t0,$t3)=($ff,$a_ptr); 124 125$code.=<<___; 126@ void ecp_nistz256_to_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); 127.globl ecp_nistz256_to_mont 128.type ecp_nistz256_to_mont,%function 129ecp_nistz256_to_mont: 130 adr $b_ptr,.LRR 131 b .Lecp_nistz256_mul_mont 132.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont 133 134@ void ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); 135.globl ecp_nistz256_from_mont 136.type ecp_nistz256_from_mont,%function 137ecp_nistz256_from_mont: 138 adr $b_ptr,.Lone 139 b .Lecp_nistz256_mul_mont 140.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont 141 142@ void ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); 143.globl ecp_nistz256_mul_by_2 144.type ecp_nistz256_mul_by_2,%function 145.align 4 146ecp_nistz256_mul_by_2: 147 stmdb sp!,{r4-r12,lr} 148 bl __ecp_nistz256_mul_by_2 149#if __ARM_ARCH__>=5 || !defined(__thumb__) 150 ldmia sp!,{r4-r12,pc} 151#else 152 ldmia sp!,{r4-r12,lr} 153 bx lr @ interoperable with Thumb ISA:-) 154#endif 155.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 156 157.type __ecp_nistz256_mul_by_2,%function 158.align 4 159__ecp_nistz256_mul_by_2: 160 ldr $a0,[$a_ptr,#0] 161 ldr $a1,[$a_ptr,#4] 162 ldr $a2,[$a_ptr,#8] 163 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself 164 ldr $a3,[$a_ptr,#12] 165 adcs $a1,$a1,$a1 166 ldr $a4,[$a_ptr,#16] 167 adcs $a2,$a2,$a2 168 ldr $a5,[$a_ptr,#20] 169 adcs $a3,$a3,$a3 170 ldr $a6,[$a_ptr,#24] 171 adcs $a4,$a4,$a4 172 ldr $a7,[$a_ptr,#28] 173 adcs $a5,$a5,$a5 174 adcs $a6,$a6,$a6 175 mov $ff,#0 176 adcs $a7,$a7,$a7 177 adc $ff,$ff,#0 178 179 b .Lreduce_by_sub 180.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 181 182@ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8], 183@ const BN_ULONG r2[8]); 184.globl ecp_nistz256_add 185.type ecp_nistz256_add,%function 186.align 4 187ecp_nistz256_add: 188 stmdb sp!,{r4-r12,lr} 189 bl __ecp_nistz256_add 190#if __ARM_ARCH__>=5 || !defined(__thumb__) 191 ldmia sp!,{r4-r12,pc} 192#else 193 ldmia sp!,{r4-r12,lr} 194 bx lr @ interoperable with Thumb ISA:-) 195#endif 196.size ecp_nistz256_add,.-ecp_nistz256_add 197 198.type __ecp_nistz256_add,%function 199.align 4 200__ecp_nistz256_add: 201 str lr,[sp,#-4]! @ push lr 202 203 ldr $a0,[$a_ptr,#0] 204 ldr $a1,[$a_ptr,#4] 205 ldr $a2,[$a_ptr,#8] 206 ldr $a3,[$a_ptr,#12] 207 ldr $a4,[$a_ptr,#16] 208 ldr $t0,[$b_ptr,#0] 209 ldr $a5,[$a_ptr,#20] 210 ldr $t1,[$b_ptr,#4] 211 ldr $a6,[$a_ptr,#24] 212 ldr $t2,[$b_ptr,#8] 213 ldr $a7,[$a_ptr,#28] 214 ldr $t3,[$b_ptr,#12] 215 adds $a0,$a0,$t0 216 ldr $t0,[$b_ptr,#16] 217 adcs $a1,$a1,$t1 218 ldr $t1,[$b_ptr,#20] 219 adcs $a2,$a2,$t2 220 ldr $t2,[$b_ptr,#24] 221 adcs $a3,$a3,$t3 222 ldr $t3,[$b_ptr,#28] 223 adcs $a4,$a4,$t0 224 adcs $a5,$a5,$t1 225 adcs $a6,$a6,$t2 226 mov $ff,#0 227 adcs $a7,$a7,$t3 228 adc $ff,$ff,#0 229 ldr lr,[sp],#4 @ pop lr 230 231.Lreduce_by_sub: 232 233 @ if a+b >= modulus, subtract modulus. 234 @ 235 @ But since comparison implies subtraction, we subtract 236 @ modulus and then add it back if subtraction borrowed. 237 238 subs $a0,$a0,#-1 239 sbcs $a1,$a1,#-1 240 sbcs $a2,$a2,#-1 241 sbcs $a3,$a3,#0 242 sbcs $a4,$a4,#0 243 sbcs $a5,$a5,#0 244 sbcs $a6,$a6,#1 245 sbcs $a7,$a7,#-1 246 sbc $ff,$ff,#0 247 248 @ Note that because mod has special form, i.e. consists of 249 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 250 @ using value of borrow as a whole or extracting single bit. 251 @ Follow $ff register... 252 253 adds $a0,$a0,$ff @ add synthesized modulus 254 adcs $a1,$a1,$ff 255 str $a0,[$r_ptr,#0] 256 adcs $a2,$a2,$ff 257 str $a1,[$r_ptr,#4] 258 adcs $a3,$a3,#0 259 str $a2,[$r_ptr,#8] 260 adcs $a4,$a4,#0 261 str $a3,[$r_ptr,#12] 262 adcs $a5,$a5,#0 263 str $a4,[$r_ptr,#16] 264 adcs $a6,$a6,$ff,lsr#31 265 str $a5,[$r_ptr,#20] 266 adcs $a7,$a7,$ff 267 str $a6,[$r_ptr,#24] 268 str $a7,[$r_ptr,#28] 269 270 mov pc,lr 271.size __ecp_nistz256_add,.-__ecp_nistz256_add 272 273@ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]); 274.globl ecp_nistz256_mul_by_3 275.type ecp_nistz256_mul_by_3,%function 276.align 4 277ecp_nistz256_mul_by_3: 278 stmdb sp!,{r4-r12,lr} 279 bl __ecp_nistz256_mul_by_3 280#if __ARM_ARCH__>=5 || !defined(__thumb__) 281 ldmia sp!,{r4-r12,pc} 282#else 283 ldmia sp!,{r4-r12,lr} 284 bx lr @ interoperable with Thumb ISA:-) 285#endif 286.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 287 288.type __ecp_nistz256_mul_by_3,%function 289.align 4 290__ecp_nistz256_mul_by_3: 291 str lr,[sp,#-4]! @ push lr 292 293 @ As multiplication by 3 is performed as 2*n+n, below are inline 294 @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see 295 @ corresponding subroutines for details. 296 297 ldr $a0,[$a_ptr,#0] 298 ldr $a1,[$a_ptr,#4] 299 ldr $a2,[$a_ptr,#8] 300 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] 301 ldr $a3,[$a_ptr,#12] 302 adcs $a1,$a1,$a1 303 ldr $a4,[$a_ptr,#16] 304 adcs $a2,$a2,$a2 305 ldr $a5,[$a_ptr,#20] 306 adcs $a3,$a3,$a3 307 ldr $a6,[$a_ptr,#24] 308 adcs $a4,$a4,$a4 309 ldr $a7,[$a_ptr,#28] 310 adcs $a5,$a5,$a5 311 adcs $a6,$a6,$a6 312 mov $ff,#0 313 adcs $a7,$a7,$a7 314 adc $ff,$ff,#0 315 316 subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores 317 sbcs $a1,$a1,#-1 318 sbcs $a2,$a2,#-1 319 sbcs $a3,$a3,#0 320 sbcs $a4,$a4,#0 321 sbcs $a5,$a5,#0 322 sbcs $a6,$a6,#1 323 sbcs $a7,$a7,#-1 324 sbc $ff,$ff,#0 325 326 adds $a0,$a0,$ff @ add synthesized modulus 327 adcs $a1,$a1,$ff 328 adcs $a2,$a2,$ff 329 adcs $a3,$a3,#0 330 adcs $a4,$a4,#0 331 ldr $b_ptr,[$a_ptr,#0] 332 adcs $a5,$a5,#0 333 ldr $t1,[$a_ptr,#4] 334 adcs $a6,$a6,$ff,lsr#31 335 ldr $t2,[$a_ptr,#8] 336 adc $a7,$a7,$ff 337 338 ldr $t0,[$a_ptr,#12] 339 adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7] 340 ldr $b_ptr,[$a_ptr,#16] 341 adcs $a1,$a1,$t1 342 ldr $t1,[$a_ptr,#20] 343 adcs $a2,$a2,$t2 344 ldr $t2,[$a_ptr,#24] 345 adcs $a3,$a3,$t0 346 ldr $t3,[$a_ptr,#28] 347 adcs $a4,$a4,$b_ptr 348 adcs $a5,$a5,$t1 349 adcs $a6,$a6,$t2 350 mov $ff,#0 351 adcs $a7,$a7,$t3 352 adc $ff,$ff,#0 353 ldr lr,[sp],#4 @ pop lr 354 355 b .Lreduce_by_sub 356.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 357 358@ void ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); 359.globl ecp_nistz256_div_by_2 360.type ecp_nistz256_div_by_2,%function 361.align 4 362ecp_nistz256_div_by_2: 363 stmdb sp!,{r4-r12,lr} 364 bl __ecp_nistz256_div_by_2 365#if __ARM_ARCH__>=5 || !defined(__thumb__) 366 ldmia sp!,{r4-r12,pc} 367#else 368 ldmia sp!,{r4-r12,lr} 369 bx lr @ interoperable with Thumb ISA:-) 370#endif 371.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 372 373.type __ecp_nistz256_div_by_2,%function 374.align 4 375__ecp_nistz256_div_by_2: 376 @ ret = (a is odd ? a+mod : a) >> 1 377 378 ldr $a0,[$a_ptr,#0] 379 ldr $a1,[$a_ptr,#4] 380 ldr $a2,[$a_ptr,#8] 381 mov $ff,$a0,lsl#31 @ place least significant bit to most 382 @ significant position, now arithmetic 383 @ right shift by 31 will produce -1 or 384 @ 0, while logical right shift 1 or 0, 385 @ this is how modulus is conditionally 386 @ synthesized in this case... 387 ldr $a3,[$a_ptr,#12] 388 adds $a0,$a0,$ff,asr#31 389 ldr $a4,[$a_ptr,#16] 390 adcs $a1,$a1,$ff,asr#31 391 ldr $a5,[$a_ptr,#20] 392 adcs $a2,$a2,$ff,asr#31 393 ldr $a6,[$a_ptr,#24] 394 adcs $a3,$a3,#0 395 ldr $a7,[$a_ptr,#28] 396 adcs $a4,$a4,#0 397 mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early 398 @ because it doesn't affect flags 399 adcs $a5,$a5,#0 400 orr $a0,$a0,$a1,lsl#31 401 adcs $a6,$a6,$ff,lsr#31 402 mov $b_ptr,#0 403 adcs $a7,$a7,$ff,asr#31 404 mov $a1,$a1,lsr#1 405 adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition 406 407 orr $a1,$a1,$a2,lsl#31 408 mov $a2,$a2,lsr#1 409 str $a0,[$r_ptr,#0] 410 orr $a2,$a2,$a3,lsl#31 411 mov $a3,$a3,lsr#1 412 str $a1,[$r_ptr,#4] 413 orr $a3,$a3,$a4,lsl#31 414 mov $a4,$a4,lsr#1 415 str $a2,[$r_ptr,#8] 416 orr $a4,$a4,$a5,lsl#31 417 mov $a5,$a5,lsr#1 418 str $a3,[$r_ptr,#12] 419 orr $a5,$a5,$a6,lsl#31 420 mov $a6,$a6,lsr#1 421 str $a4,[$r_ptr,#16] 422 orr $a6,$a6,$a7,lsl#31 423 mov $a7,$a7,lsr#1 424 str $a5,[$r_ptr,#20] 425 orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit 426 str $a6,[$r_ptr,#24] 427 str $a7,[$r_ptr,#28] 428 429 mov pc,lr 430.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 431 432@ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8], 433@ const BN_ULONG r2[8]); 434.globl ecp_nistz256_sub 435.type ecp_nistz256_sub,%function 436.align 4 437ecp_nistz256_sub: 438 stmdb sp!,{r4-r12,lr} 439 bl __ecp_nistz256_sub 440#if __ARM_ARCH__>=5 || !defined(__thumb__) 441 ldmia sp!,{r4-r12,pc} 442#else 443 ldmia sp!,{r4-r12,lr} 444 bx lr @ interoperable with Thumb ISA:-) 445#endif 446.size ecp_nistz256_sub,.-ecp_nistz256_sub 447 448.type __ecp_nistz256_sub,%function 449.align 4 450__ecp_nistz256_sub: 451 str lr,[sp,#-4]! @ push lr 452 453 ldr $a0,[$a_ptr,#0] 454 ldr $a1,[$a_ptr,#4] 455 ldr $a2,[$a_ptr,#8] 456 ldr $a3,[$a_ptr,#12] 457 ldr $a4,[$a_ptr,#16] 458 ldr $t0,[$b_ptr,#0] 459 ldr $a5,[$a_ptr,#20] 460 ldr $t1,[$b_ptr,#4] 461 ldr $a6,[$a_ptr,#24] 462 ldr $t2,[$b_ptr,#8] 463 ldr $a7,[$a_ptr,#28] 464 ldr $t3,[$b_ptr,#12] 465 subs $a0,$a0,$t0 466 ldr $t0,[$b_ptr,#16] 467 sbcs $a1,$a1,$t1 468 ldr $t1,[$b_ptr,#20] 469 sbcs $a2,$a2,$t2 470 ldr $t2,[$b_ptr,#24] 471 sbcs $a3,$a3,$t3 472 ldr $t3,[$b_ptr,#28] 473 sbcs $a4,$a4,$t0 474 sbcs $a5,$a5,$t1 475 sbcs $a6,$a6,$t2 476 sbcs $a7,$a7,$t3 477 sbc $ff,$ff,$ff @ broadcast borrow bit 478 ldr lr,[sp],#4 @ pop lr 479 480.Lreduce_by_add: 481 482 @ if a-b borrows, add modulus. 483 @ 484 @ Note that because mod has special form, i.e. consists of 485 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 486 @ broadcasting borrow bit to a register, $ff, and using it as 487 @ a whole or extracting single bit. 488 489 adds $a0,$a0,$ff @ add synthesized modulus 490 adcs $a1,$a1,$ff 491 str $a0,[$r_ptr,#0] 492 adcs $a2,$a2,$ff 493 str $a1,[$r_ptr,#4] 494 adcs $a3,$a3,#0 495 str $a2,[$r_ptr,#8] 496 adcs $a4,$a4,#0 497 str $a3,[$r_ptr,#12] 498 adcs $a5,$a5,#0 499 str $a4,[$r_ptr,#16] 500 adcs $a6,$a6,$ff,lsr#31 501 str $a5,[$r_ptr,#20] 502 adcs $a7,$a7,$ff 503 str $a6,[$r_ptr,#24] 504 str $a7,[$r_ptr,#28] 505 506 mov pc,lr 507.size __ecp_nistz256_sub,.-__ecp_nistz256_sub 508 509@ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]); 510.globl ecp_nistz256_neg 511.type ecp_nistz256_neg,%function 512.align 4 513ecp_nistz256_neg: 514 stmdb sp!,{r4-r12,lr} 515 bl __ecp_nistz256_neg 516#if __ARM_ARCH__>=5 || !defined(__thumb__) 517 ldmia sp!,{r4-r12,pc} 518#else 519 ldmia sp!,{r4-r12,lr} 520 bx lr @ interoperable with Thumb ISA:-) 521#endif 522.size ecp_nistz256_neg,.-ecp_nistz256_neg 523 524.type __ecp_nistz256_neg,%function 525.align 4 526__ecp_nistz256_neg: 527 ldr $a0,[$a_ptr,#0] 528 eor $ff,$ff,$ff 529 ldr $a1,[$a_ptr,#4] 530 ldr $a2,[$a_ptr,#8] 531 subs $a0,$ff,$a0 532 ldr $a3,[$a_ptr,#12] 533 sbcs $a1,$ff,$a1 534 ldr $a4,[$a_ptr,#16] 535 sbcs $a2,$ff,$a2 536 ldr $a5,[$a_ptr,#20] 537 sbcs $a3,$ff,$a3 538 ldr $a6,[$a_ptr,#24] 539 sbcs $a4,$ff,$a4 540 ldr $a7,[$a_ptr,#28] 541 sbcs $a5,$ff,$a5 542 sbcs $a6,$ff,$a6 543 sbcs $a7,$ff,$a7 544 sbc $ff,$ff,$ff 545 546 b .Lreduce_by_add 547.size __ecp_nistz256_neg,.-__ecp_nistz256_neg 548___ 549{ 550my @acc=map("r$_",(3..11)); 551my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14)); 552 553$code.=<<___; 554@ void ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); 555.globl ecp_nistz256_sqr_mont 556.type ecp_nistz256_sqr_mont,%function 557.align 4 558ecp_nistz256_sqr_mont: 559 mov $b_ptr,$a_ptr 560 b .Lecp_nistz256_mul_mont 561.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 562 563@ void ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8], 564@ const BN_ULONG r2[8]); 565.globl ecp_nistz256_mul_mont 566.type ecp_nistz256_mul_mont,%function 567.align 4 568ecp_nistz256_mul_mont: 569.Lecp_nistz256_mul_mont: 570 stmdb sp!,{r4-r12,lr} 571 bl __ecp_nistz256_mul_mont 572#if __ARM_ARCH__>=5 || !defined(__thumb__) 573 ldmia sp!,{r4-r12,pc} 574#else 575 ldmia sp!,{r4-r12,lr} 576 bx lr @ interoperable with Thumb ISA:-) 577#endif 578.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 579 580.type __ecp_nistz256_mul_mont,%function 581.align 4 582__ecp_nistz256_mul_mont: 583 stmdb sp!,{r0-r2,lr} @ make a copy of arguments too 584 585 ldr $bj,[$b_ptr,#0] @ b[0] 586 ldmia $a_ptr,{@acc[1]-@acc[8]} 587 588 umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0] 589 stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so 590 @ that it can be addressed 591 @ without spending register 592 @ on address 593 umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0] 594 umull @acc[2],$t1,@acc[3],$bj 595 adds @acc[1],@acc[1],$t3 @ accumulate high part of mult 596 umull @acc[3],$t2,@acc[4],$bj 597 adcs @acc[2],@acc[2],$t0 598 umull @acc[4],$t3,@acc[5],$bj 599 adcs @acc[3],@acc[3],$t1 600 umull @acc[5],$t0,@acc[6],$bj 601 adcs @acc[4],@acc[4],$t2 602 umull @acc[6],$t1,@acc[7],$bj 603 adcs @acc[5],@acc[5],$t3 604 umull @acc[7],$t2,@acc[8],$bj 605 adcs @acc[6],@acc[6],$t0 606 adcs @acc[7],@acc[7],$t1 607 eor $t3,$t3,$t3 @ first overflow bit is zero 608 adc @acc[8],$t2,#0 609___ 610for(my $i=1;$i<8;$i++) { 611my $t4=@acc[0]; 612 613 # Reduction iteration is normally performed by accumulating 614 # result of multiplication of modulus by "magic" digit [and 615 # omitting least significant word, which is guaranteed to 616 # be 0], but thanks to special form of modulus and "magic" 617 # digit being equal to least significant word, it can be 618 # performed with additions and subtractions alone. Indeed: 619 # 620 # ffff.0001.0000.0000.0000.ffff.ffff.ffff 621 # * abcd 622 # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 623 # 624 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 625 # rewrite above as: 626 # 627 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 628 # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 629 # - abcd.0000.0000.0000.0000.0000.0000.abcd 630 # 631 # or marking redundant operations: 632 # 633 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- 634 # + abcd.0000.abcd.0000.0000.abcd.----.----.---- 635 # - abcd.----.----.----.----.----.----.---- 636 637$code.=<<___; 638 @ multiplication-less reduction $i 639 adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0] 640 ldr $bj,[sp,#40] @ restore b_ptr 641 adcs @acc[4],@acc[4],#0 @ r[4]+=0 642 adcs @acc[5],@acc[5],#0 @ r[5]+=0 643 adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0] 644 ldr $t1,[sp,#0] @ load a[0] 645 adcs @acc[7],@acc[7],#0 @ r[7]+=0 646 ldr $bj,[$bj,#4*$i] @ load b[i] 647 adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0] 648 eor $t0,$t0,$t0 649 adc $t3,$t3,#0 @ overflow bit 650 subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0] 651 ldr $t2,[sp,#4] @ a[1] 652 sbcs @acc[8],@acc[8],#0 @ r[8]-=0 653 umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i] 654 eor $t1,$t1,$t1 655 sbc @acc[0],$t3,#0 @ overflow bit, keep in mind 656 @ that netto result is 657 @ addition of a value which 658 @ makes underflow impossible 659 660 ldr $t3,[sp,#8] @ a[2] 661 umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i] 662 str @acc[0],[sp,#36] @ temporarily offload overflow 663 eor $t2,$t2,$t2 664 ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0] 665 umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i] 666 eor $t3,$t3,$t3 667 adds @acc[2],@acc[2],$t0 @ accumulate high part of mult 668 ldr $t0,[sp,#16] @ a[4] 669 umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i] 670 eor $t4,$t4,$t4 671 adcs @acc[3],@acc[3],$t1 672 ldr $t1,[sp,#20] @ a[5] 673 umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i] 674 eor $t0,$t0,$t0 675 adcs @acc[4],@acc[4],$t2 676 ldr $t2,[sp,#24] @ a[6] 677 umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i] 678 eor $t1,$t1,$t1 679 adcs @acc[5],@acc[5],$t3 680 ldr $t3,[sp,#28] @ a[7] 681 umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i] 682 eor $t2,$t2,$t2 683 adcs @acc[6],@acc[6],$t4 684 ldr @acc[0],[sp,#36] @ restore overflow bit 685 umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i] 686 eor $t3,$t3,$t3 687 adcs @acc[7],@acc[7],$t0 688 adcs @acc[8],@acc[8],$t1 689 adcs @acc[0],$acc[0],$t2 690 adc $t3,$t3,#0 @ new overflow bit 691___ 692 push(@acc,shift(@acc)); # rotate registers, so that 693 # "r[i]" becomes r[i] 694} 695$code.=<<___; 696 @ last multiplication-less reduction 697 adds @acc[3],@acc[3],@acc[0] 698 ldr $r_ptr,[sp,#32] @ restore r_ptr 699 adcs @acc[4],@acc[4],#0 700 adcs @acc[5],@acc[5],#0 701 adcs @acc[6],@acc[6],@acc[0] 702 adcs @acc[7],@acc[7],#0 703 adcs @acc[8],@acc[8],@acc[0] 704 adc $t3,$t3,#0 705 subs @acc[7],@acc[7],@acc[0] 706 sbcs @acc[8],@acc[8],#0 707 sbc @acc[0],$t3,#0 @ overflow bit 708 709 @ Final step is "if result > mod, subtract mod", but we do it 710 @ "other way around", namely subtract modulus from result 711 @ and if it borrowed, add modulus back. 712 713 adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1 714 adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1 715 adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1 716 sbcs @acc[4],@acc[4],#0 717 sbcs @acc[5],@acc[5],#0 718 sbcs @acc[6],@acc[6],#0 719 sbcs @acc[7],@acc[7],#1 720 adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1 721 ldr lr,[sp,#44] @ restore lr 722 sbc @acc[0],@acc[0],#0 @ broadcast borrow bit 723 add sp,sp,#48 724 725 @ Note that because mod has special form, i.e. consists of 726 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 727 @ broadcasting borrow bit to a register, @acc[0], and using it as 728 @ a whole or extracting single bit. 729 730 adds @acc[1],@acc[1],@acc[0] @ add modulus or zero 731 adcs @acc[2],@acc[2],@acc[0] 732 str @acc[1],[$r_ptr,#0] 733 adcs @acc[3],@acc[3],@acc[0] 734 str @acc[2],[$r_ptr,#4] 735 adcs @acc[4],@acc[4],#0 736 str @acc[3],[$r_ptr,#8] 737 adcs @acc[5],@acc[5],#0 738 str @acc[4],[$r_ptr,#12] 739 adcs @acc[6],@acc[6],#0 740 str @acc[5],[$r_ptr,#16] 741 adcs @acc[7],@acc[7],@acc[0],lsr#31 742 str @acc[6],[$r_ptr,#20] 743 adc @acc[8],@acc[8],@acc[0] 744 str @acc[7],[$r_ptr,#24] 745 str @acc[8],[$r_ptr,#28] 746 747 mov pc,lr 748.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 749___ 750} 751 752{ 753my ($out,$inp,$index,$mask)=map("r$_",(0..3)); 754$code.=<<___; 755@ void ecp_nistz256_scatter_w5(void *r0,const P256_POINT *r1, 756@ int r2); 757.globl ecp_nistz256_scatter_w5 758.type ecp_nistz256_scatter_w5,%function 759.align 5 760ecp_nistz256_scatter_w5: 761 stmdb sp!,{r4-r11} 762 763 add $out,$out,$index,lsl#2 764 765 ldmia $inp!,{r4-r11} @ X 766 str r4,[$out,#64*0-4] 767 str r5,[$out,#64*1-4] 768 str r6,[$out,#64*2-4] 769 str r7,[$out,#64*3-4] 770 str r8,[$out,#64*4-4] 771 str r9,[$out,#64*5-4] 772 str r10,[$out,#64*6-4] 773 str r11,[$out,#64*7-4] 774 add $out,$out,#64*8 775 776 ldmia $inp!,{r4-r11} @ Y 777 str r4,[$out,#64*0-4] 778 str r5,[$out,#64*1-4] 779 str r6,[$out,#64*2-4] 780 str r7,[$out,#64*3-4] 781 str r8,[$out,#64*4-4] 782 str r9,[$out,#64*5-4] 783 str r10,[$out,#64*6-4] 784 str r11,[$out,#64*7-4] 785 add $out,$out,#64*8 786 787 ldmia $inp,{r4-r11} @ Z 788 str r4,[$out,#64*0-4] 789 str r5,[$out,#64*1-4] 790 str r6,[$out,#64*2-4] 791 str r7,[$out,#64*3-4] 792 str r8,[$out,#64*4-4] 793 str r9,[$out,#64*5-4] 794 str r10,[$out,#64*6-4] 795 str r11,[$out,#64*7-4] 796 797 ldmia sp!,{r4-r11} 798#if __ARM_ARCH__>=5 || defined(__thumb__) 799 bx lr 800#else 801 mov pc,lr 802#endif 803.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 804 805@ void ecp_nistz256_gather_w5(P256_POINT *r0,const void *r1, 806@ int r2); 807.globl ecp_nistz256_gather_w5 808.type ecp_nistz256_gather_w5,%function 809.align 5 810ecp_nistz256_gather_w5: 811 stmdb sp!,{r4-r11} 812 813 cmp $index,#0 814 mov $mask,#0 815#ifdef __thumb2__ 816 itt ne 817#endif 818 subne $index,$index,#1 819 movne $mask,#-1 820 add $inp,$inp,$index,lsl#2 821 822 ldr r4,[$inp,#64*0] 823 ldr r5,[$inp,#64*1] 824 ldr r6,[$inp,#64*2] 825 and r4,r4,$mask 826 ldr r7,[$inp,#64*3] 827 and r5,r5,$mask 828 ldr r8,[$inp,#64*4] 829 and r6,r6,$mask 830 ldr r9,[$inp,#64*5] 831 and r7,r7,$mask 832 ldr r10,[$inp,#64*6] 833 and r8,r8,$mask 834 ldr r11,[$inp,#64*7] 835 add $inp,$inp,#64*8 836 and r9,r9,$mask 837 and r10,r10,$mask 838 and r11,r11,$mask 839 stmia $out!,{r4-r11} @ X 840 841 ldr r4,[$inp,#64*0] 842 ldr r5,[$inp,#64*1] 843 ldr r6,[$inp,#64*2] 844 and r4,r4,$mask 845 ldr r7,[$inp,#64*3] 846 and r5,r5,$mask 847 ldr r8,[$inp,#64*4] 848 and r6,r6,$mask 849 ldr r9,[$inp,#64*5] 850 and r7,r7,$mask 851 ldr r10,[$inp,#64*6] 852 and r8,r8,$mask 853 ldr r11,[$inp,#64*7] 854 add $inp,$inp,#64*8 855 and r9,r9,$mask 856 and r10,r10,$mask 857 and r11,r11,$mask 858 stmia $out!,{r4-r11} @ Y 859 860 ldr r4,[$inp,#64*0] 861 ldr r5,[$inp,#64*1] 862 ldr r6,[$inp,#64*2] 863 and r4,r4,$mask 864 ldr r7,[$inp,#64*3] 865 and r5,r5,$mask 866 ldr r8,[$inp,#64*4] 867 and r6,r6,$mask 868 ldr r9,[$inp,#64*5] 869 and r7,r7,$mask 870 ldr r10,[$inp,#64*6] 871 and r8,r8,$mask 872 ldr r11,[$inp,#64*7] 873 and r9,r9,$mask 874 and r10,r10,$mask 875 and r11,r11,$mask 876 stmia $out,{r4-r11} @ Z 877 878 ldmia sp!,{r4-r11} 879#if __ARM_ARCH__>=5 || defined(__thumb__) 880 bx lr 881#else 882 mov pc,lr 883#endif 884.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 885 886@ void ecp_nistz256_scatter_w7(void *r0,const P256_POINT_AFFINE *r1, 887@ int r2); 888.globl ecp_nistz256_scatter_w7 889.type ecp_nistz256_scatter_w7,%function 890.align 5 891ecp_nistz256_scatter_w7: 892 add $out,$out,$index 893 mov $index,#64/4 894.Loop_scatter_w7: 895 ldr $mask,[$inp],#4 896 subs $index,$index,#1 897 strb $mask,[$out,#64*0] 898 mov $mask,$mask,lsr#8 899 strb $mask,[$out,#64*1] 900 mov $mask,$mask,lsr#8 901 strb $mask,[$out,#64*2] 902 mov $mask,$mask,lsr#8 903 strb $mask,[$out,#64*3] 904 add $out,$out,#64*4 905 bne .Loop_scatter_w7 906 907#if __ARM_ARCH__>=5 || defined(__thumb__) 908 bx lr 909#else 910 mov pc,lr 911#endif 912.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 913 914@ void ecp_nistz256_gather_w7(P256_POINT_AFFINE *r0,const void *r1, 915@ int r2); 916.globl ecp_nistz256_gather_w7 917.type ecp_nistz256_gather_w7,%function 918.align 5 919ecp_nistz256_gather_w7: 920 stmdb sp!,{r4-r7} 921 922 cmp $index,#0 923 mov $mask,#0 924#ifdef __thumb2__ 925 itt ne 926#endif 927 subne $index,$index,#1 928 movne $mask,#-1 929 add $inp,$inp,$index 930 mov $index,#64/4 931 nop 932.Loop_gather_w7: 933 ldrb r4,[$inp,#64*0] 934 subs $index,$index,#1 935 ldrb r5,[$inp,#64*1] 936 ldrb r6,[$inp,#64*2] 937 ldrb r7,[$inp,#64*3] 938 add $inp,$inp,#64*4 939 orr r4,r4,r5,lsl#8 940 orr r4,r4,r6,lsl#16 941 orr r4,r4,r7,lsl#24 942 and r4,r4,$mask 943 str r4,[$out],#4 944 bne .Loop_gather_w7 945 946 ldmia sp!,{r4-r7} 947#if __ARM_ARCH__>=5 || defined(__thumb__) 948 bx lr 949#else 950 mov pc,lr 951#endif 952.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 953___ 954} 955if (0) { 956# In comparison to integer-only equivalent of below subroutine: 957# 958# Cortex-A8 +10% 959# Cortex-A9 -10% 960# Snapdragon S4 +5% 961# 962# As not all time is spent in multiplication, overall impact is deemed 963# too low to care about. 964 965my ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7)); 966my $mask="q4"; 967my $mult="q5"; 968my @AxB=map("q$_",(8..15)); 969 970my ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3)); 971 972$code.=<<___; 973#if __ARM_ARCH__>=7 974.fpu neon 975 976.globl ecp_nistz256_mul_mont_neon 977.type ecp_nistz256_mul_mont_neon,%function 978.align 5 979ecp_nistz256_mul_mont_neon: 980 mov ip,sp 981 stmdb sp!,{r4-r9} 982 vstmdb sp!,{q4-q5} @ ABI specification says so 983 984 sub $toutptr,sp,#40 985 vld1.32 {${Bi}[0]},[$bptr,:32]! 986 veor $zero,$zero,$zero 987 vld1.32 {$A0-$A3}, [$aptr] @ can't specify :32 :-( 988 vzip.16 $Bi,$zero 989 mov sp,$toutptr @ alloca 990 vmov.i64 $mask,#0xffff 991 992 vmull.u32 @AxB[0],$Bi,${A0}[0] 993 vmull.u32 @AxB[1],$Bi,${A0}[1] 994 vmull.u32 @AxB[2],$Bi,${A1}[0] 995 vmull.u32 @AxB[3],$Bi,${A1}[1] 996 vshr.u64 $temp,@AxB[0]#lo,#16 997 vmull.u32 @AxB[4],$Bi,${A2}[0] 998 vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp 999 vmull.u32 @AxB[5],$Bi,${A2}[1] 1000 vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 32 bits of a[0]*b[0] 1001 vmull.u32 @AxB[6],$Bi,${A3}[0] 1002 vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] 1003 vmull.u32 @AxB[7],$Bi,${A3}[1] 1004___ 1005for($i=1;$i<8;$i++) { 1006$code.=<<___; 1007 vld1.32 {${Bi}[0]},[$bptr,:32]! 1008 veor $zero,$zero,$zero 1009 vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ reduction 1010 vshl.u64 $mult,@AxB[0],#32 1011 vadd.u64 @AxB[3],@AxB[3],@AxB[0] 1012 vsub.u64 $mult,$mult,@AxB[0] 1013 vzip.16 $Bi,$zero 1014 vadd.u64 @AxB[6],@AxB[6],@AxB[0] 1015 vadd.u64 @AxB[7],@AxB[7],$mult 1016___ 1017 push(@AxB,shift(@AxB)); 1018$code.=<<___; 1019 vmlal.u32 @AxB[0],$Bi,${A0}[0] 1020 vmlal.u32 @AxB[1],$Bi,${A0}[1] 1021 vmlal.u32 @AxB[2],$Bi,${A1}[0] 1022 vmlal.u32 @AxB[3],$Bi,${A1}[1] 1023 vshr.u64 $temp,@AxB[0]#lo,#16 1024 vmlal.u32 @AxB[4],$Bi,${A2}[0] 1025 vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp 1026 vmlal.u32 @AxB[5],$Bi,${A2}[1] 1027 vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 33 bits of a[0]*b[i]+t[0] 1028 vmlal.u32 @AxB[6],$Bi,${A3}[0] 1029 vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] 1030 vmull.u32 @AxB[7],$Bi,${A3}[1] 1031___ 1032} 1033$code.=<<___; 1034 vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ last reduction 1035 vshl.u64 $mult,@AxB[0],#32 1036 vadd.u64 @AxB[3],@AxB[3],@AxB[0] 1037 vsub.u64 $mult,$mult,@AxB[0] 1038 vadd.u64 @AxB[6],@AxB[6],@AxB[0] 1039 vadd.u64 @AxB[7],@AxB[7],$mult 1040 1041 vshr.u64 $temp,@AxB[1]#lo,#16 @ convert 1042 vadd.u64 @AxB[1]#hi,@AxB[1]#hi,$temp 1043 vshr.u64 $temp,@AxB[1]#hi,#16 1044 vzip.16 @AxB[1]#lo,@AxB[1]#hi 1045___ 1046foreach (2..7) { 1047$code.=<<___; 1048 vadd.u64 @AxB[$_]#lo,@AxB[$_]#lo,$temp 1049 vst1.32 {@AxB[$_-1]#lo[0]},[$toutptr,:32]! 1050 vshr.u64 $temp,@AxB[$_]#lo,#16 1051 vadd.u64 @AxB[$_]#hi,@AxB[$_]#hi,$temp 1052 vshr.u64 $temp,@AxB[$_]#hi,#16 1053 vzip.16 @AxB[$_]#lo,@AxB[$_]#hi 1054___ 1055} 1056$code.=<<___; 1057 vst1.32 {@AxB[7]#lo[0]},[$toutptr,:32]! 1058 vst1.32 {$temp},[$toutptr] @ upper 33 bits 1059 1060 ldr r1,[sp,#0] 1061 ldr r2,[sp,#4] 1062 ldr r3,[sp,#8] 1063 subs r1,r1,#-1 1064 ldr r4,[sp,#12] 1065 sbcs r2,r2,#-1 1066 ldr r5,[sp,#16] 1067 sbcs r3,r3,#-1 1068 ldr r6,[sp,#20] 1069 sbcs r4,r4,#0 1070 ldr r7,[sp,#24] 1071 sbcs r5,r5,#0 1072 ldr r8,[sp,#28] 1073 sbcs r6,r6,#0 1074 ldr r9,[sp,#32] @ top-most bit 1075 sbcs r7,r7,#1 1076 sub sp,ip,#40+16 1077 sbcs r8,r8,#-1 1078 sbc r9,r9,#0 1079 vldmia sp!,{q4-q5} 1080 1081 adds r1,r1,r9 1082 adcs r2,r2,r9 1083 str r1,[$rptr,#0] 1084 adcs r3,r3,r9 1085 str r2,[$rptr,#4] 1086 adcs r4,r4,#0 1087 str r3,[$rptr,#8] 1088 adcs r5,r5,#0 1089 str r4,[$rptr,#12] 1090 adcs r6,r6,#0 1091 str r5,[$rptr,#16] 1092 adcs r7,r7,r9,lsr#31 1093 str r6,[$rptr,#20] 1094 adcs r8,r8,r9 1095 str r7,[$rptr,#24] 1096 str r8,[$rptr,#28] 1097 1098 ldmia sp!,{r4-r9} 1099 bx lr 1100.size ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon 1101#endif 1102___ 1103} 1104 1105{{{ 1106######################################################################## 1107# Below $aN assignment matches order in which 256-bit result appears in 1108# register bank at return from __ecp_nistz256_mul_mont, so that we can 1109# skip over reloading it from memory. This means that below functions 1110# use custom calling sequence accepting 256-bit input in registers, 1111# output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr. 1112# 1113# See their "normal" counterparts for insights on calculations. 1114 1115my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7, 1116 $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1)); 1117my $ff=$b_ptr; 1118 1119$code.=<<___; 1120.type __ecp_nistz256_sub_from,%function 1121.align 5 1122__ecp_nistz256_sub_from: 1123 str lr,[sp,#-4]! @ push lr 1124 1125 ldr $t0,[$b_ptr,#0] 1126 ldr $t1,[$b_ptr,#4] 1127 ldr $t2,[$b_ptr,#8] 1128 ldr $t3,[$b_ptr,#12] 1129 subs $a0,$a0,$t0 1130 ldr $t0,[$b_ptr,#16] 1131 sbcs $a1,$a1,$t1 1132 ldr $t1,[$b_ptr,#20] 1133 sbcs $a2,$a2,$t2 1134 ldr $t2,[$b_ptr,#24] 1135 sbcs $a3,$a3,$t3 1136 ldr $t3,[$b_ptr,#28] 1137 sbcs $a4,$a4,$t0 1138 sbcs $a5,$a5,$t1 1139 sbcs $a6,$a6,$t2 1140 sbcs $a7,$a7,$t3 1141 sbc $ff,$ff,$ff @ broadcast borrow bit 1142 ldr lr,[sp],#4 @ pop lr 1143 1144 adds $a0,$a0,$ff @ add synthesized modulus 1145 adcs $a1,$a1,$ff 1146 str $a0,[$r_ptr,#0] 1147 adcs $a2,$a2,$ff 1148 str $a1,[$r_ptr,#4] 1149 adcs $a3,$a3,#0 1150 str $a2,[$r_ptr,#8] 1151 adcs $a4,$a4,#0 1152 str $a3,[$r_ptr,#12] 1153 adcs $a5,$a5,#0 1154 str $a4,[$r_ptr,#16] 1155 adcs $a6,$a6,$ff,lsr#31 1156 str $a5,[$r_ptr,#20] 1157 adcs $a7,$a7,$ff 1158 str $a6,[$r_ptr,#24] 1159 str $a7,[$r_ptr,#28] 1160 1161 mov pc,lr 1162.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 1163 1164.type __ecp_nistz256_sub_morf,%function 1165.align 5 1166__ecp_nistz256_sub_morf: 1167 str lr,[sp,#-4]! @ push lr 1168 1169 ldr $t0,[$b_ptr,#0] 1170 ldr $t1,[$b_ptr,#4] 1171 ldr $t2,[$b_ptr,#8] 1172 ldr $t3,[$b_ptr,#12] 1173 subs $a0,$t0,$a0 1174 ldr $t0,[$b_ptr,#16] 1175 sbcs $a1,$t1,$a1 1176 ldr $t1,[$b_ptr,#20] 1177 sbcs $a2,$t2,$a2 1178 ldr $t2,[$b_ptr,#24] 1179 sbcs $a3,$t3,$a3 1180 ldr $t3,[$b_ptr,#28] 1181 sbcs $a4,$t0,$a4 1182 sbcs $a5,$t1,$a5 1183 sbcs $a6,$t2,$a6 1184 sbcs $a7,$t3,$a7 1185 sbc $ff,$ff,$ff @ broadcast borrow bit 1186 ldr lr,[sp],#4 @ pop lr 1187 1188 adds $a0,$a0,$ff @ add synthesized modulus 1189 adcs $a1,$a1,$ff 1190 str $a0,[$r_ptr,#0] 1191 adcs $a2,$a2,$ff 1192 str $a1,[$r_ptr,#4] 1193 adcs $a3,$a3,#0 1194 str $a2,[$r_ptr,#8] 1195 adcs $a4,$a4,#0 1196 str $a3,[$r_ptr,#12] 1197 adcs $a5,$a5,#0 1198 str $a4,[$r_ptr,#16] 1199 adcs $a6,$a6,$ff,lsr#31 1200 str $a5,[$r_ptr,#20] 1201 adcs $a7,$a7,$ff 1202 str $a6,[$r_ptr,#24] 1203 str $a7,[$r_ptr,#28] 1204 1205 mov pc,lr 1206.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 1207 1208.type __ecp_nistz256_add_self,%function 1209.align 4 1210__ecp_nistz256_add_self: 1211 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] 1212 adcs $a1,$a1,$a1 1213 adcs $a2,$a2,$a2 1214 adcs $a3,$a3,$a3 1215 adcs $a4,$a4,$a4 1216 adcs $a5,$a5,$a5 1217 adcs $a6,$a6,$a6 1218 mov $ff,#0 1219 adcs $a7,$a7,$a7 1220 adc $ff,$ff,#0 1221 1222 @ if a+b >= modulus, subtract modulus. 1223 @ 1224 @ But since comparison implies subtraction, we subtract 1225 @ modulus and then add it back if subtraction borrowed. 1226 1227 subs $a0,$a0,#-1 1228 sbcs $a1,$a1,#-1 1229 sbcs $a2,$a2,#-1 1230 sbcs $a3,$a3,#0 1231 sbcs $a4,$a4,#0 1232 sbcs $a5,$a5,#0 1233 sbcs $a6,$a6,#1 1234 sbcs $a7,$a7,#-1 1235 sbc $ff,$ff,#0 1236 1237 @ Note that because mod has special form, i.e. consists of 1238 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 1239 @ using value of borrow as a whole or extracting single bit. 1240 @ Follow $ff register... 1241 1242 adds $a0,$a0,$ff @ add synthesized modulus 1243 adcs $a1,$a1,$ff 1244 str $a0,[$r_ptr,#0] 1245 adcs $a2,$a2,$ff 1246 str $a1,[$r_ptr,#4] 1247 adcs $a3,$a3,#0 1248 str $a2,[$r_ptr,#8] 1249 adcs $a4,$a4,#0 1250 str $a3,[$r_ptr,#12] 1251 adcs $a5,$a5,#0 1252 str $a4,[$r_ptr,#16] 1253 adcs $a6,$a6,$ff,lsr#31 1254 str $a5,[$r_ptr,#20] 1255 adcs $a7,$a7,$ff 1256 str $a6,[$r_ptr,#24] 1257 str $a7,[$r_ptr,#28] 1258 1259 mov pc,lr 1260.size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self 1261 1262___ 1263 1264######################################################################## 1265# following subroutines are "literal" implementation of those found in 1266# ecp_nistz256.c 1267# 1268######################################################################## 1269# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 1270# 1271{ 1272my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 1273# above map() describes stack layout with 5 temporary 1274# 256-bit vectors on top. Then note that we push 1275# starting from r0, which means that we have copy of 1276# input arguments just below these temporary vectors. 1277 1278$code.=<<___; 1279.globl ecp_nistz256_point_double 1280.type ecp_nistz256_point_double,%function 1281.align 5 1282ecp_nistz256_point_double: 1283 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional 1284 sub sp,sp,#32*5 1285 1286.Lpoint_double_shortcut: 1287 add r3,sp,#$in_x 1288 ldmia $a_ptr!,{r4-r11} @ copy in_x 1289 stmia r3,{r4-r11} 1290 1291 add $r_ptr,sp,#$S 1292 bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y); 1293 1294 add $b_ptr,$a_ptr,#32 1295 add $a_ptr,$a_ptr,#32 1296 add $r_ptr,sp,#$Zsqr 1297 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z); 1298 1299 add $a_ptr,sp,#$S 1300 add $b_ptr,sp,#$S 1301 add $r_ptr,sp,#$S 1302 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S); 1303 1304 ldr $b_ptr,[sp,#32*5+4] 1305 add $a_ptr,$b_ptr,#32 1306 add $b_ptr,$b_ptr,#64 1307 add $r_ptr,sp,#$tmp0 1308 bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y); 1309 1310 ldr $r_ptr,[sp,#32*5] 1311 add $r_ptr,$r_ptr,#64 1312 bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0); 1313 1314 add $a_ptr,sp,#$in_x 1315 add $b_ptr,sp,#$Zsqr 1316 add $r_ptr,sp,#$M 1317 bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr); 1318 1319 add $a_ptr,sp,#$in_x 1320 add $b_ptr,sp,#$Zsqr 1321 add $r_ptr,sp,#$Zsqr 1322 bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr); 1323 1324 add $a_ptr,sp,#$S 1325 add $b_ptr,sp,#$S 1326 add $r_ptr,sp,#$tmp0 1327 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S); 1328 1329 add $a_ptr,sp,#$Zsqr 1330 add $b_ptr,sp,#$M 1331 add $r_ptr,sp,#$M 1332 bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr); 1333 1334 ldr $r_ptr,[sp,#32*5] 1335 add $a_ptr,sp,#$tmp0 1336 add $r_ptr,$r_ptr,#32 1337 bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0); 1338 1339 add $a_ptr,sp,#$M 1340 add $r_ptr,sp,#$M 1341 bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M); 1342 1343 add $a_ptr,sp,#$in_x 1344 add $b_ptr,sp,#$S 1345 add $r_ptr,sp,#$S 1346 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x); 1347 1348 add $r_ptr,sp,#$tmp0 1349 bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S); 1350 1351 ldr $r_ptr,[sp,#32*5] 1352 add $a_ptr,sp,#$M 1353 add $b_ptr,sp,#$M 1354 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M); 1355 1356 add $b_ptr,sp,#$tmp0 1357 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0); 1358 1359 add $b_ptr,sp,#$S 1360 add $r_ptr,sp,#$S 1361 bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x); 1362 1363 add $a_ptr,sp,#$M 1364 add $b_ptr,sp,#$S 1365 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M); 1366 1367 ldr $r_ptr,[sp,#32*5] 1368 add $b_ptr,$r_ptr,#32 1369 add $r_ptr,$r_ptr,#32 1370 bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y); 1371 1372 add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3" 1373#if __ARM_ARCH__>=5 || !defined(__thumb__) 1374 ldmia sp!,{r4-r12,pc} 1375#else 1376 ldmia sp!,{r4-r12,lr} 1377 bx lr @ interoperable with Thumb ISA:-) 1378#endif 1379.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 1380___ 1381} 1382 1383######################################################################## 1384# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 1385# const P256_POINT *in2); 1386{ 1387my ($res_x,$res_y,$res_z, 1388 $in1_x,$in1_y,$in1_z, 1389 $in2_x,$in2_y,$in2_z, 1390 $H,$Hsqr,$R,$Rsqr,$Hcub, 1391 $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); 1392my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 1393# above map() describes stack layout with 18 temporary 1394# 256-bit vectors on top. Then note that we push 1395# starting from r0, which means that we have copy of 1396# input arguments just below these temporary vectors. 1397# We use three of them for !in1infty, !in2intfy and 1398# result of check for zero. 1399 1400$code.=<<___; 1401.globl ecp_nistz256_point_add 1402.type ecp_nistz256_point_add,%function 1403.align 5 1404ecp_nistz256_point_add: 1405 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional 1406 sub sp,sp,#32*18+16 1407 1408 ldmia $b_ptr!,{r4-r11} @ copy in2_x 1409 add r3,sp,#$in2_x 1410 stmia r3!,{r4-r11} 1411 ldmia $b_ptr!,{r4-r11} @ copy in2_y 1412 stmia r3!,{r4-r11} 1413 ldmia $b_ptr,{r4-r11} @ copy in2_z 1414 orr r12,r4,r5 1415 orr r12,r12,r6 1416 orr r12,r12,r7 1417 orr r12,r12,r8 1418 orr r12,r12,r9 1419 orr r12,r12,r10 1420 orr r12,r12,r11 1421 cmp r12,#0 1422#ifdef __thumb2__ 1423 it ne 1424#endif 1425 movne r12,#-1 1426 stmia r3,{r4-r11} 1427 str r12,[sp,#32*18+8] @ !in2infty 1428 1429 ldmia $a_ptr!,{r4-r11} @ copy in1_x 1430 add r3,sp,#$in1_x 1431 stmia r3!,{r4-r11} 1432 ldmia $a_ptr!,{r4-r11} @ copy in1_y 1433 stmia r3!,{r4-r11} 1434 ldmia $a_ptr,{r4-r11} @ copy in1_z 1435 orr r12,r4,r5 1436 orr r12,r12,r6 1437 orr r12,r12,r7 1438 orr r12,r12,r8 1439 orr r12,r12,r9 1440 orr r12,r12,r10 1441 orr r12,r12,r11 1442 cmp r12,#0 1443#ifdef __thumb2__ 1444 it ne 1445#endif 1446 movne r12,#-1 1447 stmia r3,{r4-r11} 1448 str r12,[sp,#32*18+4] @ !in1infty 1449 1450 add $a_ptr,sp,#$in2_z 1451 add $b_ptr,sp,#$in2_z 1452 add $r_ptr,sp,#$Z2sqr 1453 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z); 1454 1455 add $a_ptr,sp,#$in1_z 1456 add $b_ptr,sp,#$in1_z 1457 add $r_ptr,sp,#$Z1sqr 1458 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); 1459 1460 add $a_ptr,sp,#$in2_z 1461 add $b_ptr,sp,#$Z2sqr 1462 add $r_ptr,sp,#$S1 1463 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z); 1464 1465 add $a_ptr,sp,#$in1_z 1466 add $b_ptr,sp,#$Z1sqr 1467 add $r_ptr,sp,#$S2 1468 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); 1469 1470 add $a_ptr,sp,#$in1_y 1471 add $b_ptr,sp,#$S1 1472 add $r_ptr,sp,#$S1 1473 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y); 1474 1475 add $a_ptr,sp,#$in2_y 1476 add $b_ptr,sp,#$S2 1477 add $r_ptr,sp,#$S2 1478 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); 1479 1480 add $b_ptr,sp,#$S1 1481 add $r_ptr,sp,#$R 1482 bl __ecp_nistz256_sub_from @ p256_sub(R, S2, S1); 1483 1484 orr $a0,$a0,$a1 @ see if result is zero 1485 orr $a2,$a2,$a3 1486 orr $a4,$a4,$a5 1487 orr $a0,$a0,$a2 1488 orr $a4,$a4,$a6 1489 orr $a0,$a0,$a7 1490 add $a_ptr,sp,#$in1_x 1491 orr $a0,$a0,$a4 1492 add $b_ptr,sp,#$Z2sqr 1493 str $a0,[sp,#32*18+12] 1494 1495 add $r_ptr,sp,#$U1 1496 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr); 1497 1498 add $a_ptr,sp,#$in2_x 1499 add $b_ptr,sp,#$Z1sqr 1500 add $r_ptr,sp,#$U2 1501 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr); 1502 1503 add $b_ptr,sp,#$U1 1504 add $r_ptr,sp,#$H 1505 bl __ecp_nistz256_sub_from @ p256_sub(H, U2, U1); 1506 1507 orr $a0,$a0,$a1 @ see if result is zero 1508 orr $a2,$a2,$a3 1509 orr $a4,$a4,$a5 1510 orr $a0,$a0,$a2 1511 orr $a4,$a4,$a6 1512 orr $a0,$a0,$a7 1513 orrs $a0,$a0,$a4 1514 1515 bne .Ladd_proceed @ is_equal(U1,U2)? 1516 1517 ldr $t0,[sp,#32*18+4] 1518 ldr $t1,[sp,#32*18+8] 1519 ldr $t2,[sp,#32*18+12] 1520 tst $t0,$t1 1521 beq .Ladd_proceed @ (in1infty || in2infty)? 1522 tst $t2,$t2 1523 beq .Ladd_double @ is_equal(S1,S2)? 1524 1525 ldr $r_ptr,[sp,#32*18+16] 1526 eor r4,r4,r4 1527 eor r5,r5,r5 1528 eor r6,r6,r6 1529 eor r7,r7,r7 1530 eor r8,r8,r8 1531 eor r9,r9,r9 1532 eor r10,r10,r10 1533 eor r11,r11,r11 1534 stmia $r_ptr!,{r4-r11} 1535 stmia $r_ptr!,{r4-r11} 1536 stmia $r_ptr!,{r4-r11} 1537 b .Ladd_done 1538 1539.align 4 1540.Ladd_double: 1541 ldr $a_ptr,[sp,#32*18+20] 1542 add sp,sp,#32*(18-5)+16 @ difference in frame sizes 1543 b .Lpoint_double_shortcut 1544 1545.align 4 1546.Ladd_proceed: 1547 add $a_ptr,sp,#$R 1548 add $b_ptr,sp,#$R 1549 add $r_ptr,sp,#$Rsqr 1550 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); 1551 1552 add $a_ptr,sp,#$H 1553 add $b_ptr,sp,#$in1_z 1554 add $r_ptr,sp,#$res_z 1555 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); 1556 1557 add $a_ptr,sp,#$H 1558 add $b_ptr,sp,#$H 1559 add $r_ptr,sp,#$Hsqr 1560 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); 1561 1562 add $a_ptr,sp,#$in2_z 1563 add $b_ptr,sp,#$res_z 1564 add $r_ptr,sp,#$res_z 1565 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z); 1566 1567 add $a_ptr,sp,#$H 1568 add $b_ptr,sp,#$Hsqr 1569 add $r_ptr,sp,#$Hcub 1570 bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); 1571 1572 add $a_ptr,sp,#$Hsqr 1573 add $b_ptr,sp,#$U1 1574 add $r_ptr,sp,#$U2 1575 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr); 1576 1577 add $r_ptr,sp,#$Hsqr 1578 bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); 1579 1580 add $b_ptr,sp,#$Rsqr 1581 add $r_ptr,sp,#$res_x 1582 bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); 1583 1584 add $b_ptr,sp,#$Hcub 1585 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); 1586 1587 add $b_ptr,sp,#$U2 1588 add $r_ptr,sp,#$res_y 1589 bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); 1590 1591 add $a_ptr,sp,#$Hcub 1592 add $b_ptr,sp,#$S1 1593 add $r_ptr,sp,#$S2 1594 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub); 1595 1596 add $a_ptr,sp,#$R 1597 add $b_ptr,sp,#$res_y 1598 add $r_ptr,sp,#$res_y 1599 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); 1600 1601 add $b_ptr,sp,#$S2 1602 bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); 1603 1604 ldr r11,[sp,#32*18+4] @ !in1intfy 1605 ldr r12,[sp,#32*18+8] @ !in2intfy 1606 add r1,sp,#$res_x 1607 add r2,sp,#$in2_x 1608 and r10,r11,r12 1609 mvn r11,r11 1610 add r3,sp,#$in1_x 1611 and r11,r11,r12 1612 mvn r12,r12 1613 ldr $r_ptr,[sp,#32*18+16] 1614___ 1615for($i=0;$i<96;$i+=8) { # conditional moves 1616$code.=<<___; 1617 ldmia r1!,{r4-r5} @ res_x 1618 ldmia r2!,{r6-r7} @ in2_x 1619 ldmia r3!,{r8-r9} @ in1_x 1620 and r4,r4,r10 1621 and r5,r5,r10 1622 and r6,r6,r11 1623 and r7,r7,r11 1624 and r8,r8,r12 1625 and r9,r9,r12 1626 orr r4,r4,r6 1627 orr r5,r5,r7 1628 orr r4,r4,r8 1629 orr r5,r5,r9 1630 stmia $r_ptr!,{r4-r5} 1631___ 1632} 1633$code.=<<___; 1634.Ladd_done: 1635 add sp,sp,#32*18+16+16 @ +16 means "skip even over saved r0-r3" 1636#if __ARM_ARCH__>=5 || !defined(__thumb__) 1637 ldmia sp!,{r4-r12,pc} 1638#else 1639 ldmia sp!,{r4-r12,lr} 1640 bx lr @ interoperable with Thumb ISA:-) 1641#endif 1642.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 1643___ 1644} 1645 1646######################################################################## 1647# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 1648# const P256_POINT_AFFINE *in2); 1649{ 1650my ($res_x,$res_y,$res_z, 1651 $in1_x,$in1_y,$in1_z, 1652 $in2_x,$in2_y, 1653 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); 1654my $Z1sqr = $S2; 1655# above map() describes stack layout with 18 temporary 1656# 256-bit vectors on top. Then note that we push 1657# starting from r0, which means that we have copy of 1658# input arguments just below these temporary vectors. 1659# We use two of them for !in1infty, !in2intfy. 1660 1661my @ONE_mont=(1,0,0,-1,-1,-1,-2,0); 1662 1663$code.=<<___; 1664.globl ecp_nistz256_point_add_affine 1665.type ecp_nistz256_point_add_affine,%function 1666.align 5 1667ecp_nistz256_point_add_affine: 1668 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional 1669 sub sp,sp,#32*15 1670 1671 ldmia $a_ptr!,{r4-r11} @ copy in1_x 1672 add r3,sp,#$in1_x 1673 stmia r3!,{r4-r11} 1674 ldmia $a_ptr!,{r4-r11} @ copy in1_y 1675 stmia r3!,{r4-r11} 1676 ldmia $a_ptr,{r4-r11} @ copy in1_z 1677 orr r12,r4,r5 1678 orr r12,r12,r6 1679 orr r12,r12,r7 1680 orr r12,r12,r8 1681 orr r12,r12,r9 1682 orr r12,r12,r10 1683 orr r12,r12,r11 1684 cmp r12,#0 1685#ifdef __thumb2__ 1686 it ne 1687#endif 1688 movne r12,#-1 1689 stmia r3,{r4-r11} 1690 str r12,[sp,#32*15+4] @ !in1infty 1691 1692 ldmia $b_ptr!,{r4-r11} @ copy in2_x 1693 add r3,sp,#$in2_x 1694 orr r12,r4,r5 1695 orr r12,r12,r6 1696 orr r12,r12,r7 1697 orr r12,r12,r8 1698 orr r12,r12,r9 1699 orr r12,r12,r10 1700 orr r12,r12,r11 1701 stmia r3!,{r4-r11} 1702 ldmia $b_ptr!,{r4-r11} @ copy in2_y 1703 orr r12,r12,r4 1704 orr r12,r12,r5 1705 orr r12,r12,r6 1706 orr r12,r12,r7 1707 orr r12,r12,r8 1708 orr r12,r12,r9 1709 orr r12,r12,r10 1710 orr r12,r12,r11 1711 stmia r3!,{r4-r11} 1712 cmp r12,#0 1713#ifdef __thumb2__ 1714 it ne 1715#endif 1716 movne r12,#-1 1717 str r12,[sp,#32*15+8] @ !in2infty 1718 1719 add $a_ptr,sp,#$in1_z 1720 add $b_ptr,sp,#$in1_z 1721 add $r_ptr,sp,#$Z1sqr 1722 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); 1723 1724 add $a_ptr,sp,#$Z1sqr 1725 add $b_ptr,sp,#$in2_x 1726 add $r_ptr,sp,#$U2 1727 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x); 1728 1729 add $b_ptr,sp,#$in1_x 1730 add $r_ptr,sp,#$H 1731 bl __ecp_nistz256_sub_from @ p256_sub(H, U2, in1_x); 1732 1733 add $a_ptr,sp,#$Z1sqr 1734 add $b_ptr,sp,#$in1_z 1735 add $r_ptr,sp,#$S2 1736 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); 1737 1738 add $a_ptr,sp,#$H 1739 add $b_ptr,sp,#$in1_z 1740 add $r_ptr,sp,#$res_z 1741 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); 1742 1743 add $a_ptr,sp,#$in2_y 1744 add $b_ptr,sp,#$S2 1745 add $r_ptr,sp,#$S2 1746 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); 1747 1748 add $b_ptr,sp,#$in1_y 1749 add $r_ptr,sp,#$R 1750 bl __ecp_nistz256_sub_from @ p256_sub(R, S2, in1_y); 1751 1752 add $a_ptr,sp,#$H 1753 add $b_ptr,sp,#$H 1754 add $r_ptr,sp,#$Hsqr 1755 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); 1756 1757 add $a_ptr,sp,#$R 1758 add $b_ptr,sp,#$R 1759 add $r_ptr,sp,#$Rsqr 1760 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); 1761 1762 add $a_ptr,sp,#$H 1763 add $b_ptr,sp,#$Hsqr 1764 add $r_ptr,sp,#$Hcub 1765 bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); 1766 1767 add $a_ptr,sp,#$Hsqr 1768 add $b_ptr,sp,#$in1_x 1769 add $r_ptr,sp,#$U2 1770 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr); 1771 1772 add $r_ptr,sp,#$Hsqr 1773 bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); 1774 1775 add $b_ptr,sp,#$Rsqr 1776 add $r_ptr,sp,#$res_x 1777 bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); 1778 1779 add $b_ptr,sp,#$Hcub 1780 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); 1781 1782 add $b_ptr,sp,#$U2 1783 add $r_ptr,sp,#$res_y 1784 bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); 1785 1786 add $a_ptr,sp,#$Hcub 1787 add $b_ptr,sp,#$in1_y 1788 add $r_ptr,sp,#$S2 1789 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub); 1790 1791 add $a_ptr,sp,#$R 1792 add $b_ptr,sp,#$res_y 1793 add $r_ptr,sp,#$res_y 1794 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); 1795 1796 add $b_ptr,sp,#$S2 1797 bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); 1798 1799 ldr r11,[sp,#32*15+4] @ !in1intfy 1800 ldr r12,[sp,#32*15+8] @ !in2intfy 1801 add r1,sp,#$res_x 1802 add r2,sp,#$in2_x 1803 and r10,r11,r12 1804 mvn r11,r11 1805 add r3,sp,#$in1_x 1806 and r11,r11,r12 1807 mvn r12,r12 1808 ldr $r_ptr,[sp,#32*15] 1809___ 1810for($i=0;$i<64;$i+=8) { # conditional moves 1811$code.=<<___; 1812 ldmia r1!,{r4-r5} @ res_x 1813 ldmia r2!,{r6-r7} @ in2_x 1814 ldmia r3!,{r8-r9} @ in1_x 1815 and r4,r4,r10 1816 and r5,r5,r10 1817 and r6,r6,r11 1818 and r7,r7,r11 1819 and r8,r8,r12 1820 and r9,r9,r12 1821 orr r4,r4,r6 1822 orr r5,r5,r7 1823 orr r4,r4,r8 1824 orr r5,r5,r9 1825 stmia $r_ptr!,{r4-r5} 1826___ 1827} 1828for(;$i<96;$i+=8) { 1829my $j=($i-64)/4; 1830$code.=<<___; 1831 ldmia r1!,{r4-r5} @ res_z 1832 ldmia r3!,{r8-r9} @ in1_z 1833 and r4,r4,r10 1834 and r5,r5,r10 1835 and r6,r11,#@ONE_mont[$j] 1836 and r7,r11,#@ONE_mont[$j+1] 1837 and r8,r8,r12 1838 and r9,r9,r12 1839 orr r4,r4,r6 1840 orr r5,r5,r7 1841 orr r4,r4,r8 1842 orr r5,r5,r9 1843 stmia $r_ptr!,{r4-r5} 1844___ 1845} 1846$code.=<<___; 1847 add sp,sp,#32*15+16 @ +16 means "skip even over saved r0-r3" 1848#if __ARM_ARCH__>=5 || !defined(__thumb__) 1849 ldmia sp!,{r4-r12,pc} 1850#else 1851 ldmia sp!,{r4-r12,lr} 1852 bx lr @ interoperable with Thumb ISA:-) 1853#endif 1854.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1855___ 1856} }}} 1857 1858foreach (split("\n",$code)) { 1859 s/\`([^\`]*)\`/eval $1/geo; 1860 1861 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; 1862 1863 print $_,"\n"; 1864} 1865close STDOUT; # enforce flush 1866