1e71b7053SJung-uk Kim#! /usr/bin/env perl 217f01e99SJung-uk Kim# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. 3e71b7053SJung-uk Kim# 4*b077aed3SPierre Pronchery# Licensed under the Apache License 2.0 (the "License"). You may not use 5e71b7053SJung-uk Kim# this file except in compliance with the License. You can obtain a copy 6e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at 7e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html 8e71b7053SJung-uk Kim 9e71b7053SJung-uk Kim 10e71b7053SJung-uk Kim# ==================================================================== 11e71b7053SJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12e71b7053SJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and 13e71b7053SJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further 14e71b7053SJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/. 15e71b7053SJung-uk Kim# ==================================================================== 16e71b7053SJung-uk Kim# 17e71b7053SJung-uk Kim# ECP_NISTZ256 module for ARMv4. 18e71b7053SJung-uk Kim# 19e71b7053SJung-uk Kim# October 2014. 20e71b7053SJung-uk Kim# 21e71b7053SJung-uk Kim# Original ECP_NISTZ256 submission targeting x86_64 is detailed in 22e71b7053SJung-uk Kim# http://eprint.iacr.org/2013/816. In the process of adaptation 23e71b7053SJung-uk Kim# original .c module was made 32-bit savvy in order to make this 24e71b7053SJung-uk Kim# implementation possible. 25e71b7053SJung-uk Kim# 26e71b7053SJung-uk Kim# with/without -DECP_NISTZ256_ASM 27e71b7053SJung-uk Kim# Cortex-A8 +53-170% 28e71b7053SJung-uk Kim# Cortex-A9 +76-205% 29e71b7053SJung-uk Kim# Cortex-A15 +100-316% 30e71b7053SJung-uk Kim# Snapdragon S4 +66-187% 31e71b7053SJung-uk Kim# 32e71b7053SJung-uk Kim# Ranges denote minimum and maximum improvement coefficients depending 33e71b7053SJung-uk Kim# on benchmark. Lower coefficients are for ECDSA sign, server-side 34e71b7053SJung-uk Kim# operation. Keep in mind that +200% means 3x improvement. 35e71b7053SJung-uk Kim 36*b077aed3SPierre Pronchery# $output is the last argument if it looks like a file (it has an extension) 37*b077aed3SPierre Pronchery# $flavour is the first argument if it doesn't look like a file 38*b077aed3SPierre Pronchery$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 39*b077aed3SPierre Pronchery$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 40e71b7053SJung-uk Kim 41e71b7053SJung-uk Kimif ($flavour && $flavour ne "void") { 42e71b7053SJung-uk Kim $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 43e71b7053SJung-uk Kim ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 44e71b7053SJung-uk Kim ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 45e71b7053SJung-uk Kim die "can't locate arm-xlate.pl"; 46e71b7053SJung-uk Kim 47*b077aed3SPierre Pronchery open STDOUT,"| \"$^X\" $xlate $flavour \"$output\"" 48*b077aed3SPierre Pronchery or die "can't call $xlate: $!"; 49e71b7053SJung-uk Kim} else { 50*b077aed3SPierre Pronchery $output and open STDOUT,">$output"; 51e71b7053SJung-uk Kim} 52e71b7053SJung-uk Kim 53e71b7053SJung-uk Kim$code.=<<___; 54e71b7053SJung-uk Kim#include "arm_arch.h" 55e71b7053SJung-uk Kim 56e71b7053SJung-uk Kim#if defined(__thumb2__) 57e71b7053SJung-uk Kim.syntax unified 58e71b7053SJung-uk Kim.thumb 59e71b7053SJung-uk Kim#else 60e71b7053SJung-uk Kim.code 32 61e71b7053SJung-uk Kim#endif 62e71b7053SJung-uk Kim___ 63e71b7053SJung-uk Kim######################################################################## 64e71b7053SJung-uk Kim# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 65e71b7053SJung-uk Kim# 66e71b7053SJung-uk Kim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 67e71b7053SJung-uk Kimopen TABLE,"<ecp_nistz256_table.c" or 68e71b7053SJung-uk Kimopen TABLE,"<${dir}../ecp_nistz256_table.c" or 69e71b7053SJung-uk Kimdie "failed to open ecp_nistz256_table.c:",$!; 70e71b7053SJung-uk Kim 71e71b7053SJung-uk Kimuse integer; 72e71b7053SJung-uk Kim 73e71b7053SJung-uk Kimforeach(<TABLE>) { 74e71b7053SJung-uk Kim s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; 75e71b7053SJung-uk Kim} 76e71b7053SJung-uk Kimclose TABLE; 77e71b7053SJung-uk Kim 78e71b7053SJung-uk Kim# See ecp_nistz256_table.c for explanation for why it's 64*16*37. 79e71b7053SJung-uk Kim# 64*16*37-1 is because $#arr returns last valid index or @arr, not 80e71b7053SJung-uk Kim# amount of elements. 81e71b7053SJung-uk Kimdie "insane number of elements" if ($#arr != 64*16*37-1); 82e71b7053SJung-uk Kim 83e71b7053SJung-uk Kim$code.=<<___; 84*b077aed3SPierre Pronchery.rodata 85e71b7053SJung-uk Kim.globl ecp_nistz256_precomputed 86e71b7053SJung-uk Kim.type ecp_nistz256_precomputed,%object 87e71b7053SJung-uk Kim.align 12 88e71b7053SJung-uk Kimecp_nistz256_precomputed: 89e71b7053SJung-uk Kim___ 90e71b7053SJung-uk Kim######################################################################## 91e71b7053SJung-uk Kim# this conversion smashes P256_POINT_AFFINE by individual bytes with 92e71b7053SJung-uk Kim# 64 byte interval, similar to 93e71b7053SJung-uk Kim# 1111222233334444 94e71b7053SJung-uk Kim# 1234123412341234 95e71b7053SJung-uk Kimfor(1..37) { 96e71b7053SJung-uk Kim @tbl = splice(@arr,0,64*16); 97e71b7053SJung-uk Kim for($i=0;$i<64;$i++) { 98e71b7053SJung-uk Kim undef @line; 99e71b7053SJung-uk Kim for($j=0;$j<64;$j++) { 100e71b7053SJung-uk Kim push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff; 101e71b7053SJung-uk Kim } 102e71b7053SJung-uk Kim $code.=".byte\t"; 103e71b7053SJung-uk Kim $code.=join(',',map { sprintf "0x%02x",$_} @line); 104e71b7053SJung-uk Kim $code.="\n"; 105e71b7053SJung-uk Kim } 106e71b7053SJung-uk Kim} 107e71b7053SJung-uk Kim$code.=<<___; 108e71b7053SJung-uk Kim.size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed 109*b077aed3SPierre Pronchery 110*b077aed3SPierre Pronchery.text 111e71b7053SJung-uk Kim.align 5 112e71b7053SJung-uk Kim.LRR: @ 2^512 mod P precomputed for NIST P256 polynomial 113e71b7053SJung-uk Kim.long 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb 114e71b7053SJung-uk Kim.long 0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004 115e71b7053SJung-uk Kim.Lone: 116e71b7053SJung-uk Kim.long 1,0,0,0,0,0,0,0 117e71b7053SJung-uk Kim.asciz "ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" 118e71b7053SJung-uk Kim.align 6 119e71b7053SJung-uk Kim___ 120e71b7053SJung-uk Kim 121e71b7053SJung-uk Kim######################################################################## 122e71b7053SJung-uk Kim# common register layout, note that $t2 is link register, so that if 123e71b7053SJung-uk Kim# internal subroutine uses $t2, then it has to offload lr... 124e71b7053SJung-uk Kim 125e71b7053SJung-uk Kim($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)= 126e71b7053SJung-uk Kim map("r$_",(0..12,14)); 127e71b7053SJung-uk Kim($t0,$t3)=($ff,$a_ptr); 128e71b7053SJung-uk Kim 129e71b7053SJung-uk Kim$code.=<<___; 130e71b7053SJung-uk Kim@ void ecp_nistz256_to_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); 131e71b7053SJung-uk Kim.globl ecp_nistz256_to_mont 132e71b7053SJung-uk Kim.type ecp_nistz256_to_mont,%function 133e71b7053SJung-uk Kimecp_nistz256_to_mont: 134e71b7053SJung-uk Kim adr $b_ptr,.LRR 135e71b7053SJung-uk Kim b .Lecp_nistz256_mul_mont 136e71b7053SJung-uk Kim.size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont 137e71b7053SJung-uk Kim 138e71b7053SJung-uk Kim@ void ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); 139e71b7053SJung-uk Kim.globl ecp_nistz256_from_mont 140e71b7053SJung-uk Kim.type ecp_nistz256_from_mont,%function 141e71b7053SJung-uk Kimecp_nistz256_from_mont: 142e71b7053SJung-uk Kim adr $b_ptr,.Lone 143e71b7053SJung-uk Kim b .Lecp_nistz256_mul_mont 144e71b7053SJung-uk Kim.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont 145e71b7053SJung-uk Kim 146e71b7053SJung-uk Kim@ void ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); 147e71b7053SJung-uk Kim.globl ecp_nistz256_mul_by_2 148e71b7053SJung-uk Kim.type ecp_nistz256_mul_by_2,%function 149e71b7053SJung-uk Kim.align 4 150e71b7053SJung-uk Kimecp_nistz256_mul_by_2: 151e71b7053SJung-uk Kim stmdb sp!,{r4-r12,lr} 152e71b7053SJung-uk Kim bl __ecp_nistz256_mul_by_2 153e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__) 154e71b7053SJung-uk Kim ldmia sp!,{r4-r12,pc} 155e71b7053SJung-uk Kim#else 156e71b7053SJung-uk Kim ldmia sp!,{r4-r12,lr} 157e71b7053SJung-uk Kim bx lr @ interoperable with Thumb ISA:-) 158e71b7053SJung-uk Kim#endif 159e71b7053SJung-uk Kim.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 160e71b7053SJung-uk Kim 161e71b7053SJung-uk Kim.type __ecp_nistz256_mul_by_2,%function 162e71b7053SJung-uk Kim.align 4 163e71b7053SJung-uk Kim__ecp_nistz256_mul_by_2: 164e71b7053SJung-uk Kim ldr $a0,[$a_ptr,#0] 165e71b7053SJung-uk Kim ldr $a1,[$a_ptr,#4] 166e71b7053SJung-uk Kim ldr $a2,[$a_ptr,#8] 167e71b7053SJung-uk Kim adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself 168e71b7053SJung-uk Kim ldr $a3,[$a_ptr,#12] 169e71b7053SJung-uk Kim adcs $a1,$a1,$a1 170e71b7053SJung-uk Kim ldr $a4,[$a_ptr,#16] 171e71b7053SJung-uk Kim adcs $a2,$a2,$a2 172e71b7053SJung-uk Kim ldr $a5,[$a_ptr,#20] 173e71b7053SJung-uk Kim adcs $a3,$a3,$a3 174e71b7053SJung-uk Kim ldr $a6,[$a_ptr,#24] 175e71b7053SJung-uk Kim adcs $a4,$a4,$a4 176e71b7053SJung-uk Kim ldr $a7,[$a_ptr,#28] 177e71b7053SJung-uk Kim adcs $a5,$a5,$a5 178e71b7053SJung-uk Kim adcs $a6,$a6,$a6 179e71b7053SJung-uk Kim mov $ff,#0 180e71b7053SJung-uk Kim adcs $a7,$a7,$a7 181e71b7053SJung-uk Kim adc $ff,$ff,#0 182e71b7053SJung-uk Kim 183e71b7053SJung-uk Kim b .Lreduce_by_sub 184e71b7053SJung-uk Kim.size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2 185e71b7053SJung-uk Kim 186e71b7053SJung-uk Kim@ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8], 187e71b7053SJung-uk Kim@ const BN_ULONG r2[8]); 188e71b7053SJung-uk Kim.globl ecp_nistz256_add 189e71b7053SJung-uk Kim.type ecp_nistz256_add,%function 190e71b7053SJung-uk Kim.align 4 191e71b7053SJung-uk Kimecp_nistz256_add: 192e71b7053SJung-uk Kim stmdb sp!,{r4-r12,lr} 193e71b7053SJung-uk Kim bl __ecp_nistz256_add 194e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__) 195e71b7053SJung-uk Kim ldmia sp!,{r4-r12,pc} 196e71b7053SJung-uk Kim#else 197e71b7053SJung-uk Kim ldmia sp!,{r4-r12,lr} 198e71b7053SJung-uk Kim bx lr @ interoperable with Thumb ISA:-) 199e71b7053SJung-uk Kim#endif 200e71b7053SJung-uk Kim.size ecp_nistz256_add,.-ecp_nistz256_add 201e71b7053SJung-uk Kim 202e71b7053SJung-uk Kim.type __ecp_nistz256_add,%function 203e71b7053SJung-uk Kim.align 4 204e71b7053SJung-uk Kim__ecp_nistz256_add: 205e71b7053SJung-uk Kim str lr,[sp,#-4]! @ push lr 206e71b7053SJung-uk Kim 207e71b7053SJung-uk Kim ldr $a0,[$a_ptr,#0] 208e71b7053SJung-uk Kim ldr $a1,[$a_ptr,#4] 209e71b7053SJung-uk Kim ldr $a2,[$a_ptr,#8] 210e71b7053SJung-uk Kim ldr $a3,[$a_ptr,#12] 211e71b7053SJung-uk Kim ldr $a4,[$a_ptr,#16] 212e71b7053SJung-uk Kim ldr $t0,[$b_ptr,#0] 213e71b7053SJung-uk Kim ldr $a5,[$a_ptr,#20] 214e71b7053SJung-uk Kim ldr $t1,[$b_ptr,#4] 215e71b7053SJung-uk Kim ldr $a6,[$a_ptr,#24] 216e71b7053SJung-uk Kim ldr $t2,[$b_ptr,#8] 217e71b7053SJung-uk Kim ldr $a7,[$a_ptr,#28] 218e71b7053SJung-uk Kim ldr $t3,[$b_ptr,#12] 219e71b7053SJung-uk Kim adds $a0,$a0,$t0 220e71b7053SJung-uk Kim ldr $t0,[$b_ptr,#16] 221e71b7053SJung-uk Kim adcs $a1,$a1,$t1 222e71b7053SJung-uk Kim ldr $t1,[$b_ptr,#20] 223e71b7053SJung-uk Kim adcs $a2,$a2,$t2 224e71b7053SJung-uk Kim ldr $t2,[$b_ptr,#24] 225e71b7053SJung-uk Kim adcs $a3,$a3,$t3 226e71b7053SJung-uk Kim ldr $t3,[$b_ptr,#28] 227e71b7053SJung-uk Kim adcs $a4,$a4,$t0 228e71b7053SJung-uk Kim adcs $a5,$a5,$t1 229e71b7053SJung-uk Kim adcs $a6,$a6,$t2 230e71b7053SJung-uk Kim mov $ff,#0 231e71b7053SJung-uk Kim adcs $a7,$a7,$t3 232e71b7053SJung-uk Kim adc $ff,$ff,#0 233e71b7053SJung-uk Kim ldr lr,[sp],#4 @ pop lr 234e71b7053SJung-uk Kim 235e71b7053SJung-uk Kim.Lreduce_by_sub: 236e71b7053SJung-uk Kim 237e71b7053SJung-uk Kim @ if a+b >= modulus, subtract modulus. 238e71b7053SJung-uk Kim @ 239e71b7053SJung-uk Kim @ But since comparison implies subtraction, we subtract 240e71b7053SJung-uk Kim @ modulus and then add it back if subtraction borrowed. 241e71b7053SJung-uk Kim 242e71b7053SJung-uk Kim subs $a0,$a0,#-1 243e71b7053SJung-uk Kim sbcs $a1,$a1,#-1 244e71b7053SJung-uk Kim sbcs $a2,$a2,#-1 245e71b7053SJung-uk Kim sbcs $a3,$a3,#0 246e71b7053SJung-uk Kim sbcs $a4,$a4,#0 247e71b7053SJung-uk Kim sbcs $a5,$a5,#0 248e71b7053SJung-uk Kim sbcs $a6,$a6,#1 249e71b7053SJung-uk Kim sbcs $a7,$a7,#-1 250e71b7053SJung-uk Kim sbc $ff,$ff,#0 251e71b7053SJung-uk Kim 252e71b7053SJung-uk Kim @ Note that because mod has special form, i.e. consists of 253e71b7053SJung-uk Kim @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 254e71b7053SJung-uk Kim @ using value of borrow as a whole or extracting single bit. 255e71b7053SJung-uk Kim @ Follow $ff register... 256e71b7053SJung-uk Kim 257e71b7053SJung-uk Kim adds $a0,$a0,$ff @ add synthesized modulus 258e71b7053SJung-uk Kim adcs $a1,$a1,$ff 259e71b7053SJung-uk Kim str $a0,[$r_ptr,#0] 260e71b7053SJung-uk Kim adcs $a2,$a2,$ff 261e71b7053SJung-uk Kim str $a1,[$r_ptr,#4] 262e71b7053SJung-uk Kim adcs $a3,$a3,#0 263e71b7053SJung-uk Kim str $a2,[$r_ptr,#8] 264e71b7053SJung-uk Kim adcs $a4,$a4,#0 265e71b7053SJung-uk Kim str $a3,[$r_ptr,#12] 266e71b7053SJung-uk Kim adcs $a5,$a5,#0 267e71b7053SJung-uk Kim str $a4,[$r_ptr,#16] 268e71b7053SJung-uk Kim adcs $a6,$a6,$ff,lsr#31 269e71b7053SJung-uk Kim str $a5,[$r_ptr,#20] 270e71b7053SJung-uk Kim adcs $a7,$a7,$ff 271e71b7053SJung-uk Kim str $a6,[$r_ptr,#24] 272e71b7053SJung-uk Kim str $a7,[$r_ptr,#28] 273e71b7053SJung-uk Kim 274e71b7053SJung-uk Kim mov pc,lr 275e71b7053SJung-uk Kim.size __ecp_nistz256_add,.-__ecp_nistz256_add 276e71b7053SJung-uk Kim 277e71b7053SJung-uk Kim@ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]); 278e71b7053SJung-uk Kim.globl ecp_nistz256_mul_by_3 279e71b7053SJung-uk Kim.type ecp_nistz256_mul_by_3,%function 280e71b7053SJung-uk Kim.align 4 281e71b7053SJung-uk Kimecp_nistz256_mul_by_3: 282e71b7053SJung-uk Kim stmdb sp!,{r4-r12,lr} 283e71b7053SJung-uk Kim bl __ecp_nistz256_mul_by_3 284e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__) 285e71b7053SJung-uk Kim ldmia sp!,{r4-r12,pc} 286e71b7053SJung-uk Kim#else 287e71b7053SJung-uk Kim ldmia sp!,{r4-r12,lr} 288e71b7053SJung-uk Kim bx lr @ interoperable with Thumb ISA:-) 289e71b7053SJung-uk Kim#endif 290e71b7053SJung-uk Kim.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 291e71b7053SJung-uk Kim 292e71b7053SJung-uk Kim.type __ecp_nistz256_mul_by_3,%function 293e71b7053SJung-uk Kim.align 4 294e71b7053SJung-uk Kim__ecp_nistz256_mul_by_3: 295e71b7053SJung-uk Kim str lr,[sp,#-4]! @ push lr 296e71b7053SJung-uk Kim 297e71b7053SJung-uk Kim @ As multiplication by 3 is performed as 2*n+n, below are inline 298e71b7053SJung-uk Kim @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see 299e71b7053SJung-uk Kim @ corresponding subroutines for details. 300e71b7053SJung-uk Kim 301e71b7053SJung-uk Kim ldr $a0,[$a_ptr,#0] 302e71b7053SJung-uk Kim ldr $a1,[$a_ptr,#4] 303e71b7053SJung-uk Kim ldr $a2,[$a_ptr,#8] 304e71b7053SJung-uk Kim adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] 305e71b7053SJung-uk Kim ldr $a3,[$a_ptr,#12] 306e71b7053SJung-uk Kim adcs $a1,$a1,$a1 307e71b7053SJung-uk Kim ldr $a4,[$a_ptr,#16] 308e71b7053SJung-uk Kim adcs $a2,$a2,$a2 309e71b7053SJung-uk Kim ldr $a5,[$a_ptr,#20] 310e71b7053SJung-uk Kim adcs $a3,$a3,$a3 311e71b7053SJung-uk Kim ldr $a6,[$a_ptr,#24] 312e71b7053SJung-uk Kim adcs $a4,$a4,$a4 313e71b7053SJung-uk Kim ldr $a7,[$a_ptr,#28] 314e71b7053SJung-uk Kim adcs $a5,$a5,$a5 315e71b7053SJung-uk Kim adcs $a6,$a6,$a6 316e71b7053SJung-uk Kim mov $ff,#0 317e71b7053SJung-uk Kim adcs $a7,$a7,$a7 318e71b7053SJung-uk Kim adc $ff,$ff,#0 319e71b7053SJung-uk Kim 320e71b7053SJung-uk Kim subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores 321e71b7053SJung-uk Kim sbcs $a1,$a1,#-1 322e71b7053SJung-uk Kim sbcs $a2,$a2,#-1 323e71b7053SJung-uk Kim sbcs $a3,$a3,#0 324e71b7053SJung-uk Kim sbcs $a4,$a4,#0 325e71b7053SJung-uk Kim sbcs $a5,$a5,#0 326e71b7053SJung-uk Kim sbcs $a6,$a6,#1 327e71b7053SJung-uk Kim sbcs $a7,$a7,#-1 328e71b7053SJung-uk Kim sbc $ff,$ff,#0 329e71b7053SJung-uk Kim 330e71b7053SJung-uk Kim adds $a0,$a0,$ff @ add synthesized modulus 331e71b7053SJung-uk Kim adcs $a1,$a1,$ff 332e71b7053SJung-uk Kim adcs $a2,$a2,$ff 333e71b7053SJung-uk Kim adcs $a3,$a3,#0 334e71b7053SJung-uk Kim adcs $a4,$a4,#0 335e71b7053SJung-uk Kim ldr $b_ptr,[$a_ptr,#0] 336e71b7053SJung-uk Kim adcs $a5,$a5,#0 337e71b7053SJung-uk Kim ldr $t1,[$a_ptr,#4] 338e71b7053SJung-uk Kim adcs $a6,$a6,$ff,lsr#31 339e71b7053SJung-uk Kim ldr $t2,[$a_ptr,#8] 340e71b7053SJung-uk Kim adc $a7,$a7,$ff 341e71b7053SJung-uk Kim 342e71b7053SJung-uk Kim ldr $t0,[$a_ptr,#12] 343e71b7053SJung-uk Kim adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7] 344e71b7053SJung-uk Kim ldr $b_ptr,[$a_ptr,#16] 345e71b7053SJung-uk Kim adcs $a1,$a1,$t1 346e71b7053SJung-uk Kim ldr $t1,[$a_ptr,#20] 347e71b7053SJung-uk Kim adcs $a2,$a2,$t2 348e71b7053SJung-uk Kim ldr $t2,[$a_ptr,#24] 349e71b7053SJung-uk Kim adcs $a3,$a3,$t0 350e71b7053SJung-uk Kim ldr $t3,[$a_ptr,#28] 351e71b7053SJung-uk Kim adcs $a4,$a4,$b_ptr 352e71b7053SJung-uk Kim adcs $a5,$a5,$t1 353e71b7053SJung-uk Kim adcs $a6,$a6,$t2 354e71b7053SJung-uk Kim mov $ff,#0 355e71b7053SJung-uk Kim adcs $a7,$a7,$t3 356e71b7053SJung-uk Kim adc $ff,$ff,#0 357e71b7053SJung-uk Kim ldr lr,[sp],#4 @ pop lr 358e71b7053SJung-uk Kim 359e71b7053SJung-uk Kim b .Lreduce_by_sub 360e71b7053SJung-uk Kim.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 361e71b7053SJung-uk Kim 362e71b7053SJung-uk Kim@ void ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]); 363e71b7053SJung-uk Kim.globl ecp_nistz256_div_by_2 364e71b7053SJung-uk Kim.type ecp_nistz256_div_by_2,%function 365e71b7053SJung-uk Kim.align 4 366e71b7053SJung-uk Kimecp_nistz256_div_by_2: 367e71b7053SJung-uk Kim stmdb sp!,{r4-r12,lr} 368e71b7053SJung-uk Kim bl __ecp_nistz256_div_by_2 369e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__) 370e71b7053SJung-uk Kim ldmia sp!,{r4-r12,pc} 371e71b7053SJung-uk Kim#else 372e71b7053SJung-uk Kim ldmia sp!,{r4-r12,lr} 373e71b7053SJung-uk Kim bx lr @ interoperable with Thumb ISA:-) 374e71b7053SJung-uk Kim#endif 375e71b7053SJung-uk Kim.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 376e71b7053SJung-uk Kim 377e71b7053SJung-uk Kim.type __ecp_nistz256_div_by_2,%function 378e71b7053SJung-uk Kim.align 4 379e71b7053SJung-uk Kim__ecp_nistz256_div_by_2: 380e71b7053SJung-uk Kim @ ret = (a is odd ? a+mod : a) >> 1 381e71b7053SJung-uk Kim 382e71b7053SJung-uk Kim ldr $a0,[$a_ptr,#0] 383e71b7053SJung-uk Kim ldr $a1,[$a_ptr,#4] 384e71b7053SJung-uk Kim ldr $a2,[$a_ptr,#8] 385e71b7053SJung-uk Kim mov $ff,$a0,lsl#31 @ place least significant bit to most 386e71b7053SJung-uk Kim @ significant position, now arithmetic 387e71b7053SJung-uk Kim @ right shift by 31 will produce -1 or 388e71b7053SJung-uk Kim @ 0, while logical right shift 1 or 0, 389e71b7053SJung-uk Kim @ this is how modulus is conditionally 390e71b7053SJung-uk Kim @ synthesized in this case... 391e71b7053SJung-uk Kim ldr $a3,[$a_ptr,#12] 392e71b7053SJung-uk Kim adds $a0,$a0,$ff,asr#31 393e71b7053SJung-uk Kim ldr $a4,[$a_ptr,#16] 394e71b7053SJung-uk Kim adcs $a1,$a1,$ff,asr#31 395e71b7053SJung-uk Kim ldr $a5,[$a_ptr,#20] 396e71b7053SJung-uk Kim adcs $a2,$a2,$ff,asr#31 397e71b7053SJung-uk Kim ldr $a6,[$a_ptr,#24] 398e71b7053SJung-uk Kim adcs $a3,$a3,#0 399e71b7053SJung-uk Kim ldr $a7,[$a_ptr,#28] 400e71b7053SJung-uk Kim adcs $a4,$a4,#0 401e71b7053SJung-uk Kim mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early 402e71b7053SJung-uk Kim @ because it doesn't affect flags 403e71b7053SJung-uk Kim adcs $a5,$a5,#0 404e71b7053SJung-uk Kim orr $a0,$a0,$a1,lsl#31 405e71b7053SJung-uk Kim adcs $a6,$a6,$ff,lsr#31 406e71b7053SJung-uk Kim mov $b_ptr,#0 407e71b7053SJung-uk Kim adcs $a7,$a7,$ff,asr#31 408e71b7053SJung-uk Kim mov $a1,$a1,lsr#1 409e71b7053SJung-uk Kim adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition 410e71b7053SJung-uk Kim 411e71b7053SJung-uk Kim orr $a1,$a1,$a2,lsl#31 412e71b7053SJung-uk Kim mov $a2,$a2,lsr#1 413e71b7053SJung-uk Kim str $a0,[$r_ptr,#0] 414e71b7053SJung-uk Kim orr $a2,$a2,$a3,lsl#31 415e71b7053SJung-uk Kim mov $a3,$a3,lsr#1 416e71b7053SJung-uk Kim str $a1,[$r_ptr,#4] 417e71b7053SJung-uk Kim orr $a3,$a3,$a4,lsl#31 418e71b7053SJung-uk Kim mov $a4,$a4,lsr#1 419e71b7053SJung-uk Kim str $a2,[$r_ptr,#8] 420e71b7053SJung-uk Kim orr $a4,$a4,$a5,lsl#31 421e71b7053SJung-uk Kim mov $a5,$a5,lsr#1 422e71b7053SJung-uk Kim str $a3,[$r_ptr,#12] 423e71b7053SJung-uk Kim orr $a5,$a5,$a6,lsl#31 424e71b7053SJung-uk Kim mov $a6,$a6,lsr#1 425e71b7053SJung-uk Kim str $a4,[$r_ptr,#16] 426e71b7053SJung-uk Kim orr $a6,$a6,$a7,lsl#31 427e71b7053SJung-uk Kim mov $a7,$a7,lsr#1 428e71b7053SJung-uk Kim str $a5,[$r_ptr,#20] 429e71b7053SJung-uk Kim orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit 430e71b7053SJung-uk Kim str $a6,[$r_ptr,#24] 431e71b7053SJung-uk Kim str $a7,[$r_ptr,#28] 432e71b7053SJung-uk Kim 433e71b7053SJung-uk Kim mov pc,lr 434e71b7053SJung-uk Kim.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 435e71b7053SJung-uk Kim 436e71b7053SJung-uk Kim@ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8], 437e71b7053SJung-uk Kim@ const BN_ULONG r2[8]); 438e71b7053SJung-uk Kim.globl ecp_nistz256_sub 439e71b7053SJung-uk Kim.type ecp_nistz256_sub,%function 440e71b7053SJung-uk Kim.align 4 441e71b7053SJung-uk Kimecp_nistz256_sub: 442e71b7053SJung-uk Kim stmdb sp!,{r4-r12,lr} 443e71b7053SJung-uk Kim bl __ecp_nistz256_sub 444e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__) 445e71b7053SJung-uk Kim ldmia sp!,{r4-r12,pc} 446e71b7053SJung-uk Kim#else 447e71b7053SJung-uk Kim ldmia sp!,{r4-r12,lr} 448e71b7053SJung-uk Kim bx lr @ interoperable with Thumb ISA:-) 449e71b7053SJung-uk Kim#endif 450e71b7053SJung-uk Kim.size ecp_nistz256_sub,.-ecp_nistz256_sub 451e71b7053SJung-uk Kim 452e71b7053SJung-uk Kim.type __ecp_nistz256_sub,%function 453e71b7053SJung-uk Kim.align 4 454e71b7053SJung-uk Kim__ecp_nistz256_sub: 455e71b7053SJung-uk Kim str lr,[sp,#-4]! @ push lr 456e71b7053SJung-uk Kim 457e71b7053SJung-uk Kim ldr $a0,[$a_ptr,#0] 458e71b7053SJung-uk Kim ldr $a1,[$a_ptr,#4] 459e71b7053SJung-uk Kim ldr $a2,[$a_ptr,#8] 460e71b7053SJung-uk Kim ldr $a3,[$a_ptr,#12] 461e71b7053SJung-uk Kim ldr $a4,[$a_ptr,#16] 462e71b7053SJung-uk Kim ldr $t0,[$b_ptr,#0] 463e71b7053SJung-uk Kim ldr $a5,[$a_ptr,#20] 464e71b7053SJung-uk Kim ldr $t1,[$b_ptr,#4] 465e71b7053SJung-uk Kim ldr $a6,[$a_ptr,#24] 466e71b7053SJung-uk Kim ldr $t2,[$b_ptr,#8] 467e71b7053SJung-uk Kim ldr $a7,[$a_ptr,#28] 468e71b7053SJung-uk Kim ldr $t3,[$b_ptr,#12] 469e71b7053SJung-uk Kim subs $a0,$a0,$t0 470e71b7053SJung-uk Kim ldr $t0,[$b_ptr,#16] 471e71b7053SJung-uk Kim sbcs $a1,$a1,$t1 472e71b7053SJung-uk Kim ldr $t1,[$b_ptr,#20] 473e71b7053SJung-uk Kim sbcs $a2,$a2,$t2 474e71b7053SJung-uk Kim ldr $t2,[$b_ptr,#24] 475e71b7053SJung-uk Kim sbcs $a3,$a3,$t3 476e71b7053SJung-uk Kim ldr $t3,[$b_ptr,#28] 477e71b7053SJung-uk Kim sbcs $a4,$a4,$t0 478e71b7053SJung-uk Kim sbcs $a5,$a5,$t1 479e71b7053SJung-uk Kim sbcs $a6,$a6,$t2 480e71b7053SJung-uk Kim sbcs $a7,$a7,$t3 481e71b7053SJung-uk Kim sbc $ff,$ff,$ff @ broadcast borrow bit 482e71b7053SJung-uk Kim ldr lr,[sp],#4 @ pop lr 483e71b7053SJung-uk Kim 484e71b7053SJung-uk Kim.Lreduce_by_add: 485e71b7053SJung-uk Kim 486e71b7053SJung-uk Kim @ if a-b borrows, add modulus. 487e71b7053SJung-uk Kim @ 488e71b7053SJung-uk Kim @ Note that because mod has special form, i.e. consists of 489e71b7053SJung-uk Kim @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 490e71b7053SJung-uk Kim @ broadcasting borrow bit to a register, $ff, and using it as 491e71b7053SJung-uk Kim @ a whole or extracting single bit. 492e71b7053SJung-uk Kim 493e71b7053SJung-uk Kim adds $a0,$a0,$ff @ add synthesized modulus 494e71b7053SJung-uk Kim adcs $a1,$a1,$ff 495e71b7053SJung-uk Kim str $a0,[$r_ptr,#0] 496e71b7053SJung-uk Kim adcs $a2,$a2,$ff 497e71b7053SJung-uk Kim str $a1,[$r_ptr,#4] 498e71b7053SJung-uk Kim adcs $a3,$a3,#0 499e71b7053SJung-uk Kim str $a2,[$r_ptr,#8] 500e71b7053SJung-uk Kim adcs $a4,$a4,#0 501e71b7053SJung-uk Kim str $a3,[$r_ptr,#12] 502e71b7053SJung-uk Kim adcs $a5,$a5,#0 503e71b7053SJung-uk Kim str $a4,[$r_ptr,#16] 504e71b7053SJung-uk Kim adcs $a6,$a6,$ff,lsr#31 505e71b7053SJung-uk Kim str $a5,[$r_ptr,#20] 506e71b7053SJung-uk Kim adcs $a7,$a7,$ff 507e71b7053SJung-uk Kim str $a6,[$r_ptr,#24] 508e71b7053SJung-uk Kim str $a7,[$r_ptr,#28] 509e71b7053SJung-uk Kim 510e71b7053SJung-uk Kim mov pc,lr 511e71b7053SJung-uk Kim.size __ecp_nistz256_sub,.-__ecp_nistz256_sub 512e71b7053SJung-uk Kim 513e71b7053SJung-uk Kim@ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]); 514e71b7053SJung-uk Kim.globl ecp_nistz256_neg 515e71b7053SJung-uk Kim.type ecp_nistz256_neg,%function 516e71b7053SJung-uk Kim.align 4 517e71b7053SJung-uk Kimecp_nistz256_neg: 518e71b7053SJung-uk Kim stmdb sp!,{r4-r12,lr} 519e71b7053SJung-uk Kim bl __ecp_nistz256_neg 520e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__) 521e71b7053SJung-uk Kim ldmia sp!,{r4-r12,pc} 522e71b7053SJung-uk Kim#else 523e71b7053SJung-uk Kim ldmia sp!,{r4-r12,lr} 524e71b7053SJung-uk Kim bx lr @ interoperable with Thumb ISA:-) 525e71b7053SJung-uk Kim#endif 526e71b7053SJung-uk Kim.size ecp_nistz256_neg,.-ecp_nistz256_neg 527e71b7053SJung-uk Kim 528e71b7053SJung-uk Kim.type __ecp_nistz256_neg,%function 529e71b7053SJung-uk Kim.align 4 530e71b7053SJung-uk Kim__ecp_nistz256_neg: 531e71b7053SJung-uk Kim ldr $a0,[$a_ptr,#0] 532e71b7053SJung-uk Kim eor $ff,$ff,$ff 533e71b7053SJung-uk Kim ldr $a1,[$a_ptr,#4] 534e71b7053SJung-uk Kim ldr $a2,[$a_ptr,#8] 535e71b7053SJung-uk Kim subs $a0,$ff,$a0 536e71b7053SJung-uk Kim ldr $a3,[$a_ptr,#12] 537e71b7053SJung-uk Kim sbcs $a1,$ff,$a1 538e71b7053SJung-uk Kim ldr $a4,[$a_ptr,#16] 539e71b7053SJung-uk Kim sbcs $a2,$ff,$a2 540e71b7053SJung-uk Kim ldr $a5,[$a_ptr,#20] 541e71b7053SJung-uk Kim sbcs $a3,$ff,$a3 542e71b7053SJung-uk Kim ldr $a6,[$a_ptr,#24] 543e71b7053SJung-uk Kim sbcs $a4,$ff,$a4 544e71b7053SJung-uk Kim ldr $a7,[$a_ptr,#28] 545e71b7053SJung-uk Kim sbcs $a5,$ff,$a5 546e71b7053SJung-uk Kim sbcs $a6,$ff,$a6 547e71b7053SJung-uk Kim sbcs $a7,$ff,$a7 548e71b7053SJung-uk Kim sbc $ff,$ff,$ff 549e71b7053SJung-uk Kim 550e71b7053SJung-uk Kim b .Lreduce_by_add 551e71b7053SJung-uk Kim.size __ecp_nistz256_neg,.-__ecp_nistz256_neg 552e71b7053SJung-uk Kim___ 553e71b7053SJung-uk Kim{ 554e71b7053SJung-uk Kimmy @acc=map("r$_",(3..11)); 555e71b7053SJung-uk Kimmy ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14)); 556e71b7053SJung-uk Kim 557e71b7053SJung-uk Kim$code.=<<___; 558e71b7053SJung-uk Kim@ void ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]); 559e71b7053SJung-uk Kim.globl ecp_nistz256_sqr_mont 560e71b7053SJung-uk Kim.type ecp_nistz256_sqr_mont,%function 561e71b7053SJung-uk Kim.align 4 562e71b7053SJung-uk Kimecp_nistz256_sqr_mont: 563e71b7053SJung-uk Kim mov $b_ptr,$a_ptr 564e71b7053SJung-uk Kim b .Lecp_nistz256_mul_mont 565e71b7053SJung-uk Kim.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 566e71b7053SJung-uk Kim 567e71b7053SJung-uk Kim@ void ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8], 568e71b7053SJung-uk Kim@ const BN_ULONG r2[8]); 569e71b7053SJung-uk Kim.globl ecp_nistz256_mul_mont 570e71b7053SJung-uk Kim.type ecp_nistz256_mul_mont,%function 571e71b7053SJung-uk Kim.align 4 572e71b7053SJung-uk Kimecp_nistz256_mul_mont: 573e71b7053SJung-uk Kim.Lecp_nistz256_mul_mont: 574e71b7053SJung-uk Kim stmdb sp!,{r4-r12,lr} 575e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont 576e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__) 577e71b7053SJung-uk Kim ldmia sp!,{r4-r12,pc} 578e71b7053SJung-uk Kim#else 579e71b7053SJung-uk Kim ldmia sp!,{r4-r12,lr} 580e71b7053SJung-uk Kim bx lr @ interoperable with Thumb ISA:-) 581e71b7053SJung-uk Kim#endif 582e71b7053SJung-uk Kim.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 583e71b7053SJung-uk Kim 584e71b7053SJung-uk Kim.type __ecp_nistz256_mul_mont,%function 585e71b7053SJung-uk Kim.align 4 586e71b7053SJung-uk Kim__ecp_nistz256_mul_mont: 587e71b7053SJung-uk Kim stmdb sp!,{r0-r2,lr} @ make a copy of arguments too 588e71b7053SJung-uk Kim 589e71b7053SJung-uk Kim ldr $bj,[$b_ptr,#0] @ b[0] 590e71b7053SJung-uk Kim ldmia $a_ptr,{@acc[1]-@acc[8]} 591e71b7053SJung-uk Kim 592e71b7053SJung-uk Kim umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0] 593e71b7053SJung-uk Kim stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so 594e71b7053SJung-uk Kim @ that it can be addressed 595e71b7053SJung-uk Kim @ without spending register 596e71b7053SJung-uk Kim @ on address 597e71b7053SJung-uk Kim umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0] 598e71b7053SJung-uk Kim umull @acc[2],$t1,@acc[3],$bj 599e71b7053SJung-uk Kim adds @acc[1],@acc[1],$t3 @ accumulate high part of mult 600e71b7053SJung-uk Kim umull @acc[3],$t2,@acc[4],$bj 601e71b7053SJung-uk Kim adcs @acc[2],@acc[2],$t0 602e71b7053SJung-uk Kim umull @acc[4],$t3,@acc[5],$bj 603e71b7053SJung-uk Kim adcs @acc[3],@acc[3],$t1 604e71b7053SJung-uk Kim umull @acc[5],$t0,@acc[6],$bj 605e71b7053SJung-uk Kim adcs @acc[4],@acc[4],$t2 606e71b7053SJung-uk Kim umull @acc[6],$t1,@acc[7],$bj 607e71b7053SJung-uk Kim adcs @acc[5],@acc[5],$t3 608e71b7053SJung-uk Kim umull @acc[7],$t2,@acc[8],$bj 609e71b7053SJung-uk Kim adcs @acc[6],@acc[6],$t0 610e71b7053SJung-uk Kim adcs @acc[7],@acc[7],$t1 611e71b7053SJung-uk Kim eor $t3,$t3,$t3 @ first overflow bit is zero 612e71b7053SJung-uk Kim adc @acc[8],$t2,#0 613e71b7053SJung-uk Kim___ 614e71b7053SJung-uk Kimfor(my $i=1;$i<8;$i++) { 615e71b7053SJung-uk Kimmy $t4=@acc[0]; 616e71b7053SJung-uk Kim 617e71b7053SJung-uk Kim # Reduction iteration is normally performed by accumulating 618e71b7053SJung-uk Kim # result of multiplication of modulus by "magic" digit [and 619e71b7053SJung-uk Kim # omitting least significant word, which is guaranteed to 620e71b7053SJung-uk Kim # be 0], but thanks to special form of modulus and "magic" 621e71b7053SJung-uk Kim # digit being equal to least significant word, it can be 622e71b7053SJung-uk Kim # performed with additions and subtractions alone. Indeed: 623e71b7053SJung-uk Kim # 624e71b7053SJung-uk Kim # ffff.0001.0000.0000.0000.ffff.ffff.ffff 625e71b7053SJung-uk Kim # * abcd 626e71b7053SJung-uk Kim # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 627e71b7053SJung-uk Kim # 628e71b7053SJung-uk Kim # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we 629e71b7053SJung-uk Kim # rewrite above as: 630e71b7053SJung-uk Kim # 631e71b7053SJung-uk Kim # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd 632e71b7053SJung-uk Kim # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000 633e71b7053SJung-uk Kim # - abcd.0000.0000.0000.0000.0000.0000.abcd 634e71b7053SJung-uk Kim # 635e71b7053SJung-uk Kim # or marking redundant operations: 636e71b7053SJung-uk Kim # 637e71b7053SJung-uk Kim # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.---- 638e71b7053SJung-uk Kim # + abcd.0000.abcd.0000.0000.abcd.----.----.---- 639e71b7053SJung-uk Kim # - abcd.----.----.----.----.----.----.---- 640e71b7053SJung-uk Kim 641e71b7053SJung-uk Kim$code.=<<___; 642e71b7053SJung-uk Kim @ multiplication-less reduction $i 643e71b7053SJung-uk Kim adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0] 644e71b7053SJung-uk Kim ldr $bj,[sp,#40] @ restore b_ptr 645e71b7053SJung-uk Kim adcs @acc[4],@acc[4],#0 @ r[4]+=0 646e71b7053SJung-uk Kim adcs @acc[5],@acc[5],#0 @ r[5]+=0 647e71b7053SJung-uk Kim adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0] 648e71b7053SJung-uk Kim ldr $t1,[sp,#0] @ load a[0] 649e71b7053SJung-uk Kim adcs @acc[7],@acc[7],#0 @ r[7]+=0 650e71b7053SJung-uk Kim ldr $bj,[$bj,#4*$i] @ load b[i] 651e71b7053SJung-uk Kim adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0] 652e71b7053SJung-uk Kim eor $t0,$t0,$t0 653e71b7053SJung-uk Kim adc $t3,$t3,#0 @ overflow bit 654e71b7053SJung-uk Kim subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0] 655e71b7053SJung-uk Kim ldr $t2,[sp,#4] @ a[1] 656e71b7053SJung-uk Kim sbcs @acc[8],@acc[8],#0 @ r[8]-=0 657e71b7053SJung-uk Kim umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i] 658e71b7053SJung-uk Kim eor $t1,$t1,$t1 659e71b7053SJung-uk Kim sbc @acc[0],$t3,#0 @ overflow bit, keep in mind 660e71b7053SJung-uk Kim @ that netto result is 661e71b7053SJung-uk Kim @ addition of a value which 662e71b7053SJung-uk Kim @ makes underflow impossible 663e71b7053SJung-uk Kim 664e71b7053SJung-uk Kim ldr $t3,[sp,#8] @ a[2] 665e71b7053SJung-uk Kim umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i] 666e71b7053SJung-uk Kim str @acc[0],[sp,#36] @ temporarily offload overflow 667e71b7053SJung-uk Kim eor $t2,$t2,$t2 668e71b7053SJung-uk Kim ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0] 669e71b7053SJung-uk Kim umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i] 670e71b7053SJung-uk Kim eor $t3,$t3,$t3 671e71b7053SJung-uk Kim adds @acc[2],@acc[2],$t0 @ accumulate high part of mult 672e71b7053SJung-uk Kim ldr $t0,[sp,#16] @ a[4] 673e71b7053SJung-uk Kim umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i] 674e71b7053SJung-uk Kim eor $t4,$t4,$t4 675e71b7053SJung-uk Kim adcs @acc[3],@acc[3],$t1 676e71b7053SJung-uk Kim ldr $t1,[sp,#20] @ a[5] 677e71b7053SJung-uk Kim umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i] 678e71b7053SJung-uk Kim eor $t0,$t0,$t0 679e71b7053SJung-uk Kim adcs @acc[4],@acc[4],$t2 680e71b7053SJung-uk Kim ldr $t2,[sp,#24] @ a[6] 681e71b7053SJung-uk Kim umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i] 682e71b7053SJung-uk Kim eor $t1,$t1,$t1 683e71b7053SJung-uk Kim adcs @acc[5],@acc[5],$t3 684e71b7053SJung-uk Kim ldr $t3,[sp,#28] @ a[7] 685e71b7053SJung-uk Kim umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i] 686e71b7053SJung-uk Kim eor $t2,$t2,$t2 687e71b7053SJung-uk Kim adcs @acc[6],@acc[6],$t4 688e71b7053SJung-uk Kim ldr @acc[0],[sp,#36] @ restore overflow bit 689e71b7053SJung-uk Kim umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i] 690e71b7053SJung-uk Kim eor $t3,$t3,$t3 691e71b7053SJung-uk Kim adcs @acc[7],@acc[7],$t0 692e71b7053SJung-uk Kim adcs @acc[8],@acc[8],$t1 693e71b7053SJung-uk Kim adcs @acc[0],$acc[0],$t2 694e71b7053SJung-uk Kim adc $t3,$t3,#0 @ new overflow bit 695e71b7053SJung-uk Kim___ 696e71b7053SJung-uk Kim push(@acc,shift(@acc)); # rotate registers, so that 697e71b7053SJung-uk Kim # "r[i]" becomes r[i] 698e71b7053SJung-uk Kim} 699e71b7053SJung-uk Kim$code.=<<___; 700e71b7053SJung-uk Kim @ last multiplication-less reduction 701e71b7053SJung-uk Kim adds @acc[3],@acc[3],@acc[0] 702e71b7053SJung-uk Kim ldr $r_ptr,[sp,#32] @ restore r_ptr 703e71b7053SJung-uk Kim adcs @acc[4],@acc[4],#0 704e71b7053SJung-uk Kim adcs @acc[5],@acc[5],#0 705e71b7053SJung-uk Kim adcs @acc[6],@acc[6],@acc[0] 706e71b7053SJung-uk Kim adcs @acc[7],@acc[7],#0 707e71b7053SJung-uk Kim adcs @acc[8],@acc[8],@acc[0] 708e71b7053SJung-uk Kim adc $t3,$t3,#0 709e71b7053SJung-uk Kim subs @acc[7],@acc[7],@acc[0] 710e71b7053SJung-uk Kim sbcs @acc[8],@acc[8],#0 711e71b7053SJung-uk Kim sbc @acc[0],$t3,#0 @ overflow bit 712e71b7053SJung-uk Kim 713e71b7053SJung-uk Kim @ Final step is "if result > mod, subtract mod", but we do it 714e71b7053SJung-uk Kim @ "other way around", namely subtract modulus from result 715e71b7053SJung-uk Kim @ and if it borrowed, add modulus back. 716e71b7053SJung-uk Kim 717e71b7053SJung-uk Kim adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1 718e71b7053SJung-uk Kim adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1 719e71b7053SJung-uk Kim adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1 720e71b7053SJung-uk Kim sbcs @acc[4],@acc[4],#0 721e71b7053SJung-uk Kim sbcs @acc[5],@acc[5],#0 722e71b7053SJung-uk Kim sbcs @acc[6],@acc[6],#0 723e71b7053SJung-uk Kim sbcs @acc[7],@acc[7],#1 724e71b7053SJung-uk Kim adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1 725e71b7053SJung-uk Kim ldr lr,[sp,#44] @ restore lr 726e71b7053SJung-uk Kim sbc @acc[0],@acc[0],#0 @ broadcast borrow bit 727e71b7053SJung-uk Kim add sp,sp,#48 728e71b7053SJung-uk Kim 729e71b7053SJung-uk Kim @ Note that because mod has special form, i.e. consists of 730e71b7053SJung-uk Kim @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 731e71b7053SJung-uk Kim @ broadcasting borrow bit to a register, @acc[0], and using it as 732e71b7053SJung-uk Kim @ a whole or extracting single bit. 733e71b7053SJung-uk Kim 734e71b7053SJung-uk Kim adds @acc[1],@acc[1],@acc[0] @ add modulus or zero 735e71b7053SJung-uk Kim adcs @acc[2],@acc[2],@acc[0] 736e71b7053SJung-uk Kim str @acc[1],[$r_ptr,#0] 737e71b7053SJung-uk Kim adcs @acc[3],@acc[3],@acc[0] 738e71b7053SJung-uk Kim str @acc[2],[$r_ptr,#4] 739e71b7053SJung-uk Kim adcs @acc[4],@acc[4],#0 740e71b7053SJung-uk Kim str @acc[3],[$r_ptr,#8] 741e71b7053SJung-uk Kim adcs @acc[5],@acc[5],#0 742e71b7053SJung-uk Kim str @acc[4],[$r_ptr,#12] 743e71b7053SJung-uk Kim adcs @acc[6],@acc[6],#0 744e71b7053SJung-uk Kim str @acc[5],[$r_ptr,#16] 745e71b7053SJung-uk Kim adcs @acc[7],@acc[7],@acc[0],lsr#31 746e71b7053SJung-uk Kim str @acc[6],[$r_ptr,#20] 747e71b7053SJung-uk Kim adc @acc[8],@acc[8],@acc[0] 748e71b7053SJung-uk Kim str @acc[7],[$r_ptr,#24] 749e71b7053SJung-uk Kim str @acc[8],[$r_ptr,#28] 750e71b7053SJung-uk Kim 751e71b7053SJung-uk Kim mov pc,lr 752e71b7053SJung-uk Kim.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 753e71b7053SJung-uk Kim___ 754e71b7053SJung-uk Kim} 755e71b7053SJung-uk Kim 756e71b7053SJung-uk Kim{ 757e71b7053SJung-uk Kimmy ($out,$inp,$index,$mask)=map("r$_",(0..3)); 758e71b7053SJung-uk Kim$code.=<<___; 759e71b7053SJung-uk Kim@ void ecp_nistz256_scatter_w5(void *r0,const P256_POINT *r1, 760e71b7053SJung-uk Kim@ int r2); 761e71b7053SJung-uk Kim.globl ecp_nistz256_scatter_w5 762e71b7053SJung-uk Kim.type ecp_nistz256_scatter_w5,%function 763e71b7053SJung-uk Kim.align 5 764e71b7053SJung-uk Kimecp_nistz256_scatter_w5: 765e71b7053SJung-uk Kim stmdb sp!,{r4-r11} 766e71b7053SJung-uk Kim 767e71b7053SJung-uk Kim add $out,$out,$index,lsl#2 768e71b7053SJung-uk Kim 769e71b7053SJung-uk Kim ldmia $inp!,{r4-r11} @ X 770e71b7053SJung-uk Kim str r4,[$out,#64*0-4] 771e71b7053SJung-uk Kim str r5,[$out,#64*1-4] 772e71b7053SJung-uk Kim str r6,[$out,#64*2-4] 773e71b7053SJung-uk Kim str r7,[$out,#64*3-4] 774e71b7053SJung-uk Kim str r8,[$out,#64*4-4] 775e71b7053SJung-uk Kim str r9,[$out,#64*5-4] 776e71b7053SJung-uk Kim str r10,[$out,#64*6-4] 777e71b7053SJung-uk Kim str r11,[$out,#64*7-4] 778e71b7053SJung-uk Kim add $out,$out,#64*8 779e71b7053SJung-uk Kim 780e71b7053SJung-uk Kim ldmia $inp!,{r4-r11} @ Y 781e71b7053SJung-uk Kim str r4,[$out,#64*0-4] 782e71b7053SJung-uk Kim str r5,[$out,#64*1-4] 783e71b7053SJung-uk Kim str r6,[$out,#64*2-4] 784e71b7053SJung-uk Kim str r7,[$out,#64*3-4] 785e71b7053SJung-uk Kim str r8,[$out,#64*4-4] 786e71b7053SJung-uk Kim str r9,[$out,#64*5-4] 787e71b7053SJung-uk Kim str r10,[$out,#64*6-4] 788e71b7053SJung-uk Kim str r11,[$out,#64*7-4] 789e71b7053SJung-uk Kim add $out,$out,#64*8 790e71b7053SJung-uk Kim 791e71b7053SJung-uk Kim ldmia $inp,{r4-r11} @ Z 792e71b7053SJung-uk Kim str r4,[$out,#64*0-4] 793e71b7053SJung-uk Kim str r5,[$out,#64*1-4] 794e71b7053SJung-uk Kim str r6,[$out,#64*2-4] 795e71b7053SJung-uk Kim str r7,[$out,#64*3-4] 796e71b7053SJung-uk Kim str r8,[$out,#64*4-4] 797e71b7053SJung-uk Kim str r9,[$out,#64*5-4] 798e71b7053SJung-uk Kim str r10,[$out,#64*6-4] 799e71b7053SJung-uk Kim str r11,[$out,#64*7-4] 800e71b7053SJung-uk Kim 801e71b7053SJung-uk Kim ldmia sp!,{r4-r11} 802e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || defined(__thumb__) 803e71b7053SJung-uk Kim bx lr 804e71b7053SJung-uk Kim#else 805e71b7053SJung-uk Kim mov pc,lr 806e71b7053SJung-uk Kim#endif 807e71b7053SJung-uk Kim.size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 808e71b7053SJung-uk Kim 809e71b7053SJung-uk Kim@ void ecp_nistz256_gather_w5(P256_POINT *r0,const void *r1, 810e71b7053SJung-uk Kim@ int r2); 811e71b7053SJung-uk Kim.globl ecp_nistz256_gather_w5 812e71b7053SJung-uk Kim.type ecp_nistz256_gather_w5,%function 813e71b7053SJung-uk Kim.align 5 814e71b7053SJung-uk Kimecp_nistz256_gather_w5: 815e71b7053SJung-uk Kim stmdb sp!,{r4-r11} 816e71b7053SJung-uk Kim 817e71b7053SJung-uk Kim cmp $index,#0 818e71b7053SJung-uk Kim mov $mask,#0 819e71b7053SJung-uk Kim#ifdef __thumb2__ 820e71b7053SJung-uk Kim itt ne 821e71b7053SJung-uk Kim#endif 822e71b7053SJung-uk Kim subne $index,$index,#1 823e71b7053SJung-uk Kim movne $mask,#-1 824e71b7053SJung-uk Kim add $inp,$inp,$index,lsl#2 825e71b7053SJung-uk Kim 826e71b7053SJung-uk Kim ldr r4,[$inp,#64*0] 827e71b7053SJung-uk Kim ldr r5,[$inp,#64*1] 828e71b7053SJung-uk Kim ldr r6,[$inp,#64*2] 829e71b7053SJung-uk Kim and r4,r4,$mask 830e71b7053SJung-uk Kim ldr r7,[$inp,#64*3] 831e71b7053SJung-uk Kim and r5,r5,$mask 832e71b7053SJung-uk Kim ldr r8,[$inp,#64*4] 833e71b7053SJung-uk Kim and r6,r6,$mask 834e71b7053SJung-uk Kim ldr r9,[$inp,#64*5] 835e71b7053SJung-uk Kim and r7,r7,$mask 836e71b7053SJung-uk Kim ldr r10,[$inp,#64*6] 837e71b7053SJung-uk Kim and r8,r8,$mask 838e71b7053SJung-uk Kim ldr r11,[$inp,#64*7] 839e71b7053SJung-uk Kim add $inp,$inp,#64*8 840e71b7053SJung-uk Kim and r9,r9,$mask 841e71b7053SJung-uk Kim and r10,r10,$mask 842e71b7053SJung-uk Kim and r11,r11,$mask 843e71b7053SJung-uk Kim stmia $out!,{r4-r11} @ X 844e71b7053SJung-uk Kim 845e71b7053SJung-uk Kim ldr r4,[$inp,#64*0] 846e71b7053SJung-uk Kim ldr r5,[$inp,#64*1] 847e71b7053SJung-uk Kim ldr r6,[$inp,#64*2] 848e71b7053SJung-uk Kim and r4,r4,$mask 849e71b7053SJung-uk Kim ldr r7,[$inp,#64*3] 850e71b7053SJung-uk Kim and r5,r5,$mask 851e71b7053SJung-uk Kim ldr r8,[$inp,#64*4] 852e71b7053SJung-uk Kim and r6,r6,$mask 853e71b7053SJung-uk Kim ldr r9,[$inp,#64*5] 854e71b7053SJung-uk Kim and r7,r7,$mask 855e71b7053SJung-uk Kim ldr r10,[$inp,#64*6] 856e71b7053SJung-uk Kim and r8,r8,$mask 857e71b7053SJung-uk Kim ldr r11,[$inp,#64*7] 858e71b7053SJung-uk Kim add $inp,$inp,#64*8 859e71b7053SJung-uk Kim and r9,r9,$mask 860e71b7053SJung-uk Kim and r10,r10,$mask 861e71b7053SJung-uk Kim and r11,r11,$mask 862e71b7053SJung-uk Kim stmia $out!,{r4-r11} @ Y 863e71b7053SJung-uk Kim 864e71b7053SJung-uk Kim ldr r4,[$inp,#64*0] 865e71b7053SJung-uk Kim ldr r5,[$inp,#64*1] 866e71b7053SJung-uk Kim ldr r6,[$inp,#64*2] 867e71b7053SJung-uk Kim and r4,r4,$mask 868e71b7053SJung-uk Kim ldr r7,[$inp,#64*3] 869e71b7053SJung-uk Kim and r5,r5,$mask 870e71b7053SJung-uk Kim ldr r8,[$inp,#64*4] 871e71b7053SJung-uk Kim and r6,r6,$mask 872e71b7053SJung-uk Kim ldr r9,[$inp,#64*5] 873e71b7053SJung-uk Kim and r7,r7,$mask 874e71b7053SJung-uk Kim ldr r10,[$inp,#64*6] 875e71b7053SJung-uk Kim and r8,r8,$mask 876e71b7053SJung-uk Kim ldr r11,[$inp,#64*7] 877e71b7053SJung-uk Kim and r9,r9,$mask 878e71b7053SJung-uk Kim and r10,r10,$mask 879e71b7053SJung-uk Kim and r11,r11,$mask 880e71b7053SJung-uk Kim stmia $out,{r4-r11} @ Z 881e71b7053SJung-uk Kim 882e71b7053SJung-uk Kim ldmia sp!,{r4-r11} 883e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || defined(__thumb__) 884e71b7053SJung-uk Kim bx lr 885e71b7053SJung-uk Kim#else 886e71b7053SJung-uk Kim mov pc,lr 887e71b7053SJung-uk Kim#endif 888e71b7053SJung-uk Kim.size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 889e71b7053SJung-uk Kim 890e71b7053SJung-uk Kim@ void ecp_nistz256_scatter_w7(void *r0,const P256_POINT_AFFINE *r1, 891e71b7053SJung-uk Kim@ int r2); 892e71b7053SJung-uk Kim.globl ecp_nistz256_scatter_w7 893e71b7053SJung-uk Kim.type ecp_nistz256_scatter_w7,%function 894e71b7053SJung-uk Kim.align 5 895e71b7053SJung-uk Kimecp_nistz256_scatter_w7: 896e71b7053SJung-uk Kim add $out,$out,$index 897e71b7053SJung-uk Kim mov $index,#64/4 898e71b7053SJung-uk Kim.Loop_scatter_w7: 899e71b7053SJung-uk Kim ldr $mask,[$inp],#4 900e71b7053SJung-uk Kim subs $index,$index,#1 901e71b7053SJung-uk Kim strb $mask,[$out,#64*0] 902e71b7053SJung-uk Kim mov $mask,$mask,lsr#8 903e71b7053SJung-uk Kim strb $mask,[$out,#64*1] 904e71b7053SJung-uk Kim mov $mask,$mask,lsr#8 905e71b7053SJung-uk Kim strb $mask,[$out,#64*2] 906e71b7053SJung-uk Kim mov $mask,$mask,lsr#8 907e71b7053SJung-uk Kim strb $mask,[$out,#64*3] 908e71b7053SJung-uk Kim add $out,$out,#64*4 909e71b7053SJung-uk Kim bne .Loop_scatter_w7 910e71b7053SJung-uk Kim 911e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || defined(__thumb__) 912e71b7053SJung-uk Kim bx lr 913e71b7053SJung-uk Kim#else 914e71b7053SJung-uk Kim mov pc,lr 915e71b7053SJung-uk Kim#endif 916e71b7053SJung-uk Kim.size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 917e71b7053SJung-uk Kim 918e71b7053SJung-uk Kim@ void ecp_nistz256_gather_w7(P256_POINT_AFFINE *r0,const void *r1, 919e71b7053SJung-uk Kim@ int r2); 920e71b7053SJung-uk Kim.globl ecp_nistz256_gather_w7 921e71b7053SJung-uk Kim.type ecp_nistz256_gather_w7,%function 922e71b7053SJung-uk Kim.align 5 923e71b7053SJung-uk Kimecp_nistz256_gather_w7: 924e71b7053SJung-uk Kim stmdb sp!,{r4-r7} 925e71b7053SJung-uk Kim 926e71b7053SJung-uk Kim cmp $index,#0 927e71b7053SJung-uk Kim mov $mask,#0 928e71b7053SJung-uk Kim#ifdef __thumb2__ 929e71b7053SJung-uk Kim itt ne 930e71b7053SJung-uk Kim#endif 931e71b7053SJung-uk Kim subne $index,$index,#1 932e71b7053SJung-uk Kim movne $mask,#-1 933e71b7053SJung-uk Kim add $inp,$inp,$index 934e71b7053SJung-uk Kim mov $index,#64/4 935e71b7053SJung-uk Kim nop 936e71b7053SJung-uk Kim.Loop_gather_w7: 937e71b7053SJung-uk Kim ldrb r4,[$inp,#64*0] 938e71b7053SJung-uk Kim subs $index,$index,#1 939e71b7053SJung-uk Kim ldrb r5,[$inp,#64*1] 940e71b7053SJung-uk Kim ldrb r6,[$inp,#64*2] 941e71b7053SJung-uk Kim ldrb r7,[$inp,#64*3] 942e71b7053SJung-uk Kim add $inp,$inp,#64*4 943e71b7053SJung-uk Kim orr r4,r4,r5,lsl#8 944e71b7053SJung-uk Kim orr r4,r4,r6,lsl#16 945e71b7053SJung-uk Kim orr r4,r4,r7,lsl#24 946e71b7053SJung-uk Kim and r4,r4,$mask 947e71b7053SJung-uk Kim str r4,[$out],#4 948e71b7053SJung-uk Kim bne .Loop_gather_w7 949e71b7053SJung-uk Kim 950e71b7053SJung-uk Kim ldmia sp!,{r4-r7} 951e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || defined(__thumb__) 952e71b7053SJung-uk Kim bx lr 953e71b7053SJung-uk Kim#else 954e71b7053SJung-uk Kim mov pc,lr 955e71b7053SJung-uk Kim#endif 956e71b7053SJung-uk Kim.size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 957e71b7053SJung-uk Kim___ 958e71b7053SJung-uk Kim} 959e71b7053SJung-uk Kimif (0) { 960e71b7053SJung-uk Kim# In comparison to integer-only equivalent of below subroutine: 961e71b7053SJung-uk Kim# 962e71b7053SJung-uk Kim# Cortex-A8 +10% 963e71b7053SJung-uk Kim# Cortex-A9 -10% 964e71b7053SJung-uk Kim# Snapdragon S4 +5% 965e71b7053SJung-uk Kim# 966e71b7053SJung-uk Kim# As not all time is spent in multiplication, overall impact is deemed 967e71b7053SJung-uk Kim# too low to care about. 968e71b7053SJung-uk Kim 969e71b7053SJung-uk Kimmy ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7)); 970e71b7053SJung-uk Kimmy $mask="q4"; 971e71b7053SJung-uk Kimmy $mult="q5"; 972e71b7053SJung-uk Kimmy @AxB=map("q$_",(8..15)); 973e71b7053SJung-uk Kim 974e71b7053SJung-uk Kimmy ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3)); 975e71b7053SJung-uk Kim 976e71b7053SJung-uk Kim$code.=<<___; 977e71b7053SJung-uk Kim#if __ARM_ARCH__>=7 978e71b7053SJung-uk Kim.fpu neon 979e71b7053SJung-uk Kim 980e71b7053SJung-uk Kim.globl ecp_nistz256_mul_mont_neon 981e71b7053SJung-uk Kim.type ecp_nistz256_mul_mont_neon,%function 982e71b7053SJung-uk Kim.align 5 983e71b7053SJung-uk Kimecp_nistz256_mul_mont_neon: 984e71b7053SJung-uk Kim mov ip,sp 985e71b7053SJung-uk Kim stmdb sp!,{r4-r9} 986e71b7053SJung-uk Kim vstmdb sp!,{q4-q5} @ ABI specification says so 987e71b7053SJung-uk Kim 988e71b7053SJung-uk Kim sub $toutptr,sp,#40 989e71b7053SJung-uk Kim vld1.32 {${Bi}[0]},[$bptr,:32]! 990e71b7053SJung-uk Kim veor $zero,$zero,$zero 991e71b7053SJung-uk Kim vld1.32 {$A0-$A3}, [$aptr] @ can't specify :32 :-( 992e71b7053SJung-uk Kim vzip.16 $Bi,$zero 993e71b7053SJung-uk Kim mov sp,$toutptr @ alloca 994e71b7053SJung-uk Kim vmov.i64 $mask,#0xffff 995e71b7053SJung-uk Kim 996e71b7053SJung-uk Kim vmull.u32 @AxB[0],$Bi,${A0}[0] 997e71b7053SJung-uk Kim vmull.u32 @AxB[1],$Bi,${A0}[1] 998e71b7053SJung-uk Kim vmull.u32 @AxB[2],$Bi,${A1}[0] 999e71b7053SJung-uk Kim vmull.u32 @AxB[3],$Bi,${A1}[1] 1000e71b7053SJung-uk Kim vshr.u64 $temp,@AxB[0]#lo,#16 1001e71b7053SJung-uk Kim vmull.u32 @AxB[4],$Bi,${A2}[0] 1002e71b7053SJung-uk Kim vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp 1003e71b7053SJung-uk Kim vmull.u32 @AxB[5],$Bi,${A2}[1] 1004e71b7053SJung-uk Kim vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 32 bits of a[0]*b[0] 1005e71b7053SJung-uk Kim vmull.u32 @AxB[6],$Bi,${A3}[0] 1006e71b7053SJung-uk Kim vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] 1007e71b7053SJung-uk Kim vmull.u32 @AxB[7],$Bi,${A3}[1] 1008e71b7053SJung-uk Kim___ 1009e71b7053SJung-uk Kimfor($i=1;$i<8;$i++) { 1010e71b7053SJung-uk Kim$code.=<<___; 1011e71b7053SJung-uk Kim vld1.32 {${Bi}[0]},[$bptr,:32]! 1012e71b7053SJung-uk Kim veor $zero,$zero,$zero 1013e71b7053SJung-uk Kim vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ reduction 1014e71b7053SJung-uk Kim vshl.u64 $mult,@AxB[0],#32 1015e71b7053SJung-uk Kim vadd.u64 @AxB[3],@AxB[3],@AxB[0] 1016e71b7053SJung-uk Kim vsub.u64 $mult,$mult,@AxB[0] 1017e71b7053SJung-uk Kim vzip.16 $Bi,$zero 1018e71b7053SJung-uk Kim vadd.u64 @AxB[6],@AxB[6],@AxB[0] 1019e71b7053SJung-uk Kim vadd.u64 @AxB[7],@AxB[7],$mult 1020e71b7053SJung-uk Kim___ 1021e71b7053SJung-uk Kim push(@AxB,shift(@AxB)); 1022e71b7053SJung-uk Kim$code.=<<___; 1023e71b7053SJung-uk Kim vmlal.u32 @AxB[0],$Bi,${A0}[0] 1024e71b7053SJung-uk Kim vmlal.u32 @AxB[1],$Bi,${A0}[1] 1025e71b7053SJung-uk Kim vmlal.u32 @AxB[2],$Bi,${A1}[0] 1026e71b7053SJung-uk Kim vmlal.u32 @AxB[3],$Bi,${A1}[1] 1027e71b7053SJung-uk Kim vshr.u64 $temp,@AxB[0]#lo,#16 1028e71b7053SJung-uk Kim vmlal.u32 @AxB[4],$Bi,${A2}[0] 1029e71b7053SJung-uk Kim vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp 1030e71b7053SJung-uk Kim vmlal.u32 @AxB[5],$Bi,${A2}[1] 1031e71b7053SJung-uk Kim vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 33 bits of a[0]*b[i]+t[0] 1032e71b7053SJung-uk Kim vmlal.u32 @AxB[6],$Bi,${A3}[0] 1033e71b7053SJung-uk Kim vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0] 1034e71b7053SJung-uk Kim vmull.u32 @AxB[7],$Bi,${A3}[1] 1035e71b7053SJung-uk Kim___ 1036e71b7053SJung-uk Kim} 1037e71b7053SJung-uk Kim$code.=<<___; 1038e71b7053SJung-uk Kim vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ last reduction 1039e71b7053SJung-uk Kim vshl.u64 $mult,@AxB[0],#32 1040e71b7053SJung-uk Kim vadd.u64 @AxB[3],@AxB[3],@AxB[0] 1041e71b7053SJung-uk Kim vsub.u64 $mult,$mult,@AxB[0] 1042e71b7053SJung-uk Kim vadd.u64 @AxB[6],@AxB[6],@AxB[0] 1043e71b7053SJung-uk Kim vadd.u64 @AxB[7],@AxB[7],$mult 1044e71b7053SJung-uk Kim 1045e71b7053SJung-uk Kim vshr.u64 $temp,@AxB[1]#lo,#16 @ convert 1046e71b7053SJung-uk Kim vadd.u64 @AxB[1]#hi,@AxB[1]#hi,$temp 1047e71b7053SJung-uk Kim vshr.u64 $temp,@AxB[1]#hi,#16 1048e71b7053SJung-uk Kim vzip.16 @AxB[1]#lo,@AxB[1]#hi 1049e71b7053SJung-uk Kim___ 1050e71b7053SJung-uk Kimforeach (2..7) { 1051e71b7053SJung-uk Kim$code.=<<___; 1052e71b7053SJung-uk Kim vadd.u64 @AxB[$_]#lo,@AxB[$_]#lo,$temp 1053e71b7053SJung-uk Kim vst1.32 {@AxB[$_-1]#lo[0]},[$toutptr,:32]! 1054e71b7053SJung-uk Kim vshr.u64 $temp,@AxB[$_]#lo,#16 1055e71b7053SJung-uk Kim vadd.u64 @AxB[$_]#hi,@AxB[$_]#hi,$temp 1056e71b7053SJung-uk Kim vshr.u64 $temp,@AxB[$_]#hi,#16 1057e71b7053SJung-uk Kim vzip.16 @AxB[$_]#lo,@AxB[$_]#hi 1058e71b7053SJung-uk Kim___ 1059e71b7053SJung-uk Kim} 1060e71b7053SJung-uk Kim$code.=<<___; 1061e71b7053SJung-uk Kim vst1.32 {@AxB[7]#lo[0]},[$toutptr,:32]! 1062e71b7053SJung-uk Kim vst1.32 {$temp},[$toutptr] @ upper 33 bits 1063e71b7053SJung-uk Kim 1064e71b7053SJung-uk Kim ldr r1,[sp,#0] 1065e71b7053SJung-uk Kim ldr r2,[sp,#4] 1066e71b7053SJung-uk Kim ldr r3,[sp,#8] 1067e71b7053SJung-uk Kim subs r1,r1,#-1 1068e71b7053SJung-uk Kim ldr r4,[sp,#12] 1069e71b7053SJung-uk Kim sbcs r2,r2,#-1 1070e71b7053SJung-uk Kim ldr r5,[sp,#16] 1071e71b7053SJung-uk Kim sbcs r3,r3,#-1 1072e71b7053SJung-uk Kim ldr r6,[sp,#20] 1073e71b7053SJung-uk Kim sbcs r4,r4,#0 1074e71b7053SJung-uk Kim ldr r7,[sp,#24] 1075e71b7053SJung-uk Kim sbcs r5,r5,#0 1076e71b7053SJung-uk Kim ldr r8,[sp,#28] 1077e71b7053SJung-uk Kim sbcs r6,r6,#0 1078e71b7053SJung-uk Kim ldr r9,[sp,#32] @ top-most bit 1079e71b7053SJung-uk Kim sbcs r7,r7,#1 1080e71b7053SJung-uk Kim sub sp,ip,#40+16 1081e71b7053SJung-uk Kim sbcs r8,r8,#-1 1082e71b7053SJung-uk Kim sbc r9,r9,#0 1083e71b7053SJung-uk Kim vldmia sp!,{q4-q5} 1084e71b7053SJung-uk Kim 1085e71b7053SJung-uk Kim adds r1,r1,r9 1086e71b7053SJung-uk Kim adcs r2,r2,r9 1087e71b7053SJung-uk Kim str r1,[$rptr,#0] 1088e71b7053SJung-uk Kim adcs r3,r3,r9 1089e71b7053SJung-uk Kim str r2,[$rptr,#4] 1090e71b7053SJung-uk Kim adcs r4,r4,#0 1091e71b7053SJung-uk Kim str r3,[$rptr,#8] 1092e71b7053SJung-uk Kim adcs r5,r5,#0 1093e71b7053SJung-uk Kim str r4,[$rptr,#12] 1094e71b7053SJung-uk Kim adcs r6,r6,#0 1095e71b7053SJung-uk Kim str r5,[$rptr,#16] 1096e71b7053SJung-uk Kim adcs r7,r7,r9,lsr#31 1097e71b7053SJung-uk Kim str r6,[$rptr,#20] 1098e71b7053SJung-uk Kim adcs r8,r8,r9 1099e71b7053SJung-uk Kim str r7,[$rptr,#24] 1100e71b7053SJung-uk Kim str r8,[$rptr,#28] 1101e71b7053SJung-uk Kim 1102e71b7053SJung-uk Kim ldmia sp!,{r4-r9} 1103e71b7053SJung-uk Kim bx lr 1104e71b7053SJung-uk Kim.size ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon 1105e71b7053SJung-uk Kim#endif 1106e71b7053SJung-uk Kim___ 1107e71b7053SJung-uk Kim} 1108e71b7053SJung-uk Kim 1109e71b7053SJung-uk Kim{{{ 1110e71b7053SJung-uk Kim######################################################################## 1111e71b7053SJung-uk Kim# Below $aN assignment matches order in which 256-bit result appears in 1112e71b7053SJung-uk Kim# register bank at return from __ecp_nistz256_mul_mont, so that we can 1113e71b7053SJung-uk Kim# skip over reloading it from memory. This means that below functions 1114e71b7053SJung-uk Kim# use custom calling sequence accepting 256-bit input in registers, 1115e71b7053SJung-uk Kim# output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr. 1116e71b7053SJung-uk Kim# 1117e71b7053SJung-uk Kim# See their "normal" counterparts for insights on calculations. 1118e71b7053SJung-uk Kim 1119e71b7053SJung-uk Kimmy ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7, 1120e71b7053SJung-uk Kim $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1)); 1121e71b7053SJung-uk Kimmy $ff=$b_ptr; 1122e71b7053SJung-uk Kim 1123e71b7053SJung-uk Kim$code.=<<___; 1124e71b7053SJung-uk Kim.type __ecp_nistz256_sub_from,%function 1125e71b7053SJung-uk Kim.align 5 1126e71b7053SJung-uk Kim__ecp_nistz256_sub_from: 1127e71b7053SJung-uk Kim str lr,[sp,#-4]! @ push lr 1128e71b7053SJung-uk Kim 1129e71b7053SJung-uk Kim ldr $t0,[$b_ptr,#0] 1130e71b7053SJung-uk Kim ldr $t1,[$b_ptr,#4] 1131e71b7053SJung-uk Kim ldr $t2,[$b_ptr,#8] 1132e71b7053SJung-uk Kim ldr $t3,[$b_ptr,#12] 1133e71b7053SJung-uk Kim subs $a0,$a0,$t0 1134e71b7053SJung-uk Kim ldr $t0,[$b_ptr,#16] 1135e71b7053SJung-uk Kim sbcs $a1,$a1,$t1 1136e71b7053SJung-uk Kim ldr $t1,[$b_ptr,#20] 1137e71b7053SJung-uk Kim sbcs $a2,$a2,$t2 1138e71b7053SJung-uk Kim ldr $t2,[$b_ptr,#24] 1139e71b7053SJung-uk Kim sbcs $a3,$a3,$t3 1140e71b7053SJung-uk Kim ldr $t3,[$b_ptr,#28] 1141e71b7053SJung-uk Kim sbcs $a4,$a4,$t0 1142e71b7053SJung-uk Kim sbcs $a5,$a5,$t1 1143e71b7053SJung-uk Kim sbcs $a6,$a6,$t2 1144e71b7053SJung-uk Kim sbcs $a7,$a7,$t3 1145e71b7053SJung-uk Kim sbc $ff,$ff,$ff @ broadcast borrow bit 1146e71b7053SJung-uk Kim ldr lr,[sp],#4 @ pop lr 1147e71b7053SJung-uk Kim 1148e71b7053SJung-uk Kim adds $a0,$a0,$ff @ add synthesized modulus 1149e71b7053SJung-uk Kim adcs $a1,$a1,$ff 1150e71b7053SJung-uk Kim str $a0,[$r_ptr,#0] 1151e71b7053SJung-uk Kim adcs $a2,$a2,$ff 1152e71b7053SJung-uk Kim str $a1,[$r_ptr,#4] 1153e71b7053SJung-uk Kim adcs $a3,$a3,#0 1154e71b7053SJung-uk Kim str $a2,[$r_ptr,#8] 1155e71b7053SJung-uk Kim adcs $a4,$a4,#0 1156e71b7053SJung-uk Kim str $a3,[$r_ptr,#12] 1157e71b7053SJung-uk Kim adcs $a5,$a5,#0 1158e71b7053SJung-uk Kim str $a4,[$r_ptr,#16] 1159e71b7053SJung-uk Kim adcs $a6,$a6,$ff,lsr#31 1160e71b7053SJung-uk Kim str $a5,[$r_ptr,#20] 1161e71b7053SJung-uk Kim adcs $a7,$a7,$ff 1162e71b7053SJung-uk Kim str $a6,[$r_ptr,#24] 1163e71b7053SJung-uk Kim str $a7,[$r_ptr,#28] 1164e71b7053SJung-uk Kim 1165e71b7053SJung-uk Kim mov pc,lr 1166e71b7053SJung-uk Kim.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 1167e71b7053SJung-uk Kim 1168e71b7053SJung-uk Kim.type __ecp_nistz256_sub_morf,%function 1169e71b7053SJung-uk Kim.align 5 1170e71b7053SJung-uk Kim__ecp_nistz256_sub_morf: 1171e71b7053SJung-uk Kim str lr,[sp,#-4]! @ push lr 1172e71b7053SJung-uk Kim 1173e71b7053SJung-uk Kim ldr $t0,[$b_ptr,#0] 1174e71b7053SJung-uk Kim ldr $t1,[$b_ptr,#4] 1175e71b7053SJung-uk Kim ldr $t2,[$b_ptr,#8] 1176e71b7053SJung-uk Kim ldr $t3,[$b_ptr,#12] 1177e71b7053SJung-uk Kim subs $a0,$t0,$a0 1178e71b7053SJung-uk Kim ldr $t0,[$b_ptr,#16] 1179e71b7053SJung-uk Kim sbcs $a1,$t1,$a1 1180e71b7053SJung-uk Kim ldr $t1,[$b_ptr,#20] 1181e71b7053SJung-uk Kim sbcs $a2,$t2,$a2 1182e71b7053SJung-uk Kim ldr $t2,[$b_ptr,#24] 1183e71b7053SJung-uk Kim sbcs $a3,$t3,$a3 1184e71b7053SJung-uk Kim ldr $t3,[$b_ptr,#28] 1185e71b7053SJung-uk Kim sbcs $a4,$t0,$a4 1186e71b7053SJung-uk Kim sbcs $a5,$t1,$a5 1187e71b7053SJung-uk Kim sbcs $a6,$t2,$a6 1188e71b7053SJung-uk Kim sbcs $a7,$t3,$a7 1189e71b7053SJung-uk Kim sbc $ff,$ff,$ff @ broadcast borrow bit 1190e71b7053SJung-uk Kim ldr lr,[sp],#4 @ pop lr 1191e71b7053SJung-uk Kim 1192e71b7053SJung-uk Kim adds $a0,$a0,$ff @ add synthesized modulus 1193e71b7053SJung-uk Kim adcs $a1,$a1,$ff 1194e71b7053SJung-uk Kim str $a0,[$r_ptr,#0] 1195e71b7053SJung-uk Kim adcs $a2,$a2,$ff 1196e71b7053SJung-uk Kim str $a1,[$r_ptr,#4] 1197e71b7053SJung-uk Kim adcs $a3,$a3,#0 1198e71b7053SJung-uk Kim str $a2,[$r_ptr,#8] 1199e71b7053SJung-uk Kim adcs $a4,$a4,#0 1200e71b7053SJung-uk Kim str $a3,[$r_ptr,#12] 1201e71b7053SJung-uk Kim adcs $a5,$a5,#0 1202e71b7053SJung-uk Kim str $a4,[$r_ptr,#16] 1203e71b7053SJung-uk Kim adcs $a6,$a6,$ff,lsr#31 1204e71b7053SJung-uk Kim str $a5,[$r_ptr,#20] 1205e71b7053SJung-uk Kim adcs $a7,$a7,$ff 1206e71b7053SJung-uk Kim str $a6,[$r_ptr,#24] 1207e71b7053SJung-uk Kim str $a7,[$r_ptr,#28] 1208e71b7053SJung-uk Kim 1209e71b7053SJung-uk Kim mov pc,lr 1210e71b7053SJung-uk Kim.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 1211e71b7053SJung-uk Kim 1212e71b7053SJung-uk Kim.type __ecp_nistz256_add_self,%function 1213e71b7053SJung-uk Kim.align 4 1214e71b7053SJung-uk Kim__ecp_nistz256_add_self: 1215e71b7053SJung-uk Kim adds $a0,$a0,$a0 @ a[0:7]+=a[0:7] 1216e71b7053SJung-uk Kim adcs $a1,$a1,$a1 1217e71b7053SJung-uk Kim adcs $a2,$a2,$a2 1218e71b7053SJung-uk Kim adcs $a3,$a3,$a3 1219e71b7053SJung-uk Kim adcs $a4,$a4,$a4 1220e71b7053SJung-uk Kim adcs $a5,$a5,$a5 1221e71b7053SJung-uk Kim adcs $a6,$a6,$a6 1222e71b7053SJung-uk Kim mov $ff,#0 1223e71b7053SJung-uk Kim adcs $a7,$a7,$a7 1224e71b7053SJung-uk Kim adc $ff,$ff,#0 1225e71b7053SJung-uk Kim 1226e71b7053SJung-uk Kim @ if a+b >= modulus, subtract modulus. 1227e71b7053SJung-uk Kim @ 1228e71b7053SJung-uk Kim @ But since comparison implies subtraction, we subtract 1229e71b7053SJung-uk Kim @ modulus and then add it back if subtraction borrowed. 1230e71b7053SJung-uk Kim 1231e71b7053SJung-uk Kim subs $a0,$a0,#-1 1232e71b7053SJung-uk Kim sbcs $a1,$a1,#-1 1233e71b7053SJung-uk Kim sbcs $a2,$a2,#-1 1234e71b7053SJung-uk Kim sbcs $a3,$a3,#0 1235e71b7053SJung-uk Kim sbcs $a4,$a4,#0 1236e71b7053SJung-uk Kim sbcs $a5,$a5,#0 1237e71b7053SJung-uk Kim sbcs $a6,$a6,#1 1238e71b7053SJung-uk Kim sbcs $a7,$a7,#-1 1239e71b7053SJung-uk Kim sbc $ff,$ff,#0 1240e71b7053SJung-uk Kim 1241e71b7053SJung-uk Kim @ Note that because mod has special form, i.e. consists of 1242e71b7053SJung-uk Kim @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by 1243e71b7053SJung-uk Kim @ using value of borrow as a whole or extracting single bit. 1244e71b7053SJung-uk Kim @ Follow $ff register... 1245e71b7053SJung-uk Kim 1246e71b7053SJung-uk Kim adds $a0,$a0,$ff @ add synthesized modulus 1247e71b7053SJung-uk Kim adcs $a1,$a1,$ff 1248e71b7053SJung-uk Kim str $a0,[$r_ptr,#0] 1249e71b7053SJung-uk Kim adcs $a2,$a2,$ff 1250e71b7053SJung-uk Kim str $a1,[$r_ptr,#4] 1251e71b7053SJung-uk Kim adcs $a3,$a3,#0 1252e71b7053SJung-uk Kim str $a2,[$r_ptr,#8] 1253e71b7053SJung-uk Kim adcs $a4,$a4,#0 1254e71b7053SJung-uk Kim str $a3,[$r_ptr,#12] 1255e71b7053SJung-uk Kim adcs $a5,$a5,#0 1256e71b7053SJung-uk Kim str $a4,[$r_ptr,#16] 1257e71b7053SJung-uk Kim adcs $a6,$a6,$ff,lsr#31 1258e71b7053SJung-uk Kim str $a5,[$r_ptr,#20] 1259e71b7053SJung-uk Kim adcs $a7,$a7,$ff 1260e71b7053SJung-uk Kim str $a6,[$r_ptr,#24] 1261e71b7053SJung-uk Kim str $a7,[$r_ptr,#28] 1262e71b7053SJung-uk Kim 1263e71b7053SJung-uk Kim mov pc,lr 1264e71b7053SJung-uk Kim.size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self 1265e71b7053SJung-uk Kim 1266e71b7053SJung-uk Kim___ 1267e71b7053SJung-uk Kim 1268e71b7053SJung-uk Kim######################################################################## 1269e71b7053SJung-uk Kim# following subroutines are "literal" implementation of those found in 1270e71b7053SJung-uk Kim# ecp_nistz256.c 1271e71b7053SJung-uk Kim# 1272e71b7053SJung-uk Kim######################################################################## 1273e71b7053SJung-uk Kim# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp); 1274e71b7053SJung-uk Kim# 1275e71b7053SJung-uk Kim{ 1276e71b7053SJung-uk Kimmy ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); 1277e71b7053SJung-uk Kim# above map() describes stack layout with 5 temporary 1278e71b7053SJung-uk Kim# 256-bit vectors on top. Then note that we push 1279e71b7053SJung-uk Kim# starting from r0, which means that we have copy of 1280e71b7053SJung-uk Kim# input arguments just below these temporary vectors. 1281e71b7053SJung-uk Kim 1282e71b7053SJung-uk Kim$code.=<<___; 1283e71b7053SJung-uk Kim.globl ecp_nistz256_point_double 1284e71b7053SJung-uk Kim.type ecp_nistz256_point_double,%function 1285e71b7053SJung-uk Kim.align 5 1286e71b7053SJung-uk Kimecp_nistz256_point_double: 1287e71b7053SJung-uk Kim stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional 1288e71b7053SJung-uk Kim sub sp,sp,#32*5 1289e71b7053SJung-uk Kim 1290e71b7053SJung-uk Kim.Lpoint_double_shortcut: 1291e71b7053SJung-uk Kim add r3,sp,#$in_x 1292e71b7053SJung-uk Kim ldmia $a_ptr!,{r4-r11} @ copy in_x 1293e71b7053SJung-uk Kim stmia r3,{r4-r11} 1294e71b7053SJung-uk Kim 1295e71b7053SJung-uk Kim add $r_ptr,sp,#$S 1296e71b7053SJung-uk Kim bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y); 1297e71b7053SJung-uk Kim 1298e71b7053SJung-uk Kim add $b_ptr,$a_ptr,#32 1299e71b7053SJung-uk Kim add $a_ptr,$a_ptr,#32 1300e71b7053SJung-uk Kim add $r_ptr,sp,#$Zsqr 1301e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z); 1302e71b7053SJung-uk Kim 1303e71b7053SJung-uk Kim add $a_ptr,sp,#$S 1304e71b7053SJung-uk Kim add $b_ptr,sp,#$S 1305e71b7053SJung-uk Kim add $r_ptr,sp,#$S 1306e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S); 1307e71b7053SJung-uk Kim 1308e71b7053SJung-uk Kim ldr $b_ptr,[sp,#32*5+4] 1309e71b7053SJung-uk Kim add $a_ptr,$b_ptr,#32 1310e71b7053SJung-uk Kim add $b_ptr,$b_ptr,#64 1311e71b7053SJung-uk Kim add $r_ptr,sp,#$tmp0 1312e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y); 1313e71b7053SJung-uk Kim 1314e71b7053SJung-uk Kim ldr $r_ptr,[sp,#32*5] 1315e71b7053SJung-uk Kim add $r_ptr,$r_ptr,#64 1316e71b7053SJung-uk Kim bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0); 1317e71b7053SJung-uk Kim 1318e71b7053SJung-uk Kim add $a_ptr,sp,#$in_x 1319e71b7053SJung-uk Kim add $b_ptr,sp,#$Zsqr 1320e71b7053SJung-uk Kim add $r_ptr,sp,#$M 1321e71b7053SJung-uk Kim bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr); 1322e71b7053SJung-uk Kim 1323e71b7053SJung-uk Kim add $a_ptr,sp,#$in_x 1324e71b7053SJung-uk Kim add $b_ptr,sp,#$Zsqr 1325e71b7053SJung-uk Kim add $r_ptr,sp,#$Zsqr 1326e71b7053SJung-uk Kim bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr); 1327e71b7053SJung-uk Kim 1328e71b7053SJung-uk Kim add $a_ptr,sp,#$S 1329e71b7053SJung-uk Kim add $b_ptr,sp,#$S 1330e71b7053SJung-uk Kim add $r_ptr,sp,#$tmp0 1331e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S); 1332e71b7053SJung-uk Kim 1333e71b7053SJung-uk Kim add $a_ptr,sp,#$Zsqr 1334e71b7053SJung-uk Kim add $b_ptr,sp,#$M 1335e71b7053SJung-uk Kim add $r_ptr,sp,#$M 1336e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr); 1337e71b7053SJung-uk Kim 1338e71b7053SJung-uk Kim ldr $r_ptr,[sp,#32*5] 1339e71b7053SJung-uk Kim add $a_ptr,sp,#$tmp0 1340e71b7053SJung-uk Kim add $r_ptr,$r_ptr,#32 1341e71b7053SJung-uk Kim bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0); 1342e71b7053SJung-uk Kim 1343e71b7053SJung-uk Kim add $a_ptr,sp,#$M 1344e71b7053SJung-uk Kim add $r_ptr,sp,#$M 1345e71b7053SJung-uk Kim bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M); 1346e71b7053SJung-uk Kim 1347e71b7053SJung-uk Kim add $a_ptr,sp,#$in_x 1348e71b7053SJung-uk Kim add $b_ptr,sp,#$S 1349e71b7053SJung-uk Kim add $r_ptr,sp,#$S 1350e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x); 1351e71b7053SJung-uk Kim 1352e71b7053SJung-uk Kim add $r_ptr,sp,#$tmp0 1353e71b7053SJung-uk Kim bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S); 1354e71b7053SJung-uk Kim 1355e71b7053SJung-uk Kim ldr $r_ptr,[sp,#32*5] 1356e71b7053SJung-uk Kim add $a_ptr,sp,#$M 1357e71b7053SJung-uk Kim add $b_ptr,sp,#$M 1358e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M); 1359e71b7053SJung-uk Kim 1360e71b7053SJung-uk Kim add $b_ptr,sp,#$tmp0 1361e71b7053SJung-uk Kim bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0); 1362e71b7053SJung-uk Kim 1363e71b7053SJung-uk Kim add $b_ptr,sp,#$S 1364e71b7053SJung-uk Kim add $r_ptr,sp,#$S 1365e71b7053SJung-uk Kim bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x); 1366e71b7053SJung-uk Kim 1367e71b7053SJung-uk Kim add $a_ptr,sp,#$M 1368e71b7053SJung-uk Kim add $b_ptr,sp,#$S 1369e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M); 1370e71b7053SJung-uk Kim 1371e71b7053SJung-uk Kim ldr $r_ptr,[sp,#32*5] 1372e71b7053SJung-uk Kim add $b_ptr,$r_ptr,#32 1373e71b7053SJung-uk Kim add $r_ptr,$r_ptr,#32 1374e71b7053SJung-uk Kim bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y); 1375e71b7053SJung-uk Kim 1376e71b7053SJung-uk Kim add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3" 1377e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__) 1378e71b7053SJung-uk Kim ldmia sp!,{r4-r12,pc} 1379e71b7053SJung-uk Kim#else 1380e71b7053SJung-uk Kim ldmia sp!,{r4-r12,lr} 1381e71b7053SJung-uk Kim bx lr @ interoperable with Thumb ISA:-) 1382e71b7053SJung-uk Kim#endif 1383e71b7053SJung-uk Kim.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 1384e71b7053SJung-uk Kim___ 1385e71b7053SJung-uk Kim} 1386e71b7053SJung-uk Kim 1387e71b7053SJung-uk Kim######################################################################## 1388e71b7053SJung-uk Kim# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1, 1389e71b7053SJung-uk Kim# const P256_POINT *in2); 1390e71b7053SJung-uk Kim{ 1391e71b7053SJung-uk Kimmy ($res_x,$res_y,$res_z, 1392e71b7053SJung-uk Kim $in1_x,$in1_y,$in1_z, 1393e71b7053SJung-uk Kim $in2_x,$in2_y,$in2_z, 1394e71b7053SJung-uk Kim $H,$Hsqr,$R,$Rsqr,$Hcub, 1395e71b7053SJung-uk Kim $U1,$U2,$S1,$S2)=map(32*$_,(0..17)); 1396e71b7053SJung-uk Kimmy ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); 1397e71b7053SJung-uk Kim# above map() describes stack layout with 18 temporary 1398e71b7053SJung-uk Kim# 256-bit vectors on top. Then note that we push 1399e71b7053SJung-uk Kim# starting from r0, which means that we have copy of 1400e71b7053SJung-uk Kim# input arguments just below these temporary vectors. 140117f01e99SJung-uk Kim# We use three of them for ~in1infty, ~in2infty and 1402e71b7053SJung-uk Kim# result of check for zero. 1403e71b7053SJung-uk Kim 1404e71b7053SJung-uk Kim$code.=<<___; 1405e71b7053SJung-uk Kim.globl ecp_nistz256_point_add 1406e71b7053SJung-uk Kim.type ecp_nistz256_point_add,%function 1407e71b7053SJung-uk Kim.align 5 1408e71b7053SJung-uk Kimecp_nistz256_point_add: 1409e71b7053SJung-uk Kim stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional 1410e71b7053SJung-uk Kim sub sp,sp,#32*18+16 1411e71b7053SJung-uk Kim 1412e71b7053SJung-uk Kim ldmia $b_ptr!,{r4-r11} @ copy in2_x 1413e71b7053SJung-uk Kim add r3,sp,#$in2_x 1414e71b7053SJung-uk Kim stmia r3!,{r4-r11} 1415e71b7053SJung-uk Kim ldmia $b_ptr!,{r4-r11} @ copy in2_y 1416e71b7053SJung-uk Kim stmia r3!,{r4-r11} 1417e71b7053SJung-uk Kim ldmia $b_ptr,{r4-r11} @ copy in2_z 1418e71b7053SJung-uk Kim orr r12,r4,r5 1419e71b7053SJung-uk Kim orr r12,r12,r6 1420e71b7053SJung-uk Kim orr r12,r12,r7 1421e71b7053SJung-uk Kim orr r12,r12,r8 1422e71b7053SJung-uk Kim orr r12,r12,r9 1423e71b7053SJung-uk Kim orr r12,r12,r10 1424e71b7053SJung-uk Kim orr r12,r12,r11 1425e71b7053SJung-uk Kim cmp r12,#0 1426e71b7053SJung-uk Kim#ifdef __thumb2__ 1427e71b7053SJung-uk Kim it ne 1428e71b7053SJung-uk Kim#endif 1429e71b7053SJung-uk Kim movne r12,#-1 1430e71b7053SJung-uk Kim stmia r3,{r4-r11} 143117f01e99SJung-uk Kim str r12,[sp,#32*18+8] @ ~in2infty 1432e71b7053SJung-uk Kim 1433e71b7053SJung-uk Kim ldmia $a_ptr!,{r4-r11} @ copy in1_x 1434e71b7053SJung-uk Kim add r3,sp,#$in1_x 1435e71b7053SJung-uk Kim stmia r3!,{r4-r11} 1436e71b7053SJung-uk Kim ldmia $a_ptr!,{r4-r11} @ copy in1_y 1437e71b7053SJung-uk Kim stmia r3!,{r4-r11} 1438e71b7053SJung-uk Kim ldmia $a_ptr,{r4-r11} @ copy in1_z 1439e71b7053SJung-uk Kim orr r12,r4,r5 1440e71b7053SJung-uk Kim orr r12,r12,r6 1441e71b7053SJung-uk Kim orr r12,r12,r7 1442e71b7053SJung-uk Kim orr r12,r12,r8 1443e71b7053SJung-uk Kim orr r12,r12,r9 1444e71b7053SJung-uk Kim orr r12,r12,r10 1445e71b7053SJung-uk Kim orr r12,r12,r11 1446e71b7053SJung-uk Kim cmp r12,#0 1447e71b7053SJung-uk Kim#ifdef __thumb2__ 1448e71b7053SJung-uk Kim it ne 1449e71b7053SJung-uk Kim#endif 1450e71b7053SJung-uk Kim movne r12,#-1 1451e71b7053SJung-uk Kim stmia r3,{r4-r11} 145217f01e99SJung-uk Kim str r12,[sp,#32*18+4] @ ~in1infty 1453e71b7053SJung-uk Kim 1454e71b7053SJung-uk Kim add $a_ptr,sp,#$in2_z 1455e71b7053SJung-uk Kim add $b_ptr,sp,#$in2_z 1456e71b7053SJung-uk Kim add $r_ptr,sp,#$Z2sqr 1457e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z); 1458e71b7053SJung-uk Kim 1459e71b7053SJung-uk Kim add $a_ptr,sp,#$in1_z 1460e71b7053SJung-uk Kim add $b_ptr,sp,#$in1_z 1461e71b7053SJung-uk Kim add $r_ptr,sp,#$Z1sqr 1462e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); 1463e71b7053SJung-uk Kim 1464e71b7053SJung-uk Kim add $a_ptr,sp,#$in2_z 1465e71b7053SJung-uk Kim add $b_ptr,sp,#$Z2sqr 1466e71b7053SJung-uk Kim add $r_ptr,sp,#$S1 1467e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z); 1468e71b7053SJung-uk Kim 1469e71b7053SJung-uk Kim add $a_ptr,sp,#$in1_z 1470e71b7053SJung-uk Kim add $b_ptr,sp,#$Z1sqr 1471e71b7053SJung-uk Kim add $r_ptr,sp,#$S2 1472e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); 1473e71b7053SJung-uk Kim 1474e71b7053SJung-uk Kim add $a_ptr,sp,#$in1_y 1475e71b7053SJung-uk Kim add $b_ptr,sp,#$S1 1476e71b7053SJung-uk Kim add $r_ptr,sp,#$S1 1477e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y); 1478e71b7053SJung-uk Kim 1479e71b7053SJung-uk Kim add $a_ptr,sp,#$in2_y 1480e71b7053SJung-uk Kim add $b_ptr,sp,#$S2 1481e71b7053SJung-uk Kim add $r_ptr,sp,#$S2 1482e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); 1483e71b7053SJung-uk Kim 1484e71b7053SJung-uk Kim add $b_ptr,sp,#$S1 1485e71b7053SJung-uk Kim add $r_ptr,sp,#$R 1486e71b7053SJung-uk Kim bl __ecp_nistz256_sub_from @ p256_sub(R, S2, S1); 1487e71b7053SJung-uk Kim 1488e71b7053SJung-uk Kim orr $a0,$a0,$a1 @ see if result is zero 1489e71b7053SJung-uk Kim orr $a2,$a2,$a3 1490e71b7053SJung-uk Kim orr $a4,$a4,$a5 1491e71b7053SJung-uk Kim orr $a0,$a0,$a2 1492e71b7053SJung-uk Kim orr $a4,$a4,$a6 1493e71b7053SJung-uk Kim orr $a0,$a0,$a7 1494e71b7053SJung-uk Kim add $a_ptr,sp,#$in1_x 1495e71b7053SJung-uk Kim orr $a0,$a0,$a4 1496e71b7053SJung-uk Kim add $b_ptr,sp,#$Z2sqr 1497e71b7053SJung-uk Kim str $a0,[sp,#32*18+12] 1498e71b7053SJung-uk Kim 1499e71b7053SJung-uk Kim add $r_ptr,sp,#$U1 1500e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr); 1501e71b7053SJung-uk Kim 1502e71b7053SJung-uk Kim add $a_ptr,sp,#$in2_x 1503e71b7053SJung-uk Kim add $b_ptr,sp,#$Z1sqr 1504e71b7053SJung-uk Kim add $r_ptr,sp,#$U2 1505e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr); 1506e71b7053SJung-uk Kim 1507e71b7053SJung-uk Kim add $b_ptr,sp,#$U1 1508e71b7053SJung-uk Kim add $r_ptr,sp,#$H 1509e71b7053SJung-uk Kim bl __ecp_nistz256_sub_from @ p256_sub(H, U2, U1); 1510e71b7053SJung-uk Kim 1511e71b7053SJung-uk Kim orr $a0,$a0,$a1 @ see if result is zero 1512e71b7053SJung-uk Kim orr $a2,$a2,$a3 1513e71b7053SJung-uk Kim orr $a4,$a4,$a5 1514e71b7053SJung-uk Kim orr $a0,$a0,$a2 1515e71b7053SJung-uk Kim orr $a4,$a4,$a6 1516e71b7053SJung-uk Kim orr $a0,$a0,$a7 151717f01e99SJung-uk Kim orr $a0,$a0,$a4 @ ~is_equal(U1,U2) 1518e71b7053SJung-uk Kim 151917f01e99SJung-uk Kim ldr $t0,[sp,#32*18+4] @ ~in1infty 152017f01e99SJung-uk Kim ldr $t1,[sp,#32*18+8] @ ~in2infty 152117f01e99SJung-uk Kim ldr $t2,[sp,#32*18+12] @ ~is_equal(S1,S2) 152217f01e99SJung-uk Kim mvn $t0,$t0 @ -1/0 -> 0/-1 152317f01e99SJung-uk Kim mvn $t1,$t1 @ -1/0 -> 0/-1 152458f35182SJung-uk Kim orr $a0,$a0,$t0 152558f35182SJung-uk Kim orr $a0,$a0,$t1 152658f35182SJung-uk Kim orrs $a0,$a0,$t2 @ set flags 1527e71b7053SJung-uk Kim 152817f01e99SJung-uk Kim @ if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) 152917f01e99SJung-uk Kim bne .Ladd_proceed 1530e71b7053SJung-uk Kim 1531e71b7053SJung-uk Kim.Ladd_double: 1532e71b7053SJung-uk Kim ldr $a_ptr,[sp,#32*18+20] 1533e71b7053SJung-uk Kim add sp,sp,#32*(18-5)+16 @ difference in frame sizes 1534e71b7053SJung-uk Kim b .Lpoint_double_shortcut 1535e71b7053SJung-uk Kim 1536e71b7053SJung-uk Kim.align 4 1537e71b7053SJung-uk Kim.Ladd_proceed: 1538e71b7053SJung-uk Kim add $a_ptr,sp,#$R 1539e71b7053SJung-uk Kim add $b_ptr,sp,#$R 1540e71b7053SJung-uk Kim add $r_ptr,sp,#$Rsqr 1541e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); 1542e71b7053SJung-uk Kim 1543e71b7053SJung-uk Kim add $a_ptr,sp,#$H 1544e71b7053SJung-uk Kim add $b_ptr,sp,#$in1_z 1545e71b7053SJung-uk Kim add $r_ptr,sp,#$res_z 1546e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); 1547e71b7053SJung-uk Kim 1548e71b7053SJung-uk Kim add $a_ptr,sp,#$H 1549e71b7053SJung-uk Kim add $b_ptr,sp,#$H 1550e71b7053SJung-uk Kim add $r_ptr,sp,#$Hsqr 1551e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); 1552e71b7053SJung-uk Kim 1553e71b7053SJung-uk Kim add $a_ptr,sp,#$in2_z 1554e71b7053SJung-uk Kim add $b_ptr,sp,#$res_z 1555e71b7053SJung-uk Kim add $r_ptr,sp,#$res_z 1556e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z); 1557e71b7053SJung-uk Kim 1558e71b7053SJung-uk Kim add $a_ptr,sp,#$H 1559e71b7053SJung-uk Kim add $b_ptr,sp,#$Hsqr 1560e71b7053SJung-uk Kim add $r_ptr,sp,#$Hcub 1561e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); 1562e71b7053SJung-uk Kim 1563e71b7053SJung-uk Kim add $a_ptr,sp,#$Hsqr 1564e71b7053SJung-uk Kim add $b_ptr,sp,#$U1 1565e71b7053SJung-uk Kim add $r_ptr,sp,#$U2 1566e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr); 1567e71b7053SJung-uk Kim 1568e71b7053SJung-uk Kim add $r_ptr,sp,#$Hsqr 1569e71b7053SJung-uk Kim bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); 1570e71b7053SJung-uk Kim 1571e71b7053SJung-uk Kim add $b_ptr,sp,#$Rsqr 1572e71b7053SJung-uk Kim add $r_ptr,sp,#$res_x 1573e71b7053SJung-uk Kim bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); 1574e71b7053SJung-uk Kim 1575e71b7053SJung-uk Kim add $b_ptr,sp,#$Hcub 1576e71b7053SJung-uk Kim bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); 1577e71b7053SJung-uk Kim 1578e71b7053SJung-uk Kim add $b_ptr,sp,#$U2 1579e71b7053SJung-uk Kim add $r_ptr,sp,#$res_y 1580e71b7053SJung-uk Kim bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); 1581e71b7053SJung-uk Kim 1582e71b7053SJung-uk Kim add $a_ptr,sp,#$Hcub 1583e71b7053SJung-uk Kim add $b_ptr,sp,#$S1 1584e71b7053SJung-uk Kim add $r_ptr,sp,#$S2 1585e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub); 1586e71b7053SJung-uk Kim 1587e71b7053SJung-uk Kim add $a_ptr,sp,#$R 1588e71b7053SJung-uk Kim add $b_ptr,sp,#$res_y 1589e71b7053SJung-uk Kim add $r_ptr,sp,#$res_y 1590e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); 1591e71b7053SJung-uk Kim 1592e71b7053SJung-uk Kim add $b_ptr,sp,#$S2 1593e71b7053SJung-uk Kim bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); 1594e71b7053SJung-uk Kim 159517f01e99SJung-uk Kim ldr r11,[sp,#32*18+4] @ ~in1infty 159617f01e99SJung-uk Kim ldr r12,[sp,#32*18+8] @ ~in2infty 1597e71b7053SJung-uk Kim add r1,sp,#$res_x 1598e71b7053SJung-uk Kim add r2,sp,#$in2_x 159917f01e99SJung-uk Kim and r10,r11,r12 @ ~in1infty & ~in2infty 1600e71b7053SJung-uk Kim mvn r11,r11 1601e71b7053SJung-uk Kim add r3,sp,#$in1_x 160217f01e99SJung-uk Kim and r11,r11,r12 @ in1infty & ~in2infty 160317f01e99SJung-uk Kim mvn r12,r12 @ in2infty 1604e71b7053SJung-uk Kim ldr $r_ptr,[sp,#32*18+16] 1605e71b7053SJung-uk Kim___ 1606e71b7053SJung-uk Kimfor($i=0;$i<96;$i+=8) { # conditional moves 1607e71b7053SJung-uk Kim$code.=<<___; 1608e71b7053SJung-uk Kim ldmia r1!,{r4-r5} @ res_x 1609e71b7053SJung-uk Kim ldmia r2!,{r6-r7} @ in2_x 1610e71b7053SJung-uk Kim ldmia r3!,{r8-r9} @ in1_x 161117f01e99SJung-uk Kim and r4,r4,r10 @ ~in1infty & ~in2infty 1612e71b7053SJung-uk Kim and r5,r5,r10 161317f01e99SJung-uk Kim and r6,r6,r11 @ in1infty & ~in2infty 1614e71b7053SJung-uk Kim and r7,r7,r11 161517f01e99SJung-uk Kim and r8,r8,r12 @ in2infty 1616e71b7053SJung-uk Kim and r9,r9,r12 1617e71b7053SJung-uk Kim orr r4,r4,r6 1618e71b7053SJung-uk Kim orr r5,r5,r7 1619e71b7053SJung-uk Kim orr r4,r4,r8 1620e71b7053SJung-uk Kim orr r5,r5,r9 1621e71b7053SJung-uk Kim stmia $r_ptr!,{r4-r5} 1622e71b7053SJung-uk Kim___ 1623e71b7053SJung-uk Kim} 1624e71b7053SJung-uk Kim$code.=<<___; 1625e71b7053SJung-uk Kim.Ladd_done: 1626e71b7053SJung-uk Kim add sp,sp,#32*18+16+16 @ +16 means "skip even over saved r0-r3" 1627e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__) 1628e71b7053SJung-uk Kim ldmia sp!,{r4-r12,pc} 1629e71b7053SJung-uk Kim#else 1630e71b7053SJung-uk Kim ldmia sp!,{r4-r12,lr} 1631e71b7053SJung-uk Kim bx lr @ interoperable with Thumb ISA:-) 1632e71b7053SJung-uk Kim#endif 1633e71b7053SJung-uk Kim.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 1634e71b7053SJung-uk Kim___ 1635e71b7053SJung-uk Kim} 1636e71b7053SJung-uk Kim 1637e71b7053SJung-uk Kim######################################################################## 1638e71b7053SJung-uk Kim# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1, 1639e71b7053SJung-uk Kim# const P256_POINT_AFFINE *in2); 1640e71b7053SJung-uk Kim{ 1641e71b7053SJung-uk Kimmy ($res_x,$res_y,$res_z, 1642e71b7053SJung-uk Kim $in1_x,$in1_y,$in1_z, 1643e71b7053SJung-uk Kim $in2_x,$in2_y, 1644e71b7053SJung-uk Kim $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14)); 1645e71b7053SJung-uk Kimmy $Z1sqr = $S2; 1646e71b7053SJung-uk Kim# above map() describes stack layout with 18 temporary 1647e71b7053SJung-uk Kim# 256-bit vectors on top. Then note that we push 1648e71b7053SJung-uk Kim# starting from r0, which means that we have copy of 1649e71b7053SJung-uk Kim# input arguments just below these temporary vectors. 165017f01e99SJung-uk Kim# We use two of them for ~in1infty, ~in2infty. 1651e71b7053SJung-uk Kim 1652e71b7053SJung-uk Kimmy @ONE_mont=(1,0,0,-1,-1,-1,-2,0); 1653e71b7053SJung-uk Kim 1654e71b7053SJung-uk Kim$code.=<<___; 1655e71b7053SJung-uk Kim.globl ecp_nistz256_point_add_affine 1656e71b7053SJung-uk Kim.type ecp_nistz256_point_add_affine,%function 1657e71b7053SJung-uk Kim.align 5 1658e71b7053SJung-uk Kimecp_nistz256_point_add_affine: 1659e71b7053SJung-uk Kim stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional 1660e71b7053SJung-uk Kim sub sp,sp,#32*15 1661e71b7053SJung-uk Kim 1662e71b7053SJung-uk Kim ldmia $a_ptr!,{r4-r11} @ copy in1_x 1663e71b7053SJung-uk Kim add r3,sp,#$in1_x 1664e71b7053SJung-uk Kim stmia r3!,{r4-r11} 1665e71b7053SJung-uk Kim ldmia $a_ptr!,{r4-r11} @ copy in1_y 1666e71b7053SJung-uk Kim stmia r3!,{r4-r11} 1667e71b7053SJung-uk Kim ldmia $a_ptr,{r4-r11} @ copy in1_z 1668e71b7053SJung-uk Kim orr r12,r4,r5 1669e71b7053SJung-uk Kim orr r12,r12,r6 1670e71b7053SJung-uk Kim orr r12,r12,r7 1671e71b7053SJung-uk Kim orr r12,r12,r8 1672e71b7053SJung-uk Kim orr r12,r12,r9 1673e71b7053SJung-uk Kim orr r12,r12,r10 1674e71b7053SJung-uk Kim orr r12,r12,r11 1675e71b7053SJung-uk Kim cmp r12,#0 1676e71b7053SJung-uk Kim#ifdef __thumb2__ 1677e71b7053SJung-uk Kim it ne 1678e71b7053SJung-uk Kim#endif 1679e71b7053SJung-uk Kim movne r12,#-1 1680e71b7053SJung-uk Kim stmia r3,{r4-r11} 168117f01e99SJung-uk Kim str r12,[sp,#32*15+4] @ ~in1infty 1682e71b7053SJung-uk Kim 1683e71b7053SJung-uk Kim ldmia $b_ptr!,{r4-r11} @ copy in2_x 1684e71b7053SJung-uk Kim add r3,sp,#$in2_x 1685e71b7053SJung-uk Kim orr r12,r4,r5 1686e71b7053SJung-uk Kim orr r12,r12,r6 1687e71b7053SJung-uk Kim orr r12,r12,r7 1688e71b7053SJung-uk Kim orr r12,r12,r8 1689e71b7053SJung-uk Kim orr r12,r12,r9 1690e71b7053SJung-uk Kim orr r12,r12,r10 1691e71b7053SJung-uk Kim orr r12,r12,r11 1692e71b7053SJung-uk Kim stmia r3!,{r4-r11} 1693e71b7053SJung-uk Kim ldmia $b_ptr!,{r4-r11} @ copy in2_y 1694e71b7053SJung-uk Kim orr r12,r12,r4 1695e71b7053SJung-uk Kim orr r12,r12,r5 1696e71b7053SJung-uk Kim orr r12,r12,r6 1697e71b7053SJung-uk Kim orr r12,r12,r7 1698e71b7053SJung-uk Kim orr r12,r12,r8 1699e71b7053SJung-uk Kim orr r12,r12,r9 1700e71b7053SJung-uk Kim orr r12,r12,r10 1701e71b7053SJung-uk Kim orr r12,r12,r11 1702e71b7053SJung-uk Kim stmia r3!,{r4-r11} 1703e71b7053SJung-uk Kim cmp r12,#0 1704e71b7053SJung-uk Kim#ifdef __thumb2__ 1705e71b7053SJung-uk Kim it ne 1706e71b7053SJung-uk Kim#endif 1707e71b7053SJung-uk Kim movne r12,#-1 170817f01e99SJung-uk Kim str r12,[sp,#32*15+8] @ ~in2infty 1709e71b7053SJung-uk Kim 1710e71b7053SJung-uk Kim add $a_ptr,sp,#$in1_z 1711e71b7053SJung-uk Kim add $b_ptr,sp,#$in1_z 1712e71b7053SJung-uk Kim add $r_ptr,sp,#$Z1sqr 1713e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z); 1714e71b7053SJung-uk Kim 1715e71b7053SJung-uk Kim add $a_ptr,sp,#$Z1sqr 1716e71b7053SJung-uk Kim add $b_ptr,sp,#$in2_x 1717e71b7053SJung-uk Kim add $r_ptr,sp,#$U2 1718e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x); 1719e71b7053SJung-uk Kim 1720e71b7053SJung-uk Kim add $b_ptr,sp,#$in1_x 1721e71b7053SJung-uk Kim add $r_ptr,sp,#$H 1722e71b7053SJung-uk Kim bl __ecp_nistz256_sub_from @ p256_sub(H, U2, in1_x); 1723e71b7053SJung-uk Kim 1724e71b7053SJung-uk Kim add $a_ptr,sp,#$Z1sqr 1725e71b7053SJung-uk Kim add $b_ptr,sp,#$in1_z 1726e71b7053SJung-uk Kim add $r_ptr,sp,#$S2 1727e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z); 1728e71b7053SJung-uk Kim 1729e71b7053SJung-uk Kim add $a_ptr,sp,#$H 1730e71b7053SJung-uk Kim add $b_ptr,sp,#$in1_z 1731e71b7053SJung-uk Kim add $r_ptr,sp,#$res_z 1732e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z); 1733e71b7053SJung-uk Kim 1734e71b7053SJung-uk Kim add $a_ptr,sp,#$in2_y 1735e71b7053SJung-uk Kim add $b_ptr,sp,#$S2 1736e71b7053SJung-uk Kim add $r_ptr,sp,#$S2 1737e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y); 1738e71b7053SJung-uk Kim 1739e71b7053SJung-uk Kim add $b_ptr,sp,#$in1_y 1740e71b7053SJung-uk Kim add $r_ptr,sp,#$R 1741e71b7053SJung-uk Kim bl __ecp_nistz256_sub_from @ p256_sub(R, S2, in1_y); 1742e71b7053SJung-uk Kim 1743e71b7053SJung-uk Kim add $a_ptr,sp,#$H 1744e71b7053SJung-uk Kim add $b_ptr,sp,#$H 1745e71b7053SJung-uk Kim add $r_ptr,sp,#$Hsqr 1746e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H); 1747e71b7053SJung-uk Kim 1748e71b7053SJung-uk Kim add $a_ptr,sp,#$R 1749e71b7053SJung-uk Kim add $b_ptr,sp,#$R 1750e71b7053SJung-uk Kim add $r_ptr,sp,#$Rsqr 1751e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R); 1752e71b7053SJung-uk Kim 1753e71b7053SJung-uk Kim add $a_ptr,sp,#$H 1754e71b7053SJung-uk Kim add $b_ptr,sp,#$Hsqr 1755e71b7053SJung-uk Kim add $r_ptr,sp,#$Hcub 1756e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H); 1757e71b7053SJung-uk Kim 1758e71b7053SJung-uk Kim add $a_ptr,sp,#$Hsqr 1759e71b7053SJung-uk Kim add $b_ptr,sp,#$in1_x 1760e71b7053SJung-uk Kim add $r_ptr,sp,#$U2 1761e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr); 1762e71b7053SJung-uk Kim 1763e71b7053SJung-uk Kim add $r_ptr,sp,#$Hsqr 1764e71b7053SJung-uk Kim bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2); 1765e71b7053SJung-uk Kim 1766e71b7053SJung-uk Kim add $b_ptr,sp,#$Rsqr 1767e71b7053SJung-uk Kim add $r_ptr,sp,#$res_x 1768e71b7053SJung-uk Kim bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr); 1769e71b7053SJung-uk Kim 1770e71b7053SJung-uk Kim add $b_ptr,sp,#$Hcub 1771e71b7053SJung-uk Kim bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub); 1772e71b7053SJung-uk Kim 1773e71b7053SJung-uk Kim add $b_ptr,sp,#$U2 1774e71b7053SJung-uk Kim add $r_ptr,sp,#$res_y 1775e71b7053SJung-uk Kim bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x); 1776e71b7053SJung-uk Kim 1777e71b7053SJung-uk Kim add $a_ptr,sp,#$Hcub 1778e71b7053SJung-uk Kim add $b_ptr,sp,#$in1_y 1779e71b7053SJung-uk Kim add $r_ptr,sp,#$S2 1780e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub); 1781e71b7053SJung-uk Kim 1782e71b7053SJung-uk Kim add $a_ptr,sp,#$R 1783e71b7053SJung-uk Kim add $b_ptr,sp,#$res_y 1784e71b7053SJung-uk Kim add $r_ptr,sp,#$res_y 1785e71b7053SJung-uk Kim bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R); 1786e71b7053SJung-uk Kim 1787e71b7053SJung-uk Kim add $b_ptr,sp,#$S2 1788e71b7053SJung-uk Kim bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2); 1789e71b7053SJung-uk Kim 179017f01e99SJung-uk Kim ldr r11,[sp,#32*15+4] @ ~in1infty 179117f01e99SJung-uk Kim ldr r12,[sp,#32*15+8] @ ~in2infty 1792e71b7053SJung-uk Kim add r1,sp,#$res_x 1793e71b7053SJung-uk Kim add r2,sp,#$in2_x 179417f01e99SJung-uk Kim and r10,r11,r12 @ ~in1infty & ~in2infty 1795e71b7053SJung-uk Kim mvn r11,r11 1796e71b7053SJung-uk Kim add r3,sp,#$in1_x 179717f01e99SJung-uk Kim and r11,r11,r12 @ in1infty & ~in2infty 179817f01e99SJung-uk Kim mvn r12,r12 @ in2infty 1799e71b7053SJung-uk Kim ldr $r_ptr,[sp,#32*15] 1800e71b7053SJung-uk Kim___ 1801e71b7053SJung-uk Kimfor($i=0;$i<64;$i+=8) { # conditional moves 1802e71b7053SJung-uk Kim$code.=<<___; 1803e71b7053SJung-uk Kim ldmia r1!,{r4-r5} @ res_x 1804e71b7053SJung-uk Kim ldmia r2!,{r6-r7} @ in2_x 1805e71b7053SJung-uk Kim ldmia r3!,{r8-r9} @ in1_x 180617f01e99SJung-uk Kim and r4,r4,r10 @ ~in1infty & ~in2infty 1807e71b7053SJung-uk Kim and r5,r5,r10 180817f01e99SJung-uk Kim and r6,r6,r11 @ in1infty & ~in2infty 1809e71b7053SJung-uk Kim and r7,r7,r11 181017f01e99SJung-uk Kim and r8,r8,r12 @ in2infty 1811e71b7053SJung-uk Kim and r9,r9,r12 1812e71b7053SJung-uk Kim orr r4,r4,r6 1813e71b7053SJung-uk Kim orr r5,r5,r7 1814e71b7053SJung-uk Kim orr r4,r4,r8 1815e71b7053SJung-uk Kim orr r5,r5,r9 1816e71b7053SJung-uk Kim stmia $r_ptr!,{r4-r5} 1817e71b7053SJung-uk Kim___ 1818e71b7053SJung-uk Kim} 1819e71b7053SJung-uk Kimfor(;$i<96;$i+=8) { 1820e71b7053SJung-uk Kimmy $j=($i-64)/4; 1821e71b7053SJung-uk Kim$code.=<<___; 1822e71b7053SJung-uk Kim ldmia r1!,{r4-r5} @ res_z 1823e71b7053SJung-uk Kim ldmia r3!,{r8-r9} @ in1_z 1824e71b7053SJung-uk Kim and r4,r4,r10 1825e71b7053SJung-uk Kim and r5,r5,r10 1826e71b7053SJung-uk Kim and r6,r11,#@ONE_mont[$j] 1827e71b7053SJung-uk Kim and r7,r11,#@ONE_mont[$j+1] 1828e71b7053SJung-uk Kim and r8,r8,r12 1829e71b7053SJung-uk Kim and r9,r9,r12 1830e71b7053SJung-uk Kim orr r4,r4,r6 1831e71b7053SJung-uk Kim orr r5,r5,r7 1832e71b7053SJung-uk Kim orr r4,r4,r8 1833e71b7053SJung-uk Kim orr r5,r5,r9 1834e71b7053SJung-uk Kim stmia $r_ptr!,{r4-r5} 1835e71b7053SJung-uk Kim___ 1836e71b7053SJung-uk Kim} 1837e71b7053SJung-uk Kim$code.=<<___; 1838e71b7053SJung-uk Kim add sp,sp,#32*15+16 @ +16 means "skip even over saved r0-r3" 1839e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__) 1840e71b7053SJung-uk Kim ldmia sp!,{r4-r12,pc} 1841e71b7053SJung-uk Kim#else 1842e71b7053SJung-uk Kim ldmia sp!,{r4-r12,lr} 1843e71b7053SJung-uk Kim bx lr @ interoperable with Thumb ISA:-) 1844e71b7053SJung-uk Kim#endif 1845e71b7053SJung-uk Kim.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1846e71b7053SJung-uk Kim___ 1847e71b7053SJung-uk Kim} }}} 1848e71b7053SJung-uk Kim 1849e71b7053SJung-uk Kimforeach (split("\n",$code)) { 1850e71b7053SJung-uk Kim s/\`([^\`]*)\`/eval $1/geo; 1851e71b7053SJung-uk Kim 1852e71b7053SJung-uk Kim s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; 1853e71b7053SJung-uk Kim 1854e71b7053SJung-uk Kim print $_,"\n"; 1855e71b7053SJung-uk Kim} 185617f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!"; # enforce flush 1857