xref: /freebsd/crypto/openssl/crypto/ec/asm/ecp_nistz256-armv4.pl (revision b077aed33b7b6aefca7b17ddb250cf521f938613)
1e71b7053SJung-uk Kim#! /usr/bin/env perl
217f01e99SJung-uk Kim# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
3e71b7053SJung-uk Kim#
4*b077aed3SPierre Pronchery# Licensed under the Apache License 2.0 (the "License").  You may not use
5e71b7053SJung-uk Kim# this file except in compliance with the License.  You can obtain a copy
6e71b7053SJung-uk Kim# in the file LICENSE in the source distribution or at
7e71b7053SJung-uk Kim# https://www.openssl.org/source/license.html
8e71b7053SJung-uk Kim
9e71b7053SJung-uk Kim
10e71b7053SJung-uk Kim# ====================================================================
11e71b7053SJung-uk Kim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12e71b7053SJung-uk Kim# project. The module is, however, dual licensed under OpenSSL and
13e71b7053SJung-uk Kim# CRYPTOGAMS licenses depending on where you obtain it. For further
14e71b7053SJung-uk Kim# details see http://www.openssl.org/~appro/cryptogams/.
15e71b7053SJung-uk Kim# ====================================================================
16e71b7053SJung-uk Kim#
17e71b7053SJung-uk Kim# ECP_NISTZ256 module for ARMv4.
18e71b7053SJung-uk Kim#
19e71b7053SJung-uk Kim# October 2014.
20e71b7053SJung-uk Kim#
21e71b7053SJung-uk Kim# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22e71b7053SJung-uk Kim# http://eprint.iacr.org/2013/816. In the process of adaptation
23e71b7053SJung-uk Kim# original .c module was made 32-bit savvy in order to make this
24e71b7053SJung-uk Kim# implementation possible.
25e71b7053SJung-uk Kim#
26e71b7053SJung-uk Kim#			with/without -DECP_NISTZ256_ASM
27e71b7053SJung-uk Kim# Cortex-A8		+53-170%
28e71b7053SJung-uk Kim# Cortex-A9		+76-205%
29e71b7053SJung-uk Kim# Cortex-A15		+100-316%
30e71b7053SJung-uk Kim# Snapdragon S4		+66-187%
31e71b7053SJung-uk Kim#
32e71b7053SJung-uk Kim# Ranges denote minimum and maximum improvement coefficients depending
33e71b7053SJung-uk Kim# on benchmark. Lower coefficients are for ECDSA sign, server-side
34e71b7053SJung-uk Kim# operation. Keep in mind that +200% means 3x improvement.
35e71b7053SJung-uk Kim
36*b077aed3SPierre Pronchery# $output is the last argument if it looks like a file (it has an extension)
37*b077aed3SPierre Pronchery# $flavour is the first argument if it doesn't look like a file
38*b077aed3SPierre Pronchery$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
39*b077aed3SPierre Pronchery$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
40e71b7053SJung-uk Kim
41e71b7053SJung-uk Kimif ($flavour && $flavour ne "void") {
42e71b7053SJung-uk Kim    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43e71b7053SJung-uk Kim    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
44e71b7053SJung-uk Kim    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
45e71b7053SJung-uk Kim    die "can't locate arm-xlate.pl";
46e71b7053SJung-uk Kim
47*b077aed3SPierre Pronchery    open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
48*b077aed3SPierre Pronchery        or die "can't call  $xlate: $!";
49e71b7053SJung-uk Kim} else {
50*b077aed3SPierre Pronchery    $output and open STDOUT,">$output";
51e71b7053SJung-uk Kim}
52e71b7053SJung-uk Kim
53e71b7053SJung-uk Kim$code.=<<___;
54e71b7053SJung-uk Kim#include "arm_arch.h"
55e71b7053SJung-uk Kim
56e71b7053SJung-uk Kim#if defined(__thumb2__)
57e71b7053SJung-uk Kim.syntax	unified
58e71b7053SJung-uk Kim.thumb
59e71b7053SJung-uk Kim#else
60e71b7053SJung-uk Kim.code	32
61e71b7053SJung-uk Kim#endif
62e71b7053SJung-uk Kim___
63e71b7053SJung-uk Kim########################################################################
64e71b7053SJung-uk Kim# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
65e71b7053SJung-uk Kim#
66e71b7053SJung-uk Kim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
67e71b7053SJung-uk Kimopen TABLE,"<ecp_nistz256_table.c"		or
68e71b7053SJung-uk Kimopen TABLE,"<${dir}../ecp_nistz256_table.c"	or
69e71b7053SJung-uk Kimdie "failed to open ecp_nistz256_table.c:",$!;
70e71b7053SJung-uk Kim
71e71b7053SJung-uk Kimuse integer;
72e71b7053SJung-uk Kim
73e71b7053SJung-uk Kimforeach(<TABLE>) {
74e71b7053SJung-uk Kim	s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
75e71b7053SJung-uk Kim}
76e71b7053SJung-uk Kimclose TABLE;
77e71b7053SJung-uk Kim
78e71b7053SJung-uk Kim# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
79e71b7053SJung-uk Kim# 64*16*37-1 is because $#arr returns last valid index or @arr, not
80e71b7053SJung-uk Kim# amount of elements.
81e71b7053SJung-uk Kimdie "insane number of elements" if ($#arr != 64*16*37-1);
82e71b7053SJung-uk Kim
83e71b7053SJung-uk Kim$code.=<<___;
84*b077aed3SPierre Pronchery.rodata
85e71b7053SJung-uk Kim.globl	ecp_nistz256_precomputed
86e71b7053SJung-uk Kim.type	ecp_nistz256_precomputed,%object
87e71b7053SJung-uk Kim.align	12
88e71b7053SJung-uk Kimecp_nistz256_precomputed:
89e71b7053SJung-uk Kim___
90e71b7053SJung-uk Kim########################################################################
91e71b7053SJung-uk Kim# this conversion smashes P256_POINT_AFFINE by individual bytes with
92e71b7053SJung-uk Kim# 64 byte interval, similar to
93e71b7053SJung-uk Kim#	1111222233334444
94e71b7053SJung-uk Kim#	1234123412341234
95e71b7053SJung-uk Kimfor(1..37) {
96e71b7053SJung-uk Kim	@tbl = splice(@arr,0,64*16);
97e71b7053SJung-uk Kim	for($i=0;$i<64;$i++) {
98e71b7053SJung-uk Kim		undef @line;
99e71b7053SJung-uk Kim		for($j=0;$j<64;$j++) {
100e71b7053SJung-uk Kim			push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
101e71b7053SJung-uk Kim		}
102e71b7053SJung-uk Kim		$code.=".byte\t";
103e71b7053SJung-uk Kim		$code.=join(',',map { sprintf "0x%02x",$_} @line);
104e71b7053SJung-uk Kim		$code.="\n";
105e71b7053SJung-uk Kim	}
106e71b7053SJung-uk Kim}
107e71b7053SJung-uk Kim$code.=<<___;
108e71b7053SJung-uk Kim.size	ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
109*b077aed3SPierre Pronchery
110*b077aed3SPierre Pronchery.text
111e71b7053SJung-uk Kim.align	5
112e71b7053SJung-uk Kim.LRR:	@ 2^512 mod P precomputed for NIST P256 polynomial
113e71b7053SJung-uk Kim.long	0x00000003, 0x00000000, 0xffffffff, 0xfffffffb
114e71b7053SJung-uk Kim.long	0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004
115e71b7053SJung-uk Kim.Lone:
116e71b7053SJung-uk Kim.long	1,0,0,0,0,0,0,0
117e71b7053SJung-uk Kim.asciz	"ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
118e71b7053SJung-uk Kim.align	6
119e71b7053SJung-uk Kim___
120e71b7053SJung-uk Kim
121e71b7053SJung-uk Kim########################################################################
122e71b7053SJung-uk Kim# common register layout, note that $t2 is link register, so that if
123e71b7053SJung-uk Kim# internal subroutine uses $t2, then it has to offload lr...
124e71b7053SJung-uk Kim
125e71b7053SJung-uk Kim($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)=
126e71b7053SJung-uk Kim		map("r$_",(0..12,14));
127e71b7053SJung-uk Kim($t0,$t3)=($ff,$a_ptr);
128e71b7053SJung-uk Kim
129e71b7053SJung-uk Kim$code.=<<___;
130e71b7053SJung-uk Kim@ void	ecp_nistz256_to_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
131e71b7053SJung-uk Kim.globl	ecp_nistz256_to_mont
132e71b7053SJung-uk Kim.type	ecp_nistz256_to_mont,%function
133e71b7053SJung-uk Kimecp_nistz256_to_mont:
134e71b7053SJung-uk Kim	adr	$b_ptr,.LRR
135e71b7053SJung-uk Kim	b	.Lecp_nistz256_mul_mont
136e71b7053SJung-uk Kim.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
137e71b7053SJung-uk Kim
138e71b7053SJung-uk Kim@ void	ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
139e71b7053SJung-uk Kim.globl	ecp_nistz256_from_mont
140e71b7053SJung-uk Kim.type	ecp_nistz256_from_mont,%function
141e71b7053SJung-uk Kimecp_nistz256_from_mont:
142e71b7053SJung-uk Kim	adr	$b_ptr,.Lone
143e71b7053SJung-uk Kim	b	.Lecp_nistz256_mul_mont
144e71b7053SJung-uk Kim.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
145e71b7053SJung-uk Kim
146e71b7053SJung-uk Kim@ void	ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
147e71b7053SJung-uk Kim.globl	ecp_nistz256_mul_by_2
148e71b7053SJung-uk Kim.type	ecp_nistz256_mul_by_2,%function
149e71b7053SJung-uk Kim.align	4
150e71b7053SJung-uk Kimecp_nistz256_mul_by_2:
151e71b7053SJung-uk Kim	stmdb	sp!,{r4-r12,lr}
152e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_by_2
153e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__)
154e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,pc}
155e71b7053SJung-uk Kim#else
156e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,lr}
157e71b7053SJung-uk Kim	bx	lr			@ interoperable with Thumb ISA:-)
158e71b7053SJung-uk Kim#endif
159e71b7053SJung-uk Kim.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
160e71b7053SJung-uk Kim
161e71b7053SJung-uk Kim.type	__ecp_nistz256_mul_by_2,%function
162e71b7053SJung-uk Kim.align	4
163e71b7053SJung-uk Kim__ecp_nistz256_mul_by_2:
164e71b7053SJung-uk Kim	ldr	$a0,[$a_ptr,#0]
165e71b7053SJung-uk Kim	ldr	$a1,[$a_ptr,#4]
166e71b7053SJung-uk Kim	ldr	$a2,[$a_ptr,#8]
167e71b7053SJung-uk Kim	adds	$a0,$a0,$a0		@ a[0:7]+=a[0:7], i.e. add with itself
168e71b7053SJung-uk Kim	ldr	$a3,[$a_ptr,#12]
169e71b7053SJung-uk Kim	adcs	$a1,$a1,$a1
170e71b7053SJung-uk Kim	ldr	$a4,[$a_ptr,#16]
171e71b7053SJung-uk Kim	adcs	$a2,$a2,$a2
172e71b7053SJung-uk Kim	ldr	$a5,[$a_ptr,#20]
173e71b7053SJung-uk Kim	adcs	$a3,$a3,$a3
174e71b7053SJung-uk Kim	ldr	$a6,[$a_ptr,#24]
175e71b7053SJung-uk Kim	adcs	$a4,$a4,$a4
176e71b7053SJung-uk Kim	ldr	$a7,[$a_ptr,#28]
177e71b7053SJung-uk Kim	adcs	$a5,$a5,$a5
178e71b7053SJung-uk Kim	adcs	$a6,$a6,$a6
179e71b7053SJung-uk Kim	mov	$ff,#0
180e71b7053SJung-uk Kim	adcs	$a7,$a7,$a7
181e71b7053SJung-uk Kim	adc	$ff,$ff,#0
182e71b7053SJung-uk Kim
183e71b7053SJung-uk Kim	b	.Lreduce_by_sub
184e71b7053SJung-uk Kim.size	__ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
185e71b7053SJung-uk Kim
186e71b7053SJung-uk Kim@ void	ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8],
187e71b7053SJung-uk Kim@					const BN_ULONG r2[8]);
188e71b7053SJung-uk Kim.globl	ecp_nistz256_add
189e71b7053SJung-uk Kim.type	ecp_nistz256_add,%function
190e71b7053SJung-uk Kim.align	4
191e71b7053SJung-uk Kimecp_nistz256_add:
192e71b7053SJung-uk Kim	stmdb	sp!,{r4-r12,lr}
193e71b7053SJung-uk Kim	bl	__ecp_nistz256_add
194e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__)
195e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,pc}
196e71b7053SJung-uk Kim#else
197e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,lr}
198e71b7053SJung-uk Kim	bx	lr			@ interoperable with Thumb ISA:-)
199e71b7053SJung-uk Kim#endif
200e71b7053SJung-uk Kim.size	ecp_nistz256_add,.-ecp_nistz256_add
201e71b7053SJung-uk Kim
202e71b7053SJung-uk Kim.type	__ecp_nistz256_add,%function
203e71b7053SJung-uk Kim.align	4
204e71b7053SJung-uk Kim__ecp_nistz256_add:
205e71b7053SJung-uk Kim	str	lr,[sp,#-4]!		@ push lr
206e71b7053SJung-uk Kim
207e71b7053SJung-uk Kim	ldr	$a0,[$a_ptr,#0]
208e71b7053SJung-uk Kim	ldr	$a1,[$a_ptr,#4]
209e71b7053SJung-uk Kim	ldr	$a2,[$a_ptr,#8]
210e71b7053SJung-uk Kim	ldr	$a3,[$a_ptr,#12]
211e71b7053SJung-uk Kim	ldr	$a4,[$a_ptr,#16]
212e71b7053SJung-uk Kim	 ldr	$t0,[$b_ptr,#0]
213e71b7053SJung-uk Kim	ldr	$a5,[$a_ptr,#20]
214e71b7053SJung-uk Kim	 ldr	$t1,[$b_ptr,#4]
215e71b7053SJung-uk Kim	ldr	$a6,[$a_ptr,#24]
216e71b7053SJung-uk Kim	 ldr	$t2,[$b_ptr,#8]
217e71b7053SJung-uk Kim	ldr	$a7,[$a_ptr,#28]
218e71b7053SJung-uk Kim	 ldr	$t3,[$b_ptr,#12]
219e71b7053SJung-uk Kim	adds	$a0,$a0,$t0
220e71b7053SJung-uk Kim	 ldr	$t0,[$b_ptr,#16]
221e71b7053SJung-uk Kim	adcs	$a1,$a1,$t1
222e71b7053SJung-uk Kim	 ldr	$t1,[$b_ptr,#20]
223e71b7053SJung-uk Kim	adcs	$a2,$a2,$t2
224e71b7053SJung-uk Kim	 ldr	$t2,[$b_ptr,#24]
225e71b7053SJung-uk Kim	adcs	$a3,$a3,$t3
226e71b7053SJung-uk Kim	 ldr	$t3,[$b_ptr,#28]
227e71b7053SJung-uk Kim	adcs	$a4,$a4,$t0
228e71b7053SJung-uk Kim	adcs	$a5,$a5,$t1
229e71b7053SJung-uk Kim	adcs	$a6,$a6,$t2
230e71b7053SJung-uk Kim	mov	$ff,#0
231e71b7053SJung-uk Kim	adcs	$a7,$a7,$t3
232e71b7053SJung-uk Kim	adc	$ff,$ff,#0
233e71b7053SJung-uk Kim	ldr	lr,[sp],#4		@ pop lr
234e71b7053SJung-uk Kim
235e71b7053SJung-uk Kim.Lreduce_by_sub:
236e71b7053SJung-uk Kim
237e71b7053SJung-uk Kim	@ if a+b >= modulus, subtract modulus.
238e71b7053SJung-uk Kim	@
239e71b7053SJung-uk Kim	@ But since comparison implies subtraction, we subtract
240e71b7053SJung-uk Kim	@ modulus and then add it back if subtraction borrowed.
241e71b7053SJung-uk Kim
242e71b7053SJung-uk Kim	subs	$a0,$a0,#-1
243e71b7053SJung-uk Kim	sbcs	$a1,$a1,#-1
244e71b7053SJung-uk Kim	sbcs	$a2,$a2,#-1
245e71b7053SJung-uk Kim	sbcs	$a3,$a3,#0
246e71b7053SJung-uk Kim	sbcs	$a4,$a4,#0
247e71b7053SJung-uk Kim	sbcs	$a5,$a5,#0
248e71b7053SJung-uk Kim	sbcs	$a6,$a6,#1
249e71b7053SJung-uk Kim	sbcs	$a7,$a7,#-1
250e71b7053SJung-uk Kim	sbc	$ff,$ff,#0
251e71b7053SJung-uk Kim
252e71b7053SJung-uk Kim	@ Note that because mod has special form, i.e. consists of
253e71b7053SJung-uk Kim	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
254e71b7053SJung-uk Kim	@ using value of borrow as a whole or extracting single bit.
255e71b7053SJung-uk Kim	@ Follow $ff register...
256e71b7053SJung-uk Kim
257e71b7053SJung-uk Kim	adds	$a0,$a0,$ff		@ add synthesized modulus
258e71b7053SJung-uk Kim	adcs	$a1,$a1,$ff
259e71b7053SJung-uk Kim	str	$a0,[$r_ptr,#0]
260e71b7053SJung-uk Kim	adcs	$a2,$a2,$ff
261e71b7053SJung-uk Kim	str	$a1,[$r_ptr,#4]
262e71b7053SJung-uk Kim	adcs	$a3,$a3,#0
263e71b7053SJung-uk Kim	str	$a2,[$r_ptr,#8]
264e71b7053SJung-uk Kim	adcs	$a4,$a4,#0
265e71b7053SJung-uk Kim	str	$a3,[$r_ptr,#12]
266e71b7053SJung-uk Kim	adcs	$a5,$a5,#0
267e71b7053SJung-uk Kim	str	$a4,[$r_ptr,#16]
268e71b7053SJung-uk Kim	adcs	$a6,$a6,$ff,lsr#31
269e71b7053SJung-uk Kim	str	$a5,[$r_ptr,#20]
270e71b7053SJung-uk Kim	adcs	$a7,$a7,$ff
271e71b7053SJung-uk Kim	str	$a6,[$r_ptr,#24]
272e71b7053SJung-uk Kim	str	$a7,[$r_ptr,#28]
273e71b7053SJung-uk Kim
274e71b7053SJung-uk Kim	mov	pc,lr
275e71b7053SJung-uk Kim.size	__ecp_nistz256_add,.-__ecp_nistz256_add
276e71b7053SJung-uk Kim
277e71b7053SJung-uk Kim@ void	ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]);
278e71b7053SJung-uk Kim.globl	ecp_nistz256_mul_by_3
279e71b7053SJung-uk Kim.type	ecp_nistz256_mul_by_3,%function
280e71b7053SJung-uk Kim.align	4
281e71b7053SJung-uk Kimecp_nistz256_mul_by_3:
282e71b7053SJung-uk Kim	stmdb	sp!,{r4-r12,lr}
283e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_by_3
284e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__)
285e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,pc}
286e71b7053SJung-uk Kim#else
287e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,lr}
288e71b7053SJung-uk Kim	bx	lr			@ interoperable with Thumb ISA:-)
289e71b7053SJung-uk Kim#endif
290e71b7053SJung-uk Kim.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
291e71b7053SJung-uk Kim
292e71b7053SJung-uk Kim.type	__ecp_nistz256_mul_by_3,%function
293e71b7053SJung-uk Kim.align	4
294e71b7053SJung-uk Kim__ecp_nistz256_mul_by_3:
295e71b7053SJung-uk Kim	str	lr,[sp,#-4]!		@ push lr
296e71b7053SJung-uk Kim
297e71b7053SJung-uk Kim	@ As multiplication by 3 is performed as 2*n+n, below are inline
298e71b7053SJung-uk Kim	@ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see
299e71b7053SJung-uk Kim	@ corresponding subroutines for details.
300e71b7053SJung-uk Kim
301e71b7053SJung-uk Kim	ldr	$a0,[$a_ptr,#0]
302e71b7053SJung-uk Kim	ldr	$a1,[$a_ptr,#4]
303e71b7053SJung-uk Kim	ldr	$a2,[$a_ptr,#8]
304e71b7053SJung-uk Kim	adds	$a0,$a0,$a0		@ a[0:7]+=a[0:7]
305e71b7053SJung-uk Kim	ldr	$a3,[$a_ptr,#12]
306e71b7053SJung-uk Kim	adcs	$a1,$a1,$a1
307e71b7053SJung-uk Kim	ldr	$a4,[$a_ptr,#16]
308e71b7053SJung-uk Kim	adcs	$a2,$a2,$a2
309e71b7053SJung-uk Kim	ldr	$a5,[$a_ptr,#20]
310e71b7053SJung-uk Kim	adcs	$a3,$a3,$a3
311e71b7053SJung-uk Kim	ldr	$a6,[$a_ptr,#24]
312e71b7053SJung-uk Kim	adcs	$a4,$a4,$a4
313e71b7053SJung-uk Kim	ldr	$a7,[$a_ptr,#28]
314e71b7053SJung-uk Kim	adcs	$a5,$a5,$a5
315e71b7053SJung-uk Kim	adcs	$a6,$a6,$a6
316e71b7053SJung-uk Kim	mov	$ff,#0
317e71b7053SJung-uk Kim	adcs	$a7,$a7,$a7
318e71b7053SJung-uk Kim	adc	$ff,$ff,#0
319e71b7053SJung-uk Kim
320e71b7053SJung-uk Kim	subs	$a0,$a0,#-1		@ .Lreduce_by_sub but without stores
321e71b7053SJung-uk Kim	sbcs	$a1,$a1,#-1
322e71b7053SJung-uk Kim	sbcs	$a2,$a2,#-1
323e71b7053SJung-uk Kim	sbcs	$a3,$a3,#0
324e71b7053SJung-uk Kim	sbcs	$a4,$a4,#0
325e71b7053SJung-uk Kim	sbcs	$a5,$a5,#0
326e71b7053SJung-uk Kim	sbcs	$a6,$a6,#1
327e71b7053SJung-uk Kim	sbcs	$a7,$a7,#-1
328e71b7053SJung-uk Kim	sbc	$ff,$ff,#0
329e71b7053SJung-uk Kim
330e71b7053SJung-uk Kim	adds	$a0,$a0,$ff		@ add synthesized modulus
331e71b7053SJung-uk Kim	adcs	$a1,$a1,$ff
332e71b7053SJung-uk Kim	adcs	$a2,$a2,$ff
333e71b7053SJung-uk Kim	adcs	$a3,$a3,#0
334e71b7053SJung-uk Kim	adcs	$a4,$a4,#0
335e71b7053SJung-uk Kim	 ldr	$b_ptr,[$a_ptr,#0]
336e71b7053SJung-uk Kim	adcs	$a5,$a5,#0
337e71b7053SJung-uk Kim	 ldr	$t1,[$a_ptr,#4]
338e71b7053SJung-uk Kim	adcs	$a6,$a6,$ff,lsr#31
339e71b7053SJung-uk Kim	 ldr	$t2,[$a_ptr,#8]
340e71b7053SJung-uk Kim	adc	$a7,$a7,$ff
341e71b7053SJung-uk Kim
342e71b7053SJung-uk Kim	ldr	$t0,[$a_ptr,#12]
343e71b7053SJung-uk Kim	adds	$a0,$a0,$b_ptr		@ 2*a[0:7]+=a[0:7]
344e71b7053SJung-uk Kim	ldr	$b_ptr,[$a_ptr,#16]
345e71b7053SJung-uk Kim	adcs	$a1,$a1,$t1
346e71b7053SJung-uk Kim	ldr	$t1,[$a_ptr,#20]
347e71b7053SJung-uk Kim	adcs	$a2,$a2,$t2
348e71b7053SJung-uk Kim	ldr	$t2,[$a_ptr,#24]
349e71b7053SJung-uk Kim	adcs	$a3,$a3,$t0
350e71b7053SJung-uk Kim	ldr	$t3,[$a_ptr,#28]
351e71b7053SJung-uk Kim	adcs	$a4,$a4,$b_ptr
352e71b7053SJung-uk Kim	adcs	$a5,$a5,$t1
353e71b7053SJung-uk Kim	adcs	$a6,$a6,$t2
354e71b7053SJung-uk Kim	mov	$ff,#0
355e71b7053SJung-uk Kim	adcs	$a7,$a7,$t3
356e71b7053SJung-uk Kim	adc	$ff,$ff,#0
357e71b7053SJung-uk Kim	ldr	lr,[sp],#4		@ pop lr
358e71b7053SJung-uk Kim
359e71b7053SJung-uk Kim	b	.Lreduce_by_sub
360e71b7053SJung-uk Kim.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
361e71b7053SJung-uk Kim
362e71b7053SJung-uk Kim@ void	ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
363e71b7053SJung-uk Kim.globl	ecp_nistz256_div_by_2
364e71b7053SJung-uk Kim.type	ecp_nistz256_div_by_2,%function
365e71b7053SJung-uk Kim.align	4
366e71b7053SJung-uk Kimecp_nistz256_div_by_2:
367e71b7053SJung-uk Kim	stmdb	sp!,{r4-r12,lr}
368e71b7053SJung-uk Kim	bl	__ecp_nistz256_div_by_2
369e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__)
370e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,pc}
371e71b7053SJung-uk Kim#else
372e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,lr}
373e71b7053SJung-uk Kim	bx	lr			@ interoperable with Thumb ISA:-)
374e71b7053SJung-uk Kim#endif
375e71b7053SJung-uk Kim.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
376e71b7053SJung-uk Kim
377e71b7053SJung-uk Kim.type	__ecp_nistz256_div_by_2,%function
378e71b7053SJung-uk Kim.align	4
379e71b7053SJung-uk Kim__ecp_nistz256_div_by_2:
380e71b7053SJung-uk Kim	@ ret = (a is odd ? a+mod : a) >> 1
381e71b7053SJung-uk Kim
382e71b7053SJung-uk Kim	ldr	$a0,[$a_ptr,#0]
383e71b7053SJung-uk Kim	ldr	$a1,[$a_ptr,#4]
384e71b7053SJung-uk Kim	ldr	$a2,[$a_ptr,#8]
385e71b7053SJung-uk Kim	mov	$ff,$a0,lsl#31		@ place least significant bit to most
386e71b7053SJung-uk Kim					@ significant position, now arithmetic
387e71b7053SJung-uk Kim					@ right shift by 31 will produce -1 or
388e71b7053SJung-uk Kim					@ 0, while logical right shift 1 or 0,
389e71b7053SJung-uk Kim					@ this is how modulus is conditionally
390e71b7053SJung-uk Kim					@ synthesized in this case...
391e71b7053SJung-uk Kim	ldr	$a3,[$a_ptr,#12]
392e71b7053SJung-uk Kim	adds	$a0,$a0,$ff,asr#31
393e71b7053SJung-uk Kim	ldr	$a4,[$a_ptr,#16]
394e71b7053SJung-uk Kim	adcs	$a1,$a1,$ff,asr#31
395e71b7053SJung-uk Kim	ldr	$a5,[$a_ptr,#20]
396e71b7053SJung-uk Kim	adcs	$a2,$a2,$ff,asr#31
397e71b7053SJung-uk Kim	ldr	$a6,[$a_ptr,#24]
398e71b7053SJung-uk Kim	adcs	$a3,$a3,#0
399e71b7053SJung-uk Kim	ldr	$a7,[$a_ptr,#28]
400e71b7053SJung-uk Kim	adcs	$a4,$a4,#0
401e71b7053SJung-uk Kim	 mov	$a0,$a0,lsr#1		@ a[0:7]>>=1, we can start early
402e71b7053SJung-uk Kim					@ because it doesn't affect flags
403e71b7053SJung-uk Kim	adcs	$a5,$a5,#0
404e71b7053SJung-uk Kim	 orr	$a0,$a0,$a1,lsl#31
405e71b7053SJung-uk Kim	adcs	$a6,$a6,$ff,lsr#31
406e71b7053SJung-uk Kim	mov	$b_ptr,#0
407e71b7053SJung-uk Kim	adcs	$a7,$a7,$ff,asr#31
408e71b7053SJung-uk Kim	 mov	$a1,$a1,lsr#1
409e71b7053SJung-uk Kim	adc	$b_ptr,$b_ptr,#0	@ top-most carry bit from addition
410e71b7053SJung-uk Kim
411e71b7053SJung-uk Kim	orr	$a1,$a1,$a2,lsl#31
412e71b7053SJung-uk Kim	mov	$a2,$a2,lsr#1
413e71b7053SJung-uk Kim	str	$a0,[$r_ptr,#0]
414e71b7053SJung-uk Kim	orr	$a2,$a2,$a3,lsl#31
415e71b7053SJung-uk Kim	mov	$a3,$a3,lsr#1
416e71b7053SJung-uk Kim	str	$a1,[$r_ptr,#4]
417e71b7053SJung-uk Kim	orr	$a3,$a3,$a4,lsl#31
418e71b7053SJung-uk Kim	mov	$a4,$a4,lsr#1
419e71b7053SJung-uk Kim	str	$a2,[$r_ptr,#8]
420e71b7053SJung-uk Kim	orr	$a4,$a4,$a5,lsl#31
421e71b7053SJung-uk Kim	mov	$a5,$a5,lsr#1
422e71b7053SJung-uk Kim	str	$a3,[$r_ptr,#12]
423e71b7053SJung-uk Kim	orr	$a5,$a5,$a6,lsl#31
424e71b7053SJung-uk Kim	mov	$a6,$a6,lsr#1
425e71b7053SJung-uk Kim	str	$a4,[$r_ptr,#16]
426e71b7053SJung-uk Kim	orr	$a6,$a6,$a7,lsl#31
427e71b7053SJung-uk Kim	mov	$a7,$a7,lsr#1
428e71b7053SJung-uk Kim	str	$a5,[$r_ptr,#20]
429e71b7053SJung-uk Kim	orr	$a7,$a7,$b_ptr,lsl#31	@ don't forget the top-most carry bit
430e71b7053SJung-uk Kim	str	$a6,[$r_ptr,#24]
431e71b7053SJung-uk Kim	str	$a7,[$r_ptr,#28]
432e71b7053SJung-uk Kim
433e71b7053SJung-uk Kim	mov	pc,lr
434e71b7053SJung-uk Kim.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
435e71b7053SJung-uk Kim
436e71b7053SJung-uk Kim@ void	ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8],
437e71b7053SJung-uk Kim@				        const BN_ULONG r2[8]);
438e71b7053SJung-uk Kim.globl	ecp_nistz256_sub
439e71b7053SJung-uk Kim.type	ecp_nistz256_sub,%function
440e71b7053SJung-uk Kim.align	4
441e71b7053SJung-uk Kimecp_nistz256_sub:
442e71b7053SJung-uk Kim	stmdb	sp!,{r4-r12,lr}
443e71b7053SJung-uk Kim	bl	__ecp_nistz256_sub
444e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__)
445e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,pc}
446e71b7053SJung-uk Kim#else
447e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,lr}
448e71b7053SJung-uk Kim	bx	lr			@ interoperable with Thumb ISA:-)
449e71b7053SJung-uk Kim#endif
450e71b7053SJung-uk Kim.size	ecp_nistz256_sub,.-ecp_nistz256_sub
451e71b7053SJung-uk Kim
452e71b7053SJung-uk Kim.type	__ecp_nistz256_sub,%function
453e71b7053SJung-uk Kim.align	4
454e71b7053SJung-uk Kim__ecp_nistz256_sub:
455e71b7053SJung-uk Kim	str	lr,[sp,#-4]!		@ push lr
456e71b7053SJung-uk Kim
457e71b7053SJung-uk Kim	ldr	$a0,[$a_ptr,#0]
458e71b7053SJung-uk Kim	ldr	$a1,[$a_ptr,#4]
459e71b7053SJung-uk Kim	ldr	$a2,[$a_ptr,#8]
460e71b7053SJung-uk Kim	ldr	$a3,[$a_ptr,#12]
461e71b7053SJung-uk Kim	ldr	$a4,[$a_ptr,#16]
462e71b7053SJung-uk Kim	 ldr	$t0,[$b_ptr,#0]
463e71b7053SJung-uk Kim	ldr	$a5,[$a_ptr,#20]
464e71b7053SJung-uk Kim	 ldr	$t1,[$b_ptr,#4]
465e71b7053SJung-uk Kim	ldr	$a6,[$a_ptr,#24]
466e71b7053SJung-uk Kim	 ldr	$t2,[$b_ptr,#8]
467e71b7053SJung-uk Kim	ldr	$a7,[$a_ptr,#28]
468e71b7053SJung-uk Kim	 ldr	$t3,[$b_ptr,#12]
469e71b7053SJung-uk Kim	subs	$a0,$a0,$t0
470e71b7053SJung-uk Kim	 ldr	$t0,[$b_ptr,#16]
471e71b7053SJung-uk Kim	sbcs	$a1,$a1,$t1
472e71b7053SJung-uk Kim	 ldr	$t1,[$b_ptr,#20]
473e71b7053SJung-uk Kim	sbcs	$a2,$a2,$t2
474e71b7053SJung-uk Kim	 ldr	$t2,[$b_ptr,#24]
475e71b7053SJung-uk Kim	sbcs	$a3,$a3,$t3
476e71b7053SJung-uk Kim	 ldr	$t3,[$b_ptr,#28]
477e71b7053SJung-uk Kim	sbcs	$a4,$a4,$t0
478e71b7053SJung-uk Kim	sbcs	$a5,$a5,$t1
479e71b7053SJung-uk Kim	sbcs	$a6,$a6,$t2
480e71b7053SJung-uk Kim	sbcs	$a7,$a7,$t3
481e71b7053SJung-uk Kim	sbc	$ff,$ff,$ff		@ broadcast borrow bit
482e71b7053SJung-uk Kim	ldr	lr,[sp],#4		@ pop lr
483e71b7053SJung-uk Kim
484e71b7053SJung-uk Kim.Lreduce_by_add:
485e71b7053SJung-uk Kim
486e71b7053SJung-uk Kim	@ if a-b borrows, add modulus.
487e71b7053SJung-uk Kim	@
488e71b7053SJung-uk Kim	@ Note that because mod has special form, i.e. consists of
489e71b7053SJung-uk Kim	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
490e71b7053SJung-uk Kim	@ broadcasting borrow bit to a register, $ff, and using it as
491e71b7053SJung-uk Kim	@ a whole or extracting single bit.
492e71b7053SJung-uk Kim
493e71b7053SJung-uk Kim	adds	$a0,$a0,$ff		@ add synthesized modulus
494e71b7053SJung-uk Kim	adcs	$a1,$a1,$ff
495e71b7053SJung-uk Kim	str	$a0,[$r_ptr,#0]
496e71b7053SJung-uk Kim	adcs	$a2,$a2,$ff
497e71b7053SJung-uk Kim	str	$a1,[$r_ptr,#4]
498e71b7053SJung-uk Kim	adcs	$a3,$a3,#0
499e71b7053SJung-uk Kim	str	$a2,[$r_ptr,#8]
500e71b7053SJung-uk Kim	adcs	$a4,$a4,#0
501e71b7053SJung-uk Kim	str	$a3,[$r_ptr,#12]
502e71b7053SJung-uk Kim	adcs	$a5,$a5,#0
503e71b7053SJung-uk Kim	str	$a4,[$r_ptr,#16]
504e71b7053SJung-uk Kim	adcs	$a6,$a6,$ff,lsr#31
505e71b7053SJung-uk Kim	str	$a5,[$r_ptr,#20]
506e71b7053SJung-uk Kim	adcs	$a7,$a7,$ff
507e71b7053SJung-uk Kim	str	$a6,[$r_ptr,#24]
508e71b7053SJung-uk Kim	str	$a7,[$r_ptr,#28]
509e71b7053SJung-uk Kim
510e71b7053SJung-uk Kim	mov	pc,lr
511e71b7053SJung-uk Kim.size	__ecp_nistz256_sub,.-__ecp_nistz256_sub
512e71b7053SJung-uk Kim
513e71b7053SJung-uk Kim@ void	ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]);
514e71b7053SJung-uk Kim.globl	ecp_nistz256_neg
515e71b7053SJung-uk Kim.type	ecp_nistz256_neg,%function
516e71b7053SJung-uk Kim.align	4
517e71b7053SJung-uk Kimecp_nistz256_neg:
518e71b7053SJung-uk Kim	stmdb	sp!,{r4-r12,lr}
519e71b7053SJung-uk Kim	bl	__ecp_nistz256_neg
520e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__)
521e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,pc}
522e71b7053SJung-uk Kim#else
523e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,lr}
524e71b7053SJung-uk Kim	bx	lr			@ interoperable with Thumb ISA:-)
525e71b7053SJung-uk Kim#endif
526e71b7053SJung-uk Kim.size	ecp_nistz256_neg,.-ecp_nistz256_neg
527e71b7053SJung-uk Kim
528e71b7053SJung-uk Kim.type	__ecp_nistz256_neg,%function
529e71b7053SJung-uk Kim.align	4
530e71b7053SJung-uk Kim__ecp_nistz256_neg:
531e71b7053SJung-uk Kim	ldr	$a0,[$a_ptr,#0]
532e71b7053SJung-uk Kim	eor	$ff,$ff,$ff
533e71b7053SJung-uk Kim	ldr	$a1,[$a_ptr,#4]
534e71b7053SJung-uk Kim	ldr	$a2,[$a_ptr,#8]
535e71b7053SJung-uk Kim	subs	$a0,$ff,$a0
536e71b7053SJung-uk Kim	ldr	$a3,[$a_ptr,#12]
537e71b7053SJung-uk Kim	sbcs	$a1,$ff,$a1
538e71b7053SJung-uk Kim	ldr	$a4,[$a_ptr,#16]
539e71b7053SJung-uk Kim	sbcs	$a2,$ff,$a2
540e71b7053SJung-uk Kim	ldr	$a5,[$a_ptr,#20]
541e71b7053SJung-uk Kim	sbcs	$a3,$ff,$a3
542e71b7053SJung-uk Kim	ldr	$a6,[$a_ptr,#24]
543e71b7053SJung-uk Kim	sbcs	$a4,$ff,$a4
544e71b7053SJung-uk Kim	ldr	$a7,[$a_ptr,#28]
545e71b7053SJung-uk Kim	sbcs	$a5,$ff,$a5
546e71b7053SJung-uk Kim	sbcs	$a6,$ff,$a6
547e71b7053SJung-uk Kim	sbcs	$a7,$ff,$a7
548e71b7053SJung-uk Kim	sbc	$ff,$ff,$ff
549e71b7053SJung-uk Kim
550e71b7053SJung-uk Kim	b	.Lreduce_by_add
551e71b7053SJung-uk Kim.size	__ecp_nistz256_neg,.-__ecp_nistz256_neg
552e71b7053SJung-uk Kim___
553e71b7053SJung-uk Kim{
554e71b7053SJung-uk Kimmy @acc=map("r$_",(3..11));
555e71b7053SJung-uk Kimmy ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14));
556e71b7053SJung-uk Kim
557e71b7053SJung-uk Kim$code.=<<___;
558e71b7053SJung-uk Kim@ void	ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
559e71b7053SJung-uk Kim.globl	ecp_nistz256_sqr_mont
560e71b7053SJung-uk Kim.type	ecp_nistz256_sqr_mont,%function
561e71b7053SJung-uk Kim.align	4
562e71b7053SJung-uk Kimecp_nistz256_sqr_mont:
563e71b7053SJung-uk Kim	mov	$b_ptr,$a_ptr
564e71b7053SJung-uk Kim	b	.Lecp_nistz256_mul_mont
565e71b7053SJung-uk Kim.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
566e71b7053SJung-uk Kim
567e71b7053SJung-uk Kim@ void	ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8],
568e71b7053SJung-uk Kim@					     const BN_ULONG r2[8]);
569e71b7053SJung-uk Kim.globl	ecp_nistz256_mul_mont
570e71b7053SJung-uk Kim.type	ecp_nistz256_mul_mont,%function
571e71b7053SJung-uk Kim.align	4
572e71b7053SJung-uk Kimecp_nistz256_mul_mont:
573e71b7053SJung-uk Kim.Lecp_nistz256_mul_mont:
574e71b7053SJung-uk Kim	stmdb	sp!,{r4-r12,lr}
575e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont
576e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__)
577e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,pc}
578e71b7053SJung-uk Kim#else
579e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,lr}
580e71b7053SJung-uk Kim	bx	lr			@ interoperable with Thumb ISA:-)
581e71b7053SJung-uk Kim#endif
582e71b7053SJung-uk Kim.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
583e71b7053SJung-uk Kim
584e71b7053SJung-uk Kim.type	__ecp_nistz256_mul_mont,%function
585e71b7053SJung-uk Kim.align	4
586e71b7053SJung-uk Kim__ecp_nistz256_mul_mont:
587e71b7053SJung-uk Kim	stmdb	sp!,{r0-r2,lr}			@ make a copy of arguments too
588e71b7053SJung-uk Kim
589e71b7053SJung-uk Kim	ldr	$bj,[$b_ptr,#0]			@ b[0]
590e71b7053SJung-uk Kim	ldmia	$a_ptr,{@acc[1]-@acc[8]}
591e71b7053SJung-uk Kim
592e71b7053SJung-uk Kim	umull	@acc[0],$t3,@acc[1],$bj		@ r[0]=a[0]*b[0]
593e71b7053SJung-uk Kim	stmdb	sp!,{$acc[1]-@acc[8]}		@ copy a[0-7] to stack, so
594e71b7053SJung-uk Kim						@ that it can be addressed
595e71b7053SJung-uk Kim						@ without spending register
596e71b7053SJung-uk Kim						@ on address
597e71b7053SJung-uk Kim	umull	@acc[1],$t0,@acc[2],$bj		@ r[1]=a[1]*b[0]
598e71b7053SJung-uk Kim	umull	@acc[2],$t1,@acc[3],$bj
599e71b7053SJung-uk Kim	adds	@acc[1],@acc[1],$t3		@ accumulate high part of mult
600e71b7053SJung-uk Kim	umull	@acc[3],$t2,@acc[4],$bj
601e71b7053SJung-uk Kim	adcs	@acc[2],@acc[2],$t0
602e71b7053SJung-uk Kim	umull	@acc[4],$t3,@acc[5],$bj
603e71b7053SJung-uk Kim	adcs	@acc[3],@acc[3],$t1
604e71b7053SJung-uk Kim	umull	@acc[5],$t0,@acc[6],$bj
605e71b7053SJung-uk Kim	adcs	@acc[4],@acc[4],$t2
606e71b7053SJung-uk Kim	umull	@acc[6],$t1,@acc[7],$bj
607e71b7053SJung-uk Kim	adcs	@acc[5],@acc[5],$t3
608e71b7053SJung-uk Kim	umull	@acc[7],$t2,@acc[8],$bj
609e71b7053SJung-uk Kim	adcs	@acc[6],@acc[6],$t0
610e71b7053SJung-uk Kim	adcs	@acc[7],@acc[7],$t1
611e71b7053SJung-uk Kim	eor	$t3,$t3,$t3			@ first overflow bit is zero
612e71b7053SJung-uk Kim	adc	@acc[8],$t2,#0
613e71b7053SJung-uk Kim___
614e71b7053SJung-uk Kimfor(my $i=1;$i<8;$i++) {
615e71b7053SJung-uk Kimmy $t4=@acc[0];
616e71b7053SJung-uk Kim
617e71b7053SJung-uk Kim	# Reduction iteration is normally performed by accumulating
618e71b7053SJung-uk Kim	# result of multiplication of modulus by "magic" digit [and
619e71b7053SJung-uk Kim	# omitting least significant word, which is guaranteed to
620e71b7053SJung-uk Kim	# be 0], but thanks to special form of modulus and "magic"
621e71b7053SJung-uk Kim	# digit being equal to least significant word, it can be
622e71b7053SJung-uk Kim	# performed with additions and subtractions alone. Indeed:
623e71b7053SJung-uk Kim	#
624e71b7053SJung-uk Kim	#        ffff.0001.0000.0000.0000.ffff.ffff.ffff
625e71b7053SJung-uk Kim	# *                                         abcd
626e71b7053SJung-uk Kim	# + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
627e71b7053SJung-uk Kim	#
628e71b7053SJung-uk Kim	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
629e71b7053SJung-uk Kim	# rewrite above as:
630e71b7053SJung-uk Kim	#
631e71b7053SJung-uk Kim	#   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
632e71b7053SJung-uk Kim	# + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
633e71b7053SJung-uk Kim	# -      abcd.0000.0000.0000.0000.0000.0000.abcd
634e71b7053SJung-uk Kim	#
635e71b7053SJung-uk Kim	# or marking redundant operations:
636e71b7053SJung-uk Kim	#
637e71b7053SJung-uk Kim	#   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
638e71b7053SJung-uk Kim	# + abcd.0000.abcd.0000.0000.abcd.----.----.----
639e71b7053SJung-uk Kim	# -      abcd.----.----.----.----.----.----.----
640e71b7053SJung-uk Kim
641e71b7053SJung-uk Kim$code.=<<___;
642e71b7053SJung-uk Kim	@ multiplication-less reduction $i
643e71b7053SJung-uk Kim	adds	@acc[3],@acc[3],@acc[0]		@ r[3]+=r[0]
644e71b7053SJung-uk Kim	 ldr	$bj,[sp,#40]			@ restore b_ptr
645e71b7053SJung-uk Kim	adcs	@acc[4],@acc[4],#0		@ r[4]+=0
646e71b7053SJung-uk Kim	adcs	@acc[5],@acc[5],#0		@ r[5]+=0
647e71b7053SJung-uk Kim	adcs	@acc[6],@acc[6],@acc[0]		@ r[6]+=r[0]
648e71b7053SJung-uk Kim	 ldr	$t1,[sp,#0]			@ load a[0]
649e71b7053SJung-uk Kim	adcs	@acc[7],@acc[7],#0		@ r[7]+=0
650e71b7053SJung-uk Kim	 ldr	$bj,[$bj,#4*$i]			@ load b[i]
651e71b7053SJung-uk Kim	adcs	@acc[8],@acc[8],@acc[0]		@ r[8]+=r[0]
652e71b7053SJung-uk Kim	 eor	$t0,$t0,$t0
653e71b7053SJung-uk Kim	adc	$t3,$t3,#0			@ overflow bit
654e71b7053SJung-uk Kim	subs	@acc[7],@acc[7],@acc[0]		@ r[7]-=r[0]
655e71b7053SJung-uk Kim	 ldr	$t2,[sp,#4]			@ a[1]
656e71b7053SJung-uk Kim	sbcs	@acc[8],@acc[8],#0		@ r[8]-=0
657e71b7053SJung-uk Kim	 umlal	@acc[1],$t0,$t1,$bj		@ "r[0]"+=a[0]*b[i]
658e71b7053SJung-uk Kim	 eor	$t1,$t1,$t1
659e71b7053SJung-uk Kim	sbc	@acc[0],$t3,#0			@ overflow bit, keep in mind
660e71b7053SJung-uk Kim						@ that netto result is
661e71b7053SJung-uk Kim						@ addition of a value which
662e71b7053SJung-uk Kim						@ makes underflow impossible
663e71b7053SJung-uk Kim
664e71b7053SJung-uk Kim	ldr	$t3,[sp,#8]			@ a[2]
665e71b7053SJung-uk Kim	umlal	@acc[2],$t1,$t2,$bj		@ "r[1]"+=a[1]*b[i]
666e71b7053SJung-uk Kim	 str	@acc[0],[sp,#36]		@ temporarily offload overflow
667e71b7053SJung-uk Kim	eor	$t2,$t2,$t2
668e71b7053SJung-uk Kim	ldr	$t4,[sp,#12]			@ a[3], $t4 is alias @acc[0]
669e71b7053SJung-uk Kim	umlal	@acc[3],$t2,$t3,$bj		@ "r[2]"+=a[2]*b[i]
670e71b7053SJung-uk Kim	eor	$t3,$t3,$t3
671e71b7053SJung-uk Kim	adds	@acc[2],@acc[2],$t0		@ accumulate high part of mult
672e71b7053SJung-uk Kim	ldr	$t0,[sp,#16]			@ a[4]
673e71b7053SJung-uk Kim	umlal	@acc[4],$t3,$t4,$bj		@ "r[3]"+=a[3]*b[i]
674e71b7053SJung-uk Kim	eor	$t4,$t4,$t4
675e71b7053SJung-uk Kim	adcs	@acc[3],@acc[3],$t1
676e71b7053SJung-uk Kim	ldr	$t1,[sp,#20]			@ a[5]
677e71b7053SJung-uk Kim	umlal	@acc[5],$t4,$t0,$bj		@ "r[4]"+=a[4]*b[i]
678e71b7053SJung-uk Kim	eor	$t0,$t0,$t0
679e71b7053SJung-uk Kim	adcs	@acc[4],@acc[4],$t2
680e71b7053SJung-uk Kim	ldr	$t2,[sp,#24]			@ a[6]
681e71b7053SJung-uk Kim	umlal	@acc[6],$t0,$t1,$bj		@ "r[5]"+=a[5]*b[i]
682e71b7053SJung-uk Kim	eor	$t1,$t1,$t1
683e71b7053SJung-uk Kim	adcs	@acc[5],@acc[5],$t3
684e71b7053SJung-uk Kim	ldr	$t3,[sp,#28]			@ a[7]
685e71b7053SJung-uk Kim	umlal	@acc[7],$t1,$t2,$bj		@ "r[6]"+=a[6]*b[i]
686e71b7053SJung-uk Kim	eor	$t2,$t2,$t2
687e71b7053SJung-uk Kim	adcs	@acc[6],@acc[6],$t4
688e71b7053SJung-uk Kim	 ldr	@acc[0],[sp,#36]		@ restore overflow bit
689e71b7053SJung-uk Kim	umlal	@acc[8],$t2,$t3,$bj		@ "r[7]"+=a[7]*b[i]
690e71b7053SJung-uk Kim	eor	$t3,$t3,$t3
691e71b7053SJung-uk Kim	adcs	@acc[7],@acc[7],$t0
692e71b7053SJung-uk Kim	adcs	@acc[8],@acc[8],$t1
693e71b7053SJung-uk Kim	adcs	@acc[0],$acc[0],$t2
694e71b7053SJung-uk Kim	adc	$t3,$t3,#0			@ new overflow bit
695e71b7053SJung-uk Kim___
696e71b7053SJung-uk Kim	push(@acc,shift(@acc));			# rotate registers, so that
697e71b7053SJung-uk Kim						# "r[i]" becomes r[i]
698e71b7053SJung-uk Kim}
699e71b7053SJung-uk Kim$code.=<<___;
700e71b7053SJung-uk Kim	@ last multiplication-less reduction
701e71b7053SJung-uk Kim	adds	@acc[3],@acc[3],@acc[0]
702e71b7053SJung-uk Kim	ldr	$r_ptr,[sp,#32]			@ restore r_ptr
703e71b7053SJung-uk Kim	adcs	@acc[4],@acc[4],#0
704e71b7053SJung-uk Kim	adcs	@acc[5],@acc[5],#0
705e71b7053SJung-uk Kim	adcs	@acc[6],@acc[6],@acc[0]
706e71b7053SJung-uk Kim	adcs	@acc[7],@acc[7],#0
707e71b7053SJung-uk Kim	adcs	@acc[8],@acc[8],@acc[0]
708e71b7053SJung-uk Kim	adc	$t3,$t3,#0
709e71b7053SJung-uk Kim	subs	@acc[7],@acc[7],@acc[0]
710e71b7053SJung-uk Kim	sbcs	@acc[8],@acc[8],#0
711e71b7053SJung-uk Kim	sbc	@acc[0],$t3,#0			@ overflow bit
712e71b7053SJung-uk Kim
713e71b7053SJung-uk Kim	@ Final step is "if result > mod, subtract mod", but we do it
714e71b7053SJung-uk Kim	@ "other way around", namely subtract modulus from result
715e71b7053SJung-uk Kim	@ and if it borrowed, add modulus back.
716e71b7053SJung-uk Kim
717e71b7053SJung-uk Kim	adds	@acc[1],@acc[1],#1		@ subs	@acc[1],@acc[1],#-1
718e71b7053SJung-uk Kim	adcs	@acc[2],@acc[2],#0		@ sbcs	@acc[2],@acc[2],#-1
719e71b7053SJung-uk Kim	adcs	@acc[3],@acc[3],#0		@ sbcs	@acc[3],@acc[3],#-1
720e71b7053SJung-uk Kim	sbcs	@acc[4],@acc[4],#0
721e71b7053SJung-uk Kim	sbcs	@acc[5],@acc[5],#0
722e71b7053SJung-uk Kim	sbcs	@acc[6],@acc[6],#0
723e71b7053SJung-uk Kim	sbcs	@acc[7],@acc[7],#1
724e71b7053SJung-uk Kim	adcs	@acc[8],@acc[8],#0		@ sbcs	@acc[8],@acc[8],#-1
725e71b7053SJung-uk Kim	ldr	lr,[sp,#44]			@ restore lr
726e71b7053SJung-uk Kim	sbc	@acc[0],@acc[0],#0		@ broadcast borrow bit
727e71b7053SJung-uk Kim	add	sp,sp,#48
728e71b7053SJung-uk Kim
729e71b7053SJung-uk Kim	@ Note that because mod has special form, i.e. consists of
730e71b7053SJung-uk Kim	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
731e71b7053SJung-uk Kim	@ broadcasting borrow bit to a register, @acc[0], and using it as
732e71b7053SJung-uk Kim	@ a whole or extracting single bit.
733e71b7053SJung-uk Kim
734e71b7053SJung-uk Kim	adds	@acc[1],@acc[1],@acc[0]		@ add modulus or zero
735e71b7053SJung-uk Kim	adcs	@acc[2],@acc[2],@acc[0]
736e71b7053SJung-uk Kim	str	@acc[1],[$r_ptr,#0]
737e71b7053SJung-uk Kim	adcs	@acc[3],@acc[3],@acc[0]
738e71b7053SJung-uk Kim	str	@acc[2],[$r_ptr,#4]
739e71b7053SJung-uk Kim	adcs	@acc[4],@acc[4],#0
740e71b7053SJung-uk Kim	str	@acc[3],[$r_ptr,#8]
741e71b7053SJung-uk Kim	adcs	@acc[5],@acc[5],#0
742e71b7053SJung-uk Kim	str	@acc[4],[$r_ptr,#12]
743e71b7053SJung-uk Kim	adcs	@acc[6],@acc[6],#0
744e71b7053SJung-uk Kim	str	@acc[5],[$r_ptr,#16]
745e71b7053SJung-uk Kim	adcs	@acc[7],@acc[7],@acc[0],lsr#31
746e71b7053SJung-uk Kim	str	@acc[6],[$r_ptr,#20]
747e71b7053SJung-uk Kim	adc	@acc[8],@acc[8],@acc[0]
748e71b7053SJung-uk Kim	str	@acc[7],[$r_ptr,#24]
749e71b7053SJung-uk Kim	str	@acc[8],[$r_ptr,#28]
750e71b7053SJung-uk Kim
751e71b7053SJung-uk Kim	mov	pc,lr
752e71b7053SJung-uk Kim.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
753e71b7053SJung-uk Kim___
754e71b7053SJung-uk Kim}
755e71b7053SJung-uk Kim
756e71b7053SJung-uk Kim{
757e71b7053SJung-uk Kimmy ($out,$inp,$index,$mask)=map("r$_",(0..3));
758e71b7053SJung-uk Kim$code.=<<___;
759e71b7053SJung-uk Kim@ void	ecp_nistz256_scatter_w5(void *r0,const P256_POINT *r1,
760e71b7053SJung-uk Kim@					 int r2);
761e71b7053SJung-uk Kim.globl	ecp_nistz256_scatter_w5
762e71b7053SJung-uk Kim.type	ecp_nistz256_scatter_w5,%function
763e71b7053SJung-uk Kim.align	5
764e71b7053SJung-uk Kimecp_nistz256_scatter_w5:
765e71b7053SJung-uk Kim	stmdb	sp!,{r4-r11}
766e71b7053SJung-uk Kim
767e71b7053SJung-uk Kim	add	$out,$out,$index,lsl#2
768e71b7053SJung-uk Kim
769e71b7053SJung-uk Kim	ldmia	$inp!,{r4-r11}		@ X
770e71b7053SJung-uk Kim	str	r4,[$out,#64*0-4]
771e71b7053SJung-uk Kim	str	r5,[$out,#64*1-4]
772e71b7053SJung-uk Kim	str	r6,[$out,#64*2-4]
773e71b7053SJung-uk Kim	str	r7,[$out,#64*3-4]
774e71b7053SJung-uk Kim	str	r8,[$out,#64*4-4]
775e71b7053SJung-uk Kim	str	r9,[$out,#64*5-4]
776e71b7053SJung-uk Kim	str	r10,[$out,#64*6-4]
777e71b7053SJung-uk Kim	str	r11,[$out,#64*7-4]
778e71b7053SJung-uk Kim	add	$out,$out,#64*8
779e71b7053SJung-uk Kim
780e71b7053SJung-uk Kim	ldmia	$inp!,{r4-r11}		@ Y
781e71b7053SJung-uk Kim	str	r4,[$out,#64*0-4]
782e71b7053SJung-uk Kim	str	r5,[$out,#64*1-4]
783e71b7053SJung-uk Kim	str	r6,[$out,#64*2-4]
784e71b7053SJung-uk Kim	str	r7,[$out,#64*3-4]
785e71b7053SJung-uk Kim	str	r8,[$out,#64*4-4]
786e71b7053SJung-uk Kim	str	r9,[$out,#64*5-4]
787e71b7053SJung-uk Kim	str	r10,[$out,#64*6-4]
788e71b7053SJung-uk Kim	str	r11,[$out,#64*7-4]
789e71b7053SJung-uk Kim	add	$out,$out,#64*8
790e71b7053SJung-uk Kim
791e71b7053SJung-uk Kim	ldmia	$inp,{r4-r11}		@ Z
792e71b7053SJung-uk Kim	str	r4,[$out,#64*0-4]
793e71b7053SJung-uk Kim	str	r5,[$out,#64*1-4]
794e71b7053SJung-uk Kim	str	r6,[$out,#64*2-4]
795e71b7053SJung-uk Kim	str	r7,[$out,#64*3-4]
796e71b7053SJung-uk Kim	str	r8,[$out,#64*4-4]
797e71b7053SJung-uk Kim	str	r9,[$out,#64*5-4]
798e71b7053SJung-uk Kim	str	r10,[$out,#64*6-4]
799e71b7053SJung-uk Kim	str	r11,[$out,#64*7-4]
800e71b7053SJung-uk Kim
801e71b7053SJung-uk Kim	ldmia	sp!,{r4-r11}
802e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || defined(__thumb__)
803e71b7053SJung-uk Kim	bx	lr
804e71b7053SJung-uk Kim#else
805e71b7053SJung-uk Kim	mov	pc,lr
806e71b7053SJung-uk Kim#endif
807e71b7053SJung-uk Kim.size	ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
808e71b7053SJung-uk Kim
809e71b7053SJung-uk Kim@ void	ecp_nistz256_gather_w5(P256_POINT *r0,const void *r1,
810e71b7053SJung-uk Kim@					      int r2);
811e71b7053SJung-uk Kim.globl	ecp_nistz256_gather_w5
812e71b7053SJung-uk Kim.type	ecp_nistz256_gather_w5,%function
813e71b7053SJung-uk Kim.align	5
814e71b7053SJung-uk Kimecp_nistz256_gather_w5:
815e71b7053SJung-uk Kim	stmdb	sp!,{r4-r11}
816e71b7053SJung-uk Kim
817e71b7053SJung-uk Kim	cmp	$index,#0
818e71b7053SJung-uk Kim	mov	$mask,#0
819e71b7053SJung-uk Kim#ifdef	__thumb2__
820e71b7053SJung-uk Kim	itt	ne
821e71b7053SJung-uk Kim#endif
822e71b7053SJung-uk Kim	subne	$index,$index,#1
823e71b7053SJung-uk Kim	movne	$mask,#-1
824e71b7053SJung-uk Kim	add	$inp,$inp,$index,lsl#2
825e71b7053SJung-uk Kim
826e71b7053SJung-uk Kim	ldr	r4,[$inp,#64*0]
827e71b7053SJung-uk Kim	ldr	r5,[$inp,#64*1]
828e71b7053SJung-uk Kim	ldr	r6,[$inp,#64*2]
829e71b7053SJung-uk Kim	and	r4,r4,$mask
830e71b7053SJung-uk Kim	ldr	r7,[$inp,#64*3]
831e71b7053SJung-uk Kim	and	r5,r5,$mask
832e71b7053SJung-uk Kim	ldr	r8,[$inp,#64*4]
833e71b7053SJung-uk Kim	and	r6,r6,$mask
834e71b7053SJung-uk Kim	ldr	r9,[$inp,#64*5]
835e71b7053SJung-uk Kim	and	r7,r7,$mask
836e71b7053SJung-uk Kim	ldr	r10,[$inp,#64*6]
837e71b7053SJung-uk Kim	and	r8,r8,$mask
838e71b7053SJung-uk Kim	ldr	r11,[$inp,#64*7]
839e71b7053SJung-uk Kim	add	$inp,$inp,#64*8
840e71b7053SJung-uk Kim	and	r9,r9,$mask
841e71b7053SJung-uk Kim	and	r10,r10,$mask
842e71b7053SJung-uk Kim	and	r11,r11,$mask
843e71b7053SJung-uk Kim	stmia	$out!,{r4-r11}	@ X
844e71b7053SJung-uk Kim
845e71b7053SJung-uk Kim	ldr	r4,[$inp,#64*0]
846e71b7053SJung-uk Kim	ldr	r5,[$inp,#64*1]
847e71b7053SJung-uk Kim	ldr	r6,[$inp,#64*2]
848e71b7053SJung-uk Kim	and	r4,r4,$mask
849e71b7053SJung-uk Kim	ldr	r7,[$inp,#64*3]
850e71b7053SJung-uk Kim	and	r5,r5,$mask
851e71b7053SJung-uk Kim	ldr	r8,[$inp,#64*4]
852e71b7053SJung-uk Kim	and	r6,r6,$mask
853e71b7053SJung-uk Kim	ldr	r9,[$inp,#64*5]
854e71b7053SJung-uk Kim	and	r7,r7,$mask
855e71b7053SJung-uk Kim	ldr	r10,[$inp,#64*6]
856e71b7053SJung-uk Kim	and	r8,r8,$mask
857e71b7053SJung-uk Kim	ldr	r11,[$inp,#64*7]
858e71b7053SJung-uk Kim	add	$inp,$inp,#64*8
859e71b7053SJung-uk Kim	and	r9,r9,$mask
860e71b7053SJung-uk Kim	and	r10,r10,$mask
861e71b7053SJung-uk Kim	and	r11,r11,$mask
862e71b7053SJung-uk Kim	stmia	$out!,{r4-r11}	@ Y
863e71b7053SJung-uk Kim
864e71b7053SJung-uk Kim	ldr	r4,[$inp,#64*0]
865e71b7053SJung-uk Kim	ldr	r5,[$inp,#64*1]
866e71b7053SJung-uk Kim	ldr	r6,[$inp,#64*2]
867e71b7053SJung-uk Kim	and	r4,r4,$mask
868e71b7053SJung-uk Kim	ldr	r7,[$inp,#64*3]
869e71b7053SJung-uk Kim	and	r5,r5,$mask
870e71b7053SJung-uk Kim	ldr	r8,[$inp,#64*4]
871e71b7053SJung-uk Kim	and	r6,r6,$mask
872e71b7053SJung-uk Kim	ldr	r9,[$inp,#64*5]
873e71b7053SJung-uk Kim	and	r7,r7,$mask
874e71b7053SJung-uk Kim	ldr	r10,[$inp,#64*6]
875e71b7053SJung-uk Kim	and	r8,r8,$mask
876e71b7053SJung-uk Kim	ldr	r11,[$inp,#64*7]
877e71b7053SJung-uk Kim	and	r9,r9,$mask
878e71b7053SJung-uk Kim	and	r10,r10,$mask
879e71b7053SJung-uk Kim	and	r11,r11,$mask
880e71b7053SJung-uk Kim	stmia	$out,{r4-r11}		@ Z
881e71b7053SJung-uk Kim
882e71b7053SJung-uk Kim	ldmia	sp!,{r4-r11}
883e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || defined(__thumb__)
884e71b7053SJung-uk Kim	bx	lr
885e71b7053SJung-uk Kim#else
886e71b7053SJung-uk Kim	mov	pc,lr
887e71b7053SJung-uk Kim#endif
888e71b7053SJung-uk Kim.size	ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
889e71b7053SJung-uk Kim
890e71b7053SJung-uk Kim@ void	ecp_nistz256_scatter_w7(void *r0,const P256_POINT_AFFINE *r1,
891e71b7053SJung-uk Kim@					 int r2);
892e71b7053SJung-uk Kim.globl	ecp_nistz256_scatter_w7
893e71b7053SJung-uk Kim.type	ecp_nistz256_scatter_w7,%function
894e71b7053SJung-uk Kim.align	5
895e71b7053SJung-uk Kimecp_nistz256_scatter_w7:
896e71b7053SJung-uk Kim	add	$out,$out,$index
897e71b7053SJung-uk Kim	mov	$index,#64/4
898e71b7053SJung-uk Kim.Loop_scatter_w7:
899e71b7053SJung-uk Kim	ldr	$mask,[$inp],#4
900e71b7053SJung-uk Kim	subs	$index,$index,#1
901e71b7053SJung-uk Kim	strb	$mask,[$out,#64*0]
902e71b7053SJung-uk Kim	mov	$mask,$mask,lsr#8
903e71b7053SJung-uk Kim	strb	$mask,[$out,#64*1]
904e71b7053SJung-uk Kim	mov	$mask,$mask,lsr#8
905e71b7053SJung-uk Kim	strb	$mask,[$out,#64*2]
906e71b7053SJung-uk Kim	mov	$mask,$mask,lsr#8
907e71b7053SJung-uk Kim	strb	$mask,[$out,#64*3]
908e71b7053SJung-uk Kim	add	$out,$out,#64*4
909e71b7053SJung-uk Kim	bne	.Loop_scatter_w7
910e71b7053SJung-uk Kim
911e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || defined(__thumb__)
912e71b7053SJung-uk Kim	bx	lr
913e71b7053SJung-uk Kim#else
914e71b7053SJung-uk Kim	mov	pc,lr
915e71b7053SJung-uk Kim#endif
916e71b7053SJung-uk Kim.size	ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
917e71b7053SJung-uk Kim
918e71b7053SJung-uk Kim@ void	ecp_nistz256_gather_w7(P256_POINT_AFFINE *r0,const void *r1,
919e71b7053SJung-uk Kim@						     int r2);
920e71b7053SJung-uk Kim.globl	ecp_nistz256_gather_w7
921e71b7053SJung-uk Kim.type	ecp_nistz256_gather_w7,%function
922e71b7053SJung-uk Kim.align	5
923e71b7053SJung-uk Kimecp_nistz256_gather_w7:
924e71b7053SJung-uk Kim	stmdb	sp!,{r4-r7}
925e71b7053SJung-uk Kim
926e71b7053SJung-uk Kim	cmp	$index,#0
927e71b7053SJung-uk Kim	mov	$mask,#0
928e71b7053SJung-uk Kim#ifdef	__thumb2__
929e71b7053SJung-uk Kim	itt	ne
930e71b7053SJung-uk Kim#endif
931e71b7053SJung-uk Kim	subne	$index,$index,#1
932e71b7053SJung-uk Kim	movne	$mask,#-1
933e71b7053SJung-uk Kim	add	$inp,$inp,$index
934e71b7053SJung-uk Kim	mov	$index,#64/4
935e71b7053SJung-uk Kim	nop
936e71b7053SJung-uk Kim.Loop_gather_w7:
937e71b7053SJung-uk Kim	ldrb	r4,[$inp,#64*0]
938e71b7053SJung-uk Kim	subs	$index,$index,#1
939e71b7053SJung-uk Kim	ldrb	r5,[$inp,#64*1]
940e71b7053SJung-uk Kim	ldrb	r6,[$inp,#64*2]
941e71b7053SJung-uk Kim	ldrb	r7,[$inp,#64*3]
942e71b7053SJung-uk Kim	add	$inp,$inp,#64*4
943e71b7053SJung-uk Kim	orr	r4,r4,r5,lsl#8
944e71b7053SJung-uk Kim	orr	r4,r4,r6,lsl#16
945e71b7053SJung-uk Kim	orr	r4,r4,r7,lsl#24
946e71b7053SJung-uk Kim	and	r4,r4,$mask
947e71b7053SJung-uk Kim	str	r4,[$out],#4
948e71b7053SJung-uk Kim	bne	.Loop_gather_w7
949e71b7053SJung-uk Kim
950e71b7053SJung-uk Kim	ldmia	sp!,{r4-r7}
951e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || defined(__thumb__)
952e71b7053SJung-uk Kim	bx	lr
953e71b7053SJung-uk Kim#else
954e71b7053SJung-uk Kim	mov	pc,lr
955e71b7053SJung-uk Kim#endif
956e71b7053SJung-uk Kim.size	ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
957e71b7053SJung-uk Kim___
958e71b7053SJung-uk Kim}
959e71b7053SJung-uk Kimif (0) {
960e71b7053SJung-uk Kim# In comparison to integer-only equivalent of below subroutine:
961e71b7053SJung-uk Kim#
962e71b7053SJung-uk Kim# Cortex-A8	+10%
963e71b7053SJung-uk Kim# Cortex-A9	-10%
964e71b7053SJung-uk Kim# Snapdragon S4	+5%
965e71b7053SJung-uk Kim#
966e71b7053SJung-uk Kim# As not all time is spent in multiplication, overall impact is deemed
967e71b7053SJung-uk Kim# too low to care about.
968e71b7053SJung-uk Kim
969e71b7053SJung-uk Kimmy ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7));
970e71b7053SJung-uk Kimmy $mask="q4";
971e71b7053SJung-uk Kimmy $mult="q5";
972e71b7053SJung-uk Kimmy @AxB=map("q$_",(8..15));
973e71b7053SJung-uk Kim
974e71b7053SJung-uk Kimmy ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3));
975e71b7053SJung-uk Kim
976e71b7053SJung-uk Kim$code.=<<___;
977e71b7053SJung-uk Kim#if __ARM_ARCH__>=7
978e71b7053SJung-uk Kim.fpu	neon
979e71b7053SJung-uk Kim
980e71b7053SJung-uk Kim.globl	ecp_nistz256_mul_mont_neon
981e71b7053SJung-uk Kim.type	ecp_nistz256_mul_mont_neon,%function
982e71b7053SJung-uk Kim.align	5
983e71b7053SJung-uk Kimecp_nistz256_mul_mont_neon:
984e71b7053SJung-uk Kim	mov	ip,sp
985e71b7053SJung-uk Kim	stmdb	sp!,{r4-r9}
986e71b7053SJung-uk Kim	vstmdb	sp!,{q4-q5}		@ ABI specification says so
987e71b7053SJung-uk Kim
988e71b7053SJung-uk Kim	sub		$toutptr,sp,#40
989e71b7053SJung-uk Kim	vld1.32		{${Bi}[0]},[$bptr,:32]!
990e71b7053SJung-uk Kim	veor		$zero,$zero,$zero
991e71b7053SJung-uk Kim	vld1.32		{$A0-$A3}, [$aptr]		@ can't specify :32 :-(
992e71b7053SJung-uk Kim	vzip.16		$Bi,$zero
993e71b7053SJung-uk Kim	mov		sp,$toutptr			@ alloca
994e71b7053SJung-uk Kim	vmov.i64	$mask,#0xffff
995e71b7053SJung-uk Kim
996e71b7053SJung-uk Kim	vmull.u32	@AxB[0],$Bi,${A0}[0]
997e71b7053SJung-uk Kim	vmull.u32	@AxB[1],$Bi,${A0}[1]
998e71b7053SJung-uk Kim	vmull.u32	@AxB[2],$Bi,${A1}[0]
999e71b7053SJung-uk Kim	vmull.u32	@AxB[3],$Bi,${A1}[1]
1000e71b7053SJung-uk Kim	 vshr.u64	$temp,@AxB[0]#lo,#16
1001e71b7053SJung-uk Kim	vmull.u32	@AxB[4],$Bi,${A2}[0]
1002e71b7053SJung-uk Kim	 vadd.u64	@AxB[0]#hi,@AxB[0]#hi,$temp
1003e71b7053SJung-uk Kim	vmull.u32	@AxB[5],$Bi,${A2}[1]
1004e71b7053SJung-uk Kim	 vshr.u64	$temp,@AxB[0]#hi,#16		@ upper 32 bits of a[0]*b[0]
1005e71b7053SJung-uk Kim	vmull.u32	@AxB[6],$Bi,${A3}[0]
1006e71b7053SJung-uk Kim	 vand.u64	@AxB[0],@AxB[0],$mask		@ lower 32 bits of a[0]*b[0]
1007e71b7053SJung-uk Kim	vmull.u32	@AxB[7],$Bi,${A3}[1]
1008e71b7053SJung-uk Kim___
1009e71b7053SJung-uk Kimfor($i=1;$i<8;$i++) {
1010e71b7053SJung-uk Kim$code.=<<___;
1011e71b7053SJung-uk Kim	 vld1.32	{${Bi}[0]},[$bptr,:32]!
1012e71b7053SJung-uk Kim	 veor		$zero,$zero,$zero
1013e71b7053SJung-uk Kim	vadd.u64	@AxB[1]#lo,@AxB[1]#lo,$temp	@ reduction
1014e71b7053SJung-uk Kim	vshl.u64	$mult,@AxB[0],#32
1015e71b7053SJung-uk Kim	vadd.u64	@AxB[3],@AxB[3],@AxB[0]
1016e71b7053SJung-uk Kim	vsub.u64	$mult,$mult,@AxB[0]
1017e71b7053SJung-uk Kim	 vzip.16	$Bi,$zero
1018e71b7053SJung-uk Kim	vadd.u64	@AxB[6],@AxB[6],@AxB[0]
1019e71b7053SJung-uk Kim	vadd.u64	@AxB[7],@AxB[7],$mult
1020e71b7053SJung-uk Kim___
1021e71b7053SJung-uk Kim	push(@AxB,shift(@AxB));
1022e71b7053SJung-uk Kim$code.=<<___;
1023e71b7053SJung-uk Kim	vmlal.u32	@AxB[0],$Bi,${A0}[0]
1024e71b7053SJung-uk Kim	vmlal.u32	@AxB[1],$Bi,${A0}[1]
1025e71b7053SJung-uk Kim	vmlal.u32	@AxB[2],$Bi,${A1}[0]
1026e71b7053SJung-uk Kim	vmlal.u32	@AxB[3],$Bi,${A1}[1]
1027e71b7053SJung-uk Kim	 vshr.u64	$temp,@AxB[0]#lo,#16
1028e71b7053SJung-uk Kim	vmlal.u32	@AxB[4],$Bi,${A2}[0]
1029e71b7053SJung-uk Kim	 vadd.u64	@AxB[0]#hi,@AxB[0]#hi,$temp
1030e71b7053SJung-uk Kim	vmlal.u32	@AxB[5],$Bi,${A2}[1]
1031e71b7053SJung-uk Kim	 vshr.u64	$temp,@AxB[0]#hi,#16		@ upper 33 bits of a[0]*b[i]+t[0]
1032e71b7053SJung-uk Kim	vmlal.u32	@AxB[6],$Bi,${A3}[0]
1033e71b7053SJung-uk Kim	 vand.u64	@AxB[0],@AxB[0],$mask		@ lower 32 bits of a[0]*b[0]
1034e71b7053SJung-uk Kim	vmull.u32	@AxB[7],$Bi,${A3}[1]
1035e71b7053SJung-uk Kim___
1036e71b7053SJung-uk Kim}
1037e71b7053SJung-uk Kim$code.=<<___;
1038e71b7053SJung-uk Kim	vadd.u64	@AxB[1]#lo,@AxB[1]#lo,$temp	@ last reduction
1039e71b7053SJung-uk Kim	vshl.u64	$mult,@AxB[0],#32
1040e71b7053SJung-uk Kim	vadd.u64	@AxB[3],@AxB[3],@AxB[0]
1041e71b7053SJung-uk Kim	vsub.u64	$mult,$mult,@AxB[0]
1042e71b7053SJung-uk Kim	vadd.u64	@AxB[6],@AxB[6],@AxB[0]
1043e71b7053SJung-uk Kim	vadd.u64	@AxB[7],@AxB[7],$mult
1044e71b7053SJung-uk Kim
1045e71b7053SJung-uk Kim	vshr.u64	$temp,@AxB[1]#lo,#16		@ convert
1046e71b7053SJung-uk Kim	vadd.u64	@AxB[1]#hi,@AxB[1]#hi,$temp
1047e71b7053SJung-uk Kim	vshr.u64	$temp,@AxB[1]#hi,#16
1048e71b7053SJung-uk Kim	vzip.16		@AxB[1]#lo,@AxB[1]#hi
1049e71b7053SJung-uk Kim___
1050e71b7053SJung-uk Kimforeach (2..7) {
1051e71b7053SJung-uk Kim$code.=<<___;
1052e71b7053SJung-uk Kim	vadd.u64	@AxB[$_]#lo,@AxB[$_]#lo,$temp
1053e71b7053SJung-uk Kim	vst1.32		{@AxB[$_-1]#lo[0]},[$toutptr,:32]!
1054e71b7053SJung-uk Kim	vshr.u64	$temp,@AxB[$_]#lo,#16
1055e71b7053SJung-uk Kim	vadd.u64	@AxB[$_]#hi,@AxB[$_]#hi,$temp
1056e71b7053SJung-uk Kim	vshr.u64	$temp,@AxB[$_]#hi,#16
1057e71b7053SJung-uk Kim	vzip.16		@AxB[$_]#lo,@AxB[$_]#hi
1058e71b7053SJung-uk Kim___
1059e71b7053SJung-uk Kim}
1060e71b7053SJung-uk Kim$code.=<<___;
1061e71b7053SJung-uk Kim	vst1.32		{@AxB[7]#lo[0]},[$toutptr,:32]!
1062e71b7053SJung-uk Kim	vst1.32		{$temp},[$toutptr]		@ upper 33 bits
1063e71b7053SJung-uk Kim
1064e71b7053SJung-uk Kim	ldr	r1,[sp,#0]
1065e71b7053SJung-uk Kim	ldr	r2,[sp,#4]
1066e71b7053SJung-uk Kim	ldr	r3,[sp,#8]
1067e71b7053SJung-uk Kim	subs	r1,r1,#-1
1068e71b7053SJung-uk Kim	ldr	r4,[sp,#12]
1069e71b7053SJung-uk Kim	sbcs	r2,r2,#-1
1070e71b7053SJung-uk Kim	ldr	r5,[sp,#16]
1071e71b7053SJung-uk Kim	sbcs	r3,r3,#-1
1072e71b7053SJung-uk Kim	ldr	r6,[sp,#20]
1073e71b7053SJung-uk Kim	sbcs	r4,r4,#0
1074e71b7053SJung-uk Kim	ldr	r7,[sp,#24]
1075e71b7053SJung-uk Kim	sbcs	r5,r5,#0
1076e71b7053SJung-uk Kim	ldr	r8,[sp,#28]
1077e71b7053SJung-uk Kim	sbcs	r6,r6,#0
1078e71b7053SJung-uk Kim	ldr	r9,[sp,#32]				@ top-most bit
1079e71b7053SJung-uk Kim	sbcs	r7,r7,#1
1080e71b7053SJung-uk Kim	sub	sp,ip,#40+16
1081e71b7053SJung-uk Kim	sbcs	r8,r8,#-1
1082e71b7053SJung-uk Kim	sbc	r9,r9,#0
1083e71b7053SJung-uk Kim        vldmia  sp!,{q4-q5}
1084e71b7053SJung-uk Kim
1085e71b7053SJung-uk Kim	adds	r1,r1,r9
1086e71b7053SJung-uk Kim	adcs	r2,r2,r9
1087e71b7053SJung-uk Kim	str	r1,[$rptr,#0]
1088e71b7053SJung-uk Kim	adcs	r3,r3,r9
1089e71b7053SJung-uk Kim	str	r2,[$rptr,#4]
1090e71b7053SJung-uk Kim	adcs	r4,r4,#0
1091e71b7053SJung-uk Kim	str	r3,[$rptr,#8]
1092e71b7053SJung-uk Kim	adcs	r5,r5,#0
1093e71b7053SJung-uk Kim	str	r4,[$rptr,#12]
1094e71b7053SJung-uk Kim	adcs	r6,r6,#0
1095e71b7053SJung-uk Kim	str	r5,[$rptr,#16]
1096e71b7053SJung-uk Kim	adcs	r7,r7,r9,lsr#31
1097e71b7053SJung-uk Kim	str	r6,[$rptr,#20]
1098e71b7053SJung-uk Kim	adcs	r8,r8,r9
1099e71b7053SJung-uk Kim	str	r7,[$rptr,#24]
1100e71b7053SJung-uk Kim	str	r8,[$rptr,#28]
1101e71b7053SJung-uk Kim
1102e71b7053SJung-uk Kim        ldmia   sp!,{r4-r9}
1103e71b7053SJung-uk Kim	bx	lr
1104e71b7053SJung-uk Kim.size	ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon
1105e71b7053SJung-uk Kim#endif
1106e71b7053SJung-uk Kim___
1107e71b7053SJung-uk Kim}
1108e71b7053SJung-uk Kim
1109e71b7053SJung-uk Kim{{{
1110e71b7053SJung-uk Kim########################################################################
1111e71b7053SJung-uk Kim# Below $aN assignment matches order in which 256-bit result appears in
1112e71b7053SJung-uk Kim# register bank at return from __ecp_nistz256_mul_mont, so that we can
1113e71b7053SJung-uk Kim# skip over reloading it from memory. This means that below functions
1114e71b7053SJung-uk Kim# use custom calling sequence accepting 256-bit input in registers,
1115e71b7053SJung-uk Kim# output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr.
1116e71b7053SJung-uk Kim#
1117e71b7053SJung-uk Kim# See their "normal" counterparts for insights on calculations.
1118e71b7053SJung-uk Kim
1119e71b7053SJung-uk Kimmy ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,
1120e71b7053SJung-uk Kim    $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1));
1121e71b7053SJung-uk Kimmy $ff=$b_ptr;
1122e71b7053SJung-uk Kim
1123e71b7053SJung-uk Kim$code.=<<___;
1124e71b7053SJung-uk Kim.type	__ecp_nistz256_sub_from,%function
1125e71b7053SJung-uk Kim.align	5
1126e71b7053SJung-uk Kim__ecp_nistz256_sub_from:
1127e71b7053SJung-uk Kim	str	lr,[sp,#-4]!		@ push lr
1128e71b7053SJung-uk Kim
1129e71b7053SJung-uk Kim	 ldr	$t0,[$b_ptr,#0]
1130e71b7053SJung-uk Kim	 ldr	$t1,[$b_ptr,#4]
1131e71b7053SJung-uk Kim	 ldr	$t2,[$b_ptr,#8]
1132e71b7053SJung-uk Kim	 ldr	$t3,[$b_ptr,#12]
1133e71b7053SJung-uk Kim	subs	$a0,$a0,$t0
1134e71b7053SJung-uk Kim	 ldr	$t0,[$b_ptr,#16]
1135e71b7053SJung-uk Kim	sbcs	$a1,$a1,$t1
1136e71b7053SJung-uk Kim	 ldr	$t1,[$b_ptr,#20]
1137e71b7053SJung-uk Kim	sbcs	$a2,$a2,$t2
1138e71b7053SJung-uk Kim	 ldr	$t2,[$b_ptr,#24]
1139e71b7053SJung-uk Kim	sbcs	$a3,$a3,$t3
1140e71b7053SJung-uk Kim	 ldr	$t3,[$b_ptr,#28]
1141e71b7053SJung-uk Kim	sbcs	$a4,$a4,$t0
1142e71b7053SJung-uk Kim	sbcs	$a5,$a5,$t1
1143e71b7053SJung-uk Kim	sbcs	$a6,$a6,$t2
1144e71b7053SJung-uk Kim	sbcs	$a7,$a7,$t3
1145e71b7053SJung-uk Kim	sbc	$ff,$ff,$ff		@ broadcast borrow bit
1146e71b7053SJung-uk Kim	ldr	lr,[sp],#4		@ pop lr
1147e71b7053SJung-uk Kim
1148e71b7053SJung-uk Kim	adds	$a0,$a0,$ff		@ add synthesized modulus
1149e71b7053SJung-uk Kim	adcs	$a1,$a1,$ff
1150e71b7053SJung-uk Kim	str	$a0,[$r_ptr,#0]
1151e71b7053SJung-uk Kim	adcs	$a2,$a2,$ff
1152e71b7053SJung-uk Kim	str	$a1,[$r_ptr,#4]
1153e71b7053SJung-uk Kim	adcs	$a3,$a3,#0
1154e71b7053SJung-uk Kim	str	$a2,[$r_ptr,#8]
1155e71b7053SJung-uk Kim	adcs	$a4,$a4,#0
1156e71b7053SJung-uk Kim	str	$a3,[$r_ptr,#12]
1157e71b7053SJung-uk Kim	adcs	$a5,$a5,#0
1158e71b7053SJung-uk Kim	str	$a4,[$r_ptr,#16]
1159e71b7053SJung-uk Kim	adcs	$a6,$a6,$ff,lsr#31
1160e71b7053SJung-uk Kim	str	$a5,[$r_ptr,#20]
1161e71b7053SJung-uk Kim	adcs	$a7,$a7,$ff
1162e71b7053SJung-uk Kim	str	$a6,[$r_ptr,#24]
1163e71b7053SJung-uk Kim	str	$a7,[$r_ptr,#28]
1164e71b7053SJung-uk Kim
1165e71b7053SJung-uk Kim	mov	pc,lr
1166e71b7053SJung-uk Kim.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
1167e71b7053SJung-uk Kim
1168e71b7053SJung-uk Kim.type	__ecp_nistz256_sub_morf,%function
1169e71b7053SJung-uk Kim.align	5
1170e71b7053SJung-uk Kim__ecp_nistz256_sub_morf:
1171e71b7053SJung-uk Kim	str	lr,[sp,#-4]!		@ push lr
1172e71b7053SJung-uk Kim
1173e71b7053SJung-uk Kim	 ldr	$t0,[$b_ptr,#0]
1174e71b7053SJung-uk Kim	 ldr	$t1,[$b_ptr,#4]
1175e71b7053SJung-uk Kim	 ldr	$t2,[$b_ptr,#8]
1176e71b7053SJung-uk Kim	 ldr	$t3,[$b_ptr,#12]
1177e71b7053SJung-uk Kim	subs	$a0,$t0,$a0
1178e71b7053SJung-uk Kim	 ldr	$t0,[$b_ptr,#16]
1179e71b7053SJung-uk Kim	sbcs	$a1,$t1,$a1
1180e71b7053SJung-uk Kim	 ldr	$t1,[$b_ptr,#20]
1181e71b7053SJung-uk Kim	sbcs	$a2,$t2,$a2
1182e71b7053SJung-uk Kim	 ldr	$t2,[$b_ptr,#24]
1183e71b7053SJung-uk Kim	sbcs	$a3,$t3,$a3
1184e71b7053SJung-uk Kim	 ldr	$t3,[$b_ptr,#28]
1185e71b7053SJung-uk Kim	sbcs	$a4,$t0,$a4
1186e71b7053SJung-uk Kim	sbcs	$a5,$t1,$a5
1187e71b7053SJung-uk Kim	sbcs	$a6,$t2,$a6
1188e71b7053SJung-uk Kim	sbcs	$a7,$t3,$a7
1189e71b7053SJung-uk Kim	sbc	$ff,$ff,$ff		@ broadcast borrow bit
1190e71b7053SJung-uk Kim	ldr	lr,[sp],#4		@ pop lr
1191e71b7053SJung-uk Kim
1192e71b7053SJung-uk Kim	adds	$a0,$a0,$ff		@ add synthesized modulus
1193e71b7053SJung-uk Kim	adcs	$a1,$a1,$ff
1194e71b7053SJung-uk Kim	str	$a0,[$r_ptr,#0]
1195e71b7053SJung-uk Kim	adcs	$a2,$a2,$ff
1196e71b7053SJung-uk Kim	str	$a1,[$r_ptr,#4]
1197e71b7053SJung-uk Kim	adcs	$a3,$a3,#0
1198e71b7053SJung-uk Kim	str	$a2,[$r_ptr,#8]
1199e71b7053SJung-uk Kim	adcs	$a4,$a4,#0
1200e71b7053SJung-uk Kim	str	$a3,[$r_ptr,#12]
1201e71b7053SJung-uk Kim	adcs	$a5,$a5,#0
1202e71b7053SJung-uk Kim	str	$a4,[$r_ptr,#16]
1203e71b7053SJung-uk Kim	adcs	$a6,$a6,$ff,lsr#31
1204e71b7053SJung-uk Kim	str	$a5,[$r_ptr,#20]
1205e71b7053SJung-uk Kim	adcs	$a7,$a7,$ff
1206e71b7053SJung-uk Kim	str	$a6,[$r_ptr,#24]
1207e71b7053SJung-uk Kim	str	$a7,[$r_ptr,#28]
1208e71b7053SJung-uk Kim
1209e71b7053SJung-uk Kim	mov	pc,lr
1210e71b7053SJung-uk Kim.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
1211e71b7053SJung-uk Kim
1212e71b7053SJung-uk Kim.type	__ecp_nistz256_add_self,%function
1213e71b7053SJung-uk Kim.align	4
1214e71b7053SJung-uk Kim__ecp_nistz256_add_self:
1215e71b7053SJung-uk Kim	adds	$a0,$a0,$a0		@ a[0:7]+=a[0:7]
1216e71b7053SJung-uk Kim	adcs	$a1,$a1,$a1
1217e71b7053SJung-uk Kim	adcs	$a2,$a2,$a2
1218e71b7053SJung-uk Kim	adcs	$a3,$a3,$a3
1219e71b7053SJung-uk Kim	adcs	$a4,$a4,$a4
1220e71b7053SJung-uk Kim	adcs	$a5,$a5,$a5
1221e71b7053SJung-uk Kim	adcs	$a6,$a6,$a6
1222e71b7053SJung-uk Kim	mov	$ff,#0
1223e71b7053SJung-uk Kim	adcs	$a7,$a7,$a7
1224e71b7053SJung-uk Kim	adc	$ff,$ff,#0
1225e71b7053SJung-uk Kim
1226e71b7053SJung-uk Kim	@ if a+b >= modulus, subtract modulus.
1227e71b7053SJung-uk Kim	@
1228e71b7053SJung-uk Kim	@ But since comparison implies subtraction, we subtract
1229e71b7053SJung-uk Kim	@ modulus and then add it back if subtraction borrowed.
1230e71b7053SJung-uk Kim
1231e71b7053SJung-uk Kim	subs	$a0,$a0,#-1
1232e71b7053SJung-uk Kim	sbcs	$a1,$a1,#-1
1233e71b7053SJung-uk Kim	sbcs	$a2,$a2,#-1
1234e71b7053SJung-uk Kim	sbcs	$a3,$a3,#0
1235e71b7053SJung-uk Kim	sbcs	$a4,$a4,#0
1236e71b7053SJung-uk Kim	sbcs	$a5,$a5,#0
1237e71b7053SJung-uk Kim	sbcs	$a6,$a6,#1
1238e71b7053SJung-uk Kim	sbcs	$a7,$a7,#-1
1239e71b7053SJung-uk Kim	sbc	$ff,$ff,#0
1240e71b7053SJung-uk Kim
1241e71b7053SJung-uk Kim	@ Note that because mod has special form, i.e. consists of
1242e71b7053SJung-uk Kim	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
1243e71b7053SJung-uk Kim	@ using value of borrow as a whole or extracting single bit.
1244e71b7053SJung-uk Kim	@ Follow $ff register...
1245e71b7053SJung-uk Kim
1246e71b7053SJung-uk Kim	adds	$a0,$a0,$ff		@ add synthesized modulus
1247e71b7053SJung-uk Kim	adcs	$a1,$a1,$ff
1248e71b7053SJung-uk Kim	str	$a0,[$r_ptr,#0]
1249e71b7053SJung-uk Kim	adcs	$a2,$a2,$ff
1250e71b7053SJung-uk Kim	str	$a1,[$r_ptr,#4]
1251e71b7053SJung-uk Kim	adcs	$a3,$a3,#0
1252e71b7053SJung-uk Kim	str	$a2,[$r_ptr,#8]
1253e71b7053SJung-uk Kim	adcs	$a4,$a4,#0
1254e71b7053SJung-uk Kim	str	$a3,[$r_ptr,#12]
1255e71b7053SJung-uk Kim	adcs	$a5,$a5,#0
1256e71b7053SJung-uk Kim	str	$a4,[$r_ptr,#16]
1257e71b7053SJung-uk Kim	adcs	$a6,$a6,$ff,lsr#31
1258e71b7053SJung-uk Kim	str	$a5,[$r_ptr,#20]
1259e71b7053SJung-uk Kim	adcs	$a7,$a7,$ff
1260e71b7053SJung-uk Kim	str	$a6,[$r_ptr,#24]
1261e71b7053SJung-uk Kim	str	$a7,[$r_ptr,#28]
1262e71b7053SJung-uk Kim
1263e71b7053SJung-uk Kim	mov	pc,lr
1264e71b7053SJung-uk Kim.size	__ecp_nistz256_add_self,.-__ecp_nistz256_add_self
1265e71b7053SJung-uk Kim
1266e71b7053SJung-uk Kim___
1267e71b7053SJung-uk Kim
1268e71b7053SJung-uk Kim########################################################################
1269e71b7053SJung-uk Kim# following subroutines are "literal" implementation of those found in
1270e71b7053SJung-uk Kim# ecp_nistz256.c
1271e71b7053SJung-uk Kim#
1272e71b7053SJung-uk Kim########################################################################
1273e71b7053SJung-uk Kim# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
1274e71b7053SJung-uk Kim#
1275e71b7053SJung-uk Kim{
1276e71b7053SJung-uk Kimmy ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1277e71b7053SJung-uk Kim# above map() describes stack layout with 5 temporary
1278e71b7053SJung-uk Kim# 256-bit vectors on top. Then note that we push
1279e71b7053SJung-uk Kim# starting from r0, which means that we have copy of
1280e71b7053SJung-uk Kim# input arguments just below these temporary vectors.
1281e71b7053SJung-uk Kim
1282e71b7053SJung-uk Kim$code.=<<___;
1283e71b7053SJung-uk Kim.globl	ecp_nistz256_point_double
1284e71b7053SJung-uk Kim.type	ecp_nistz256_point_double,%function
1285e71b7053SJung-uk Kim.align	5
1286e71b7053SJung-uk Kimecp_nistz256_point_double:
1287e71b7053SJung-uk Kim	stmdb	sp!,{r0-r12,lr}		@ push from r0, unusual, but intentional
1288e71b7053SJung-uk Kim	sub	sp,sp,#32*5
1289e71b7053SJung-uk Kim
1290e71b7053SJung-uk Kim.Lpoint_double_shortcut:
1291e71b7053SJung-uk Kim	add	r3,sp,#$in_x
1292e71b7053SJung-uk Kim	ldmia	$a_ptr!,{r4-r11}	@ copy in_x
1293e71b7053SJung-uk Kim	stmia	r3,{r4-r11}
1294e71b7053SJung-uk Kim
1295e71b7053SJung-uk Kim	add	$r_ptr,sp,#$S
1296e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_by_2	@ p256_mul_by_2(S, in_y);
1297e71b7053SJung-uk Kim
1298e71b7053SJung-uk Kim	add	$b_ptr,$a_ptr,#32
1299e71b7053SJung-uk Kim	add	$a_ptr,$a_ptr,#32
1300e71b7053SJung-uk Kim	add	$r_ptr,sp,#$Zsqr
1301e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Zsqr, in_z);
1302e71b7053SJung-uk Kim
1303e71b7053SJung-uk Kim	add	$a_ptr,sp,#$S
1304e71b7053SJung-uk Kim	add	$b_ptr,sp,#$S
1305e71b7053SJung-uk Kim	add	$r_ptr,sp,#$S
1306e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(S, S);
1307e71b7053SJung-uk Kim
1308e71b7053SJung-uk Kim	ldr	$b_ptr,[sp,#32*5+4]
1309e71b7053SJung-uk Kim	add	$a_ptr,$b_ptr,#32
1310e71b7053SJung-uk Kim	add	$b_ptr,$b_ptr,#64
1311e71b7053SJung-uk Kim	add	$r_ptr,sp,#$tmp0
1312e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(tmp0, in_z, in_y);
1313e71b7053SJung-uk Kim
1314e71b7053SJung-uk Kim	ldr	$r_ptr,[sp,#32*5]
1315e71b7053SJung-uk Kim	add	$r_ptr,$r_ptr,#64
1316e71b7053SJung-uk Kim	bl	__ecp_nistz256_add_self	@ p256_mul_by_2(res_z, tmp0);
1317e71b7053SJung-uk Kim
1318e71b7053SJung-uk Kim	add	$a_ptr,sp,#$in_x
1319e71b7053SJung-uk Kim	add	$b_ptr,sp,#$Zsqr
1320e71b7053SJung-uk Kim	add	$r_ptr,sp,#$M
1321e71b7053SJung-uk Kim	bl	__ecp_nistz256_add	@ p256_add(M, in_x, Zsqr);
1322e71b7053SJung-uk Kim
1323e71b7053SJung-uk Kim	add	$a_ptr,sp,#$in_x
1324e71b7053SJung-uk Kim	add	$b_ptr,sp,#$Zsqr
1325e71b7053SJung-uk Kim	add	$r_ptr,sp,#$Zsqr
1326e71b7053SJung-uk Kim	bl	__ecp_nistz256_sub	@ p256_sub(Zsqr, in_x, Zsqr);
1327e71b7053SJung-uk Kim
1328e71b7053SJung-uk Kim	add	$a_ptr,sp,#$S
1329e71b7053SJung-uk Kim	add	$b_ptr,sp,#$S
1330e71b7053SJung-uk Kim	add	$r_ptr,sp,#$tmp0
1331e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(tmp0, S);
1332e71b7053SJung-uk Kim
1333e71b7053SJung-uk Kim	add	$a_ptr,sp,#$Zsqr
1334e71b7053SJung-uk Kim	add	$b_ptr,sp,#$M
1335e71b7053SJung-uk Kim	add	$r_ptr,sp,#$M
1336e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(M, M, Zsqr);
1337e71b7053SJung-uk Kim
1338e71b7053SJung-uk Kim	ldr	$r_ptr,[sp,#32*5]
1339e71b7053SJung-uk Kim	add	$a_ptr,sp,#$tmp0
1340e71b7053SJung-uk Kim	add	$r_ptr,$r_ptr,#32
1341e71b7053SJung-uk Kim	bl	__ecp_nistz256_div_by_2	@ p256_div_by_2(res_y, tmp0);
1342e71b7053SJung-uk Kim
1343e71b7053SJung-uk Kim	add	$a_ptr,sp,#$M
1344e71b7053SJung-uk Kim	add	$r_ptr,sp,#$M
1345e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_by_3	@ p256_mul_by_3(M, M);
1346e71b7053SJung-uk Kim
1347e71b7053SJung-uk Kim	add	$a_ptr,sp,#$in_x
1348e71b7053SJung-uk Kim	add	$b_ptr,sp,#$S
1349e71b7053SJung-uk Kim	add	$r_ptr,sp,#$S
1350e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S, S, in_x);
1351e71b7053SJung-uk Kim
1352e71b7053SJung-uk Kim	add	$r_ptr,sp,#$tmp0
1353e71b7053SJung-uk Kim	bl	__ecp_nistz256_add_self	@ p256_mul_by_2(tmp0, S);
1354e71b7053SJung-uk Kim
1355e71b7053SJung-uk Kim	ldr	$r_ptr,[sp,#32*5]
1356e71b7053SJung-uk Kim	add	$a_ptr,sp,#$M
1357e71b7053SJung-uk Kim	add	$b_ptr,sp,#$M
1358e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(res_x, M);
1359e71b7053SJung-uk Kim
1360e71b7053SJung-uk Kim	add	$b_ptr,sp,#$tmp0
1361e71b7053SJung-uk Kim	bl	__ecp_nistz256_sub_from	@ p256_sub(res_x, res_x, tmp0);
1362e71b7053SJung-uk Kim
1363e71b7053SJung-uk Kim	add	$b_ptr,sp,#$S
1364e71b7053SJung-uk Kim	add	$r_ptr,sp,#$S
1365e71b7053SJung-uk Kim	bl	__ecp_nistz256_sub_morf	@ p256_sub(S, S, res_x);
1366e71b7053SJung-uk Kim
1367e71b7053SJung-uk Kim	add	$a_ptr,sp,#$M
1368e71b7053SJung-uk Kim	add	$b_ptr,sp,#$S
1369e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S, S, M);
1370e71b7053SJung-uk Kim
1371e71b7053SJung-uk Kim	ldr	$r_ptr,[sp,#32*5]
1372e71b7053SJung-uk Kim	add	$b_ptr,$r_ptr,#32
1373e71b7053SJung-uk Kim	add	$r_ptr,$r_ptr,#32
1374e71b7053SJung-uk Kim	bl	__ecp_nistz256_sub_from	@ p256_sub(res_y, S, res_y);
1375e71b7053SJung-uk Kim
1376e71b7053SJung-uk Kim	add	sp,sp,#32*5+16		@ +16 means "skip even over saved r0-r3"
1377e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__)
1378e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,pc}
1379e71b7053SJung-uk Kim#else
1380e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,lr}
1381e71b7053SJung-uk Kim	bx	lr			@ interoperable with Thumb ISA:-)
1382e71b7053SJung-uk Kim#endif
1383e71b7053SJung-uk Kim.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
1384e71b7053SJung-uk Kim___
1385e71b7053SJung-uk Kim}
1386e71b7053SJung-uk Kim
1387e71b7053SJung-uk Kim########################################################################
1388e71b7053SJung-uk Kim# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
1389e71b7053SJung-uk Kim#			      const P256_POINT *in2);
1390e71b7053SJung-uk Kim{
1391e71b7053SJung-uk Kimmy ($res_x,$res_y,$res_z,
1392e71b7053SJung-uk Kim    $in1_x,$in1_y,$in1_z,
1393e71b7053SJung-uk Kim    $in2_x,$in2_y,$in2_z,
1394e71b7053SJung-uk Kim    $H,$Hsqr,$R,$Rsqr,$Hcub,
1395e71b7053SJung-uk Kim    $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
1396e71b7053SJung-uk Kimmy ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1397e71b7053SJung-uk Kim# above map() describes stack layout with 18 temporary
1398e71b7053SJung-uk Kim# 256-bit vectors on top. Then note that we push
1399e71b7053SJung-uk Kim# starting from r0, which means that we have copy of
1400e71b7053SJung-uk Kim# input arguments just below these temporary vectors.
140117f01e99SJung-uk Kim# We use three of them for ~in1infty, ~in2infty and
1402e71b7053SJung-uk Kim# result of check for zero.
1403e71b7053SJung-uk Kim
1404e71b7053SJung-uk Kim$code.=<<___;
1405e71b7053SJung-uk Kim.globl	ecp_nistz256_point_add
1406e71b7053SJung-uk Kim.type	ecp_nistz256_point_add,%function
1407e71b7053SJung-uk Kim.align	5
1408e71b7053SJung-uk Kimecp_nistz256_point_add:
1409e71b7053SJung-uk Kim	stmdb	sp!,{r0-r12,lr}		@ push from r0, unusual, but intentional
1410e71b7053SJung-uk Kim	sub	sp,sp,#32*18+16
1411e71b7053SJung-uk Kim
1412e71b7053SJung-uk Kim	ldmia	$b_ptr!,{r4-r11}	@ copy in2_x
1413e71b7053SJung-uk Kim	add	r3,sp,#$in2_x
1414e71b7053SJung-uk Kim	stmia	r3!,{r4-r11}
1415e71b7053SJung-uk Kim	ldmia	$b_ptr!,{r4-r11}	@ copy in2_y
1416e71b7053SJung-uk Kim	stmia	r3!,{r4-r11}
1417e71b7053SJung-uk Kim	ldmia	$b_ptr,{r4-r11}		@ copy in2_z
1418e71b7053SJung-uk Kim	orr	r12,r4,r5
1419e71b7053SJung-uk Kim	orr	r12,r12,r6
1420e71b7053SJung-uk Kim	orr	r12,r12,r7
1421e71b7053SJung-uk Kim	orr	r12,r12,r8
1422e71b7053SJung-uk Kim	orr	r12,r12,r9
1423e71b7053SJung-uk Kim	orr	r12,r12,r10
1424e71b7053SJung-uk Kim	orr	r12,r12,r11
1425e71b7053SJung-uk Kim	cmp	r12,#0
1426e71b7053SJung-uk Kim#ifdef	__thumb2__
1427e71b7053SJung-uk Kim	it	ne
1428e71b7053SJung-uk Kim#endif
1429e71b7053SJung-uk Kim	movne	r12,#-1
1430e71b7053SJung-uk Kim	stmia	r3,{r4-r11}
143117f01e99SJung-uk Kim	str	r12,[sp,#32*18+8]	@ ~in2infty
1432e71b7053SJung-uk Kim
1433e71b7053SJung-uk Kim	ldmia	$a_ptr!,{r4-r11}	@ copy in1_x
1434e71b7053SJung-uk Kim	add	r3,sp,#$in1_x
1435e71b7053SJung-uk Kim	stmia	r3!,{r4-r11}
1436e71b7053SJung-uk Kim	ldmia	$a_ptr!,{r4-r11}	@ copy in1_y
1437e71b7053SJung-uk Kim	stmia	r3!,{r4-r11}
1438e71b7053SJung-uk Kim	ldmia	$a_ptr,{r4-r11}		@ copy in1_z
1439e71b7053SJung-uk Kim	orr	r12,r4,r5
1440e71b7053SJung-uk Kim	orr	r12,r12,r6
1441e71b7053SJung-uk Kim	orr	r12,r12,r7
1442e71b7053SJung-uk Kim	orr	r12,r12,r8
1443e71b7053SJung-uk Kim	orr	r12,r12,r9
1444e71b7053SJung-uk Kim	orr	r12,r12,r10
1445e71b7053SJung-uk Kim	orr	r12,r12,r11
1446e71b7053SJung-uk Kim	cmp	r12,#0
1447e71b7053SJung-uk Kim#ifdef	__thumb2__
1448e71b7053SJung-uk Kim	it	ne
1449e71b7053SJung-uk Kim#endif
1450e71b7053SJung-uk Kim	movne	r12,#-1
1451e71b7053SJung-uk Kim	stmia	r3,{r4-r11}
145217f01e99SJung-uk Kim	str	r12,[sp,#32*18+4]	@ ~in1infty
1453e71b7053SJung-uk Kim
1454e71b7053SJung-uk Kim	add	$a_ptr,sp,#$in2_z
1455e71b7053SJung-uk Kim	add	$b_ptr,sp,#$in2_z
1456e71b7053SJung-uk Kim	add	$r_ptr,sp,#$Z2sqr
1457e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Z2sqr, in2_z);
1458e71b7053SJung-uk Kim
1459e71b7053SJung-uk Kim	add	$a_ptr,sp,#$in1_z
1460e71b7053SJung-uk Kim	add	$b_ptr,sp,#$in1_z
1461e71b7053SJung-uk Kim	add	$r_ptr,sp,#$Z1sqr
1462e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Z1sqr, in1_z);
1463e71b7053SJung-uk Kim
1464e71b7053SJung-uk Kim	add	$a_ptr,sp,#$in2_z
1465e71b7053SJung-uk Kim	add	$b_ptr,sp,#$Z2sqr
1466e71b7053SJung-uk Kim	add	$r_ptr,sp,#$S1
1467e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S1, Z2sqr, in2_z);
1468e71b7053SJung-uk Kim
1469e71b7053SJung-uk Kim	add	$a_ptr,sp,#$in1_z
1470e71b7053SJung-uk Kim	add	$b_ptr,sp,#$Z1sqr
1471e71b7053SJung-uk Kim	add	$r_ptr,sp,#$S2
1472e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, Z1sqr, in1_z);
1473e71b7053SJung-uk Kim
1474e71b7053SJung-uk Kim	add	$a_ptr,sp,#$in1_y
1475e71b7053SJung-uk Kim	add	$b_ptr,sp,#$S1
1476e71b7053SJung-uk Kim	add	$r_ptr,sp,#$S1
1477e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S1, S1, in1_y);
1478e71b7053SJung-uk Kim
1479e71b7053SJung-uk Kim	add	$a_ptr,sp,#$in2_y
1480e71b7053SJung-uk Kim	add	$b_ptr,sp,#$S2
1481e71b7053SJung-uk Kim	add	$r_ptr,sp,#$S2
1482e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, S2, in2_y);
1483e71b7053SJung-uk Kim
1484e71b7053SJung-uk Kim	add	$b_ptr,sp,#$S1
1485e71b7053SJung-uk Kim	add	$r_ptr,sp,#$R
1486e71b7053SJung-uk Kim	bl	__ecp_nistz256_sub_from	@ p256_sub(R, S2, S1);
1487e71b7053SJung-uk Kim
1488e71b7053SJung-uk Kim	orr	$a0,$a0,$a1		@ see if result is zero
1489e71b7053SJung-uk Kim	orr	$a2,$a2,$a3
1490e71b7053SJung-uk Kim	orr	$a4,$a4,$a5
1491e71b7053SJung-uk Kim	orr	$a0,$a0,$a2
1492e71b7053SJung-uk Kim	orr	$a4,$a4,$a6
1493e71b7053SJung-uk Kim	orr	$a0,$a0,$a7
1494e71b7053SJung-uk Kim	 add	$a_ptr,sp,#$in1_x
1495e71b7053SJung-uk Kim	orr	$a0,$a0,$a4
1496e71b7053SJung-uk Kim	 add	$b_ptr,sp,#$Z2sqr
1497e71b7053SJung-uk Kim	str	$a0,[sp,#32*18+12]
1498e71b7053SJung-uk Kim
1499e71b7053SJung-uk Kim	add	$r_ptr,sp,#$U1
1500e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U1, in1_x, Z2sqr);
1501e71b7053SJung-uk Kim
1502e71b7053SJung-uk Kim	add	$a_ptr,sp,#$in2_x
1503e71b7053SJung-uk Kim	add	$b_ptr,sp,#$Z1sqr
1504e71b7053SJung-uk Kim	add	$r_ptr,sp,#$U2
1505e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U2, in2_x, Z1sqr);
1506e71b7053SJung-uk Kim
1507e71b7053SJung-uk Kim	add	$b_ptr,sp,#$U1
1508e71b7053SJung-uk Kim	add	$r_ptr,sp,#$H
1509e71b7053SJung-uk Kim	bl	__ecp_nistz256_sub_from	@ p256_sub(H, U2, U1);
1510e71b7053SJung-uk Kim
1511e71b7053SJung-uk Kim	orr	$a0,$a0,$a1		@ see if result is zero
1512e71b7053SJung-uk Kim	orr	$a2,$a2,$a3
1513e71b7053SJung-uk Kim	orr	$a4,$a4,$a5
1514e71b7053SJung-uk Kim	orr	$a0,$a0,$a2
1515e71b7053SJung-uk Kim	orr	$a4,$a4,$a6
1516e71b7053SJung-uk Kim	orr	$a0,$a0,$a7
151717f01e99SJung-uk Kim	orr	$a0,$a0,$a4		@ ~is_equal(U1,U2)
1518e71b7053SJung-uk Kim
151917f01e99SJung-uk Kim	ldr	$t0,[sp,#32*18+4]	@ ~in1infty
152017f01e99SJung-uk Kim	ldr	$t1,[sp,#32*18+8]	@ ~in2infty
152117f01e99SJung-uk Kim	ldr	$t2,[sp,#32*18+12]	@ ~is_equal(S1,S2)
152217f01e99SJung-uk Kim	mvn	$t0,$t0			@ -1/0 -> 0/-1
152317f01e99SJung-uk Kim	mvn	$t1,$t1			@ -1/0 -> 0/-1
152458f35182SJung-uk Kim	orr	$a0,$a0,$t0
152558f35182SJung-uk Kim	orr	$a0,$a0,$t1
152658f35182SJung-uk Kim	orrs	$a0,$a0,$t2		@ set flags
1527e71b7053SJung-uk Kim
152817f01e99SJung-uk Kim	@ if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
152917f01e99SJung-uk Kim	bne	.Ladd_proceed
1530e71b7053SJung-uk Kim
1531e71b7053SJung-uk Kim.Ladd_double:
1532e71b7053SJung-uk Kim	ldr	$a_ptr,[sp,#32*18+20]
1533e71b7053SJung-uk Kim	add	sp,sp,#32*(18-5)+16	@ difference in frame sizes
1534e71b7053SJung-uk Kim	b	.Lpoint_double_shortcut
1535e71b7053SJung-uk Kim
1536e71b7053SJung-uk Kim.align	4
1537e71b7053SJung-uk Kim.Ladd_proceed:
1538e71b7053SJung-uk Kim	add	$a_ptr,sp,#$R
1539e71b7053SJung-uk Kim	add	$b_ptr,sp,#$R
1540e71b7053SJung-uk Kim	add	$r_ptr,sp,#$Rsqr
1541e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Rsqr, R);
1542e71b7053SJung-uk Kim
1543e71b7053SJung-uk Kim	add	$a_ptr,sp,#$H
1544e71b7053SJung-uk Kim	add	$b_ptr,sp,#$in1_z
1545e71b7053SJung-uk Kim	add	$r_ptr,sp,#$res_z
1546e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_z, H, in1_z);
1547e71b7053SJung-uk Kim
1548e71b7053SJung-uk Kim	add	$a_ptr,sp,#$H
1549e71b7053SJung-uk Kim	add	$b_ptr,sp,#$H
1550e71b7053SJung-uk Kim	add	$r_ptr,sp,#$Hsqr
1551e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Hsqr, H);
1552e71b7053SJung-uk Kim
1553e71b7053SJung-uk Kim	add	$a_ptr,sp,#$in2_z
1554e71b7053SJung-uk Kim	add	$b_ptr,sp,#$res_z
1555e71b7053SJung-uk Kim	add	$r_ptr,sp,#$res_z
1556e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_z, res_z, in2_z);
1557e71b7053SJung-uk Kim
1558e71b7053SJung-uk Kim	add	$a_ptr,sp,#$H
1559e71b7053SJung-uk Kim	add	$b_ptr,sp,#$Hsqr
1560e71b7053SJung-uk Kim	add	$r_ptr,sp,#$Hcub
1561e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(Hcub, Hsqr, H);
1562e71b7053SJung-uk Kim
1563e71b7053SJung-uk Kim	add	$a_ptr,sp,#$Hsqr
1564e71b7053SJung-uk Kim	add	$b_ptr,sp,#$U1
1565e71b7053SJung-uk Kim	add	$r_ptr,sp,#$U2
1566e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U2, U1, Hsqr);
1567e71b7053SJung-uk Kim
1568e71b7053SJung-uk Kim	add	$r_ptr,sp,#$Hsqr
1569e71b7053SJung-uk Kim	bl	__ecp_nistz256_add_self	@ p256_mul_by_2(Hsqr, U2);
1570e71b7053SJung-uk Kim
1571e71b7053SJung-uk Kim	add	$b_ptr,sp,#$Rsqr
1572e71b7053SJung-uk Kim	add	$r_ptr,sp,#$res_x
1573e71b7053SJung-uk Kim	bl	__ecp_nistz256_sub_morf	@ p256_sub(res_x, Rsqr, Hsqr);
1574e71b7053SJung-uk Kim
1575e71b7053SJung-uk Kim	add	$b_ptr,sp,#$Hcub
1576e71b7053SJung-uk Kim	bl	__ecp_nistz256_sub_from	@  p256_sub(res_x, res_x, Hcub);
1577e71b7053SJung-uk Kim
1578e71b7053SJung-uk Kim	add	$b_ptr,sp,#$U2
1579e71b7053SJung-uk Kim	add	$r_ptr,sp,#$res_y
1580e71b7053SJung-uk Kim	bl	__ecp_nistz256_sub_morf	@ p256_sub(res_y, U2, res_x);
1581e71b7053SJung-uk Kim
1582e71b7053SJung-uk Kim	add	$a_ptr,sp,#$Hcub
1583e71b7053SJung-uk Kim	add	$b_ptr,sp,#$S1
1584e71b7053SJung-uk Kim	add	$r_ptr,sp,#$S2
1585e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, S1, Hcub);
1586e71b7053SJung-uk Kim
1587e71b7053SJung-uk Kim	add	$a_ptr,sp,#$R
1588e71b7053SJung-uk Kim	add	$b_ptr,sp,#$res_y
1589e71b7053SJung-uk Kim	add	$r_ptr,sp,#$res_y
1590e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_y, res_y, R);
1591e71b7053SJung-uk Kim
1592e71b7053SJung-uk Kim	add	$b_ptr,sp,#$S2
1593e71b7053SJung-uk Kim	bl	__ecp_nistz256_sub_from	@ p256_sub(res_y, res_y, S2);
1594e71b7053SJung-uk Kim
159517f01e99SJung-uk Kim	ldr	r11,[sp,#32*18+4]	@ ~in1infty
159617f01e99SJung-uk Kim	ldr	r12,[sp,#32*18+8]	@ ~in2infty
1597e71b7053SJung-uk Kim	add	r1,sp,#$res_x
1598e71b7053SJung-uk Kim	add	r2,sp,#$in2_x
159917f01e99SJung-uk Kim	and	r10,r11,r12		@ ~in1infty & ~in2infty
1600e71b7053SJung-uk Kim	mvn	r11,r11
1601e71b7053SJung-uk Kim	add	r3,sp,#$in1_x
160217f01e99SJung-uk Kim	and	r11,r11,r12		@ in1infty & ~in2infty
160317f01e99SJung-uk Kim	mvn	r12,r12			@ in2infty
1604e71b7053SJung-uk Kim	ldr	$r_ptr,[sp,#32*18+16]
1605e71b7053SJung-uk Kim___
1606e71b7053SJung-uk Kimfor($i=0;$i<96;$i+=8) {			# conditional moves
1607e71b7053SJung-uk Kim$code.=<<___;
1608e71b7053SJung-uk Kim	ldmia	r1!,{r4-r5}		@ res_x
1609e71b7053SJung-uk Kim	ldmia	r2!,{r6-r7}		@ in2_x
1610e71b7053SJung-uk Kim	ldmia	r3!,{r8-r9}		@ in1_x
161117f01e99SJung-uk Kim	and	r4,r4,r10		@ ~in1infty & ~in2infty
1612e71b7053SJung-uk Kim	and	r5,r5,r10
161317f01e99SJung-uk Kim	and	r6,r6,r11		@ in1infty & ~in2infty
1614e71b7053SJung-uk Kim	and	r7,r7,r11
161517f01e99SJung-uk Kim	and	r8,r8,r12		@ in2infty
1616e71b7053SJung-uk Kim	and	r9,r9,r12
1617e71b7053SJung-uk Kim	orr	r4,r4,r6
1618e71b7053SJung-uk Kim	orr	r5,r5,r7
1619e71b7053SJung-uk Kim	orr	r4,r4,r8
1620e71b7053SJung-uk Kim	orr	r5,r5,r9
1621e71b7053SJung-uk Kim	stmia	$r_ptr!,{r4-r5}
1622e71b7053SJung-uk Kim___
1623e71b7053SJung-uk Kim}
1624e71b7053SJung-uk Kim$code.=<<___;
1625e71b7053SJung-uk Kim.Ladd_done:
1626e71b7053SJung-uk Kim	add	sp,sp,#32*18+16+16	@ +16 means "skip even over saved r0-r3"
1627e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__)
1628e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,pc}
1629e71b7053SJung-uk Kim#else
1630e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,lr}
1631e71b7053SJung-uk Kim	bx	lr			@ interoperable with Thumb ISA:-)
1632e71b7053SJung-uk Kim#endif
1633e71b7053SJung-uk Kim.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
1634e71b7053SJung-uk Kim___
1635e71b7053SJung-uk Kim}
1636e71b7053SJung-uk Kim
1637e71b7053SJung-uk Kim########################################################################
1638e71b7053SJung-uk Kim# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1639e71b7053SJung-uk Kim#				     const P256_POINT_AFFINE *in2);
1640e71b7053SJung-uk Kim{
1641e71b7053SJung-uk Kimmy ($res_x,$res_y,$res_z,
1642e71b7053SJung-uk Kim    $in1_x,$in1_y,$in1_z,
1643e71b7053SJung-uk Kim    $in2_x,$in2_y,
1644e71b7053SJung-uk Kim    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
1645e71b7053SJung-uk Kimmy $Z1sqr = $S2;
1646e71b7053SJung-uk Kim# above map() describes stack layout with 18 temporary
1647e71b7053SJung-uk Kim# 256-bit vectors on top. Then note that we push
1648e71b7053SJung-uk Kim# starting from r0, which means that we have copy of
1649e71b7053SJung-uk Kim# input arguments just below these temporary vectors.
165017f01e99SJung-uk Kim# We use two of them for ~in1infty, ~in2infty.
1651e71b7053SJung-uk Kim
1652e71b7053SJung-uk Kimmy @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
1653e71b7053SJung-uk Kim
1654e71b7053SJung-uk Kim$code.=<<___;
1655e71b7053SJung-uk Kim.globl	ecp_nistz256_point_add_affine
1656e71b7053SJung-uk Kim.type	ecp_nistz256_point_add_affine,%function
1657e71b7053SJung-uk Kim.align	5
1658e71b7053SJung-uk Kimecp_nistz256_point_add_affine:
1659e71b7053SJung-uk Kim	stmdb	sp!,{r0-r12,lr}		@ push from r0, unusual, but intentional
1660e71b7053SJung-uk Kim	sub	sp,sp,#32*15
1661e71b7053SJung-uk Kim
1662e71b7053SJung-uk Kim	ldmia	$a_ptr!,{r4-r11}	@ copy in1_x
1663e71b7053SJung-uk Kim	add	r3,sp,#$in1_x
1664e71b7053SJung-uk Kim	stmia	r3!,{r4-r11}
1665e71b7053SJung-uk Kim	ldmia	$a_ptr!,{r4-r11}	@ copy in1_y
1666e71b7053SJung-uk Kim	stmia	r3!,{r4-r11}
1667e71b7053SJung-uk Kim	ldmia	$a_ptr,{r4-r11}		@ copy in1_z
1668e71b7053SJung-uk Kim	orr	r12,r4,r5
1669e71b7053SJung-uk Kim	orr	r12,r12,r6
1670e71b7053SJung-uk Kim	orr	r12,r12,r7
1671e71b7053SJung-uk Kim	orr	r12,r12,r8
1672e71b7053SJung-uk Kim	orr	r12,r12,r9
1673e71b7053SJung-uk Kim	orr	r12,r12,r10
1674e71b7053SJung-uk Kim	orr	r12,r12,r11
1675e71b7053SJung-uk Kim	cmp	r12,#0
1676e71b7053SJung-uk Kim#ifdef	__thumb2__
1677e71b7053SJung-uk Kim	it	ne
1678e71b7053SJung-uk Kim#endif
1679e71b7053SJung-uk Kim	movne	r12,#-1
1680e71b7053SJung-uk Kim	stmia	r3,{r4-r11}
168117f01e99SJung-uk Kim	str	r12,[sp,#32*15+4]	@ ~in1infty
1682e71b7053SJung-uk Kim
1683e71b7053SJung-uk Kim	ldmia	$b_ptr!,{r4-r11}	@ copy in2_x
1684e71b7053SJung-uk Kim	add	r3,sp,#$in2_x
1685e71b7053SJung-uk Kim	orr	r12,r4,r5
1686e71b7053SJung-uk Kim	orr	r12,r12,r6
1687e71b7053SJung-uk Kim	orr	r12,r12,r7
1688e71b7053SJung-uk Kim	orr	r12,r12,r8
1689e71b7053SJung-uk Kim	orr	r12,r12,r9
1690e71b7053SJung-uk Kim	orr	r12,r12,r10
1691e71b7053SJung-uk Kim	orr	r12,r12,r11
1692e71b7053SJung-uk Kim	stmia	r3!,{r4-r11}
1693e71b7053SJung-uk Kim	ldmia	$b_ptr!,{r4-r11}	@ copy in2_y
1694e71b7053SJung-uk Kim	orr	r12,r12,r4
1695e71b7053SJung-uk Kim	orr	r12,r12,r5
1696e71b7053SJung-uk Kim	orr	r12,r12,r6
1697e71b7053SJung-uk Kim	orr	r12,r12,r7
1698e71b7053SJung-uk Kim	orr	r12,r12,r8
1699e71b7053SJung-uk Kim	orr	r12,r12,r9
1700e71b7053SJung-uk Kim	orr	r12,r12,r10
1701e71b7053SJung-uk Kim	orr	r12,r12,r11
1702e71b7053SJung-uk Kim	stmia	r3!,{r4-r11}
1703e71b7053SJung-uk Kim	cmp	r12,#0
1704e71b7053SJung-uk Kim#ifdef	__thumb2__
1705e71b7053SJung-uk Kim	it	ne
1706e71b7053SJung-uk Kim#endif
1707e71b7053SJung-uk Kim	movne	r12,#-1
170817f01e99SJung-uk Kim	str	r12,[sp,#32*15+8]	@ ~in2infty
1709e71b7053SJung-uk Kim
1710e71b7053SJung-uk Kim	add	$a_ptr,sp,#$in1_z
1711e71b7053SJung-uk Kim	add	$b_ptr,sp,#$in1_z
1712e71b7053SJung-uk Kim	add	$r_ptr,sp,#$Z1sqr
1713e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Z1sqr, in1_z);
1714e71b7053SJung-uk Kim
1715e71b7053SJung-uk Kim	add	$a_ptr,sp,#$Z1sqr
1716e71b7053SJung-uk Kim	add	$b_ptr,sp,#$in2_x
1717e71b7053SJung-uk Kim	add	$r_ptr,sp,#$U2
1718e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U2, Z1sqr, in2_x);
1719e71b7053SJung-uk Kim
1720e71b7053SJung-uk Kim	add	$b_ptr,sp,#$in1_x
1721e71b7053SJung-uk Kim	add	$r_ptr,sp,#$H
1722e71b7053SJung-uk Kim	bl	__ecp_nistz256_sub_from	@ p256_sub(H, U2, in1_x);
1723e71b7053SJung-uk Kim
1724e71b7053SJung-uk Kim	add	$a_ptr,sp,#$Z1sqr
1725e71b7053SJung-uk Kim	add	$b_ptr,sp,#$in1_z
1726e71b7053SJung-uk Kim	add	$r_ptr,sp,#$S2
1727e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, Z1sqr, in1_z);
1728e71b7053SJung-uk Kim
1729e71b7053SJung-uk Kim	add	$a_ptr,sp,#$H
1730e71b7053SJung-uk Kim	add	$b_ptr,sp,#$in1_z
1731e71b7053SJung-uk Kim	add	$r_ptr,sp,#$res_z
1732e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_z, H, in1_z);
1733e71b7053SJung-uk Kim
1734e71b7053SJung-uk Kim	add	$a_ptr,sp,#$in2_y
1735e71b7053SJung-uk Kim	add	$b_ptr,sp,#$S2
1736e71b7053SJung-uk Kim	add	$r_ptr,sp,#$S2
1737e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, S2, in2_y);
1738e71b7053SJung-uk Kim
1739e71b7053SJung-uk Kim	add	$b_ptr,sp,#$in1_y
1740e71b7053SJung-uk Kim	add	$r_ptr,sp,#$R
1741e71b7053SJung-uk Kim	bl	__ecp_nistz256_sub_from	@ p256_sub(R, S2, in1_y);
1742e71b7053SJung-uk Kim
1743e71b7053SJung-uk Kim	add	$a_ptr,sp,#$H
1744e71b7053SJung-uk Kim	add	$b_ptr,sp,#$H
1745e71b7053SJung-uk Kim	add	$r_ptr,sp,#$Hsqr
1746e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Hsqr, H);
1747e71b7053SJung-uk Kim
1748e71b7053SJung-uk Kim	add	$a_ptr,sp,#$R
1749e71b7053SJung-uk Kim	add	$b_ptr,sp,#$R
1750e71b7053SJung-uk Kim	add	$r_ptr,sp,#$Rsqr
1751e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Rsqr, R);
1752e71b7053SJung-uk Kim
1753e71b7053SJung-uk Kim	add	$a_ptr,sp,#$H
1754e71b7053SJung-uk Kim	add	$b_ptr,sp,#$Hsqr
1755e71b7053SJung-uk Kim	add	$r_ptr,sp,#$Hcub
1756e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(Hcub, Hsqr, H);
1757e71b7053SJung-uk Kim
1758e71b7053SJung-uk Kim	add	$a_ptr,sp,#$Hsqr
1759e71b7053SJung-uk Kim	add	$b_ptr,sp,#$in1_x
1760e71b7053SJung-uk Kim	add	$r_ptr,sp,#$U2
1761e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U2, in1_x, Hsqr);
1762e71b7053SJung-uk Kim
1763e71b7053SJung-uk Kim	add	$r_ptr,sp,#$Hsqr
1764e71b7053SJung-uk Kim	bl	__ecp_nistz256_add_self	@ p256_mul_by_2(Hsqr, U2);
1765e71b7053SJung-uk Kim
1766e71b7053SJung-uk Kim	add	$b_ptr,sp,#$Rsqr
1767e71b7053SJung-uk Kim	add	$r_ptr,sp,#$res_x
1768e71b7053SJung-uk Kim	bl	__ecp_nistz256_sub_morf	@ p256_sub(res_x, Rsqr, Hsqr);
1769e71b7053SJung-uk Kim
1770e71b7053SJung-uk Kim	add	$b_ptr,sp,#$Hcub
1771e71b7053SJung-uk Kim	bl	__ecp_nistz256_sub_from	@  p256_sub(res_x, res_x, Hcub);
1772e71b7053SJung-uk Kim
1773e71b7053SJung-uk Kim	add	$b_ptr,sp,#$U2
1774e71b7053SJung-uk Kim	add	$r_ptr,sp,#$res_y
1775e71b7053SJung-uk Kim	bl	__ecp_nistz256_sub_morf	@ p256_sub(res_y, U2, res_x);
1776e71b7053SJung-uk Kim
1777e71b7053SJung-uk Kim	add	$a_ptr,sp,#$Hcub
1778e71b7053SJung-uk Kim	add	$b_ptr,sp,#$in1_y
1779e71b7053SJung-uk Kim	add	$r_ptr,sp,#$S2
1780e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, in1_y, Hcub);
1781e71b7053SJung-uk Kim
1782e71b7053SJung-uk Kim	add	$a_ptr,sp,#$R
1783e71b7053SJung-uk Kim	add	$b_ptr,sp,#$res_y
1784e71b7053SJung-uk Kim	add	$r_ptr,sp,#$res_y
1785e71b7053SJung-uk Kim	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_y, res_y, R);
1786e71b7053SJung-uk Kim
1787e71b7053SJung-uk Kim	add	$b_ptr,sp,#$S2
1788e71b7053SJung-uk Kim	bl	__ecp_nistz256_sub_from	@ p256_sub(res_y, res_y, S2);
1789e71b7053SJung-uk Kim
179017f01e99SJung-uk Kim	ldr	r11,[sp,#32*15+4]	@ ~in1infty
179117f01e99SJung-uk Kim	ldr	r12,[sp,#32*15+8]	@ ~in2infty
1792e71b7053SJung-uk Kim	add	r1,sp,#$res_x
1793e71b7053SJung-uk Kim	add	r2,sp,#$in2_x
179417f01e99SJung-uk Kim	and	r10,r11,r12		@ ~in1infty & ~in2infty
1795e71b7053SJung-uk Kim	mvn	r11,r11
1796e71b7053SJung-uk Kim	add	r3,sp,#$in1_x
179717f01e99SJung-uk Kim	and	r11,r11,r12		@ in1infty & ~in2infty
179817f01e99SJung-uk Kim	mvn	r12,r12			@ in2infty
1799e71b7053SJung-uk Kim	ldr	$r_ptr,[sp,#32*15]
1800e71b7053SJung-uk Kim___
1801e71b7053SJung-uk Kimfor($i=0;$i<64;$i+=8) {			# conditional moves
1802e71b7053SJung-uk Kim$code.=<<___;
1803e71b7053SJung-uk Kim	ldmia	r1!,{r4-r5}		@ res_x
1804e71b7053SJung-uk Kim	ldmia	r2!,{r6-r7}		@ in2_x
1805e71b7053SJung-uk Kim	ldmia	r3!,{r8-r9}		@ in1_x
180617f01e99SJung-uk Kim	and	r4,r4,r10		@ ~in1infty & ~in2infty
1807e71b7053SJung-uk Kim	and	r5,r5,r10
180817f01e99SJung-uk Kim	and	r6,r6,r11		@ in1infty & ~in2infty
1809e71b7053SJung-uk Kim	and	r7,r7,r11
181017f01e99SJung-uk Kim	and	r8,r8,r12		@ in2infty
1811e71b7053SJung-uk Kim	and	r9,r9,r12
1812e71b7053SJung-uk Kim	orr	r4,r4,r6
1813e71b7053SJung-uk Kim	orr	r5,r5,r7
1814e71b7053SJung-uk Kim	orr	r4,r4,r8
1815e71b7053SJung-uk Kim	orr	r5,r5,r9
1816e71b7053SJung-uk Kim	stmia	$r_ptr!,{r4-r5}
1817e71b7053SJung-uk Kim___
1818e71b7053SJung-uk Kim}
1819e71b7053SJung-uk Kimfor(;$i<96;$i+=8) {
1820e71b7053SJung-uk Kimmy $j=($i-64)/4;
1821e71b7053SJung-uk Kim$code.=<<___;
1822e71b7053SJung-uk Kim	ldmia	r1!,{r4-r5}		@ res_z
1823e71b7053SJung-uk Kim	ldmia	r3!,{r8-r9}		@ in1_z
1824e71b7053SJung-uk Kim	and	r4,r4,r10
1825e71b7053SJung-uk Kim	and	r5,r5,r10
1826e71b7053SJung-uk Kim	and	r6,r11,#@ONE_mont[$j]
1827e71b7053SJung-uk Kim	and	r7,r11,#@ONE_mont[$j+1]
1828e71b7053SJung-uk Kim	and	r8,r8,r12
1829e71b7053SJung-uk Kim	and	r9,r9,r12
1830e71b7053SJung-uk Kim	orr	r4,r4,r6
1831e71b7053SJung-uk Kim	orr	r5,r5,r7
1832e71b7053SJung-uk Kim	orr	r4,r4,r8
1833e71b7053SJung-uk Kim	orr	r5,r5,r9
1834e71b7053SJung-uk Kim	stmia	$r_ptr!,{r4-r5}
1835e71b7053SJung-uk Kim___
1836e71b7053SJung-uk Kim}
1837e71b7053SJung-uk Kim$code.=<<___;
1838e71b7053SJung-uk Kim	add	sp,sp,#32*15+16		@ +16 means "skip even over saved r0-r3"
1839e71b7053SJung-uk Kim#if __ARM_ARCH__>=5 || !defined(__thumb__)
1840e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,pc}
1841e71b7053SJung-uk Kim#else
1842e71b7053SJung-uk Kim	ldmia	sp!,{r4-r12,lr}
1843e71b7053SJung-uk Kim	bx	lr			@ interoperable with Thumb ISA:-)
1844e71b7053SJung-uk Kim#endif
1845e71b7053SJung-uk Kim.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1846e71b7053SJung-uk Kim___
1847e71b7053SJung-uk Kim}					}}}
1848e71b7053SJung-uk Kim
1849e71b7053SJung-uk Kimforeach (split("\n",$code)) {
1850e71b7053SJung-uk Kim	s/\`([^\`]*)\`/eval $1/geo;
1851e71b7053SJung-uk Kim
1852e71b7053SJung-uk Kim	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1853e71b7053SJung-uk Kim
1854e71b7053SJung-uk Kim	print $_,"\n";
1855e71b7053SJung-uk Kim}
185617f01e99SJung-uk Kimclose STDOUT or die "error closing STDOUT: $!";	# enforce flush
1857