xref: /freebsd/crypto/openssl/crypto/ec/asm/ecp_nistz256-armv4.pl (revision a0409676120c1e558d0ade943019934e0f15118d)
1#! /usr/bin/env perl
2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# ECP_NISTZ256 module for ARMv4.
18#
19# October 2014.
20#
21# Original ECP_NISTZ256 submission targeting x86_64 is detailed in
22# http://eprint.iacr.org/2013/816. In the process of adaptation
23# original .c module was made 32-bit savvy in order to make this
24# implementation possible.
25#
26#			with/without -DECP_NISTZ256_ASM
27# Cortex-A8		+53-170%
28# Cortex-A9		+76-205%
29# Cortex-A15		+100-316%
30# Snapdragon S4		+66-187%
31#
32# Ranges denote minimum and maximum improvement coefficients depending
33# on benchmark. Lower coefficients are for ECDSA sign, server-side
34# operation. Keep in mind that +200% means 3x improvement.
35
36$flavour = shift;
37if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
38else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
39
40if ($flavour && $flavour ne "void") {
41    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
43    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
44    die "can't locate arm-xlate.pl";
45
46    open STDOUT,"| \"$^X\" $xlate $flavour $output";
47} else {
48    open STDOUT,">$output";
49}
50
51$code.=<<___;
52#include "arm_arch.h"
53
54.text
55#if defined(__thumb2__)
56.syntax	unified
57.thumb
58#else
59.code	32
60#endif
61___
62########################################################################
63# Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
64#
65$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
66open TABLE,"<ecp_nistz256_table.c"		or
67open TABLE,"<${dir}../ecp_nistz256_table.c"	or
68die "failed to open ecp_nistz256_table.c:",$!;
69
70use integer;
71
72foreach(<TABLE>) {
73	s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
74}
75close TABLE;
76
77# See ecp_nistz256_table.c for explanation for why it's 64*16*37.
78# 64*16*37-1 is because $#arr returns last valid index or @arr, not
79# amount of elements.
80die "insane number of elements" if ($#arr != 64*16*37-1);
81
82$code.=<<___;
83.globl	ecp_nistz256_precomputed
84.type	ecp_nistz256_precomputed,%object
85.align	12
86ecp_nistz256_precomputed:
87___
88########################################################################
89# this conversion smashes P256_POINT_AFFINE by individual bytes with
90# 64 byte interval, similar to
91#	1111222233334444
92#	1234123412341234
93for(1..37) {
94	@tbl = splice(@arr,0,64*16);
95	for($i=0;$i<64;$i++) {
96		undef @line;
97		for($j=0;$j<64;$j++) {
98			push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
99		}
100		$code.=".byte\t";
101		$code.=join(',',map { sprintf "0x%02x",$_} @line);
102		$code.="\n";
103	}
104}
105$code.=<<___;
106.size	ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
107.align	5
108.LRR:	@ 2^512 mod P precomputed for NIST P256 polynomial
109.long	0x00000003, 0x00000000, 0xffffffff, 0xfffffffb
110.long	0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004
111.Lone:
112.long	1,0,0,0,0,0,0,0
113.asciz	"ECP_NISTZ256 for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
114.align	6
115___
116
117########################################################################
118# common register layout, note that $t2 is link register, so that if
119# internal subroutine uses $t2, then it has to offload lr...
120
121($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)=
122		map("r$_",(0..12,14));
123($t0,$t3)=($ff,$a_ptr);
124
125$code.=<<___;
126@ void	ecp_nistz256_to_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
127.globl	ecp_nistz256_to_mont
128.type	ecp_nistz256_to_mont,%function
129ecp_nistz256_to_mont:
130	adr	$b_ptr,.LRR
131	b	.Lecp_nistz256_mul_mont
132.size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
133
134@ void	ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
135.globl	ecp_nistz256_from_mont
136.type	ecp_nistz256_from_mont,%function
137ecp_nistz256_from_mont:
138	adr	$b_ptr,.Lone
139	b	.Lecp_nistz256_mul_mont
140.size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
141
142@ void	ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
143.globl	ecp_nistz256_mul_by_2
144.type	ecp_nistz256_mul_by_2,%function
145.align	4
146ecp_nistz256_mul_by_2:
147	stmdb	sp!,{r4-r12,lr}
148	bl	__ecp_nistz256_mul_by_2
149#if __ARM_ARCH__>=5 || !defined(__thumb__)
150	ldmia	sp!,{r4-r12,pc}
151#else
152	ldmia	sp!,{r4-r12,lr}
153	bx	lr			@ interoperable with Thumb ISA:-)
154#endif
155.size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
156
157.type	__ecp_nistz256_mul_by_2,%function
158.align	4
159__ecp_nistz256_mul_by_2:
160	ldr	$a0,[$a_ptr,#0]
161	ldr	$a1,[$a_ptr,#4]
162	ldr	$a2,[$a_ptr,#8]
163	adds	$a0,$a0,$a0		@ a[0:7]+=a[0:7], i.e. add with itself
164	ldr	$a3,[$a_ptr,#12]
165	adcs	$a1,$a1,$a1
166	ldr	$a4,[$a_ptr,#16]
167	adcs	$a2,$a2,$a2
168	ldr	$a5,[$a_ptr,#20]
169	adcs	$a3,$a3,$a3
170	ldr	$a6,[$a_ptr,#24]
171	adcs	$a4,$a4,$a4
172	ldr	$a7,[$a_ptr,#28]
173	adcs	$a5,$a5,$a5
174	adcs	$a6,$a6,$a6
175	mov	$ff,#0
176	adcs	$a7,$a7,$a7
177	adc	$ff,$ff,#0
178
179	b	.Lreduce_by_sub
180.size	__ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
181
182@ void	ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8],
183@					const BN_ULONG r2[8]);
184.globl	ecp_nistz256_add
185.type	ecp_nistz256_add,%function
186.align	4
187ecp_nistz256_add:
188	stmdb	sp!,{r4-r12,lr}
189	bl	__ecp_nistz256_add
190#if __ARM_ARCH__>=5 || !defined(__thumb__)
191	ldmia	sp!,{r4-r12,pc}
192#else
193	ldmia	sp!,{r4-r12,lr}
194	bx	lr			@ interoperable with Thumb ISA:-)
195#endif
196.size	ecp_nistz256_add,.-ecp_nistz256_add
197
198.type	__ecp_nistz256_add,%function
199.align	4
200__ecp_nistz256_add:
201	str	lr,[sp,#-4]!		@ push lr
202
203	ldr	$a0,[$a_ptr,#0]
204	ldr	$a1,[$a_ptr,#4]
205	ldr	$a2,[$a_ptr,#8]
206	ldr	$a3,[$a_ptr,#12]
207	ldr	$a4,[$a_ptr,#16]
208	 ldr	$t0,[$b_ptr,#0]
209	ldr	$a5,[$a_ptr,#20]
210	 ldr	$t1,[$b_ptr,#4]
211	ldr	$a6,[$a_ptr,#24]
212	 ldr	$t2,[$b_ptr,#8]
213	ldr	$a7,[$a_ptr,#28]
214	 ldr	$t3,[$b_ptr,#12]
215	adds	$a0,$a0,$t0
216	 ldr	$t0,[$b_ptr,#16]
217	adcs	$a1,$a1,$t1
218	 ldr	$t1,[$b_ptr,#20]
219	adcs	$a2,$a2,$t2
220	 ldr	$t2,[$b_ptr,#24]
221	adcs	$a3,$a3,$t3
222	 ldr	$t3,[$b_ptr,#28]
223	adcs	$a4,$a4,$t0
224	adcs	$a5,$a5,$t1
225	adcs	$a6,$a6,$t2
226	mov	$ff,#0
227	adcs	$a7,$a7,$t3
228	adc	$ff,$ff,#0
229	ldr	lr,[sp],#4		@ pop lr
230
231.Lreduce_by_sub:
232
233	@ if a+b >= modulus, subtract modulus.
234	@
235	@ But since comparison implies subtraction, we subtract
236	@ modulus and then add it back if subtraction borrowed.
237
238	subs	$a0,$a0,#-1
239	sbcs	$a1,$a1,#-1
240	sbcs	$a2,$a2,#-1
241	sbcs	$a3,$a3,#0
242	sbcs	$a4,$a4,#0
243	sbcs	$a5,$a5,#0
244	sbcs	$a6,$a6,#1
245	sbcs	$a7,$a7,#-1
246	sbc	$ff,$ff,#0
247
248	@ Note that because mod has special form, i.e. consists of
249	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
250	@ using value of borrow as a whole or extracting single bit.
251	@ Follow $ff register...
252
253	adds	$a0,$a0,$ff		@ add synthesized modulus
254	adcs	$a1,$a1,$ff
255	str	$a0,[$r_ptr,#0]
256	adcs	$a2,$a2,$ff
257	str	$a1,[$r_ptr,#4]
258	adcs	$a3,$a3,#0
259	str	$a2,[$r_ptr,#8]
260	adcs	$a4,$a4,#0
261	str	$a3,[$r_ptr,#12]
262	adcs	$a5,$a5,#0
263	str	$a4,[$r_ptr,#16]
264	adcs	$a6,$a6,$ff,lsr#31
265	str	$a5,[$r_ptr,#20]
266	adcs	$a7,$a7,$ff
267	str	$a6,[$r_ptr,#24]
268	str	$a7,[$r_ptr,#28]
269
270	mov	pc,lr
271.size	__ecp_nistz256_add,.-__ecp_nistz256_add
272
273@ void	ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]);
274.globl	ecp_nistz256_mul_by_3
275.type	ecp_nistz256_mul_by_3,%function
276.align	4
277ecp_nistz256_mul_by_3:
278	stmdb	sp!,{r4-r12,lr}
279	bl	__ecp_nistz256_mul_by_3
280#if __ARM_ARCH__>=5 || !defined(__thumb__)
281	ldmia	sp!,{r4-r12,pc}
282#else
283	ldmia	sp!,{r4-r12,lr}
284	bx	lr			@ interoperable with Thumb ISA:-)
285#endif
286.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
287
288.type	__ecp_nistz256_mul_by_3,%function
289.align	4
290__ecp_nistz256_mul_by_3:
291	str	lr,[sp,#-4]!		@ push lr
292
293	@ As multiplication by 3 is performed as 2*n+n, below are inline
294	@ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see
295	@ corresponding subroutines for details.
296
297	ldr	$a0,[$a_ptr,#0]
298	ldr	$a1,[$a_ptr,#4]
299	ldr	$a2,[$a_ptr,#8]
300	adds	$a0,$a0,$a0		@ a[0:7]+=a[0:7]
301	ldr	$a3,[$a_ptr,#12]
302	adcs	$a1,$a1,$a1
303	ldr	$a4,[$a_ptr,#16]
304	adcs	$a2,$a2,$a2
305	ldr	$a5,[$a_ptr,#20]
306	adcs	$a3,$a3,$a3
307	ldr	$a6,[$a_ptr,#24]
308	adcs	$a4,$a4,$a4
309	ldr	$a7,[$a_ptr,#28]
310	adcs	$a5,$a5,$a5
311	adcs	$a6,$a6,$a6
312	mov	$ff,#0
313	adcs	$a7,$a7,$a7
314	adc	$ff,$ff,#0
315
316	subs	$a0,$a0,#-1		@ .Lreduce_by_sub but without stores
317	sbcs	$a1,$a1,#-1
318	sbcs	$a2,$a2,#-1
319	sbcs	$a3,$a3,#0
320	sbcs	$a4,$a4,#0
321	sbcs	$a5,$a5,#0
322	sbcs	$a6,$a6,#1
323	sbcs	$a7,$a7,#-1
324	sbc	$ff,$ff,#0
325
326	adds	$a0,$a0,$ff		@ add synthesized modulus
327	adcs	$a1,$a1,$ff
328	adcs	$a2,$a2,$ff
329	adcs	$a3,$a3,#0
330	adcs	$a4,$a4,#0
331	 ldr	$b_ptr,[$a_ptr,#0]
332	adcs	$a5,$a5,#0
333	 ldr	$t1,[$a_ptr,#4]
334	adcs	$a6,$a6,$ff,lsr#31
335	 ldr	$t2,[$a_ptr,#8]
336	adc	$a7,$a7,$ff
337
338	ldr	$t0,[$a_ptr,#12]
339	adds	$a0,$a0,$b_ptr		@ 2*a[0:7]+=a[0:7]
340	ldr	$b_ptr,[$a_ptr,#16]
341	adcs	$a1,$a1,$t1
342	ldr	$t1,[$a_ptr,#20]
343	adcs	$a2,$a2,$t2
344	ldr	$t2,[$a_ptr,#24]
345	adcs	$a3,$a3,$t0
346	ldr	$t3,[$a_ptr,#28]
347	adcs	$a4,$a4,$b_ptr
348	adcs	$a5,$a5,$t1
349	adcs	$a6,$a6,$t2
350	mov	$ff,#0
351	adcs	$a7,$a7,$t3
352	adc	$ff,$ff,#0
353	ldr	lr,[sp],#4		@ pop lr
354
355	b	.Lreduce_by_sub
356.size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
357
358@ void	ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
359.globl	ecp_nistz256_div_by_2
360.type	ecp_nistz256_div_by_2,%function
361.align	4
362ecp_nistz256_div_by_2:
363	stmdb	sp!,{r4-r12,lr}
364	bl	__ecp_nistz256_div_by_2
365#if __ARM_ARCH__>=5 || !defined(__thumb__)
366	ldmia	sp!,{r4-r12,pc}
367#else
368	ldmia	sp!,{r4-r12,lr}
369	bx	lr			@ interoperable with Thumb ISA:-)
370#endif
371.size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
372
373.type	__ecp_nistz256_div_by_2,%function
374.align	4
375__ecp_nistz256_div_by_2:
376	@ ret = (a is odd ? a+mod : a) >> 1
377
378	ldr	$a0,[$a_ptr,#0]
379	ldr	$a1,[$a_ptr,#4]
380	ldr	$a2,[$a_ptr,#8]
381	mov	$ff,$a0,lsl#31		@ place least significant bit to most
382					@ significant position, now arithmetic
383					@ right shift by 31 will produce -1 or
384					@ 0, while logical right shift 1 or 0,
385					@ this is how modulus is conditionally
386					@ synthesized in this case...
387	ldr	$a3,[$a_ptr,#12]
388	adds	$a0,$a0,$ff,asr#31
389	ldr	$a4,[$a_ptr,#16]
390	adcs	$a1,$a1,$ff,asr#31
391	ldr	$a5,[$a_ptr,#20]
392	adcs	$a2,$a2,$ff,asr#31
393	ldr	$a6,[$a_ptr,#24]
394	adcs	$a3,$a3,#0
395	ldr	$a7,[$a_ptr,#28]
396	adcs	$a4,$a4,#0
397	 mov	$a0,$a0,lsr#1		@ a[0:7]>>=1, we can start early
398					@ because it doesn't affect flags
399	adcs	$a5,$a5,#0
400	 orr	$a0,$a0,$a1,lsl#31
401	adcs	$a6,$a6,$ff,lsr#31
402	mov	$b_ptr,#0
403	adcs	$a7,$a7,$ff,asr#31
404	 mov	$a1,$a1,lsr#1
405	adc	$b_ptr,$b_ptr,#0	@ top-most carry bit from addition
406
407	orr	$a1,$a1,$a2,lsl#31
408	mov	$a2,$a2,lsr#1
409	str	$a0,[$r_ptr,#0]
410	orr	$a2,$a2,$a3,lsl#31
411	mov	$a3,$a3,lsr#1
412	str	$a1,[$r_ptr,#4]
413	orr	$a3,$a3,$a4,lsl#31
414	mov	$a4,$a4,lsr#1
415	str	$a2,[$r_ptr,#8]
416	orr	$a4,$a4,$a5,lsl#31
417	mov	$a5,$a5,lsr#1
418	str	$a3,[$r_ptr,#12]
419	orr	$a5,$a5,$a6,lsl#31
420	mov	$a6,$a6,lsr#1
421	str	$a4,[$r_ptr,#16]
422	orr	$a6,$a6,$a7,lsl#31
423	mov	$a7,$a7,lsr#1
424	str	$a5,[$r_ptr,#20]
425	orr	$a7,$a7,$b_ptr,lsl#31	@ don't forget the top-most carry bit
426	str	$a6,[$r_ptr,#24]
427	str	$a7,[$r_ptr,#28]
428
429	mov	pc,lr
430.size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
431
432@ void	ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8],
433@				        const BN_ULONG r2[8]);
434.globl	ecp_nistz256_sub
435.type	ecp_nistz256_sub,%function
436.align	4
437ecp_nistz256_sub:
438	stmdb	sp!,{r4-r12,lr}
439	bl	__ecp_nistz256_sub
440#if __ARM_ARCH__>=5 || !defined(__thumb__)
441	ldmia	sp!,{r4-r12,pc}
442#else
443	ldmia	sp!,{r4-r12,lr}
444	bx	lr			@ interoperable with Thumb ISA:-)
445#endif
446.size	ecp_nistz256_sub,.-ecp_nistz256_sub
447
448.type	__ecp_nistz256_sub,%function
449.align	4
450__ecp_nistz256_sub:
451	str	lr,[sp,#-4]!		@ push lr
452
453	ldr	$a0,[$a_ptr,#0]
454	ldr	$a1,[$a_ptr,#4]
455	ldr	$a2,[$a_ptr,#8]
456	ldr	$a3,[$a_ptr,#12]
457	ldr	$a4,[$a_ptr,#16]
458	 ldr	$t0,[$b_ptr,#0]
459	ldr	$a5,[$a_ptr,#20]
460	 ldr	$t1,[$b_ptr,#4]
461	ldr	$a6,[$a_ptr,#24]
462	 ldr	$t2,[$b_ptr,#8]
463	ldr	$a7,[$a_ptr,#28]
464	 ldr	$t3,[$b_ptr,#12]
465	subs	$a0,$a0,$t0
466	 ldr	$t0,[$b_ptr,#16]
467	sbcs	$a1,$a1,$t1
468	 ldr	$t1,[$b_ptr,#20]
469	sbcs	$a2,$a2,$t2
470	 ldr	$t2,[$b_ptr,#24]
471	sbcs	$a3,$a3,$t3
472	 ldr	$t3,[$b_ptr,#28]
473	sbcs	$a4,$a4,$t0
474	sbcs	$a5,$a5,$t1
475	sbcs	$a6,$a6,$t2
476	sbcs	$a7,$a7,$t3
477	sbc	$ff,$ff,$ff		@ broadcast borrow bit
478	ldr	lr,[sp],#4		@ pop lr
479
480.Lreduce_by_add:
481
482	@ if a-b borrows, add modulus.
483	@
484	@ Note that because mod has special form, i.e. consists of
485	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
486	@ broadcasting borrow bit to a register, $ff, and using it as
487	@ a whole or extracting single bit.
488
489	adds	$a0,$a0,$ff		@ add synthesized modulus
490	adcs	$a1,$a1,$ff
491	str	$a0,[$r_ptr,#0]
492	adcs	$a2,$a2,$ff
493	str	$a1,[$r_ptr,#4]
494	adcs	$a3,$a3,#0
495	str	$a2,[$r_ptr,#8]
496	adcs	$a4,$a4,#0
497	str	$a3,[$r_ptr,#12]
498	adcs	$a5,$a5,#0
499	str	$a4,[$r_ptr,#16]
500	adcs	$a6,$a6,$ff,lsr#31
501	str	$a5,[$r_ptr,#20]
502	adcs	$a7,$a7,$ff
503	str	$a6,[$r_ptr,#24]
504	str	$a7,[$r_ptr,#28]
505
506	mov	pc,lr
507.size	__ecp_nistz256_sub,.-__ecp_nistz256_sub
508
509@ void	ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]);
510.globl	ecp_nistz256_neg
511.type	ecp_nistz256_neg,%function
512.align	4
513ecp_nistz256_neg:
514	stmdb	sp!,{r4-r12,lr}
515	bl	__ecp_nistz256_neg
516#if __ARM_ARCH__>=5 || !defined(__thumb__)
517	ldmia	sp!,{r4-r12,pc}
518#else
519	ldmia	sp!,{r4-r12,lr}
520	bx	lr			@ interoperable with Thumb ISA:-)
521#endif
522.size	ecp_nistz256_neg,.-ecp_nistz256_neg
523
524.type	__ecp_nistz256_neg,%function
525.align	4
526__ecp_nistz256_neg:
527	ldr	$a0,[$a_ptr,#0]
528	eor	$ff,$ff,$ff
529	ldr	$a1,[$a_ptr,#4]
530	ldr	$a2,[$a_ptr,#8]
531	subs	$a0,$ff,$a0
532	ldr	$a3,[$a_ptr,#12]
533	sbcs	$a1,$ff,$a1
534	ldr	$a4,[$a_ptr,#16]
535	sbcs	$a2,$ff,$a2
536	ldr	$a5,[$a_ptr,#20]
537	sbcs	$a3,$ff,$a3
538	ldr	$a6,[$a_ptr,#24]
539	sbcs	$a4,$ff,$a4
540	ldr	$a7,[$a_ptr,#28]
541	sbcs	$a5,$ff,$a5
542	sbcs	$a6,$ff,$a6
543	sbcs	$a7,$ff,$a7
544	sbc	$ff,$ff,$ff
545
546	b	.Lreduce_by_add
547.size	__ecp_nistz256_neg,.-__ecp_nistz256_neg
548___
549{
550my @acc=map("r$_",(3..11));
551my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14));
552
553$code.=<<___;
554@ void	ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
555.globl	ecp_nistz256_sqr_mont
556.type	ecp_nistz256_sqr_mont,%function
557.align	4
558ecp_nistz256_sqr_mont:
559	mov	$b_ptr,$a_ptr
560	b	.Lecp_nistz256_mul_mont
561.size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
562
563@ void	ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8],
564@					     const BN_ULONG r2[8]);
565.globl	ecp_nistz256_mul_mont
566.type	ecp_nistz256_mul_mont,%function
567.align	4
568ecp_nistz256_mul_mont:
569.Lecp_nistz256_mul_mont:
570	stmdb	sp!,{r4-r12,lr}
571	bl	__ecp_nistz256_mul_mont
572#if __ARM_ARCH__>=5 || !defined(__thumb__)
573	ldmia	sp!,{r4-r12,pc}
574#else
575	ldmia	sp!,{r4-r12,lr}
576	bx	lr			@ interoperable with Thumb ISA:-)
577#endif
578.size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
579
580.type	__ecp_nistz256_mul_mont,%function
581.align	4
582__ecp_nistz256_mul_mont:
583	stmdb	sp!,{r0-r2,lr}			@ make a copy of arguments too
584
585	ldr	$bj,[$b_ptr,#0]			@ b[0]
586	ldmia	$a_ptr,{@acc[1]-@acc[8]}
587
588	umull	@acc[0],$t3,@acc[1],$bj		@ r[0]=a[0]*b[0]
589	stmdb	sp!,{$acc[1]-@acc[8]}		@ copy a[0-7] to stack, so
590						@ that it can be addressed
591						@ without spending register
592						@ on address
593	umull	@acc[1],$t0,@acc[2],$bj		@ r[1]=a[1]*b[0]
594	umull	@acc[2],$t1,@acc[3],$bj
595	adds	@acc[1],@acc[1],$t3		@ accumulate high part of mult
596	umull	@acc[3],$t2,@acc[4],$bj
597	adcs	@acc[2],@acc[2],$t0
598	umull	@acc[4],$t3,@acc[5],$bj
599	adcs	@acc[3],@acc[3],$t1
600	umull	@acc[5],$t0,@acc[6],$bj
601	adcs	@acc[4],@acc[4],$t2
602	umull	@acc[6],$t1,@acc[7],$bj
603	adcs	@acc[5],@acc[5],$t3
604	umull	@acc[7],$t2,@acc[8],$bj
605	adcs	@acc[6],@acc[6],$t0
606	adcs	@acc[7],@acc[7],$t1
607	eor	$t3,$t3,$t3			@ first overflow bit is zero
608	adc	@acc[8],$t2,#0
609___
610for(my $i=1;$i<8;$i++) {
611my $t4=@acc[0];
612
613	# Reduction iteration is normally performed by accumulating
614	# result of multiplication of modulus by "magic" digit [and
615	# omitting least significant word, which is guaranteed to
616	# be 0], but thanks to special form of modulus and "magic"
617	# digit being equal to least significant word, it can be
618	# performed with additions and subtractions alone. Indeed:
619	#
620	#        ffff.0001.0000.0000.0000.ffff.ffff.ffff
621	# *                                         abcd
622	# + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
623	#
624	# Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
625	# rewrite above as:
626	#
627	#   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
628	# + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
629	# -      abcd.0000.0000.0000.0000.0000.0000.abcd
630	#
631	# or marking redundant operations:
632	#
633	#   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
634	# + abcd.0000.abcd.0000.0000.abcd.----.----.----
635	# -      abcd.----.----.----.----.----.----.----
636
637$code.=<<___;
638	@ multiplication-less reduction $i
639	adds	@acc[3],@acc[3],@acc[0]		@ r[3]+=r[0]
640	 ldr	$bj,[sp,#40]			@ restore b_ptr
641	adcs	@acc[4],@acc[4],#0		@ r[4]+=0
642	adcs	@acc[5],@acc[5],#0		@ r[5]+=0
643	adcs	@acc[6],@acc[6],@acc[0]		@ r[6]+=r[0]
644	 ldr	$t1,[sp,#0]			@ load a[0]
645	adcs	@acc[7],@acc[7],#0		@ r[7]+=0
646	 ldr	$bj,[$bj,#4*$i]			@ load b[i]
647	adcs	@acc[8],@acc[8],@acc[0]		@ r[8]+=r[0]
648	 eor	$t0,$t0,$t0
649	adc	$t3,$t3,#0			@ overflow bit
650	subs	@acc[7],@acc[7],@acc[0]		@ r[7]-=r[0]
651	 ldr	$t2,[sp,#4]			@ a[1]
652	sbcs	@acc[8],@acc[8],#0		@ r[8]-=0
653	 umlal	@acc[1],$t0,$t1,$bj		@ "r[0]"+=a[0]*b[i]
654	 eor	$t1,$t1,$t1
655	sbc	@acc[0],$t3,#0			@ overflow bit, keep in mind
656						@ that netto result is
657						@ addition of a value which
658						@ makes underflow impossible
659
660	ldr	$t3,[sp,#8]			@ a[2]
661	umlal	@acc[2],$t1,$t2,$bj		@ "r[1]"+=a[1]*b[i]
662	 str	@acc[0],[sp,#36]		@ temporarily offload overflow
663	eor	$t2,$t2,$t2
664	ldr	$t4,[sp,#12]			@ a[3], $t4 is alias @acc[0]
665	umlal	@acc[3],$t2,$t3,$bj		@ "r[2]"+=a[2]*b[i]
666	eor	$t3,$t3,$t3
667	adds	@acc[2],@acc[2],$t0		@ accumulate high part of mult
668	ldr	$t0,[sp,#16]			@ a[4]
669	umlal	@acc[4],$t3,$t4,$bj		@ "r[3]"+=a[3]*b[i]
670	eor	$t4,$t4,$t4
671	adcs	@acc[3],@acc[3],$t1
672	ldr	$t1,[sp,#20]			@ a[5]
673	umlal	@acc[5],$t4,$t0,$bj		@ "r[4]"+=a[4]*b[i]
674	eor	$t0,$t0,$t0
675	adcs	@acc[4],@acc[4],$t2
676	ldr	$t2,[sp,#24]			@ a[6]
677	umlal	@acc[6],$t0,$t1,$bj		@ "r[5]"+=a[5]*b[i]
678	eor	$t1,$t1,$t1
679	adcs	@acc[5],@acc[5],$t3
680	ldr	$t3,[sp,#28]			@ a[7]
681	umlal	@acc[7],$t1,$t2,$bj		@ "r[6]"+=a[6]*b[i]
682	eor	$t2,$t2,$t2
683	adcs	@acc[6],@acc[6],$t4
684	 ldr	@acc[0],[sp,#36]		@ restore overflow bit
685	umlal	@acc[8],$t2,$t3,$bj		@ "r[7]"+=a[7]*b[i]
686	eor	$t3,$t3,$t3
687	adcs	@acc[7],@acc[7],$t0
688	adcs	@acc[8],@acc[8],$t1
689	adcs	@acc[0],$acc[0],$t2
690	adc	$t3,$t3,#0			@ new overflow bit
691___
692	push(@acc,shift(@acc));			# rotate registers, so that
693						# "r[i]" becomes r[i]
694}
695$code.=<<___;
696	@ last multiplication-less reduction
697	adds	@acc[3],@acc[3],@acc[0]
698	ldr	$r_ptr,[sp,#32]			@ restore r_ptr
699	adcs	@acc[4],@acc[4],#0
700	adcs	@acc[5],@acc[5],#0
701	adcs	@acc[6],@acc[6],@acc[0]
702	adcs	@acc[7],@acc[7],#0
703	adcs	@acc[8],@acc[8],@acc[0]
704	adc	$t3,$t3,#0
705	subs	@acc[7],@acc[7],@acc[0]
706	sbcs	@acc[8],@acc[8],#0
707	sbc	@acc[0],$t3,#0			@ overflow bit
708
709	@ Final step is "if result > mod, subtract mod", but we do it
710	@ "other way around", namely subtract modulus from result
711	@ and if it borrowed, add modulus back.
712
713	adds	@acc[1],@acc[1],#1		@ subs	@acc[1],@acc[1],#-1
714	adcs	@acc[2],@acc[2],#0		@ sbcs	@acc[2],@acc[2],#-1
715	adcs	@acc[3],@acc[3],#0		@ sbcs	@acc[3],@acc[3],#-1
716	sbcs	@acc[4],@acc[4],#0
717	sbcs	@acc[5],@acc[5],#0
718	sbcs	@acc[6],@acc[6],#0
719	sbcs	@acc[7],@acc[7],#1
720	adcs	@acc[8],@acc[8],#0		@ sbcs	@acc[8],@acc[8],#-1
721	ldr	lr,[sp,#44]			@ restore lr
722	sbc	@acc[0],@acc[0],#0		@ broadcast borrow bit
723	add	sp,sp,#48
724
725	@ Note that because mod has special form, i.e. consists of
726	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
727	@ broadcasting borrow bit to a register, @acc[0], and using it as
728	@ a whole or extracting single bit.
729
730	adds	@acc[1],@acc[1],@acc[0]		@ add modulus or zero
731	adcs	@acc[2],@acc[2],@acc[0]
732	str	@acc[1],[$r_ptr,#0]
733	adcs	@acc[3],@acc[3],@acc[0]
734	str	@acc[2],[$r_ptr,#4]
735	adcs	@acc[4],@acc[4],#0
736	str	@acc[3],[$r_ptr,#8]
737	adcs	@acc[5],@acc[5],#0
738	str	@acc[4],[$r_ptr,#12]
739	adcs	@acc[6],@acc[6],#0
740	str	@acc[5],[$r_ptr,#16]
741	adcs	@acc[7],@acc[7],@acc[0],lsr#31
742	str	@acc[6],[$r_ptr,#20]
743	adc	@acc[8],@acc[8],@acc[0]
744	str	@acc[7],[$r_ptr,#24]
745	str	@acc[8],[$r_ptr,#28]
746
747	mov	pc,lr
748.size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
749___
750}
751
752{
753my ($out,$inp,$index,$mask)=map("r$_",(0..3));
754$code.=<<___;
755@ void	ecp_nistz256_scatter_w5(void *r0,const P256_POINT *r1,
756@					 int r2);
757.globl	ecp_nistz256_scatter_w5
758.type	ecp_nistz256_scatter_w5,%function
759.align	5
760ecp_nistz256_scatter_w5:
761	stmdb	sp!,{r4-r11}
762
763	add	$out,$out,$index,lsl#2
764
765	ldmia	$inp!,{r4-r11}		@ X
766	str	r4,[$out,#64*0-4]
767	str	r5,[$out,#64*1-4]
768	str	r6,[$out,#64*2-4]
769	str	r7,[$out,#64*3-4]
770	str	r8,[$out,#64*4-4]
771	str	r9,[$out,#64*5-4]
772	str	r10,[$out,#64*6-4]
773	str	r11,[$out,#64*7-4]
774	add	$out,$out,#64*8
775
776	ldmia	$inp!,{r4-r11}		@ Y
777	str	r4,[$out,#64*0-4]
778	str	r5,[$out,#64*1-4]
779	str	r6,[$out,#64*2-4]
780	str	r7,[$out,#64*3-4]
781	str	r8,[$out,#64*4-4]
782	str	r9,[$out,#64*5-4]
783	str	r10,[$out,#64*6-4]
784	str	r11,[$out,#64*7-4]
785	add	$out,$out,#64*8
786
787	ldmia	$inp,{r4-r11}		@ Z
788	str	r4,[$out,#64*0-4]
789	str	r5,[$out,#64*1-4]
790	str	r6,[$out,#64*2-4]
791	str	r7,[$out,#64*3-4]
792	str	r8,[$out,#64*4-4]
793	str	r9,[$out,#64*5-4]
794	str	r10,[$out,#64*6-4]
795	str	r11,[$out,#64*7-4]
796
797	ldmia	sp!,{r4-r11}
798#if __ARM_ARCH__>=5 || defined(__thumb__)
799	bx	lr
800#else
801	mov	pc,lr
802#endif
803.size	ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
804
805@ void	ecp_nistz256_gather_w5(P256_POINT *r0,const void *r1,
806@					      int r2);
807.globl	ecp_nistz256_gather_w5
808.type	ecp_nistz256_gather_w5,%function
809.align	5
810ecp_nistz256_gather_w5:
811	stmdb	sp!,{r4-r11}
812
813	cmp	$index,#0
814	mov	$mask,#0
815#ifdef	__thumb2__
816	itt	ne
817#endif
818	subne	$index,$index,#1
819	movne	$mask,#-1
820	add	$inp,$inp,$index,lsl#2
821
822	ldr	r4,[$inp,#64*0]
823	ldr	r5,[$inp,#64*1]
824	ldr	r6,[$inp,#64*2]
825	and	r4,r4,$mask
826	ldr	r7,[$inp,#64*3]
827	and	r5,r5,$mask
828	ldr	r8,[$inp,#64*4]
829	and	r6,r6,$mask
830	ldr	r9,[$inp,#64*5]
831	and	r7,r7,$mask
832	ldr	r10,[$inp,#64*6]
833	and	r8,r8,$mask
834	ldr	r11,[$inp,#64*7]
835	add	$inp,$inp,#64*8
836	and	r9,r9,$mask
837	and	r10,r10,$mask
838	and	r11,r11,$mask
839	stmia	$out!,{r4-r11}	@ X
840
841	ldr	r4,[$inp,#64*0]
842	ldr	r5,[$inp,#64*1]
843	ldr	r6,[$inp,#64*2]
844	and	r4,r4,$mask
845	ldr	r7,[$inp,#64*3]
846	and	r5,r5,$mask
847	ldr	r8,[$inp,#64*4]
848	and	r6,r6,$mask
849	ldr	r9,[$inp,#64*5]
850	and	r7,r7,$mask
851	ldr	r10,[$inp,#64*6]
852	and	r8,r8,$mask
853	ldr	r11,[$inp,#64*7]
854	add	$inp,$inp,#64*8
855	and	r9,r9,$mask
856	and	r10,r10,$mask
857	and	r11,r11,$mask
858	stmia	$out!,{r4-r11}	@ Y
859
860	ldr	r4,[$inp,#64*0]
861	ldr	r5,[$inp,#64*1]
862	ldr	r6,[$inp,#64*2]
863	and	r4,r4,$mask
864	ldr	r7,[$inp,#64*3]
865	and	r5,r5,$mask
866	ldr	r8,[$inp,#64*4]
867	and	r6,r6,$mask
868	ldr	r9,[$inp,#64*5]
869	and	r7,r7,$mask
870	ldr	r10,[$inp,#64*6]
871	and	r8,r8,$mask
872	ldr	r11,[$inp,#64*7]
873	and	r9,r9,$mask
874	and	r10,r10,$mask
875	and	r11,r11,$mask
876	stmia	$out,{r4-r11}		@ Z
877
878	ldmia	sp!,{r4-r11}
879#if __ARM_ARCH__>=5 || defined(__thumb__)
880	bx	lr
881#else
882	mov	pc,lr
883#endif
884.size	ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
885
886@ void	ecp_nistz256_scatter_w7(void *r0,const P256_POINT_AFFINE *r1,
887@					 int r2);
888.globl	ecp_nistz256_scatter_w7
889.type	ecp_nistz256_scatter_w7,%function
890.align	5
891ecp_nistz256_scatter_w7:
892	add	$out,$out,$index
893	mov	$index,#64/4
894.Loop_scatter_w7:
895	ldr	$mask,[$inp],#4
896	subs	$index,$index,#1
897	strb	$mask,[$out,#64*0]
898	mov	$mask,$mask,lsr#8
899	strb	$mask,[$out,#64*1]
900	mov	$mask,$mask,lsr#8
901	strb	$mask,[$out,#64*2]
902	mov	$mask,$mask,lsr#8
903	strb	$mask,[$out,#64*3]
904	add	$out,$out,#64*4
905	bne	.Loop_scatter_w7
906
907#if __ARM_ARCH__>=5 || defined(__thumb__)
908	bx	lr
909#else
910	mov	pc,lr
911#endif
912.size	ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
913
914@ void	ecp_nistz256_gather_w7(P256_POINT_AFFINE *r0,const void *r1,
915@						     int r2);
916.globl	ecp_nistz256_gather_w7
917.type	ecp_nistz256_gather_w7,%function
918.align	5
919ecp_nistz256_gather_w7:
920	stmdb	sp!,{r4-r7}
921
922	cmp	$index,#0
923	mov	$mask,#0
924#ifdef	__thumb2__
925	itt	ne
926#endif
927	subne	$index,$index,#1
928	movne	$mask,#-1
929	add	$inp,$inp,$index
930	mov	$index,#64/4
931	nop
932.Loop_gather_w7:
933	ldrb	r4,[$inp,#64*0]
934	subs	$index,$index,#1
935	ldrb	r5,[$inp,#64*1]
936	ldrb	r6,[$inp,#64*2]
937	ldrb	r7,[$inp,#64*3]
938	add	$inp,$inp,#64*4
939	orr	r4,r4,r5,lsl#8
940	orr	r4,r4,r6,lsl#16
941	orr	r4,r4,r7,lsl#24
942	and	r4,r4,$mask
943	str	r4,[$out],#4
944	bne	.Loop_gather_w7
945
946	ldmia	sp!,{r4-r7}
947#if __ARM_ARCH__>=5 || defined(__thumb__)
948	bx	lr
949#else
950	mov	pc,lr
951#endif
952.size	ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
953___
954}
955if (0) {
956# In comparison to integer-only equivalent of below subroutine:
957#
958# Cortex-A8	+10%
959# Cortex-A9	-10%
960# Snapdragon S4	+5%
961#
962# As not all time is spent in multiplication, overall impact is deemed
963# too low to care about.
964
965my ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7));
966my $mask="q4";
967my $mult="q5";
968my @AxB=map("q$_",(8..15));
969
970my ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3));
971
972$code.=<<___;
973#if __ARM_ARCH__>=7
974.fpu	neon
975
976.globl	ecp_nistz256_mul_mont_neon
977.type	ecp_nistz256_mul_mont_neon,%function
978.align	5
979ecp_nistz256_mul_mont_neon:
980	mov	ip,sp
981	stmdb	sp!,{r4-r9}
982	vstmdb	sp!,{q4-q5}		@ ABI specification says so
983
984	sub		$toutptr,sp,#40
985	vld1.32		{${Bi}[0]},[$bptr,:32]!
986	veor		$zero,$zero,$zero
987	vld1.32		{$A0-$A3}, [$aptr]		@ can't specify :32 :-(
988	vzip.16		$Bi,$zero
989	mov		sp,$toutptr			@ alloca
990	vmov.i64	$mask,#0xffff
991
992	vmull.u32	@AxB[0],$Bi,${A0}[0]
993	vmull.u32	@AxB[1],$Bi,${A0}[1]
994	vmull.u32	@AxB[2],$Bi,${A1}[0]
995	vmull.u32	@AxB[3],$Bi,${A1}[1]
996	 vshr.u64	$temp,@AxB[0]#lo,#16
997	vmull.u32	@AxB[4],$Bi,${A2}[0]
998	 vadd.u64	@AxB[0]#hi,@AxB[0]#hi,$temp
999	vmull.u32	@AxB[5],$Bi,${A2}[1]
1000	 vshr.u64	$temp,@AxB[0]#hi,#16		@ upper 32 bits of a[0]*b[0]
1001	vmull.u32	@AxB[6],$Bi,${A3}[0]
1002	 vand.u64	@AxB[0],@AxB[0],$mask		@ lower 32 bits of a[0]*b[0]
1003	vmull.u32	@AxB[7],$Bi,${A3}[1]
1004___
1005for($i=1;$i<8;$i++) {
1006$code.=<<___;
1007	 vld1.32	{${Bi}[0]},[$bptr,:32]!
1008	 veor		$zero,$zero,$zero
1009	vadd.u64	@AxB[1]#lo,@AxB[1]#lo,$temp	@ reduction
1010	vshl.u64	$mult,@AxB[0],#32
1011	vadd.u64	@AxB[3],@AxB[3],@AxB[0]
1012	vsub.u64	$mult,$mult,@AxB[0]
1013	 vzip.16	$Bi,$zero
1014	vadd.u64	@AxB[6],@AxB[6],@AxB[0]
1015	vadd.u64	@AxB[7],@AxB[7],$mult
1016___
1017	push(@AxB,shift(@AxB));
1018$code.=<<___;
1019	vmlal.u32	@AxB[0],$Bi,${A0}[0]
1020	vmlal.u32	@AxB[1],$Bi,${A0}[1]
1021	vmlal.u32	@AxB[2],$Bi,${A1}[0]
1022	vmlal.u32	@AxB[3],$Bi,${A1}[1]
1023	 vshr.u64	$temp,@AxB[0]#lo,#16
1024	vmlal.u32	@AxB[4],$Bi,${A2}[0]
1025	 vadd.u64	@AxB[0]#hi,@AxB[0]#hi,$temp
1026	vmlal.u32	@AxB[5],$Bi,${A2}[1]
1027	 vshr.u64	$temp,@AxB[0]#hi,#16		@ upper 33 bits of a[0]*b[i]+t[0]
1028	vmlal.u32	@AxB[6],$Bi,${A3}[0]
1029	 vand.u64	@AxB[0],@AxB[0],$mask		@ lower 32 bits of a[0]*b[0]
1030	vmull.u32	@AxB[7],$Bi,${A3}[1]
1031___
1032}
1033$code.=<<___;
1034	vadd.u64	@AxB[1]#lo,@AxB[1]#lo,$temp	@ last reduction
1035	vshl.u64	$mult,@AxB[0],#32
1036	vadd.u64	@AxB[3],@AxB[3],@AxB[0]
1037	vsub.u64	$mult,$mult,@AxB[0]
1038	vadd.u64	@AxB[6],@AxB[6],@AxB[0]
1039	vadd.u64	@AxB[7],@AxB[7],$mult
1040
1041	vshr.u64	$temp,@AxB[1]#lo,#16		@ convert
1042	vadd.u64	@AxB[1]#hi,@AxB[1]#hi,$temp
1043	vshr.u64	$temp,@AxB[1]#hi,#16
1044	vzip.16		@AxB[1]#lo,@AxB[1]#hi
1045___
1046foreach (2..7) {
1047$code.=<<___;
1048	vadd.u64	@AxB[$_]#lo,@AxB[$_]#lo,$temp
1049	vst1.32		{@AxB[$_-1]#lo[0]},[$toutptr,:32]!
1050	vshr.u64	$temp,@AxB[$_]#lo,#16
1051	vadd.u64	@AxB[$_]#hi,@AxB[$_]#hi,$temp
1052	vshr.u64	$temp,@AxB[$_]#hi,#16
1053	vzip.16		@AxB[$_]#lo,@AxB[$_]#hi
1054___
1055}
1056$code.=<<___;
1057	vst1.32		{@AxB[7]#lo[0]},[$toutptr,:32]!
1058	vst1.32		{$temp},[$toutptr]		@ upper 33 bits
1059
1060	ldr	r1,[sp,#0]
1061	ldr	r2,[sp,#4]
1062	ldr	r3,[sp,#8]
1063	subs	r1,r1,#-1
1064	ldr	r4,[sp,#12]
1065	sbcs	r2,r2,#-1
1066	ldr	r5,[sp,#16]
1067	sbcs	r3,r3,#-1
1068	ldr	r6,[sp,#20]
1069	sbcs	r4,r4,#0
1070	ldr	r7,[sp,#24]
1071	sbcs	r5,r5,#0
1072	ldr	r8,[sp,#28]
1073	sbcs	r6,r6,#0
1074	ldr	r9,[sp,#32]				@ top-most bit
1075	sbcs	r7,r7,#1
1076	sub	sp,ip,#40+16
1077	sbcs	r8,r8,#-1
1078	sbc	r9,r9,#0
1079        vldmia  sp!,{q4-q5}
1080
1081	adds	r1,r1,r9
1082	adcs	r2,r2,r9
1083	str	r1,[$rptr,#0]
1084	adcs	r3,r3,r9
1085	str	r2,[$rptr,#4]
1086	adcs	r4,r4,#0
1087	str	r3,[$rptr,#8]
1088	adcs	r5,r5,#0
1089	str	r4,[$rptr,#12]
1090	adcs	r6,r6,#0
1091	str	r5,[$rptr,#16]
1092	adcs	r7,r7,r9,lsr#31
1093	str	r6,[$rptr,#20]
1094	adcs	r8,r8,r9
1095	str	r7,[$rptr,#24]
1096	str	r8,[$rptr,#28]
1097
1098        ldmia   sp!,{r4-r9}
1099	bx	lr
1100.size	ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon
1101#endif
1102___
1103}
1104
1105{{{
1106########################################################################
1107# Below $aN assignment matches order in which 256-bit result appears in
1108# register bank at return from __ecp_nistz256_mul_mont, so that we can
1109# skip over reloading it from memory. This means that below functions
1110# use custom calling sequence accepting 256-bit input in registers,
1111# output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr.
1112#
1113# See their "normal" counterparts for insights on calculations.
1114
1115my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,
1116    $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1));
1117my $ff=$b_ptr;
1118
1119$code.=<<___;
1120.type	__ecp_nistz256_sub_from,%function
1121.align	5
1122__ecp_nistz256_sub_from:
1123	str	lr,[sp,#-4]!		@ push lr
1124
1125	 ldr	$t0,[$b_ptr,#0]
1126	 ldr	$t1,[$b_ptr,#4]
1127	 ldr	$t2,[$b_ptr,#8]
1128	 ldr	$t3,[$b_ptr,#12]
1129	subs	$a0,$a0,$t0
1130	 ldr	$t0,[$b_ptr,#16]
1131	sbcs	$a1,$a1,$t1
1132	 ldr	$t1,[$b_ptr,#20]
1133	sbcs	$a2,$a2,$t2
1134	 ldr	$t2,[$b_ptr,#24]
1135	sbcs	$a3,$a3,$t3
1136	 ldr	$t3,[$b_ptr,#28]
1137	sbcs	$a4,$a4,$t0
1138	sbcs	$a5,$a5,$t1
1139	sbcs	$a6,$a6,$t2
1140	sbcs	$a7,$a7,$t3
1141	sbc	$ff,$ff,$ff		@ broadcast borrow bit
1142	ldr	lr,[sp],#4		@ pop lr
1143
1144	adds	$a0,$a0,$ff		@ add synthesized modulus
1145	adcs	$a1,$a1,$ff
1146	str	$a0,[$r_ptr,#0]
1147	adcs	$a2,$a2,$ff
1148	str	$a1,[$r_ptr,#4]
1149	adcs	$a3,$a3,#0
1150	str	$a2,[$r_ptr,#8]
1151	adcs	$a4,$a4,#0
1152	str	$a3,[$r_ptr,#12]
1153	adcs	$a5,$a5,#0
1154	str	$a4,[$r_ptr,#16]
1155	adcs	$a6,$a6,$ff,lsr#31
1156	str	$a5,[$r_ptr,#20]
1157	adcs	$a7,$a7,$ff
1158	str	$a6,[$r_ptr,#24]
1159	str	$a7,[$r_ptr,#28]
1160
1161	mov	pc,lr
1162.size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
1163
1164.type	__ecp_nistz256_sub_morf,%function
1165.align	5
1166__ecp_nistz256_sub_morf:
1167	str	lr,[sp,#-4]!		@ push lr
1168
1169	 ldr	$t0,[$b_ptr,#0]
1170	 ldr	$t1,[$b_ptr,#4]
1171	 ldr	$t2,[$b_ptr,#8]
1172	 ldr	$t3,[$b_ptr,#12]
1173	subs	$a0,$t0,$a0
1174	 ldr	$t0,[$b_ptr,#16]
1175	sbcs	$a1,$t1,$a1
1176	 ldr	$t1,[$b_ptr,#20]
1177	sbcs	$a2,$t2,$a2
1178	 ldr	$t2,[$b_ptr,#24]
1179	sbcs	$a3,$t3,$a3
1180	 ldr	$t3,[$b_ptr,#28]
1181	sbcs	$a4,$t0,$a4
1182	sbcs	$a5,$t1,$a5
1183	sbcs	$a6,$t2,$a6
1184	sbcs	$a7,$t3,$a7
1185	sbc	$ff,$ff,$ff		@ broadcast borrow bit
1186	ldr	lr,[sp],#4		@ pop lr
1187
1188	adds	$a0,$a0,$ff		@ add synthesized modulus
1189	adcs	$a1,$a1,$ff
1190	str	$a0,[$r_ptr,#0]
1191	adcs	$a2,$a2,$ff
1192	str	$a1,[$r_ptr,#4]
1193	adcs	$a3,$a3,#0
1194	str	$a2,[$r_ptr,#8]
1195	adcs	$a4,$a4,#0
1196	str	$a3,[$r_ptr,#12]
1197	adcs	$a5,$a5,#0
1198	str	$a4,[$r_ptr,#16]
1199	adcs	$a6,$a6,$ff,lsr#31
1200	str	$a5,[$r_ptr,#20]
1201	adcs	$a7,$a7,$ff
1202	str	$a6,[$r_ptr,#24]
1203	str	$a7,[$r_ptr,#28]
1204
1205	mov	pc,lr
1206.size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
1207
1208.type	__ecp_nistz256_add_self,%function
1209.align	4
1210__ecp_nistz256_add_self:
1211	adds	$a0,$a0,$a0		@ a[0:7]+=a[0:7]
1212	adcs	$a1,$a1,$a1
1213	adcs	$a2,$a2,$a2
1214	adcs	$a3,$a3,$a3
1215	adcs	$a4,$a4,$a4
1216	adcs	$a5,$a5,$a5
1217	adcs	$a6,$a6,$a6
1218	mov	$ff,#0
1219	adcs	$a7,$a7,$a7
1220	adc	$ff,$ff,#0
1221
1222	@ if a+b >= modulus, subtract modulus.
1223	@
1224	@ But since comparison implies subtraction, we subtract
1225	@ modulus and then add it back if subtraction borrowed.
1226
1227	subs	$a0,$a0,#-1
1228	sbcs	$a1,$a1,#-1
1229	sbcs	$a2,$a2,#-1
1230	sbcs	$a3,$a3,#0
1231	sbcs	$a4,$a4,#0
1232	sbcs	$a5,$a5,#0
1233	sbcs	$a6,$a6,#1
1234	sbcs	$a7,$a7,#-1
1235	sbc	$ff,$ff,#0
1236
1237	@ Note that because mod has special form, i.e. consists of
1238	@ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
1239	@ using value of borrow as a whole or extracting single bit.
1240	@ Follow $ff register...
1241
1242	adds	$a0,$a0,$ff		@ add synthesized modulus
1243	adcs	$a1,$a1,$ff
1244	str	$a0,[$r_ptr,#0]
1245	adcs	$a2,$a2,$ff
1246	str	$a1,[$r_ptr,#4]
1247	adcs	$a3,$a3,#0
1248	str	$a2,[$r_ptr,#8]
1249	adcs	$a4,$a4,#0
1250	str	$a3,[$r_ptr,#12]
1251	adcs	$a5,$a5,#0
1252	str	$a4,[$r_ptr,#16]
1253	adcs	$a6,$a6,$ff,lsr#31
1254	str	$a5,[$r_ptr,#20]
1255	adcs	$a7,$a7,$ff
1256	str	$a6,[$r_ptr,#24]
1257	str	$a7,[$r_ptr,#28]
1258
1259	mov	pc,lr
1260.size	__ecp_nistz256_add_self,.-__ecp_nistz256_add_self
1261
1262___
1263
1264########################################################################
1265# following subroutines are "literal" implementation of those found in
1266# ecp_nistz256.c
1267#
1268########################################################################
1269# void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
1270#
1271{
1272my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1273# above map() describes stack layout with 5 temporary
1274# 256-bit vectors on top. Then note that we push
1275# starting from r0, which means that we have copy of
1276# input arguments just below these temporary vectors.
1277
1278$code.=<<___;
1279.globl	ecp_nistz256_point_double
1280.type	ecp_nistz256_point_double,%function
1281.align	5
1282ecp_nistz256_point_double:
1283	stmdb	sp!,{r0-r12,lr}		@ push from r0, unusual, but intentional
1284	sub	sp,sp,#32*5
1285
1286.Lpoint_double_shortcut:
1287	add	r3,sp,#$in_x
1288	ldmia	$a_ptr!,{r4-r11}	@ copy in_x
1289	stmia	r3,{r4-r11}
1290
1291	add	$r_ptr,sp,#$S
1292	bl	__ecp_nistz256_mul_by_2	@ p256_mul_by_2(S, in_y);
1293
1294	add	$b_ptr,$a_ptr,#32
1295	add	$a_ptr,$a_ptr,#32
1296	add	$r_ptr,sp,#$Zsqr
1297	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Zsqr, in_z);
1298
1299	add	$a_ptr,sp,#$S
1300	add	$b_ptr,sp,#$S
1301	add	$r_ptr,sp,#$S
1302	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(S, S);
1303
1304	ldr	$b_ptr,[sp,#32*5+4]
1305	add	$a_ptr,$b_ptr,#32
1306	add	$b_ptr,$b_ptr,#64
1307	add	$r_ptr,sp,#$tmp0
1308	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(tmp0, in_z, in_y);
1309
1310	ldr	$r_ptr,[sp,#32*5]
1311	add	$r_ptr,$r_ptr,#64
1312	bl	__ecp_nistz256_add_self	@ p256_mul_by_2(res_z, tmp0);
1313
1314	add	$a_ptr,sp,#$in_x
1315	add	$b_ptr,sp,#$Zsqr
1316	add	$r_ptr,sp,#$M
1317	bl	__ecp_nistz256_add	@ p256_add(M, in_x, Zsqr);
1318
1319	add	$a_ptr,sp,#$in_x
1320	add	$b_ptr,sp,#$Zsqr
1321	add	$r_ptr,sp,#$Zsqr
1322	bl	__ecp_nistz256_sub	@ p256_sub(Zsqr, in_x, Zsqr);
1323
1324	add	$a_ptr,sp,#$S
1325	add	$b_ptr,sp,#$S
1326	add	$r_ptr,sp,#$tmp0
1327	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(tmp0, S);
1328
1329	add	$a_ptr,sp,#$Zsqr
1330	add	$b_ptr,sp,#$M
1331	add	$r_ptr,sp,#$M
1332	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(M, M, Zsqr);
1333
1334	ldr	$r_ptr,[sp,#32*5]
1335	add	$a_ptr,sp,#$tmp0
1336	add	$r_ptr,$r_ptr,#32
1337	bl	__ecp_nistz256_div_by_2	@ p256_div_by_2(res_y, tmp0);
1338
1339	add	$a_ptr,sp,#$M
1340	add	$r_ptr,sp,#$M
1341	bl	__ecp_nistz256_mul_by_3	@ p256_mul_by_3(M, M);
1342
1343	add	$a_ptr,sp,#$in_x
1344	add	$b_ptr,sp,#$S
1345	add	$r_ptr,sp,#$S
1346	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S, S, in_x);
1347
1348	add	$r_ptr,sp,#$tmp0
1349	bl	__ecp_nistz256_add_self	@ p256_mul_by_2(tmp0, S);
1350
1351	ldr	$r_ptr,[sp,#32*5]
1352	add	$a_ptr,sp,#$M
1353	add	$b_ptr,sp,#$M
1354	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(res_x, M);
1355
1356	add	$b_ptr,sp,#$tmp0
1357	bl	__ecp_nistz256_sub_from	@ p256_sub(res_x, res_x, tmp0);
1358
1359	add	$b_ptr,sp,#$S
1360	add	$r_ptr,sp,#$S
1361	bl	__ecp_nistz256_sub_morf	@ p256_sub(S, S, res_x);
1362
1363	add	$a_ptr,sp,#$M
1364	add	$b_ptr,sp,#$S
1365	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S, S, M);
1366
1367	ldr	$r_ptr,[sp,#32*5]
1368	add	$b_ptr,$r_ptr,#32
1369	add	$r_ptr,$r_ptr,#32
1370	bl	__ecp_nistz256_sub_from	@ p256_sub(res_y, S, res_y);
1371
1372	add	sp,sp,#32*5+16		@ +16 means "skip even over saved r0-r3"
1373#if __ARM_ARCH__>=5 || !defined(__thumb__)
1374	ldmia	sp!,{r4-r12,pc}
1375#else
1376	ldmia	sp!,{r4-r12,lr}
1377	bx	lr			@ interoperable with Thumb ISA:-)
1378#endif
1379.size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
1380___
1381}
1382
1383########################################################################
1384# void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
1385#			      const P256_POINT *in2);
1386{
1387my ($res_x,$res_y,$res_z,
1388    $in1_x,$in1_y,$in1_z,
1389    $in2_x,$in2_y,$in2_z,
1390    $H,$Hsqr,$R,$Rsqr,$Hcub,
1391    $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
1392my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1393# above map() describes stack layout with 18 temporary
1394# 256-bit vectors on top. Then note that we push
1395# starting from r0, which means that we have copy of
1396# input arguments just below these temporary vectors.
1397# We use three of them for ~in1infty, ~in2infty and
1398# result of check for zero.
1399
1400$code.=<<___;
1401.globl	ecp_nistz256_point_add
1402.type	ecp_nistz256_point_add,%function
1403.align	5
1404ecp_nistz256_point_add:
1405	stmdb	sp!,{r0-r12,lr}		@ push from r0, unusual, but intentional
1406	sub	sp,sp,#32*18+16
1407
1408	ldmia	$b_ptr!,{r4-r11}	@ copy in2_x
1409	add	r3,sp,#$in2_x
1410	stmia	r3!,{r4-r11}
1411	ldmia	$b_ptr!,{r4-r11}	@ copy in2_y
1412	stmia	r3!,{r4-r11}
1413	ldmia	$b_ptr,{r4-r11}		@ copy in2_z
1414	orr	r12,r4,r5
1415	orr	r12,r12,r6
1416	orr	r12,r12,r7
1417	orr	r12,r12,r8
1418	orr	r12,r12,r9
1419	orr	r12,r12,r10
1420	orr	r12,r12,r11
1421	cmp	r12,#0
1422#ifdef	__thumb2__
1423	it	ne
1424#endif
1425	movne	r12,#-1
1426	stmia	r3,{r4-r11}
1427	str	r12,[sp,#32*18+8]	@ ~in2infty
1428
1429	ldmia	$a_ptr!,{r4-r11}	@ copy in1_x
1430	add	r3,sp,#$in1_x
1431	stmia	r3!,{r4-r11}
1432	ldmia	$a_ptr!,{r4-r11}	@ copy in1_y
1433	stmia	r3!,{r4-r11}
1434	ldmia	$a_ptr,{r4-r11}		@ copy in1_z
1435	orr	r12,r4,r5
1436	orr	r12,r12,r6
1437	orr	r12,r12,r7
1438	orr	r12,r12,r8
1439	orr	r12,r12,r9
1440	orr	r12,r12,r10
1441	orr	r12,r12,r11
1442	cmp	r12,#0
1443#ifdef	__thumb2__
1444	it	ne
1445#endif
1446	movne	r12,#-1
1447	stmia	r3,{r4-r11}
1448	str	r12,[sp,#32*18+4]	@ ~in1infty
1449
1450	add	$a_ptr,sp,#$in2_z
1451	add	$b_ptr,sp,#$in2_z
1452	add	$r_ptr,sp,#$Z2sqr
1453	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Z2sqr, in2_z);
1454
1455	add	$a_ptr,sp,#$in1_z
1456	add	$b_ptr,sp,#$in1_z
1457	add	$r_ptr,sp,#$Z1sqr
1458	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Z1sqr, in1_z);
1459
1460	add	$a_ptr,sp,#$in2_z
1461	add	$b_ptr,sp,#$Z2sqr
1462	add	$r_ptr,sp,#$S1
1463	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S1, Z2sqr, in2_z);
1464
1465	add	$a_ptr,sp,#$in1_z
1466	add	$b_ptr,sp,#$Z1sqr
1467	add	$r_ptr,sp,#$S2
1468	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, Z1sqr, in1_z);
1469
1470	add	$a_ptr,sp,#$in1_y
1471	add	$b_ptr,sp,#$S1
1472	add	$r_ptr,sp,#$S1
1473	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S1, S1, in1_y);
1474
1475	add	$a_ptr,sp,#$in2_y
1476	add	$b_ptr,sp,#$S2
1477	add	$r_ptr,sp,#$S2
1478	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, S2, in2_y);
1479
1480	add	$b_ptr,sp,#$S1
1481	add	$r_ptr,sp,#$R
1482	bl	__ecp_nistz256_sub_from	@ p256_sub(R, S2, S1);
1483
1484	orr	$a0,$a0,$a1		@ see if result is zero
1485	orr	$a2,$a2,$a3
1486	orr	$a4,$a4,$a5
1487	orr	$a0,$a0,$a2
1488	orr	$a4,$a4,$a6
1489	orr	$a0,$a0,$a7
1490	 add	$a_ptr,sp,#$in1_x
1491	orr	$a0,$a0,$a4
1492	 add	$b_ptr,sp,#$Z2sqr
1493	str	$a0,[sp,#32*18+12]
1494
1495	add	$r_ptr,sp,#$U1
1496	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U1, in1_x, Z2sqr);
1497
1498	add	$a_ptr,sp,#$in2_x
1499	add	$b_ptr,sp,#$Z1sqr
1500	add	$r_ptr,sp,#$U2
1501	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U2, in2_x, Z1sqr);
1502
1503	add	$b_ptr,sp,#$U1
1504	add	$r_ptr,sp,#$H
1505	bl	__ecp_nistz256_sub_from	@ p256_sub(H, U2, U1);
1506
1507	orr	$a0,$a0,$a1		@ see if result is zero
1508	orr	$a2,$a2,$a3
1509	orr	$a4,$a4,$a5
1510	orr	$a0,$a0,$a2
1511	orr	$a4,$a4,$a6
1512	orr	$a0,$a0,$a7
1513	orr	$a0,$a0,$a4		@ ~is_equal(U1,U2)
1514
1515	ldr	$t0,[sp,#32*18+4]	@ ~in1infty
1516	ldr	$t1,[sp,#32*18+8]	@ ~in2infty
1517	ldr	$t2,[sp,#32*18+12]	@ ~is_equal(S1,S2)
1518	mvn	$t0,$t0			@ -1/0 -> 0/-1
1519	mvn	$t1,$t1			@ -1/0 -> 0/-1
1520	orr	$a0,$a0,$t0
1521	orr	$a0,$a0,$t1
1522	orrs	$a0,$a0,$t2		@ set flags
1523
1524	@ if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2))
1525	bne	.Ladd_proceed
1526
1527.Ladd_double:
1528	ldr	$a_ptr,[sp,#32*18+20]
1529	add	sp,sp,#32*(18-5)+16	@ difference in frame sizes
1530	b	.Lpoint_double_shortcut
1531
1532.align	4
1533.Ladd_proceed:
1534	add	$a_ptr,sp,#$R
1535	add	$b_ptr,sp,#$R
1536	add	$r_ptr,sp,#$Rsqr
1537	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Rsqr, R);
1538
1539	add	$a_ptr,sp,#$H
1540	add	$b_ptr,sp,#$in1_z
1541	add	$r_ptr,sp,#$res_z
1542	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_z, H, in1_z);
1543
1544	add	$a_ptr,sp,#$H
1545	add	$b_ptr,sp,#$H
1546	add	$r_ptr,sp,#$Hsqr
1547	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Hsqr, H);
1548
1549	add	$a_ptr,sp,#$in2_z
1550	add	$b_ptr,sp,#$res_z
1551	add	$r_ptr,sp,#$res_z
1552	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_z, res_z, in2_z);
1553
1554	add	$a_ptr,sp,#$H
1555	add	$b_ptr,sp,#$Hsqr
1556	add	$r_ptr,sp,#$Hcub
1557	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(Hcub, Hsqr, H);
1558
1559	add	$a_ptr,sp,#$Hsqr
1560	add	$b_ptr,sp,#$U1
1561	add	$r_ptr,sp,#$U2
1562	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U2, U1, Hsqr);
1563
1564	add	$r_ptr,sp,#$Hsqr
1565	bl	__ecp_nistz256_add_self	@ p256_mul_by_2(Hsqr, U2);
1566
1567	add	$b_ptr,sp,#$Rsqr
1568	add	$r_ptr,sp,#$res_x
1569	bl	__ecp_nistz256_sub_morf	@ p256_sub(res_x, Rsqr, Hsqr);
1570
1571	add	$b_ptr,sp,#$Hcub
1572	bl	__ecp_nistz256_sub_from	@  p256_sub(res_x, res_x, Hcub);
1573
1574	add	$b_ptr,sp,#$U2
1575	add	$r_ptr,sp,#$res_y
1576	bl	__ecp_nistz256_sub_morf	@ p256_sub(res_y, U2, res_x);
1577
1578	add	$a_ptr,sp,#$Hcub
1579	add	$b_ptr,sp,#$S1
1580	add	$r_ptr,sp,#$S2
1581	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, S1, Hcub);
1582
1583	add	$a_ptr,sp,#$R
1584	add	$b_ptr,sp,#$res_y
1585	add	$r_ptr,sp,#$res_y
1586	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_y, res_y, R);
1587
1588	add	$b_ptr,sp,#$S2
1589	bl	__ecp_nistz256_sub_from	@ p256_sub(res_y, res_y, S2);
1590
1591	ldr	r11,[sp,#32*18+4]	@ ~in1infty
1592	ldr	r12,[sp,#32*18+8]	@ ~in2infty
1593	add	r1,sp,#$res_x
1594	add	r2,sp,#$in2_x
1595	and	r10,r11,r12		@ ~in1infty & ~in2infty
1596	mvn	r11,r11
1597	add	r3,sp,#$in1_x
1598	and	r11,r11,r12		@ in1infty & ~in2infty
1599	mvn	r12,r12			@ in2infty
1600	ldr	$r_ptr,[sp,#32*18+16]
1601___
1602for($i=0;$i<96;$i+=8) {			# conditional moves
1603$code.=<<___;
1604	ldmia	r1!,{r4-r5}		@ res_x
1605	ldmia	r2!,{r6-r7}		@ in2_x
1606	ldmia	r3!,{r8-r9}		@ in1_x
1607	and	r4,r4,r10		@ ~in1infty & ~in2infty
1608	and	r5,r5,r10
1609	and	r6,r6,r11		@ in1infty & ~in2infty
1610	and	r7,r7,r11
1611	and	r8,r8,r12		@ in2infty
1612	and	r9,r9,r12
1613	orr	r4,r4,r6
1614	orr	r5,r5,r7
1615	orr	r4,r4,r8
1616	orr	r5,r5,r9
1617	stmia	$r_ptr!,{r4-r5}
1618___
1619}
1620$code.=<<___;
1621.Ladd_done:
1622	add	sp,sp,#32*18+16+16	@ +16 means "skip even over saved r0-r3"
1623#if __ARM_ARCH__>=5 || !defined(__thumb__)
1624	ldmia	sp!,{r4-r12,pc}
1625#else
1626	ldmia	sp!,{r4-r12,lr}
1627	bx	lr			@ interoperable with Thumb ISA:-)
1628#endif
1629.size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
1630___
1631}
1632
1633########################################################################
1634# void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1635#				     const P256_POINT_AFFINE *in2);
1636{
1637my ($res_x,$res_y,$res_z,
1638    $in1_x,$in1_y,$in1_z,
1639    $in2_x,$in2_y,
1640    $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
1641my $Z1sqr = $S2;
1642# above map() describes stack layout with 18 temporary
1643# 256-bit vectors on top. Then note that we push
1644# starting from r0, which means that we have copy of
1645# input arguments just below these temporary vectors.
1646# We use two of them for ~in1infty, ~in2infty.
1647
1648my @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
1649
1650$code.=<<___;
1651.globl	ecp_nistz256_point_add_affine
1652.type	ecp_nistz256_point_add_affine,%function
1653.align	5
1654ecp_nistz256_point_add_affine:
1655	stmdb	sp!,{r0-r12,lr}		@ push from r0, unusual, but intentional
1656	sub	sp,sp,#32*15
1657
1658	ldmia	$a_ptr!,{r4-r11}	@ copy in1_x
1659	add	r3,sp,#$in1_x
1660	stmia	r3!,{r4-r11}
1661	ldmia	$a_ptr!,{r4-r11}	@ copy in1_y
1662	stmia	r3!,{r4-r11}
1663	ldmia	$a_ptr,{r4-r11}		@ copy in1_z
1664	orr	r12,r4,r5
1665	orr	r12,r12,r6
1666	orr	r12,r12,r7
1667	orr	r12,r12,r8
1668	orr	r12,r12,r9
1669	orr	r12,r12,r10
1670	orr	r12,r12,r11
1671	cmp	r12,#0
1672#ifdef	__thumb2__
1673	it	ne
1674#endif
1675	movne	r12,#-1
1676	stmia	r3,{r4-r11}
1677	str	r12,[sp,#32*15+4]	@ ~in1infty
1678
1679	ldmia	$b_ptr!,{r4-r11}	@ copy in2_x
1680	add	r3,sp,#$in2_x
1681	orr	r12,r4,r5
1682	orr	r12,r12,r6
1683	orr	r12,r12,r7
1684	orr	r12,r12,r8
1685	orr	r12,r12,r9
1686	orr	r12,r12,r10
1687	orr	r12,r12,r11
1688	stmia	r3!,{r4-r11}
1689	ldmia	$b_ptr!,{r4-r11}	@ copy in2_y
1690	orr	r12,r12,r4
1691	orr	r12,r12,r5
1692	orr	r12,r12,r6
1693	orr	r12,r12,r7
1694	orr	r12,r12,r8
1695	orr	r12,r12,r9
1696	orr	r12,r12,r10
1697	orr	r12,r12,r11
1698	stmia	r3!,{r4-r11}
1699	cmp	r12,#0
1700#ifdef	__thumb2__
1701	it	ne
1702#endif
1703	movne	r12,#-1
1704	str	r12,[sp,#32*15+8]	@ ~in2infty
1705
1706	add	$a_ptr,sp,#$in1_z
1707	add	$b_ptr,sp,#$in1_z
1708	add	$r_ptr,sp,#$Z1sqr
1709	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Z1sqr, in1_z);
1710
1711	add	$a_ptr,sp,#$Z1sqr
1712	add	$b_ptr,sp,#$in2_x
1713	add	$r_ptr,sp,#$U2
1714	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U2, Z1sqr, in2_x);
1715
1716	add	$b_ptr,sp,#$in1_x
1717	add	$r_ptr,sp,#$H
1718	bl	__ecp_nistz256_sub_from	@ p256_sub(H, U2, in1_x);
1719
1720	add	$a_ptr,sp,#$Z1sqr
1721	add	$b_ptr,sp,#$in1_z
1722	add	$r_ptr,sp,#$S2
1723	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, Z1sqr, in1_z);
1724
1725	add	$a_ptr,sp,#$H
1726	add	$b_ptr,sp,#$in1_z
1727	add	$r_ptr,sp,#$res_z
1728	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_z, H, in1_z);
1729
1730	add	$a_ptr,sp,#$in2_y
1731	add	$b_ptr,sp,#$S2
1732	add	$r_ptr,sp,#$S2
1733	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, S2, in2_y);
1734
1735	add	$b_ptr,sp,#$in1_y
1736	add	$r_ptr,sp,#$R
1737	bl	__ecp_nistz256_sub_from	@ p256_sub(R, S2, in1_y);
1738
1739	add	$a_ptr,sp,#$H
1740	add	$b_ptr,sp,#$H
1741	add	$r_ptr,sp,#$Hsqr
1742	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Hsqr, H);
1743
1744	add	$a_ptr,sp,#$R
1745	add	$b_ptr,sp,#$R
1746	add	$r_ptr,sp,#$Rsqr
1747	bl	__ecp_nistz256_mul_mont	@ p256_sqr_mont(Rsqr, R);
1748
1749	add	$a_ptr,sp,#$H
1750	add	$b_ptr,sp,#$Hsqr
1751	add	$r_ptr,sp,#$Hcub
1752	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(Hcub, Hsqr, H);
1753
1754	add	$a_ptr,sp,#$Hsqr
1755	add	$b_ptr,sp,#$in1_x
1756	add	$r_ptr,sp,#$U2
1757	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(U2, in1_x, Hsqr);
1758
1759	add	$r_ptr,sp,#$Hsqr
1760	bl	__ecp_nistz256_add_self	@ p256_mul_by_2(Hsqr, U2);
1761
1762	add	$b_ptr,sp,#$Rsqr
1763	add	$r_ptr,sp,#$res_x
1764	bl	__ecp_nistz256_sub_morf	@ p256_sub(res_x, Rsqr, Hsqr);
1765
1766	add	$b_ptr,sp,#$Hcub
1767	bl	__ecp_nistz256_sub_from	@  p256_sub(res_x, res_x, Hcub);
1768
1769	add	$b_ptr,sp,#$U2
1770	add	$r_ptr,sp,#$res_y
1771	bl	__ecp_nistz256_sub_morf	@ p256_sub(res_y, U2, res_x);
1772
1773	add	$a_ptr,sp,#$Hcub
1774	add	$b_ptr,sp,#$in1_y
1775	add	$r_ptr,sp,#$S2
1776	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(S2, in1_y, Hcub);
1777
1778	add	$a_ptr,sp,#$R
1779	add	$b_ptr,sp,#$res_y
1780	add	$r_ptr,sp,#$res_y
1781	bl	__ecp_nistz256_mul_mont	@ p256_mul_mont(res_y, res_y, R);
1782
1783	add	$b_ptr,sp,#$S2
1784	bl	__ecp_nistz256_sub_from	@ p256_sub(res_y, res_y, S2);
1785
1786	ldr	r11,[sp,#32*15+4]	@ ~in1infty
1787	ldr	r12,[sp,#32*15+8]	@ ~in2infty
1788	add	r1,sp,#$res_x
1789	add	r2,sp,#$in2_x
1790	and	r10,r11,r12		@ ~in1infty & ~in2infty
1791	mvn	r11,r11
1792	add	r3,sp,#$in1_x
1793	and	r11,r11,r12		@ in1infty & ~in2infty
1794	mvn	r12,r12			@ in2infty
1795	ldr	$r_ptr,[sp,#32*15]
1796___
1797for($i=0;$i<64;$i+=8) {			# conditional moves
1798$code.=<<___;
1799	ldmia	r1!,{r4-r5}		@ res_x
1800	ldmia	r2!,{r6-r7}		@ in2_x
1801	ldmia	r3!,{r8-r9}		@ in1_x
1802	and	r4,r4,r10		@ ~in1infty & ~in2infty
1803	and	r5,r5,r10
1804	and	r6,r6,r11		@ in1infty & ~in2infty
1805	and	r7,r7,r11
1806	and	r8,r8,r12		@ in2infty
1807	and	r9,r9,r12
1808	orr	r4,r4,r6
1809	orr	r5,r5,r7
1810	orr	r4,r4,r8
1811	orr	r5,r5,r9
1812	stmia	$r_ptr!,{r4-r5}
1813___
1814}
1815for(;$i<96;$i+=8) {
1816my $j=($i-64)/4;
1817$code.=<<___;
1818	ldmia	r1!,{r4-r5}		@ res_z
1819	ldmia	r3!,{r8-r9}		@ in1_z
1820	and	r4,r4,r10
1821	and	r5,r5,r10
1822	and	r6,r11,#@ONE_mont[$j]
1823	and	r7,r11,#@ONE_mont[$j+1]
1824	and	r8,r8,r12
1825	and	r9,r9,r12
1826	orr	r4,r4,r6
1827	orr	r5,r5,r7
1828	orr	r4,r4,r8
1829	orr	r5,r5,r9
1830	stmia	$r_ptr!,{r4-r5}
1831___
1832}
1833$code.=<<___;
1834	add	sp,sp,#32*15+16		@ +16 means "skip even over saved r0-r3"
1835#if __ARM_ARCH__>=5 || !defined(__thumb__)
1836	ldmia	sp!,{r4-r12,pc}
1837#else
1838	ldmia	sp!,{r4-r12,lr}
1839	bx	lr			@ interoperable with Thumb ISA:-)
1840#endif
1841.size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1842___
1843}					}}}
1844
1845foreach (split("\n",$code)) {
1846	s/\`([^\`]*)\`/eval $1/geo;
1847
1848	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1849
1850	print $_,"\n";
1851}
1852close STDOUT or die "error closing STDOUT: $!";	# enforce flush
1853