xref: /freebsd/sys/crypto/openssl/aarch64/ecp_sm2p256-armv8.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1/* Do not modify. This file is auto-generated from ecp_sm2p256-armv8.pl. */
2#include "arm_arch.h"
3.arch	armv8-a
4.section	.rodata
5
6.align	5
7// The polynomial p
8.Lpoly:
9.quad	0xffffffffffffffff,0xffffffff00000000,0xffffffffffffffff,0xfffffffeffffffff
10// The order of polynomial n
11.Lord:
12.quad	0x53bbf40939d54123,0x7203df6b21c6052b,0xffffffffffffffff,0xfffffffeffffffff
13// (p + 1) / 2
14.Lpoly_div_2:
15.quad	0x8000000000000000,0xffffffff80000000,0xffffffffffffffff,0x7fffffff7fffffff
16// (n + 1) / 2
17.Lord_div_2:
18.quad	0xa9ddfa049ceaa092,0xb901efb590e30295,0xffffffffffffffff,0x7fffffff7fffffff
19
20.text
21
22// void bn_rshift1(BN_ULONG *a);
23.globl	bn_rshift1
24.type	bn_rshift1,%function
25.align	5
26bn_rshift1:
27	AARCH64_VALID_CALL_TARGET
28	// Load inputs
29	ldp	x7,x8,[x0]
30	ldp	x9,x10,[x0,#16]
31
32	// Right shift
33	extr	x7,x8,x7,#1
34	extr	x8,x9,x8,#1
35	extr	x9,x10,x9,#1
36	lsr	x10,x10,#1
37
38	// Store results
39	stp	x7,x8,[x0]
40	stp	x9,x10,[x0,#16]
41
42	ret
43.size	bn_rshift1,.-bn_rshift1
44
45// void bn_sub(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
46.globl	bn_sub
47.type	bn_sub,%function
48.align	5
49bn_sub:
50	AARCH64_VALID_CALL_TARGET
51	// Load inputs
52	ldp	x7,x8,[x1]
53	ldp	x9,x10,[x1,#16]
54	ldp	x11,x12,[x2]
55	ldp	x13,x14,[x2,#16]
56
57	// Subtraction
58	subs	x7,x7,x11
59	sbcs	x8,x8,x12
60	sbcs	x9,x9,x13
61	sbc	x10,x10,x14
62
63	// Store results
64	stp	x7,x8,[x0]
65	stp	x9,x10,[x0,#16]
66
67	ret
68.size	bn_sub,.-bn_sub
69
70// void ecp_sm2p256_div_by_2(BN_ULONG *r,const BN_ULONG *a);
71.globl	ecp_sm2p256_div_by_2
72.type	ecp_sm2p256_div_by_2,%function
73.align	5
74ecp_sm2p256_div_by_2:
75	AARCH64_VALID_CALL_TARGET
76	// Load inputs
77	ldp	x7,x8,[x1]
78	ldp	x9,x10,[x1,#16]
79
80	// Save the least significant bit
81	mov	x3,x7
82
83	// Right shift 1
84	extr	x7,x8,x7,#1
85	extr	x8,x9,x8,#1
86	extr	x9,x10,x9,#1
87	lsr	x10,x10,#1
88
89	// Load mod
90	adrp	x2,.Lpoly_div_2
91	add	x2,x2,#:lo12:.Lpoly_div_2
92	ldp	x11,x12,[x2]
93	ldp	x13,x14,[x2,#16]
94
95	// Parity check
96	tst	x3,#1
97	csel	x11,xzr,x11,eq
98	csel	x12,xzr,x12,eq
99	csel	x13,xzr,x13,eq
100	csel	x14,xzr,x14,eq
101
102	// Add
103	adds	x7,x7,x11
104	adcs	x8,x8,x12
105	adcs	x9,x9,x13
106	adc	x10,x10,x14
107
108	// Store results
109	stp	x7,x8,[x0]
110	stp	x9,x10,[x0,#16]
111	ret
112.size	ecp_sm2p256_div_by_2,.-ecp_sm2p256_div_by_2
113
114// void ecp_sm2p256_div_by_2_mod_ord(BN_ULONG *r,const BN_ULONG *a);
115.globl	ecp_sm2p256_div_by_2_mod_ord
116.type	ecp_sm2p256_div_by_2_mod_ord,%function
117.align	5
118ecp_sm2p256_div_by_2_mod_ord:
119	AARCH64_VALID_CALL_TARGET
120	// Load inputs
121	ldp	x7,x8,[x1]
122	ldp	x9,x10,[x1,#16]
123
124	// Save the least significant bit
125	mov	x3,x7
126
127	// Right shift 1
128	extr	x7,x8,x7,#1
129	extr	x8,x9,x8,#1
130	extr	x9,x10,x9,#1
131	lsr	x10,x10,#1
132
133	// Load mod
134	adrp	x2,.Lord_div_2
135	add	x2,x2,#:lo12:.Lord_div_2
136	ldp	x11,x12,[x2]
137	ldp	x13,x14,[x2,#16]
138
139	// Parity check
140	tst	x3,#1
141	csel	x11,xzr,x11,eq
142	csel	x12,xzr,x12,eq
143	csel	x13,xzr,x13,eq
144	csel	x14,xzr,x14,eq
145
146	// Add
147	adds	x7,x7,x11
148	adcs	x8,x8,x12
149	adcs	x9,x9,x13
150	adc	x10,x10,x14
151
152	// Store results
153	stp	x7,x8,[x0]
154	stp	x9,x10,[x0,#16]
155	ret
156.size	ecp_sm2p256_div_by_2_mod_ord,.-ecp_sm2p256_div_by_2_mod_ord
157
158// void ecp_sm2p256_mul_by_3(BN_ULONG *r,const BN_ULONG *a);
159.globl	ecp_sm2p256_mul_by_3
160.type	ecp_sm2p256_mul_by_3,%function
161.align	5
162ecp_sm2p256_mul_by_3:
163	AARCH64_VALID_CALL_TARGET
164	// Load inputs
165	ldp	x7,x8,[x1]
166	ldp	x9,x10,[x1,#16]
167
168	// 2*a
169	adds	x7,x7,x7
170	adcs	x8,x8,x8
171	adcs	x9,x9,x9
172	adcs	x10,x10,x10
173	adcs	x15,xzr,xzr
174
175	mov	x3,x7
176	mov	x4,x8
177	mov	x5,x9
178	mov	x6,x10
179
180	// Sub polynomial
181	adrp	x2,.Lpoly
182	add	x2,x2,#:lo12:.Lpoly
183	ldp	x11,x12,[x2]
184	ldp	x13,x14,[x2,#16]
185	subs	x7,x7,x11
186	sbcs	x8,x8,x12
187	sbcs	x9,x9,x13
188	sbcs	x10,x10,x14
189	sbcs	x15,x15,xzr
190
191	csel	x7,x7,x3,cs
192	csel	x8,x8,x4,cs
193	csel	x9,x9,x5,cs
194	csel	x10,x10,x6,cs
195	eor	x15,x15,x15
196
197	// 3*a
198	ldp	x11,x12,[x1]
199	ldp	x13,x14,[x1,#16]
200	adds	x7,x7,x11
201	adcs	x8,x8,x12
202	adcs	x9,x9,x13
203	adcs	x10,x10,x14
204	adcs	x15,xzr,xzr
205
206	mov	x3,x7
207	mov	x4,x8
208	mov	x5,x9
209	mov	x6,x10
210
211	// Sub polynomial
212	adrp	x2,.Lpoly
213	add	x2,x2,#:lo12:.Lpoly
214	ldp	x11,x12,[x2]
215	ldp	x13,x14,[x2,#16]
216	subs	x7,x7,x11
217	sbcs	x8,x8,x12
218	sbcs	x9,x9,x13
219	sbcs	x10,x10,x14
220	sbcs	x15,x15,xzr
221
222	csel	x7,x7,x3,cs
223	csel	x8,x8,x4,cs
224	csel	x9,x9,x5,cs
225	csel	x10,x10,x6,cs
226
227	// Store results
228	stp	x7,x8,[x0]
229	stp	x9,x10,[x0,#16]
230
231	ret
232.size	ecp_sm2p256_mul_by_3,.-ecp_sm2p256_mul_by_3
233
234// void ecp_sm2p256_add(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
235.globl	ecp_sm2p256_add
236.type	ecp_sm2p256_add,%function
237.align	5
238ecp_sm2p256_add:
239	AARCH64_VALID_CALL_TARGET
240	// Load inputs
241	ldp	x7,x8,[x1]
242	ldp	x9,x10,[x1,#16]
243	ldp	x11,x12,[x2]
244	ldp	x13,x14,[x2,#16]
245
246	// Addition
247	adds	x7,x7,x11
248	adcs	x8,x8,x12
249	adcs	x9,x9,x13
250	adcs	x10,x10,x14
251	adc	x15,xzr,xzr
252
253	// Load polynomial
254	adrp	x2,.Lpoly
255	add	x2,x2,#:lo12:.Lpoly
256	ldp	x11,x12,[x2]
257	ldp	x13,x14,[x2,#16]
258
259	// Backup Addition
260	mov	x3,x7
261	mov	x4,x8
262	mov	x5,x9
263	mov	x6,x10
264
265	// Sub polynomial
266	subs	x3,x3,x11
267	sbcs	x4,x4,x12
268	sbcs	x5,x5,x13
269	sbcs	x6,x6,x14
270	sbcs	x15,x15,xzr
271
272	// Select based on carry
273	csel	x7,x7,x3,cc
274	csel	x8,x8,x4,cc
275	csel	x9,x9,x5,cc
276	csel	x10,x10,x6,cc
277
278	// Store results
279	stp	x7,x8,[x0]
280	stp	x9,x10,[x0,#16]
281	ret
282.size	ecp_sm2p256_add,.-ecp_sm2p256_add
283
284// void ecp_sm2p256_sub(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
285.globl	ecp_sm2p256_sub
286.type	ecp_sm2p256_sub,%function
287.align	5
288ecp_sm2p256_sub:
289	AARCH64_VALID_CALL_TARGET
290	// Load inputs
291	ldp	x7,x8,[x1]
292	ldp	x9,x10,[x1,#16]
293	ldp	x11,x12,[x2]
294	ldp	x13,x14,[x2,#16]
295
296	// Subtraction
297	subs	x7,x7,x11
298	sbcs	x8,x8,x12
299	sbcs	x9,x9,x13
300	sbcs	x10,x10,x14
301	sbc	x15,xzr,xzr
302
303	// Load polynomial
304	adrp	x2,.Lpoly
305	add	x2,x2,#:lo12:.Lpoly
306	ldp	x11,x12,[x2]
307	ldp	x13,x14,[x2,#16]
308
309	// Backup subtraction
310	mov	x3,x7
311	mov	x4,x8
312	mov	x5,x9
313	mov	x6,x10
314
315	// Add polynomial
316	adds	x3,x3,x11
317	adcs	x4,x4,x12
318	adcs	x5,x5,x13
319	adcs	x6,x6,x14
320	tst	x15,x15
321
322	// Select based on carry
323	csel	x7,x7,x3,eq
324	csel	x8,x8,x4,eq
325	csel	x9,x9,x5,eq
326	csel	x10,x10,x6,eq
327
328	// Store results
329	stp	x7,x8,[x0]
330	stp	x9,x10,[x0,#16]
331	ret
332.size	ecp_sm2p256_sub,.-ecp_sm2p256_sub
333
334// void ecp_sm2p256_sub_mod_ord(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
335.globl	ecp_sm2p256_sub_mod_ord
336.type	ecp_sm2p256_sub_mod_ord,%function
337.align	5
338ecp_sm2p256_sub_mod_ord:
339	AARCH64_VALID_CALL_TARGET
340	// Load inputs
341	ldp	x7,x8,[x1]
342	ldp	x9,x10,[x1,#16]
343	ldp	x11,x12,[x2]
344	ldp	x13,x14,[x2,#16]
345
346	// Subtraction
347	subs	x7,x7,x11
348	sbcs	x8,x8,x12
349	sbcs	x9,x9,x13
350	sbcs	x10,x10,x14
351	sbc	x15,xzr,xzr
352
353	// Load polynomial
354	adrp	x2,.Lord
355	add	x2,x2,#:lo12:.Lord
356	ldp	x11,x12,[x2]
357	ldp	x13,x14,[x2,#16]
358
359	// Backup subtraction
360	mov	x3,x7
361	mov	x4,x8
362	mov	x5,x9
363	mov	x6,x10
364
365	// Add polynomial
366	adds	x3,x3,x11
367	adcs	x4,x4,x12
368	adcs	x5,x5,x13
369	adcs	x6,x6,x14
370	tst	x15,x15
371
372	// Select based on carry
373	csel	x7,x7,x3,eq
374	csel	x8,x8,x4,eq
375	csel	x9,x9,x5,eq
376	csel	x10,x10,x6,eq
377
378	// Store results
379	stp	x7,x8,[x0]
380	stp	x9,x10,[x0,#16]
381	ret
382.size	ecp_sm2p256_sub_mod_ord,.-ecp_sm2p256_sub_mod_ord
383
384.macro	RDC
385	// a = |  s7   | ... | s0  |, where si are 64-bit quantities
386	//   = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities
387	// |    s7     |    s6     |    s5     |    s4     |
388	// | a15 | a14 | a13 | a12 | a11 | a10 | a9  | a8  |
389	// |    s3     |    s2     |    s1     |    s0     |
390	// | a7  | a6  | a5  | a4  | a3  | a2  | a1  | a0  |
391	// =================================================
392	// | a8  | a11 | a10 | a9  | a8  |   0 |    s4     | (+)
393	// | a9  | a15 |    s6     | a11 |   0 | a10 | a9  | (+)
394	// | a10 |   0 | a14 | a13 | a12 |   0 |    s5     | (+)
395	// | a11 |   0 |    s7     | a13 |   0 | a12 | a11 | (+)
396	// | a12 |   0 |    s7     | a13 |   0 |    s6     | (+)
397	// | a12 |   0 |   0 | a15 | a14 |   0 | a14 | a13 | (+)
398	// | a13 |   0 |   0 |   0 | a15 |   0 | a14 | a13 | (+)
399	// | a13 |   0 |   0 |   0 |   0 |   0 |    s7     | (+)
400	// | a14 |   0 |   0 |   0 |   0 |   0 |    s7     | (+)
401	// | a14 |   0 |   0 |   0 |   0 |   0 |   0 | a15 | (+)
402	// | a15 |   0 |   0 |   0 |   0 |   0 |   0 | a15 | (+)
403	// | a15 |   0 |   0 |   0 |   0 |   0 |   0 |   0 | (+)
404	// |    s7     |   0 |   0 |   0 |   0 |   0 |   0 | (+)
405	// |   0 |   0 |   0 |   0 |   0 | a8  |   0 |   0 | (-)
406	// |   0 |   0 |   0 |   0 |   0 | a9  |   0 |   0 | (-)
407	// |   0 |   0 |   0 |   0 |   0 | a13 |   0 |   0 | (-)
408	// |   0 |   0 |   0 |   0 |   0 | a14 |   0 |   0 | (-)
409	// | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]|
410	// |    V[3]   |    V[2]   |    V[1]   |    V[0]   |
411
412	// 1. 64-bit addition
413	// t2=s6+s7+s7
414	adds	x5,x13,x14
415	adcs	x4,xzr,xzr
416	adds	x5,x5,x14
417	adcs	x4,x4,xzr
418	// t3=s4+s5+t2
419	adds	x6,x11,x5
420	adcs	x15,x4,xzr
421	adds	x6,x6,x12
422	adcs	x15,x15,xzr
423	// sum
424	adds	x7,x7,x6
425	adcs	x8,x8,x15
426	adcs	x9,x9,x5
427	adcs	x10,x10,x14
428	adcs	x3,xzr,xzr
429	adds	x10,x10,x4
430	adcs	x3,x3,xzr
431
432	stp	x7,x8,[sp,#32]
433	stp	x9,x10,[sp,#48]
434
435	// 2. 64-bit to 32-bit spread
436	mov	x4,#0xffffffff
437	mov	x7,x11
438	mov	x8,x12
439	mov	x9,x13
440	mov	x10,x14
441	and	x7,x7,x4 // a8
442	and	x8,x8,x4 // a10
443	and	x9,x9,x4 // a12
444	and	x10,x10,x4 // a14
445	lsr	x11,x11,#32 // a9
446	lsr	x12,x12,#32 // a11
447	lsr	x13,x13,#32 // a13
448	lsr	x14,x14,#32 // a15
449
450	// 3. 32-bit addition
451	add	x4,x10,x9  // t1 <- a12 + a14
452	add	x5,x14,x13  // t2 <- a13 + a15
453	add	x6,x7,x11    // t3 <- a8 + a9
454	add	x15,x10,x8  // t4 <- a10 + a14
455	add	x14,x14,x12 // a15 <- a11 + a15
456	add	x9,x5,x4   // a12 <- a12 + a13 + a14 + a15
457	add	x8,x8,x9 // a10 <- a10 + a12 + a13 + a14 + a15
458	add	x8,x8,x9 // a10 <- a10 + 2*(a12 + a13 + a14 + a15)
459	add	x8,x8,x6  // a10 <- a8 + a9 + a10 + 2*(a12 + a13 + a14 + a15)
460	add	x8,x8,x12 // a10 <- a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
461	add	x9,x9,x13 // a12 <- a12 + 2*a13 + a14 + a15
462	add	x9,x9,x12 // a12 <- a11 + a12 + 2*a13 + a14 + a15
463	add	x9,x9,x7  // a12 <- a8 + a11 + a12 + 2*a13 + a14 + a15
464	add	x6,x6,x10   // t3 <- a8 + a9 + a14
465	add	x6,x6,x13   // t3 <- a8 + a9 + a13 + a14
466	add	x11,x11,x5    // a9 <- a9 + a13 + a15
467	add	x12,x12,x11  // a11 <- a9 + a11 + a13 + a15
468	add	x12,x12,x5  // a11 <- a9 + a11 + 2*(a13 + a15)
469	add	x4,x4,x15    // t1 <- a10 + a12 + 2*a14
470
471	// U[0]  s5	a9 + a11 + 2*(a13 + a15)
472	// U[1]  t1	a10 + a12 + 2*a14
473	// U[2] -t3	a8 + a9 + a13 + a14
474	// U[3]  s2	a8 + a11 + a12 + 2*a13 + a14 + a15
475	// U[4]  s4	a9 + a13 + a15
476	// U[5]  t4	a10 + a14
477	// U[6]  s7	a11 + a15
478	// U[7]  s1	a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
479
480	// 4. 32-bit to 64-bit
481	lsl	x7,x4,#32
482	extr	x4,x9,x4,#32
483	extr	x9,x15,x9,#32
484	extr	x15,x8,x15,#32
485	lsr	x8,x8,#32
486
487	// 5. 64-bit addition
488	adds	x12,x12,x7
489	adcs	x4,x4,xzr
490	adcs	x11,x11,x9
491	adcs	x14,x14,x15
492	adcs	x3,x3,x8
493
494	// V[0]	s5
495	// V[1]	t1
496	// V[2]	s4
497	// V[3]	s7
498	// carry	t0
499	// sub	t3
500
501	// 5. Process s0-s3
502	ldp	x7,x8,[sp,#32]
503	ldp	x9,x10,[sp,#48]
504	// add with V0-V3
505	adds	x7,x7,x12
506	adcs	x8,x8,x4
507	adcs	x9,x9,x11
508	adcs	x10,x10,x14
509	adcs	x3,x3,xzr
510	// sub with t3
511	subs	x8,x8,x6
512	sbcs	x9,x9,xzr
513	sbcs	x10,x10,xzr
514	sbcs	x3,x3,xzr
515
516	// 6. MOD
517	// First Mod
518	lsl	x4,x3,#32
519	subs	x5,x4,x3
520
521	adds	x7,x7,x3
522	adcs	x8,x8,x5
523	adcs	x9,x9,xzr
524	adcs	x10,x10,x4
525
526	// Last Mod
527	// return y - p if y > p else y
528	mov	x11,x7
529	mov	x12,x8
530	mov	x13,x9
531	mov	x14,x10
532
533	adrp	x3,.Lpoly
534	add	x3,x3,#:lo12:.Lpoly
535	ldp	x4,x5,[x3]
536	ldp	x6,x15,[x3,#16]
537
538	adcs	x16,xzr,xzr
539
540	subs	x7,x7,x4
541	sbcs	x8,x8,x5
542	sbcs	x9,x9,x6
543	sbcs	x10,x10,x15
544	sbcs	x16,x16,xzr
545
546	csel	x7,x7,x11,cs
547	csel	x8,x8,x12,cs
548	csel	x9,x9,x13,cs
549	csel	x10,x10,x14,cs
550
551.endm
552
553// void ecp_sm2p256_mul(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
554.globl	ecp_sm2p256_mul
555.type	ecp_sm2p256_mul,%function
556.align	5
557ecp_sm2p256_mul:
558	AARCH64_SIGN_LINK_REGISTER
559	// Store scalar registers
560	stp	x29,x30,[sp,#-80]!
561	add	x29,sp,#0
562	stp	x16,x17,[sp,#16]
563	stp	x19,x20,[sp,#64]
564
565	// Load inputs
566	ldp	x7,x8,[x1]
567	ldp	x9,x10,[x1,#16]
568	ldp	x11,x12,[x2]
569	ldp	x13,x14,[x2,#16]
570
571// ### multiplication ###
572	// ========================
573	//             s3 s2 s1 s0
574	// *           s7 s6 s5 s4
575	// ------------------------
576	// +           s0 s0 s0 s0
577	//              *  *  *  *
578	//             s7 s6 s5 s4
579	//          s1 s1 s1 s1
580	//           *  *  *  *
581	//          s7 s6 s5 s4
582	//       s2 s2 s2 s2
583	//        *  *  *  *
584	//       s7 s6 s5 s4
585	//    s3 s3 s3 s3
586	//     *  *  *  *
587	//    s7 s6 s5 s4
588	// ------------------------
589	// s7 s6 s5 s4 s3 s2 s1 s0
590	// ========================
591
592// ### s0*s4 ###
593	mul	x16,x7,x11
594	umulh	x5,x7,x11
595
596// ### s1*s4 + s0*s5 ###
597	mul	x3,x8,x11
598	umulh	x4,x8,x11
599	adds	x5,x5,x3
600	adcs	x6,x4,xzr
601
602	mul	x3,x7,x12
603	umulh	x4,x7,x12
604	adds	x5,x5,x3
605	adcs	x6,x6,x4
606	adcs	x15,xzr,xzr
607
608// ### s2*s4 + s1*s5 + s0*s6 ###
609	mul	x3,x9,x11
610	umulh	x4,x9,x11
611	adds	x6,x6,x3
612	adcs	x15,x15,x4
613
614	mul	x3,x8,x12
615	umulh	x4,x8,x12
616	adds	x6,x6,x3
617	adcs	x15,x15,x4
618	adcs	x17,xzr,xzr
619
620	mul	x3,x7,x13
621	umulh	x4,x7,x13
622	adds	x6,x6,x3
623	adcs	x15,x15,x4
624	adcs	x17,x17,xzr
625
626// ### s3*s4 + s2*s5 + s1*s6 + s0*s7 ###
627	mul	x3,x10,x11
628	umulh	x4,x10,x11
629	adds	x15,x15,x3
630	adcs	x17,x17,x4
631	adcs	x19,xzr,xzr
632
633	mul	x3,x9,x12
634	umulh	x4,x9,x12
635	adds	x15,x15,x3
636	adcs	x17,x17,x4
637	adcs	x19,x19,xzr
638
639	mul	x3,x8,x13
640	umulh	x4,x8,x13
641	adds	x15,x15,x3
642	adcs	x17,x17,x4
643	adcs	x19,x19,xzr
644
645	mul	x3,x7,x14
646	umulh	x4,x7,x14
647	adds	x15,x15,x3
648	adcs	x17,x17,x4
649	adcs	x19,x19,xzr
650
651// ### s3*s5 + s2*s6 + s1*s7 ###
652	mul	x3,x10,x12
653	umulh	x4,x10,x12
654	adds	x17,x17,x3
655	adcs	x19,x19,x4
656	adcs	x20,xzr,xzr
657
658	mul	x3,x9,x13
659	umulh	x4,x9,x13
660	adds	x17,x17,x3
661	adcs	x19,x19,x4
662	adcs	x20,x20,xzr
663
664	mul	x3,x8,x14
665	umulh	x4,x8,x14
666	adds	x11,x17,x3
667	adcs	x19,x19,x4
668	adcs	x20,x20,xzr
669
670// ### s3*s6 + s2*s7 ###
671	mul	x3,x10,x13
672	umulh	x4,x10,x13
673	adds	x19,x19,x3
674	adcs	x20,x20,x4
675	adcs	x17,xzr,xzr
676
677	mul	x3,x9,x14
678	umulh	x4,x9,x14
679	adds	x12,x19,x3
680	adcs	x20,x20,x4
681	adcs	x17,x17,xzr
682
683// ### s3*s7 ###
684	mul	x3,x10,x14
685	umulh	x4,x10,x14
686	adds	x13,x20,x3
687	adcs	x14,x17,x4
688
689	mov	x7,x16
690	mov	x8,x5
691	mov	x9,x6
692	mov	x10,x15
693
694	// result of mul: s7 s6 s5 s4 s3 s2 s1 s0
695
696// ### Reduction ###
697	RDC
698
699	stp	x7,x8,[x0]
700	stp	x9,x10,[x0,#16]
701
702	// Restore scalar registers
703	ldp	x16,x17,[sp,#16]
704	ldp	x19,x20,[sp,#64]
705	ldp	x29,x30,[sp],#80
706
707	AARCH64_VALIDATE_LINK_REGISTER
708	ret
709.size	ecp_sm2p256_mul,.-ecp_sm2p256_mul
710
711// void ecp_sm2p256_sqr(BN_ULONG *r, const BN_ULONG *a);
712.globl	ecp_sm2p256_sqr
713.type	ecp_sm2p256_sqr,%function
714.align	5
715
716ecp_sm2p256_sqr:
717	AARCH64_SIGN_LINK_REGISTER
718	// Store scalar registers
719	stp	x29,x30,[sp,#-80]!
720	add	x29,sp,#0
721	stp	x16,x17,[sp,#16]
722	stp	x19,x20,[sp,#64]
723
724	// Load inputs
725	ldp	x11,x12,[x1]
726	ldp	x13,x14,[x1,#16]
727
728// ### square ###
729	// ========================
730	//             s7 s6 s5 s4
731	// *           s7 s6 s5 s4
732	// ------------------------
733	// +           s4 s4 s4 s4
734	//              *  *  *  *
735	//             s7 s6 s5 s4
736	//          s5 s5 s5 s5
737	//           *  *  *  *
738	//          s7 s6 s5 s4
739	//       s6 s6 s6 s6
740	//        *  *  *  *
741	//       s7 s6 s5 s4
742	//    s7 s7 s7 s7
743	//     *  *  *  *
744	//    s7 s6 s5 s4
745	// ------------------------
746	// s7 s6 s5 s4 s3 s2 s1 s0
747	// ========================
748
749// ### s4*s5 ###
750	mul	x8,x11,x12
751	umulh	x9,x11,x12
752
753// ### s4*s6 ###
754	mul	x3,x13,x11
755	umulh	x10,x13,x11
756	adds	x9,x9,x3
757	adcs	x10,x10,xzr
758
759// ### s4*s7 + s5*s6 ###
760	mul	x3,x14,x11
761	umulh	x4,x14,x11
762	adds	x10,x10,x3
763	adcs	x7,x4,xzr
764
765	mul	x3,x13,x12
766	umulh	x4,x13,x12
767	adds	x10,x10,x3
768	adcs	x7,x7,x4
769	adcs	x5,xzr,xzr
770
771// ### s5*s7 ###
772	mul	x3,x14,x12
773	umulh	x4,x14,x12
774	adds	x7,x7,x3
775	adcs	x5,x5,x4
776
777// ### s6*s7 ###
778	mul	x3,x14,x13
779	umulh	x4,x14,x13
780	adds	x5,x5,x3
781	adcs	x6,x4,xzr
782
783// ### 2*(t3,t2,s0,s3,s2,s1) ###
784	adds	x8,x8,x8
785	adcs	x9,x9,x9
786	adcs	x10,x10,x10
787	adcs	x7,x7,x7
788	adcs	x5,x5,x5
789	adcs	x6,x6,x6
790	adcs	x15,xzr,xzr
791
792// ### s4*s4 ###
793	mul	x16,x11,x11
794	umulh	x17,x11,x11
795
796// ### s5*s5 ###
797	mul	x11,x12,x12
798	umulh	x12,x12,x12
799
800// ### s6*s6 ###
801	mul	x3,x13,x13
802	umulh	x4,x13,x13
803
804// ### s7*s7 ###
805	mul	x19,x14,x14
806	umulh	x20,x14,x14
807
808	adds	x8,x8,x17
809	adcs	x9,x9,x11
810	adcs	x10,x10,x12
811	adcs	x7,x7,x3
812	adcs	x5,x5,x4
813	adcs	x6,x6,x19
814	adcs	x15,x15,x20
815
816	mov	x11,x7
817	mov	x7,x16
818	mov	x12,x5
819	mov	x13,x6
820	mov	x14,x15
821
822	// result of mul: s7 s6 s5 s4 s3 s2 s1 s0
823
824// ### Reduction ###
825	RDC
826
827	stp	x7,x8,[x0]
828	stp	x9,x10,[x0,#16]
829
830	// Restore scalar registers
831	ldp	x16,x17,[sp,#16]
832	ldp	x19,x20,[sp,#64]
833	ldp	x29,x30,[sp],#80
834
835	AARCH64_VALIDATE_LINK_REGISTER
836	ret
837.size	ecp_sm2p256_sqr,.-ecp_sm2p256_sqr
838