xref: /freebsd/crypto/openssl/crypto/ec/asm/ecp_nistp384-ppc64.pl (revision e7be843b4a162e68651d3911f0357ed464915629)
1#! /usr/bin/env perl
2# Copyright 2023-2025 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Danny Tsen <dtsen@us.ibm.com> # for the OpenSSL project.
11#
12# Copyright 2025- IBM Corp.
13# ====================================================================
14#
15# p384 lower-level primitives for PPC64.
16#
17
18
19use strict;
20use warnings;
21
22my $flavour = shift;
23my $output = "";
24while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
25if (!$output) {
26        $output = "-";
27}
28
29my ($xlate, $dir);
30$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
31( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
32( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
33die "can't locate ppc-xlate.pl";
34
35open OUT,"| \"$^X\" $xlate $flavour $output";
36*STDOUT=*OUT;
37
38my $code = "";
39
40$code.=<<___;
41.machine "any"
42.text
43
44.globl  p384_felem_mul
45.type   p384_felem_mul,\@function
46.align	4
47p384_felem_mul:
48
49	stdu	1, -176(1)
50	mflr	0
51	std	14, 56(1)
52	std	15, 64(1)
53	std	16, 72(1)
54	std	17, 80(1)
55	std	18, 88(1)
56	std	19, 96(1)
57	std	20, 104(1)
58	std	21, 112(1)
59	std	22, 120(1)
60
61	bl	_p384_felem_mul_core
62
63	mtlr	0
64	ld	14, 56(1)
65	ld	15, 64(1)
66	ld	16, 72(1)
67	ld	17, 80(1)
68	ld	18, 88(1)
69	ld	19, 96(1)
70	ld	20, 104(1)
71	ld	21, 112(1)
72	ld	22, 120(1)
73	addi	1, 1, 176
74	blr
75.size   p384_felem_mul,.-p384_felem_mul
76
77.globl  p384_felem_square
78.type   p384_felem_square,\@function
79.align	4
80p384_felem_square:
81
82	stdu	1, -176(1)
83	mflr	0
84	std	14, 56(1)
85	std	15, 64(1)
86	std	16, 72(1)
87	std	17, 80(1)
88
89	bl	_p384_felem_square_core
90
91	mtlr	0
92	ld	14, 56(1)
93	ld	15, 64(1)
94	ld	16, 72(1)
95	ld	17, 80(1)
96	addi	1, 1, 176
97	blr
98.size   p384_felem_square,.-p384_felem_square
99
100#
101# Felem mul core function -
102# r3, r4 and r5 need to pre-loaded.
103#
104.type   _p384_felem_mul_core,\@function
105.align	4
106_p384_felem_mul_core:
107
108	ld	6,0(4)
109	ld	14,0(5)
110	ld	7,8(4)
111	ld	15,8(5)
112	ld	8,16(4)
113	ld	16,16(5)
114	ld	9,24(4)
115	ld	17,24(5)
116	ld	10,32(4)
117	ld	18,32(5)
118	ld	11,40(4)
119	ld	19,40(5)
120	ld	12,48(4)
121	ld	20,48(5)
122
123	# out0
124	mulld	21, 14, 6
125	mulhdu	22, 14, 6
126	std	21, 0(3)
127	std	22, 8(3)
128
129	vxor	0, 0, 0
130
131	# out1
132	mtvsrdd	32+13, 14, 6
133	mtvsrdd	32+14, 7, 15
134	vmsumudm 1, 13, 14, 0
135
136	# out2
137	mtvsrdd	32+15, 15, 6
138	mtvsrdd	32+16, 7, 16
139	mtvsrdd	32+17, 0, 8
140	mtvsrdd	32+18, 0, 14
141	vmsumudm 19, 15, 16, 0
142	vmsumudm 2, 17, 18, 19
143
144	# out3
145	mtvsrdd	32+13, 16, 6
146	mtvsrdd	32+14, 7, 17
147	mtvsrdd	32+15, 14, 8
148	mtvsrdd	32+16, 9, 15
149	vmsumudm 19, 13, 14, 0
150	vmsumudm 3, 15, 16, 19
151
152	# out4
153	mtvsrdd	32+13, 17, 6
154	mtvsrdd	32+14, 7, 18
155	mtvsrdd	32+15, 15, 8
156	mtvsrdd	32+16, 9, 16
157	mtvsrdd	32+17, 0, 10
158	mtvsrdd	32+18, 0, 14
159	vmsumudm 19, 13, 14, 0
160	vmsumudm 4, 15, 16, 19
161	vmsumudm 4, 17, 18, 4
162
163	# out5
164	mtvsrdd	32+13, 18, 6
165	mtvsrdd	32+14, 7, 19
166	mtvsrdd	32+15, 16, 8
167	mtvsrdd	32+16, 9, 17
168	mtvsrdd	32+17, 14, 10
169	mtvsrdd	32+18, 11, 15
170	vmsumudm 19, 13, 14, 0
171	vmsumudm 5, 15, 16, 19
172	vmsumudm 5, 17, 18, 5
173
174	stxv	32+1, 16(3)
175	stxv	32+2, 32(3)
176	stxv	32+3, 48(3)
177	stxv	32+4, 64(3)
178	stxv	32+5, 80(3)
179
180	# out6
181	mtvsrdd	32+13, 19, 6
182	mtvsrdd	32+14, 7, 20
183	mtvsrdd	32+15, 17, 8
184	mtvsrdd	32+16, 9, 18
185	mtvsrdd	32+17, 15, 10
186	mtvsrdd	32+18, 11, 16
187	vmsumudm 19, 13, 14, 0
188	vmsumudm 6, 15, 16, 19
189	mtvsrdd	32+13, 0, 12
190	mtvsrdd	32+14, 0, 14
191	vmsumudm 19, 17, 18, 6
192	vmsumudm 6, 13, 14, 19
193
194	# out7
195	mtvsrdd	32+13, 19, 7
196	mtvsrdd	32+14, 8, 20
197	mtvsrdd	32+15, 17, 9
198	mtvsrdd	32+16, 10, 18
199	mtvsrdd	32+17, 15, 11
200	mtvsrdd	32+18, 12, 16
201	vmsumudm 19, 13, 14, 0
202	vmsumudm 7, 15, 16, 19
203	vmsumudm 7, 17, 18, 7
204
205	# out8
206	mtvsrdd	32+13, 19, 8
207	mtvsrdd	32+14, 9, 20
208	mtvsrdd	32+15, 17, 10
209	mtvsrdd	32+16, 11, 18
210	mtvsrdd	32+17, 0, 12
211	mtvsrdd	32+18, 0, 16
212	vmsumudm 19, 13, 14, 0
213	vmsumudm 8, 15, 16, 19
214	vmsumudm 8, 17, 18, 8
215
216	# out9
217	mtvsrdd	32+13, 19, 9
218	mtvsrdd	32+14, 10, 20
219	mtvsrdd	32+15, 17, 11
220	mtvsrdd	32+16, 12, 18
221	vmsumudm 19, 13, 14, 0
222	vmsumudm 9, 15, 16, 19
223
224	# out10
225	mtvsrdd	32+13, 19, 10
226	mtvsrdd	32+14, 11, 20
227	mtvsrdd	32+15, 0, 12
228	mtvsrdd	32+16, 0, 18
229	vmsumudm 19, 13, 14, 0
230	vmsumudm 10, 15, 16, 19
231
232	# out11
233	mtvsrdd	32+17, 19, 11
234	mtvsrdd	32+18, 12, 20
235	vmsumudm 11, 17, 18, 0
236
237	stxv	32+6, 96(3)
238	stxv	32+7, 112(3)
239	stxv	32+8, 128(3)
240	stxv	32+9, 144(3)
241	stxv	32+10, 160(3)
242	stxv	32+11, 176(3)
243
244	# out12
245	mulld	21, 20, 12
246	mulhdu	22, 20, 12	# out12
247
248	std	21, 192(3)
249	std	22, 200(3)
250
251	blr
252.size   _p384_felem_mul_core,.-_p384_felem_mul_core
253
254#
255# Felem square core function -
256# r3 and r4 need to pre-loaded.
257#
258.type   _p384_felem_square_core,\@function
259.align	4
260_p384_felem_square_core:
261
262	ld	6, 0(4)
263	ld	7, 8(4)
264	ld	8, 16(4)
265	ld	9, 24(4)
266	ld	10, 32(4)
267	ld	11, 40(4)
268	ld	12, 48(4)
269
270	vxor	0, 0, 0
271
272	# out0
273	mulld	14, 6, 6
274	mulhdu	15, 6, 6
275	std	14, 0(3)
276	std	15, 8(3)
277
278	# out1
279	add	14, 6, 6
280	mtvsrdd	32+13, 0, 14
281	mtvsrdd	32+14, 0, 7
282	vmsumudm 1, 13, 14, 0
283
284	# out2
285	mtvsrdd	32+15, 7, 14
286	mtvsrdd	32+16, 7, 8
287	vmsumudm 2, 15, 16, 0
288
289	# out3
290	add	15, 7, 7
291	mtvsrdd	32+13, 8, 14
292	mtvsrdd	32+14, 15, 9
293	vmsumudm 3, 13, 14, 0
294
295	# out4
296	mtvsrdd	32+13, 9, 14
297	mtvsrdd	32+14, 15, 10
298	mtvsrdd	32+15, 0, 8
299	vmsumudm 4, 13, 14, 0
300	vmsumudm 4, 15, 15, 4
301
302	# out5
303	mtvsrdd	32+13, 10, 14
304	mtvsrdd	32+14, 15, 11
305	add	16, 8, 8
306	mtvsrdd	32+15, 0, 16
307	mtvsrdd	32+16, 0, 9
308	vmsumudm 5, 13, 14, 0
309	vmsumudm 5, 15, 16, 5
310
311	stxv	32+1, 16(3)
312	stxv	32+2, 32(3)
313	stxv	32+3, 48(3)
314	stxv	32+4, 64(3)
315
316	# out6
317	mtvsrdd	32+13, 11, 14
318	mtvsrdd	32+14, 15, 12
319	mtvsrdd	32+15, 9, 16
320	mtvsrdd	32+16, 9, 10
321	stxv	32+5, 80(3)
322	vmsumudm 19, 13, 14, 0
323	vmsumudm 6, 15, 16, 19
324
325	# out7
326	add	17, 9, 9
327	mtvsrdd	32+13, 11, 15
328	mtvsrdd	32+14, 16, 12
329	mtvsrdd	32+15, 0, 17
330	mtvsrdd	32+16, 0, 10
331	vmsumudm 19, 13, 14, 0
332	vmsumudm 7, 15, 16, 19
333
334	# out8
335	mtvsrdd	32+13, 11, 16
336	mtvsrdd	32+14, 17, 12
337	mtvsrdd	32+15, 0, 10
338	vmsumudm 19, 13, 14, 0
339	vmsumudm 8, 15, 15, 19
340
341	# out9
342	add	14, 10, 10
343	mtvsrdd	32+13, 11, 17
344	mtvsrdd	32+14, 14, 12
345	vmsumudm 9, 13, 14, 0
346
347	# out10
348	mtvsrdd	32+13, 11, 14
349	mtvsrdd	32+14, 11, 12
350	vmsumudm 10, 13, 14, 0
351
352	stxv	32+6, 96(3)
353	stxv	32+7, 112(3)
354
355	# out11
356	#add	14, 11, 11
357	#mtvsrdd	32+13, 0, 14
358	#mtvsrdd	32+14, 0, 12
359	#vmsumudm 11, 13, 14, 0
360
361	mulld	6, 12, 11
362	mulhdu	7, 12, 11
363	addc	8, 6, 6
364	adde	9, 7, 7
365
366	stxv	32+8, 128(3)
367	stxv	32+9, 144(3)
368	stxv	32+10, 160(3)
369	#stxv	32+11, 176(3)
370
371	# out12
372	mulld	14, 12, 12
373	mulhdu	15, 12, 12
374
375	std	8, 176(3)
376	std	9, 184(3)
377	std	14, 192(3)
378	std	15, 200(3)
379
380	blr
381.size   _p384_felem_square_core,.-_p384_felem_square_core
382
383#
384# widefelem (128 bits) * 8
385#
386.macro F128_X_8 _off1 _off2
387	ld	9,\\_off1(3)
388	ld	8,\\_off2(3)
389	srdi	10,9,61
390	rldimi	10,8,3,0
391	sldi	9,9,3
392	std	9,\\_off1(3)
393	std	10,\\_off2(3)
394.endm
395
396.globl p384_felem128_mul_by_8
397.type	p384_felem128_mul_by_8, \@function
398.align 4
399p384_felem128_mul_by_8:
400
401	F128_X_8 0, 8
402
403	F128_X_8 16, 24
404
405	F128_X_8 32, 40
406
407	F128_X_8 48, 56
408
409	F128_X_8 64, 72
410
411	F128_X_8 80, 88
412
413	F128_X_8 96, 104
414
415	F128_X_8 112, 120
416
417	F128_X_8 128, 136
418
419	F128_X_8 144, 152
420
421	F128_X_8 160, 168
422
423	F128_X_8 176, 184
424
425	F128_X_8 192, 200
426
427	blr
428.size	p384_felem128_mul_by_8,.-p384_felem128_mul_by_8
429
430#
431# widefelem (128 bits) * 2
432#
433.macro F128_X_2 _off1 _off2
434	ld	9,\\_off1(3)
435	ld	8,\\_off2(3)
436	srdi	10,9,63
437	rldimi	10,8,1,0
438	sldi	9,9,1
439	std	9,\\_off1(3)
440	std	10,\\_off2(3)
441.endm
442
443.globl p384_felem128_mul_by_2
444.type	p384_felem128_mul_by_2, \@function
445.align 4
446p384_felem128_mul_by_2:
447
448	F128_X_2 0, 8
449
450	F128_X_2 16, 24
451
452	F128_X_2 32, 40
453
454	F128_X_2 48, 56
455
456	F128_X_2 64, 72
457
458	F128_X_2 80, 88
459
460	F128_X_2 96, 104
461
462	F128_X_2 112, 120
463
464	F128_X_2 128, 136
465
466	F128_X_2 144, 152
467
468	F128_X_2 160, 168
469
470	F128_X_2 176, 184
471
472	F128_X_2 192, 200
473
474	blr
475.size	p384_felem128_mul_by_2,.-p384_felem128_mul_by_2
476
477.globl p384_felem_diff128
478.type	p384_felem_diff128, \@function
479.align 4
480p384_felem_diff128:
481
482	addis   5, 2, .LConst_two127\@toc\@ha
483	addi    5, 5, .LConst_two127\@toc\@l
484
485	ld	10, 0(3)
486	ld	8, 8(3)
487	li	9, 0
488	addc	10, 10, 9
489	li	7, -1
490	rldicr	7, 7, 0, 0	# two127
491	adde	8, 8, 7
492	ld	11, 0(4)
493	ld	12, 8(4)
494	subfc	11, 11, 10
495	subfe	12, 12, 8
496	std	11, 0(3)	# out0
497	std	12, 8(3)
498
499	# two127m71 = (r10, r9)
500	ld	8, 16(3)
501	ld	7, 24(3)
502	ld	10, 24(5)	# two127m71
503	addc	8, 8, 9
504	adde	7, 7, 10
505	ld	11, 16(4)
506	ld	12, 24(4)
507	subfc	11, 11, 8
508	subfe	12, 12, 7
509	std	11, 16(3)	# out1
510	std	12, 24(3)
511
512	ld	8, 32(3)
513	ld	7, 40(3)
514	addc	8, 8, 9
515	adde	7, 7, 10
516	ld	11, 32(4)
517	ld	12, 40(4)
518	subfc	11, 11, 8
519	subfe	12, 12, 7
520	std	11, 32(3)	# out2
521	std	12, 40(3)
522
523	ld	8, 48(3)
524	ld	7, 56(3)
525	addc	8, 8, 9
526	adde	7, 7, 10
527	ld	11, 48(4)
528	ld	12, 56(4)
529	subfc	11, 11, 8
530	subfe	12, 12, 7
531	std	11, 48(3)	# out3
532	std	12, 56(3)
533
534	ld	8, 64(3)
535	ld	7, 72(3)
536	addc	8, 8, 9
537	adde	7, 7, 10
538	ld	11, 64(4)
539	ld	12, 72(4)
540	subfc	11, 11, 8
541	subfe	12, 12, 7
542	std	11, 64(3)	# out4
543	std	12, 72(3)
544
545	ld	8, 80(3)
546	ld	7, 88(3)
547	addc	8, 8, 9
548	adde	7, 7, 10
549	ld	11, 80(4)
550	ld	12, 88(4)
551	subfc	11, 11, 8
552	subfe	12, 12, 7
553	std	11, 80(3)	# out5
554	std	12, 88(3)
555
556	ld	8, 96(3)
557	ld	7, 104(3)
558	ld	6, 40(5)	# two127p111m79m71
559	addc	8, 8, 9
560	adde	7, 7, 6
561	ld	11, 96(4)
562	ld	12, 104(4)
563	subfc	11, 11, 8
564	subfe	12, 12, 7
565	std	11, 96(3)	# out6
566	std	12, 104(3)
567
568	ld	8, 112(3)
569	ld	7, 120(3)
570	ld	6, 56(5)	# two127m119m71
571	addc	8, 8, 9
572	adde	7, 7, 6
573	ld	11, 112(4)
574	ld	12, 120(4)
575	subfc	11, 11, 8
576	subfe	12, 12, 7
577	std	11, 112(3)	# out7
578	std	12, 120(3)
579
580	ld	8, 128(3)
581	ld	7, 136(3)
582	ld	6, 72(5)	# two127m95m71
583	addc	8, 8, 9
584	adde	7, 7, 6
585	ld	11, 128(4)
586	ld	12, 136(4)
587	subfc	11, 11, 8
588	subfe	12, 12, 7
589	std	11, 128(3)	# out8
590	std	12, 136(3)
591
592	ld	8, 144(3)
593	ld	7, 152(3)
594	addc	8, 8, 9
595	adde	7, 7, 10
596	ld	11, 144(4)
597	ld	12, 152(4)
598	subfc	11, 11, 8
599	subfe	12, 12, 7
600	std	11, 144(3)	# out9
601	std	12, 152(3)
602
603	ld	8, 160(3)
604	ld	7, 168(3)
605	addc	8, 8, 9
606	adde	7, 7, 10
607	ld	11, 160(4)
608	ld	12, 168(4)
609	subfc	11, 11, 8
610	subfe	12, 12, 7
611	std	11, 160(3)	# out10
612	std	12, 168(3)
613
614	ld	8, 176(3)
615	ld	7, 184(3)
616	addc	8, 8, 9
617	adde	7, 7, 10
618	ld	11, 176(4)
619	ld	12, 184(4)
620	subfc	11, 11, 8
621	subfe	12, 12, 7
622	std	11, 176(3)	# out11
623	std	12, 184(3)
624
625	ld	8, 192(3)
626	ld	7, 200(3)
627	addc	8, 8, 9
628	adde	7, 7, 10
629	ld	11, 192(4)
630	ld	12, 200(4)
631	subfc	11, 11, 8
632	subfe	12, 12, 7
633	std	11, 192(3)	# out12
634	std	12, 200(3)
635
636	blr
637.size	p384_felem_diff128,.-p384_felem_diff128
638
639.data
640.align 4
641.LConst_two127:
642#two127
643.long 0x00000000, 0x00000000, 0x00000000, 0x80000000
644#two127m71
645.long 0x00000000, 0x00000000, 0xffffff80, 0x7fffffff
646#two127p111m79m71
647.long 0x00000000, 0x00000000, 0xffff7f80, 0x80007fff
648#two127m119m71
649.long 0x00000000, 0x00000000, 0xffffff80, 0x7f7fffff
650#two127m95m71
651.long 0x00000000, 0x00000000, 0x7fffff80, 0x7fffffff
652
653.text
654
655.globl p384_felem_diff_128_64
656.type	p384_felem_diff_128_64, \@function
657.align 4
658p384_felem_diff_128_64:
659	addis   5, 2, .LConst_128_two64\@toc\@ha
660	addi    5, 5, .LConst_128_two64\@toc\@l
661
662	ld	9, 0(3)
663	ld	10, 8(3)
664	ld	8, 48(5)	# two64p48m16
665	li	7, 0
666	addc	9, 9, 8
667	li	6, 1
668	adde	10, 10, 6
669	ld	11, 0(4)
670	subfc	8, 11, 9
671	subfe	12, 7, 10
672	std	8, 0(3)		# out0
673	std	12, 8(3)
674
675	ld	9, 16(3)
676	ld	10, 24(3)
677	ld	8, 0(5)		# two64m56m8
678	addc	9, 9, 8
679	addze	10, 10
680	ld	11, 8(4)
681	subfc	11, 11, 9
682	subfe	12, 7, 10
683	std	11, 16(3)	# out1
684	std	12, 24(3)
685
686	ld	9, 32(3)
687	ld	10, 40(3)
688	ld	8, 16(5)	# two64m32m8
689	addc	9, 9, 8
690	addze	10, 10
691	ld	11, 16(4)
692	subfc	11, 11, 9
693	subfe	12, 7, 10
694	std	11, 32(3)	# out2
695	std	12, 40(3)
696
697	ld	10, 48(3)
698	ld	8, 56(3)
699	#ld	9, 32(5)	# two64m8
700	li	9, -256		# two64m8
701	addc	10, 10, 9
702	addze	8, 8
703	ld	11, 24(4)
704	subfc	11, 11, 10
705	subfe	12, 7, 8
706	std	11, 48(3)	# out3
707	std	12, 56(3)
708
709	ld	10, 64(3)
710	ld	8, 72(3)
711	addc	10, 10, 9
712	addze	8, 8
713	ld	11, 32(4)
714	subfc	11, 11, 10
715	subfe	12, 7, 8
716	std	11, 64(3)	# out4
717	std	12, 72(3)
718
719	ld	10, 80(3)
720	ld	8, 88(3)
721	addc	10, 10, 9
722	addze	8, 8
723	ld	11, 40(4)
724	subfc	11, 11, 10
725	subfe	12, 7, 8
726	std	11, 80(3)	# out5
727	std	12, 88(3)
728
729	ld	10, 96(3)
730	ld	8, 104(3)
731	addc	10, 10, 9
732	addze	9, 8
733	ld	11, 48(4)
734	subfc	11, 11, 10
735	subfe	12, 7, 9
736	std	11, 96(3)	# out6
737	std	12, 104(3)
738
739	blr
740.size	p384_felem_diff_128_64,.-p384_felem_diff_128_64
741
742.data
743.align 4
744.LConst_128_two64:
745#two64m56m8
746.long 0xffffff00, 0xfeffffff, 0x00000000, 0x00000000
747#two64m32m8
748.long 0xffffff00, 0xfffffffe, 0x00000000, 0x00000000
749#two64m8
750.long 0xffffff00, 0xffffffff, 0x00000000, 0x00000000
751#two64p48m16
752.long 0xffff0000, 0x0000ffff, 0x00000001, 0x00000000
753
754.LConst_two60:
755#two60m52m4
756.long 0xfffffff0, 0x0fefffff, 0x0, 0x0
757#two60p44m12
758.long 0xfffff000, 0x10000fff, 0x0, 0x0
759#two60m28m4
760.long 0xeffffff0, 0x0fffffff, 0x0, 0x0
761#two60m4
762.long 0xfffffff0, 0x0fffffff, 0x0, 0x0
763
764.text
765#
766# static void felem_diff64(felem out, const felem in)
767#
768.globl p384_felem_diff64
769.type	p384_felem_diff64, \@function
770.align 4
771p384_felem_diff64:
772	addis   5, 2, .LConst_two60\@toc\@ha
773	addi    5, 5, .LConst_two60\@toc\@l
774
775	ld	9, 0(3)
776	ld	8, 16(5)	# two60p44m12
777	li	7, 0
778	add	9, 9, 8
779	ld	11, 0(4)
780	subf	8, 11, 9
781	std	8, 0(3)		# out0
782
783	ld	9, 8(3)
784	ld	8, 0(5)		# two60m52m4
785	add	9, 9, 8
786	ld	11, 8(4)
787	subf	11, 11, 9
788	std	11, 8(3)	# out1
789
790	ld	9, 16(3)
791	ld	8, 32(5)	# two60m28m4
792	add	9, 9, 8
793	ld	11, 16(4)
794	subf	11, 11, 9
795	std	11, 16(3)	# out2
796
797	ld	10, 24(3)
798	ld	9, 48(5)	# two60m4
799	add	10, 10, 9
800	ld	12, 24(4)
801	subf	12, 12, 10
802	std	12, 24(3)	# out3
803
804	ld	10, 32(3)
805	add	10, 10, 9
806	ld	11, 32(4)
807	subf	11, 11, 10
808	std	11, 32(3)	# out4
809
810	ld	10, 40(3)
811	add	10, 10, 9
812	ld	12, 40(4)
813	subf	12, 12, 10
814	std	12, 40(3)	# out5
815
816	ld	10, 48(3)
817	add	10, 10, 9
818	ld	11, 48(4)
819	subf	11, 11, 10
820	std	11, 48(3)	# out6
821
822	blr
823.size	p384_felem_diff64,.-p384_felem_diff64
824
825.text
826#
827# Shift 128 bits right <nbits>
828#
829.macro SHR o_h o_l in_h in_l nbits
830	srdi	\\o_l, \\in_l, \\nbits		# shift lower right <nbits>
831	rldimi	\\o_l, \\in_h, 64-\\nbits, 0	# insert <64-nbits> from hi
832	srdi	\\o_h, \\in_h, \\nbits		# shift higher right <nbits>
833.endm
834
835#
836# static void felem_reduce(felem out, const widefelem in)
837#
838.global p384_felem_reduce
839.type   p384_felem_reduce,\@function
840.align 4
841p384_felem_reduce:
842
843	stdu    1, -208(1)
844	mflr	0
845	std     14, 56(1)
846	std     15, 64(1)
847	std     16, 72(1)
848	std     17, 80(1)
849	std     18, 88(1)
850	std     19, 96(1)
851	std     20, 104(1)
852	std     21, 112(1)
853	std     22, 120(1)
854	std     23, 128(1)
855	std     24, 136(1)
856	std     25, 144(1)
857	std     26, 152(1)
858	std     27, 160(1)
859	std     28, 168(1)
860	std     29, 176(1)
861	std     30, 184(1)
862	std     31, 192(1)
863
864	bl	_p384_felem_reduce_core
865
866	mtlr	0
867	ld     14, 56(1)
868	ld     15, 64(1)
869	ld     16, 72(1)
870	ld     17, 80(1)
871	ld     18, 88(1)
872	ld     19, 96(1)
873	ld     20, 104(1)
874	ld     21, 112(1)
875	ld     22, 120(1)
876	ld     23, 128(1)
877	ld     24, 136(1)
878	ld     25, 144(1)
879	ld     26, 152(1)
880	ld     27, 160(1)
881	ld     28, 168(1)
882	ld     29, 176(1)
883	ld     30, 184(1)
884	ld     31, 192(1)
885	addi	1, 1, 208
886	blr
887.size   p384_felem_reduce,.-p384_felem_reduce
888
889#
890# Felem reduction core function -
891# r3 and r4 need to pre-loaded.
892#
893.type   _p384_felem_reduce_core,\@function
894.align 4
895_p384_felem_reduce_core:
896	addis   12, 2, .LConst\@toc\@ha
897	addi    12, 12, .LConst\@toc\@l
898
899	# load constat p
900	ld	11, 8(12)	# hi - two124m68
901
902	# acc[6] = in[6] + two124m68;
903	ld	26, 96(4)	# in[6].l
904	ld	27, 96+8(4)	# in[6].h
905	add	27, 27, 11
906
907	# acc[5] = in[5] + two124m68;
908	ld	24, 80(4)	# in[5].l
909	ld	25, 80+8(4)	# in[5].h
910	add	25, 25, 11
911
912	# acc[4] = in[4] + two124m68;
913	ld	22, 64(4)	# in[4].l
914	ld	23, 64+8(4)	# in[4].h
915	add	23, 23, 11
916
917	# acc[3] = in[3] + two124m68;
918	ld	20, 48(4)	# in[3].l
919	ld	21, 48+8(4)	# in[3].h
920	add	21, 21, 11
921
922	ld	11, 48+8(12)	# hi - two124m92m68
923
924	# acc[2] = in[2] + two124m92m68;
925	ld	18, 32(4)	# in[2].l
926	ld	19, 32+8(4)	# in[2].h
927	add	19, 19, 11
928
929	ld	11, 16+8(12)	# high - two124m116m68
930
931	# acc[1] = in[1] + two124m116m68;
932	ld	16, 16(4)	# in[1].l
933	ld	17, 16+8(4)	# in[1].h
934	add	17, 17, 11
935
936	ld	11, 32+8(12)	# high - two124p108m76
937
938	# acc[0] = in[0] + two124p108m76;
939	ld	14, 0(4)	# in[0].l
940	ld	15, 0+8(4)	# in[0].h
941	add	15, 15, 11
942
943	# compute mask
944	li	7, -1
945
946	# Eliminate in[12]
947
948	# acc[8] += in[12] >> 32;
949	ld	5, 192(4)	# in[12].l
950	ld	6, 192+8(4)	# in[12].h
951	SHR 9, 10, 6, 5, 32
952	ld	30, 128(4)	# in[8].l
953	ld	31, 136(4)	# in[8].h
954	addc	30, 30, 10
955	adde	31, 31, 9
956
957	# acc[7] += (in[12] & 0xffffffff) << 24;
958	srdi	11, 7, 32	# 0xffffffff
959	and	11, 11, 5
960	sldi	11, 11, 24	# << 24
961	ld	28, 112(4)	# in[7].l
962	ld	29, 120(4)	# in[7].h
963	addc	28, 28, 11
964	addze	29, 29
965
966	# acc[7] += in[12] >> 8;
967	SHR 9, 10, 6, 5, 8
968	addc	28, 28, 10
969	adde	29, 29, 9
970
971	# acc[6] += (in[12] & 0xff) << 48;
972	andi.	11, 5, 0xff
973	sldi	11, 11, 48
974	addc	26, 26, 11
975	addze	27, 27
976
977	# acc[6] -= in[12] >> 16;
978	SHR 9, 10, 6, 5, 16
979	subfc	26, 10, 26
980	subfe	27, 9, 27
981
982	# acc[5] -= (in[12] & 0xffff) << 40;
983	srdi	11, 7, 48	# 0xffff
984	and	11, 11, 5
985	sldi	11, 11, 40	# << 40
986	li	9, 0
987	subfc	24, 11, 24
988	subfe	25, 9, 25
989
990	# acc[6] += in[12] >> 48;
991	SHR 9, 10, 6, 5, 48
992	addc	26, 26, 10
993	adde	27, 27, 9
994
995	# acc[5] += (in[12] & 0xffffffffffff) << 8;
996	srdi	11, 7, 16	# 0xffffffffffff
997	and	11, 11, 5
998	sldi	11, 11, 8	# << 8
999	addc	24, 24, 11
1000	addze	25, 25
1001
1002	# Eliminate in[11]
1003
1004	# acc[7] += in[11] >> 32;
1005	ld	5, 176(4)	# in[11].l
1006	ld	6, 176+8(4)	# in[11].h
1007	SHR 9, 10, 6, 5, 32
1008	addc	28, 28, 10
1009	adde	29, 29, 9
1010
1011	# acc[6] += (in[11] & 0xffffffff) << 24;
1012	srdi	11, 7, 32	# 0xffffffff
1013	and	11, 11, 5
1014	sldi	11, 11, 24	# << 24
1015	addc	26, 26, 11
1016	addze	27, 27
1017
1018	# acc[6] += in[11] >> 8;
1019	SHR 9, 10, 6, 5, 8
1020	addc	26, 26, 10
1021	adde	27, 27, 9
1022
1023	# acc[5] += (in[11] & 0xff) << 48;
1024	andi.	11, 5, 0xff
1025	sldi	11, 11, 48
1026	addc	24, 24, 11
1027	addze	25, 25
1028
1029	# acc[5] -= in[11] >> 16;
1030	SHR 9, 10, 6, 5, 16
1031	subfc	24, 10, 24
1032	subfe	25, 9, 25
1033
1034	# acc[4] -= (in[11] & 0xffff) << 40;
1035	srdi	11, 7, 48	# 0xffff
1036	and	11, 11, 5
1037	sldi	11, 11, 40	# << 40
1038	li	9, 0
1039	subfc	22, 11, 22
1040	subfe	23, 9, 23
1041
1042	# acc[5] += in[11] >> 48;
1043	SHR 9, 10, 6, 5, 48
1044	addc	24, 24, 10
1045	adde	25, 25, 9
1046
1047	# acc[4] += (in[11] & 0xffffffffffff) << 8;
1048	srdi	11, 7, 16	# 0xffffffffffff
1049	and	11, 11, 5
1050	sldi	11, 11, 8	# << 8
1051	addc	22, 22, 11
1052	addze	23, 23
1053
1054	# Eliminate in[10]
1055
1056	# acc[6] += in[10] >> 32;
1057	ld	5, 160(4)	# in[10].l
1058	ld	6, 160+8(4)	# in[10].h
1059	SHR 9, 10, 6, 5, 32
1060	addc	26, 26, 10
1061	adde	27, 27, 9
1062
1063	# acc[5] += (in[10] & 0xffffffff) << 24;
1064	srdi	11, 7, 32	# 0xffffffff
1065	and	11, 11, 5
1066	sldi	11, 11, 24	# << 24
1067	addc	24, 24, 11
1068	addze	25, 25
1069
1070	# acc[5] += in[10] >> 8;
1071	SHR 9, 10, 6, 5, 8
1072	addc	24, 24, 10
1073	adde	25, 25, 9
1074
1075	# acc[4] += (in[10] & 0xff) << 48;
1076	andi.	11, 5, 0xff
1077	sldi	11, 11, 48
1078	addc	22, 22, 11
1079	addze	23, 23
1080
1081	# acc[4] -= in[10] >> 16;
1082	SHR 9, 10, 6, 5, 16
1083	subfc	22, 10, 22
1084	subfe	23, 9, 23
1085
1086	# acc[3] -= (in[10] & 0xffff) << 40;
1087	srdi	11, 7, 48	# 0xffff
1088	and	11, 11, 5
1089	sldi	11, 11, 40	# << 40
1090	li	9, 0
1091	subfc	20, 11, 20
1092	subfe	21, 9, 21
1093
1094	# acc[4] += in[10] >> 48;
1095	SHR 9, 10, 6, 5, 48
1096	addc	22, 22, 10
1097	adde	23, 23, 9
1098
1099	# acc[3] += (in[10] & 0xffffffffffff) << 8;
1100	srdi	11, 7, 16	# 0xffffffffffff
1101	and	11, 11, 5
1102	sldi	11, 11, 8	# << 8
1103	addc	20, 20, 11
1104	addze	21, 21
1105
1106	# Eliminate in[9]
1107
1108	# acc[5] += in[9] >> 32;
1109	ld	5, 144(4)	# in[9].l
1110	ld	6, 144+8(4)	# in[9].h
1111	SHR 9, 10, 6, 5, 32
1112	addc	24, 24, 10
1113	adde	25, 25, 9
1114
1115	# acc[4] += (in[9] & 0xffffffff) << 24;
1116	srdi	11, 7, 32	# 0xffffffff
1117	and	11, 11, 5
1118	sldi	11, 11, 24	# << 24
1119	addc	22, 22, 11
1120	addze	23, 23
1121
1122	# acc[4] += in[9] >> 8;
1123	SHR 9, 10, 6, 5, 8
1124	addc	22, 22, 10
1125	adde	23, 23, 9
1126
1127	# acc[3] += (in[9] & 0xff) << 48;
1128	andi.	11, 5, 0xff
1129	sldi	11, 11, 48
1130	addc	20, 20, 11
1131	addze	21, 21
1132
1133	# acc[3] -= in[9] >> 16;
1134	SHR 9, 10, 6, 5, 16
1135	subfc	20, 10, 20
1136	subfe	21, 9, 21
1137
1138	# acc[2] -= (in[9] & 0xffff) << 40;
1139	srdi	11, 7, 48	# 0xffff
1140	and	11, 11, 5
1141	sldi	11, 11, 40	# << 40
1142	li	9, 0
1143	subfc	18, 11, 18
1144	subfe	19, 9, 19
1145
1146	# acc[3] += in[9] >> 48;
1147	SHR 9, 10, 6, 5, 48
1148	addc	20, 20, 10
1149	adde	21, 21, 9
1150
1151	# acc[2] += (in[9] & 0xffffffffffff) << 8;
1152	srdi	11, 7, 16	# 0xffffffffffff
1153	and	11, 11, 5
1154	sldi	11, 11, 8	# << 8
1155	addc	18, 18, 11
1156	addze	19, 19
1157
1158	# Eliminate acc[8]
1159
1160	# acc[4] += acc[8] >> 32;
1161	mr	5, 30		# acc[8].l
1162	mr	6, 31		# acc[8].h
1163	SHR 9, 10, 6, 5, 32
1164	addc	22, 22, 10
1165	adde	23, 23, 9
1166
1167	# acc[3] += (acc[8] & 0xffffffff) << 24;
1168	srdi	11, 7, 32	# 0xffffffff
1169	and	11, 11, 5
1170	sldi	11, 11, 24	# << 24
1171	addc	20, 20, 11
1172	addze	21, 21
1173
1174	# acc[3] += acc[8] >> 8;
1175	SHR 9, 10, 6, 5, 8
1176	addc	20, 20, 10
1177	adde	21, 21, 9
1178
1179	# acc[2] += (acc[8] & 0xff) << 48;
1180	andi.	11, 5, 0xff
1181	sldi	11, 11, 48
1182	addc	18, 18, 11
1183	addze	19, 19
1184
1185	# acc[2] -= acc[8] >> 16;
1186	SHR 9, 10, 6, 5, 16
1187	subfc	18, 10, 18
1188	subfe	19, 9, 19
1189
1190	# acc[1] -= (acc[8] & 0xffff) << 40;
1191	srdi	11, 7, 48	# 0xffff
1192	and	11, 11, 5
1193	sldi	11, 11, 40	# << 40
1194	li	9, 0
1195	subfc	16, 11, 16
1196	subfe	17, 9, 17
1197
1198	#acc[2] += acc[8] >> 48;
1199	SHR 9, 10, 6, 5, 48
1200	addc	18, 18, 10
1201	adde	19, 19, 9
1202
1203	# acc[1] += (acc[8] & 0xffffffffffff) << 8;
1204	srdi	11, 7, 16	# 0xffffffffffff
1205	and	11, 11, 5
1206	sldi	11, 11, 8	# << 8
1207	addc	16, 16, 11
1208	addze	17, 17
1209
1210	# Eliminate acc[7]
1211
1212	# acc[3] += acc[7] >> 32;
1213	mr	5, 28		# acc[7].l
1214	mr	6, 29		# acc[7].h
1215	SHR 9, 10, 6, 5, 32
1216	addc	20, 20, 10
1217	adde	21, 21, 9
1218
1219	# acc[2] += (acc[7] & 0xffffffff) << 24;
1220	srdi	11, 7, 32	# 0xffffffff
1221	and	11, 11, 5
1222	sldi	11, 11, 24	# << 24
1223	addc	18, 18, 11
1224	addze	19, 19
1225
1226	# acc[2] += acc[7] >> 8;
1227	SHR 9, 10, 6, 5, 8
1228	addc	18, 18, 10
1229	adde	19, 19, 9
1230
1231	# acc[1] += (acc[7] & 0xff) << 48;
1232	andi.	11, 5, 0xff
1233	sldi	11, 11, 48
1234	addc	16, 16, 11
1235	addze	17, 17
1236
1237	# acc[1] -= acc[7] >> 16;
1238	SHR 9, 10, 6, 5, 16
1239	subfc	16, 10, 16
1240	subfe	17, 9, 17
1241
1242	# acc[0] -= (acc[7] & 0xffff) << 40;
1243	srdi	11, 7, 48	# 0xffff
1244	and	11, 11, 5
1245	sldi	11, 11, 40	# << 40
1246	li	9, 0
1247	subfc	14, 11, 14
1248	subfe	15, 9, 15
1249
1250	# acc[1] += acc[7] >> 48;
1251	SHR 9, 10, 6, 5, 48
1252	addc	16, 16, 10
1253	adde	17, 17, 9
1254
1255	# acc[0] += (acc[7] & 0xffffffffffff) << 8;
1256	srdi	11, 7, 16	# 0xffffffffffff
1257	and	11, 11, 5
1258	sldi	11, 11, 8	# << 8
1259	addc	14, 14, 11
1260	addze	15, 15
1261
1262	#
1263	# Carry 4 -> 5 -> 6
1264	#
1265	# acc[5] += acc[4] >> 56;
1266	# acc[4] &= 0x00ffffffffffffff;
1267	SHR 9, 10, 23, 22, 56
1268	addc	24, 24, 10
1269	adde	25, 25, 9
1270	srdi	11, 7, 8	# 0x00ffffffffffffff
1271	and	22, 22, 11
1272	li	23, 0
1273
1274	# acc[6] += acc[5] >> 56;
1275	# acc[5] &= 0x00ffffffffffffff;
1276	SHR 9, 10, 25, 24, 56
1277	addc	26, 26, 10
1278	adde	27, 27, 9
1279	and	24, 24, 11
1280	li	25, 0
1281
1282	# [3]: Eliminate high bits of acc[6] */
1283	# temp = acc[6] >> 48;
1284	# acc[6] &= 0x0000ffffffffffff;
1285	SHR 31, 30, 27, 26, 48	# temp = acc[6] >> 48
1286	srdi	11, 7, 16	# 0x0000ffffffffffff
1287	and	26, 26, 11
1288	li	27, 0
1289
1290	# temp < 2^80
1291	# acc[3] += temp >> 40;
1292	SHR 9, 10, 31, 30, 40
1293	addc	20, 20, 10
1294	adde	21, 21, 9
1295
1296	# acc[2] += (temp & 0xffffffffff) << 16;
1297	srdi	11, 7, 24	# 0xffffffffff
1298	and	10, 30, 11
1299	sldi	10, 10, 16
1300	addc	18, 18, 10
1301	addze	19, 19
1302
1303	# acc[2] += temp >> 16;
1304	SHR 9, 10, 31, 30, 16
1305	addc	18, 18, 10
1306	adde	19, 19, 9
1307
1308	# acc[1] += (temp & 0xffff) << 40;
1309	srdi	11, 7, 48	# 0xffff
1310	and	10, 30, 11
1311	sldi	10, 10, 40
1312	addc	16, 16, 10
1313	addze	17, 17
1314
1315	# acc[1] -= temp >> 24;
1316	SHR 9, 10, 31, 30, 24
1317	subfc	16, 10, 16
1318	subfe	17, 9, 17
1319
1320	# acc[0] -= (temp & 0xffffff) << 32;
1321	srdi	11, 7, 40	# 0xffffff
1322	and	10, 30, 11
1323	sldi	10, 10, 32
1324	li	9, 0
1325	subfc	14, 10, 14
1326	subfe	15, 9, 15
1327
1328	# acc[0] += temp;
1329	addc	14, 14, 30
1330	adde	15, 15, 31
1331
1332	# Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6
1333	#
1334	# acc[1] += acc[0] >> 56;   /* acc[1] < acc_old[1] + 2^72 */
1335	SHR 9, 10, 15, 14, 56
1336	addc	16, 16, 10
1337	adde	17, 17, 9
1338
1339	# acc[0] &= 0x00ffffffffffffff;
1340	srdi	11, 7, 8	# 0x00ffffffffffffff
1341	and	14, 14, 11
1342	li	15, 0
1343
1344	# acc[2] += acc[1] >> 56;   /* acc[2] < acc_old[2] + 2^72 + 2^16 */
1345	SHR 9, 10, 17, 16, 56
1346	addc	18, 18, 10
1347	adde	19, 19, 9
1348
1349	# acc[1] &= 0x00ffffffffffffff;
1350	and	16, 16, 11
1351	li	17, 0
1352
1353	# acc[3] += acc[2] >> 56;   /* acc[3] < acc_old[3] + 2^72 + 2^16 */
1354	SHR 9, 10, 19, 18, 56
1355	addc	20, 20, 10
1356	adde	21, 21, 9
1357
1358	# acc[2] &= 0x00ffffffffffffff;
1359	and	18, 18, 11
1360	li	19, 0
1361
1362	# acc[4] += acc[3] >> 56;
1363	SHR 9, 10, 21, 20, 56
1364	addc	22, 22, 10
1365	adde	23, 23, 9
1366
1367	# acc[3] &= 0x00ffffffffffffff;
1368	and	20, 20, 11
1369	li	21, 0
1370
1371	# acc[5] += acc[4] >> 56;
1372	SHR 9, 10, 23, 22, 56
1373	addc	24, 24, 10
1374	adde	25, 25, 9
1375
1376	# acc[4] &= 0x00ffffffffffffff;
1377	and	22, 22, 11
1378
1379	# acc[6] += acc[5] >> 56;
1380	SHR 9, 10, 25, 24, 56
1381	addc	26, 26, 10
1382	adde	27, 27, 9
1383
1384	# acc[5] &= 0x00ffffffffffffff;
1385	and	24, 24, 11
1386
1387	std	14, 0(3)
1388	std	16, 8(3)
1389	std	18, 16(3)
1390	std	20, 24(3)
1391	std	22, 32(3)
1392	std	24, 40(3)
1393	std	26, 48(3)
1394	blr
1395.size   _p384_felem_reduce_core,.-_p384_felem_reduce_core
1396
1397.data
1398.align 4
1399.LConst:
1400# two124m68:
1401.long 0x0, 0x0, 0xfffffff0, 0xfffffff
1402# two124m116m68:
1403.long 0x0, 0x0, 0xfffffff0, 0xfefffff
1404#two124p108m76:
1405.long 0x0, 0x0, 0xfffff000, 0x10000fff
1406#two124m92m68:
1407.long 0x0, 0x0, 0xeffffff0, 0xfffffff
1408
1409.text
1410
1411#
1412# void p384_felem_square_reduce(felem out, const felem in)
1413#
1414.global p384_felem_square_reduce
1415.type   p384_felem_square_reduce,\@function
1416.align 4
1417p384_felem_square_reduce:
1418	stdu    1, -512(1)
1419	mflr	0
1420	std     14, 56(1)
1421	std     15, 64(1)
1422	std     16, 72(1)
1423	std     17, 80(1)
1424	std     18, 88(1)
1425	std     19, 96(1)
1426	std     20, 104(1)
1427	std     21, 112(1)
1428	std     22, 120(1)
1429	std     23, 128(1)
1430	std     24, 136(1)
1431	std     25, 144(1)
1432	std     26, 152(1)
1433	std     27, 160(1)
1434	std     28, 168(1)
1435	std     29, 176(1)
1436	std     30, 184(1)
1437	std     31, 192(1)
1438
1439	std	3, 496(1)
1440	addi	3, 1, 208
1441	bl _p384_felem_square_core
1442
1443	mr	4, 3
1444	ld	3, 496(1)
1445	bl _p384_felem_reduce_core
1446
1447	ld     14, 56(1)
1448	ld     15, 64(1)
1449	ld     16, 72(1)
1450	ld     17, 80(1)
1451	ld     18, 88(1)
1452	ld     19, 96(1)
1453	ld     20, 104(1)
1454	ld     21, 112(1)
1455	ld     22, 120(1)
1456	ld     23, 128(1)
1457	ld     24, 136(1)
1458	ld     25, 144(1)
1459	ld     26, 152(1)
1460	ld     27, 160(1)
1461	ld     28, 168(1)
1462	ld     29, 176(1)
1463	ld     30, 184(1)
1464	ld     31, 192(1)
1465	addi	1, 1, 512
1466	mtlr	0
1467	blr
1468.size   p384_felem_square_reduce,.-p384_felem_square_reduce
1469
1470#
1471# void p384_felem_mul_reduce(felem out, const felem in1, const felem in2)
1472#
1473.global p384_felem_mul_reduce
1474.type   p384_felem_mul_reduce,\@function
1475.align 5
1476p384_felem_mul_reduce:
1477	stdu    1, -512(1)
1478	mflr	0
1479	std     14, 56(1)
1480	std     15, 64(1)
1481	std     16, 72(1)
1482	std     17, 80(1)
1483	std     18, 88(1)
1484	std     19, 96(1)
1485	std     20, 104(1)
1486	std     21, 112(1)
1487	std     22, 120(1)
1488	std     23, 128(1)
1489	std     24, 136(1)
1490	std     25, 144(1)
1491	std     26, 152(1)
1492	std     27, 160(1)
1493	std     28, 168(1)
1494	std     29, 176(1)
1495	std     30, 184(1)
1496	std     31, 192(1)
1497
1498	std	3, 496(1)
1499	addi	3, 1, 208
1500	bl _p384_felem_mul_core
1501
1502	mr	4, 3
1503	ld	3, 496(1)
1504	bl _p384_felem_reduce_core
1505
1506	ld     14, 56(1)
1507	ld     15, 64(1)
1508	ld     16, 72(1)
1509	ld     17, 80(1)
1510	ld     18, 88(1)
1511	ld     19, 96(1)
1512	ld     20, 104(1)
1513	ld     21, 112(1)
1514	ld     22, 120(1)
1515	ld     23, 128(1)
1516	ld     24, 136(1)
1517	ld     25, 144(1)
1518	ld     26, 152(1)
1519	ld     27, 160(1)
1520	ld     28, 168(1)
1521	ld     29, 176(1)
1522	ld     30, 184(1)
1523	ld     31, 192(1)
1524	addi	1, 1, 512
1525	mtlr	0
1526	blr
1527.size   p384_felem_mul_reduce,.-p384_felem_mul_reduce
1528___
1529
1530$code =~ s/\`([^\`]*)\`/eval $1/gem;
1531print $code;
1532close STDOUT or die "error closing STDOUT: $!";
1533