xref: /freebsd/crypto/openssl/crypto/bn/asm/ppc.pl (revision b4af4f93c682e445bf159f0d1ec90b636296c946)
1#! /usr/bin/env perl
2# Copyright 2004-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9# Implemented as a Perl wrapper as we want to support several different
10# architectures with single file. We pick up the target based on the
11# file name we are asked to generate.
12#
13# It should be noted though that this perl code is nothing like
14# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
15# as pre-processor to cover for platform differences in name decoration,
16# linker tables, 32-/64-bit instruction sets...
17#
18# As you might know there're several PowerPC ABI in use. Most notably
19# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
20# are similar enough to implement leaf(!) functions, which would be ABI
21# neutral. And that's what you find here: ABI neutral leaf functions.
22# In case you wonder what that is...
23#
24#       AIX performance
25#
26#	MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
27#
28#	The following is the performance of 32-bit compiler
29#	generated code:
30#
31#	OpenSSL 0.9.6c 21 dec 2001
32#	built on: Tue Jun 11 11:06:51 EDT 2002
33#	options:bn(64,32) ...
34#compiler: cc -DTHREADS  -DAIX -DB_ENDIAN -DBN_LLONG -O3
35#                  sign    verify    sign/s verify/s
36#rsa  512 bits   0.0098s   0.0009s    102.0   1170.6
37#rsa 1024 bits   0.0507s   0.0026s     19.7    387.5
38#rsa 2048 bits   0.3036s   0.0085s      3.3    117.1
39#rsa 4096 bits   2.0040s   0.0299s      0.5     33.4
40#dsa  512 bits   0.0087s   0.0106s    114.3     94.5
41#dsa 1024 bits   0.0256s   0.0313s     39.0     32.0
42#
43#	Same benchmark with this assembler code:
44#
45#rsa  512 bits   0.0056s   0.0005s    178.6   2049.2
46#rsa 1024 bits   0.0283s   0.0015s     35.3    674.1
47#rsa 2048 bits   0.1744s   0.0050s      5.7    201.2
48#rsa 4096 bits   1.1644s   0.0179s      0.9     55.7
49#dsa  512 bits   0.0052s   0.0062s    191.6    162.0
50#dsa 1024 bits   0.0149s   0.0180s     67.0     55.5
51#
52#	Number of operations increases by at almost 75%
53#
54#	Here are performance numbers for 64-bit compiler
55#	generated code:
56#
57#	OpenSSL 0.9.6g [engine] 9 Aug 2002
58#	built on: Fri Apr 18 16:59:20 EDT 2003
59#	options:bn(64,64) ...
60#	compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
61#                  sign    verify    sign/s verify/s
62#rsa  512 bits   0.0028s   0.0003s    357.1   3844.4
63#rsa 1024 bits   0.0148s   0.0008s     67.5   1239.7
64#rsa 2048 bits   0.0963s   0.0028s     10.4    353.0
65#rsa 4096 bits   0.6538s   0.0102s      1.5     98.1
66#dsa  512 bits   0.0026s   0.0032s    382.5    313.7
67#dsa 1024 bits   0.0081s   0.0099s    122.8    100.6
68#
69#	Same benchmark with this assembler code:
70#
71#rsa  512 bits   0.0020s   0.0002s    510.4   6273.7
72#rsa 1024 bits   0.0088s   0.0005s    114.1   2128.3
73#rsa 2048 bits   0.0540s   0.0016s     18.5    622.5
74#rsa 4096 bits   0.3700s   0.0058s      2.7    171.0
75#dsa  512 bits   0.0016s   0.0020s    610.7    507.1
76#dsa 1024 bits   0.0047s   0.0058s    212.5    173.2
77#
78#	Again, performance increases by at about 75%
79#
80#       Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
81#       OpenSSL 0.9.7c 30 Sep 2003
82#
83#       Original code.
84#
85#rsa  512 bits   0.0011s   0.0001s    906.1  11012.5
86#rsa 1024 bits   0.0060s   0.0003s    166.6   3363.1
87#rsa 2048 bits   0.0370s   0.0010s     27.1    982.4
88#rsa 4096 bits   0.2426s   0.0036s      4.1    280.4
89#dsa  512 bits   0.0010s   0.0012s   1038.1    841.5
90#dsa 1024 bits   0.0030s   0.0037s    329.6    269.7
91#dsa 2048 bits   0.0101s   0.0127s     98.9     78.6
92#
93#       Same benchmark with this assembler code:
94#
95#rsa  512 bits   0.0007s   0.0001s   1416.2  16645.9
96#rsa 1024 bits   0.0036s   0.0002s    274.4   5380.6
97#rsa 2048 bits   0.0222s   0.0006s     45.1   1589.5
98#rsa 4096 bits   0.1469s   0.0022s      6.8    449.6
99#dsa  512 bits   0.0006s   0.0007s   1664.2   1376.2
100#dsa 1024 bits   0.0018s   0.0023s    545.0    442.2
101#dsa 2048 bits   0.0061s   0.0075s    163.5    132.8
102#
103#        Performance increase of ~60%
104#        Based on submission from Suresh N. Chari of IBM
105
106$flavour = shift;
107
108if ($flavour =~ /32/) {
109	$BITS=	32;
110	$BNSZ=	$BITS/8;
111	$ISA=	"\"ppc\"";
112
113	$LD=	"lwz";		# load
114	$LDU=	"lwzu";		# load and update
115	$ST=	"stw";		# store
116	$STU=	"stwu";		# store and update
117	$UMULL=	"mullw";	# unsigned multiply low
118	$UMULH=	"mulhwu";	# unsigned multiply high
119	$UDIV=	"divwu";	# unsigned divide
120	$UCMPI=	"cmplwi";	# unsigned compare with immediate
121	$UCMP=	"cmplw";	# unsigned compare
122	$CNTLZ=	"cntlzw";	# count leading zeros
123	$SHL=	"slw";		# shift left
124	$SHR=	"srw";		# unsigned shift right
125	$SHRI=	"srwi";		# unsigned shift right by immediate
126	$SHLI=	"slwi";		# shift left by immediate
127	$CLRU=	"clrlwi";	# clear upper bits
128	$INSR=	"insrwi";	# insert right
129	$ROTL=	"rotlwi";	# rotate left by immediate
130	$TR=	"tw";		# conditional trap
131} elsif ($flavour =~ /64/) {
132	$BITS=	64;
133	$BNSZ=	$BITS/8;
134	$ISA=	"\"ppc64\"";
135
136	# same as above, but 64-bit mnemonics...
137	$LD=	"ld";		# load
138	$LDU=	"ldu";		# load and update
139	$ST=	"std";		# store
140	$STU=	"stdu";		# store and update
141	$UMULL=	"mulld";	# unsigned multiply low
142	$UMULH=	"mulhdu";	# unsigned multiply high
143	$UDIV=	"divdu";	# unsigned divide
144	$UCMPI=	"cmpldi";	# unsigned compare with immediate
145	$UCMP=	"cmpld";	# unsigned compare
146	$CNTLZ=	"cntlzd";	# count leading zeros
147	$SHL=	"sld";		# shift left
148	$SHR=	"srd";		# unsigned shift right
149	$SHRI=	"srdi";		# unsigned shift right by immediate
150	$SHLI=	"sldi";		# shift left by immediate
151	$CLRU=	"clrldi";	# clear upper bits
152	$INSR=	"insrdi";	# insert right
153	$ROTL=	"rotldi";	# rotate left by immediate
154	$TR=	"td";		# conditional trap
155} else { die "nonsense $flavour"; }
156
157$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
158( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
159( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
160die "can't locate ppc-xlate.pl";
161
162open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
163
164$data=<<EOF;
165#--------------------------------------------------------------------
166#
167#
168#
169#
170#	File:		ppc32.s
171#
172#	Created by:	Suresh Chari
173#			IBM Thomas J. Watson Research Library
174#			Hawthorne, NY
175#
176#
177#	Description:	Optimized assembly routines for OpenSSL crypto
178#			on the 32 bitPowerPC platform.
179#
180#
181#	Version History
182#
183#	2. Fixed bn_add,bn_sub and bn_div_words, added comments,
184#	   cleaned up code. Also made a single version which can
185#	   be used for both the AIX and Linux compilers. See NOTE
186#	   below.
187#				12/05/03		Suresh Chari
188#			(with lots of help from)        Andy Polyakov
189##
190#	1. Initial version	10/20/02		Suresh Chari
191#
192#
193#	The following file works for the xlc,cc
194#	and gcc compilers.
195#
196#	NOTE:	To get the file to link correctly with the gcc compiler
197#	        you have to change the names of the routines and remove
198#		the first .(dot) character. This should automatically
199#		be done in the build process.
200#
201#	Hand optimized assembly code for the following routines
202#
203#	bn_sqr_comba4
204#	bn_sqr_comba8
205#	bn_mul_comba4
206#	bn_mul_comba8
207#	bn_sub_words
208#	bn_add_words
209#	bn_div_words
210#	bn_sqr_words
211#	bn_mul_words
212#	bn_mul_add_words
213#
214#	NOTE:	It is possible to optimize this code more for
215#	specific PowerPC or Power architectures. On the Northstar
216#	architecture the optimizations in this file do
217#	 NOT provide much improvement.
218#
219#	If you have comments or suggestions to improve code send
220#	me a note at schari\@us.ibm.com
221#
222#--------------------------------------------------------------------------
223#
224#	Defines to be used in the assembly code.
225#
226#.set r0,0	# we use it as storage for value of 0
227#.set SP,1	# preserved
228#.set RTOC,2	# preserved
229#.set r3,3	# 1st argument/return value
230#.set r4,4	# 2nd argument/volatile register
231#.set r5,5	# 3rd argument/volatile register
232#.set r6,6	# ...
233#.set r7,7
234#.set r8,8
235#.set r9,9
236#.set r10,10
237#.set r11,11
238#.set r12,12
239#.set r13,13	# not used, nor any other "below" it...
240
241#	Declare function names to be global
242#	NOTE:	For gcc these names MUST be changed to remove
243#	        the first . i.e. for example change ".bn_sqr_comba4"
244#		to "bn_sqr_comba4". This should be automatically done
245#		in the build.
246
247	.globl	.bn_sqr_comba4
248	.globl	.bn_sqr_comba8
249	.globl	.bn_mul_comba4
250	.globl	.bn_mul_comba8
251	.globl	.bn_sub_words
252	.globl	.bn_add_words
253	.globl	.bn_div_words
254	.globl	.bn_sqr_words
255	.globl	.bn_mul_words
256	.globl	.bn_mul_add_words
257
258# .text section
259
260	.machine	"any"
261	.text
262
263#
264#	NOTE:	The following label name should be changed to
265#		"bn_sqr_comba4" i.e. remove the first dot
266#		for the gcc compiler. This should be automatically
267#		done in the build
268#
269
270.align	4
271.bn_sqr_comba4:
272#
273# Optimized version of bn_sqr_comba4.
274#
275# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
276# r3 contains r
277# r4 contains a
278#
279# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
280#
281# r5,r6 are the two BN_ULONGs being multiplied.
282# r7,r8 are the results of the 32x32 giving 64 bit multiply.
283# r9,r10, r11 are the equivalents of c1,c2, c3.
284# Here's the assembly
285#
286#
287	xor		r0,r0,r0		# set r0 = 0. Used in the addze
288						# instructions below
289
290						#sqr_add_c(a,0,c1,c2,c3)
291	$LD		r5,`0*$BNSZ`(r4)
292	$UMULL		r9,r5,r5
293	$UMULH		r10,r5,r5		#in first iteration. No need
294						#to add since c1=c2=c3=0.
295						# Note c3(r11) is NOT set to 0
296						# but will be.
297
298	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
299						# sqr_add_c2(a,1,0,c2,c3,c1);
300	$LD		r6,`1*$BNSZ`(r4)
301	$UMULL		r7,r5,r6
302	$UMULH		r8,r5,r6
303
304	addc		r7,r7,r7		# compute (r7,r8)=2*(r7,r8)
305	adde		r8,r8,r8
306	addze		r9,r0			# catch carry if any.
307						# r9= r0(=0) and carry
308
309	addc		r10,r7,r10		# now add to temp result.
310	addze		r11,r8                  # r8 added to r11 which is 0
311	addze		r9,r9
312
313	$ST		r10,`1*$BNSZ`(r3)	#r[1]=c2;
314						#sqr_add_c(a,1,c3,c1,c2)
315	$UMULL		r7,r6,r6
316	$UMULH		r8,r6,r6
317	addc		r11,r7,r11
318	adde		r9,r8,r9
319	addze		r10,r0
320						#sqr_add_c2(a,2,0,c3,c1,c2)
321	$LD		r6,`2*$BNSZ`(r4)
322	$UMULL		r7,r5,r6
323	$UMULH		r8,r5,r6
324
325	addc		r7,r7,r7
326	adde		r8,r8,r8
327	addze		r10,r10
328
329	addc		r11,r7,r11
330	adde		r9,r8,r9
331	addze		r10,r10
332	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
333						#sqr_add_c2(a,3,0,c1,c2,c3);
334	$LD		r6,`3*$BNSZ`(r4)
335	$UMULL		r7,r5,r6
336	$UMULH		r8,r5,r6
337	addc		r7,r7,r7
338	adde		r8,r8,r8
339	addze		r11,r0
340
341	addc		r9,r7,r9
342	adde		r10,r8,r10
343	addze		r11,r11
344						#sqr_add_c2(a,2,1,c1,c2,c3);
345	$LD		r5,`1*$BNSZ`(r4)
346	$LD		r6,`2*$BNSZ`(r4)
347	$UMULL		r7,r5,r6
348	$UMULH		r8,r5,r6
349
350	addc		r7,r7,r7
351	adde		r8,r8,r8
352	addze		r11,r11
353	addc		r9,r7,r9
354	adde		r10,r8,r10
355	addze		r11,r11
356	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1
357						#sqr_add_c(a,2,c2,c3,c1);
358	$UMULL		r7,r6,r6
359	$UMULH		r8,r6,r6
360	addc		r10,r7,r10
361	adde		r11,r8,r11
362	addze		r9,r0
363						#sqr_add_c2(a,3,1,c2,c3,c1);
364	$LD		r6,`3*$BNSZ`(r4)
365	$UMULL		r7,r5,r6
366	$UMULH		r8,r5,r6
367	addc		r7,r7,r7
368	adde		r8,r8,r8
369	addze		r9,r9
370
371	addc		r10,r7,r10
372	adde		r11,r8,r11
373	addze		r9,r9
374	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2
375						#sqr_add_c2(a,3,2,c3,c1,c2);
376	$LD		r5,`2*$BNSZ`(r4)
377	$UMULL		r7,r5,r6
378	$UMULH		r8,r5,r6
379	addc		r7,r7,r7
380	adde		r8,r8,r8
381	addze		r10,r0
382
383	addc		r11,r7,r11
384	adde		r9,r8,r9
385	addze		r10,r10
386	$ST		r11,`5*$BNSZ`(r3)	#r[5] = c3
387						#sqr_add_c(a,3,c1,c2,c3);
388	$UMULL		r7,r6,r6
389	$UMULH		r8,r6,r6
390	addc		r9,r7,r9
391	adde		r10,r8,r10
392
393	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1
394	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2
395	blr
396	.long	0
397	.byte	0,12,0x14,0,0,0,2,0
398	.long	0
399.size	.bn_sqr_comba4,.-.bn_sqr_comba4
400
401#
402#	NOTE:	The following label name should be changed to
403#		"bn_sqr_comba8" i.e. remove the first dot
404#		for the gcc compiler. This should be automatically
405#		done in the build
406#
407
408.align	4
409.bn_sqr_comba8:
410#
411# This is an optimized version of the bn_sqr_comba8 routine.
412# Tightly uses the adde instruction
413#
414#
415# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
416# r3 contains r
417# r4 contains a
418#
419# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
420#
421# r5,r6 are the two BN_ULONGs being multiplied.
422# r7,r8 are the results of the 32x32 giving 64 bit multiply.
423# r9,r10, r11 are the equivalents of c1,c2, c3.
424#
425# Possible optimization of loading all 8 longs of a into registers
426# doesn't provide any speedup
427#
428
429	xor		r0,r0,r0		#set r0 = 0.Used in addze
430						#instructions below.
431
432						#sqr_add_c(a,0,c1,c2,c3);
433	$LD		r5,`0*$BNSZ`(r4)
434	$UMULL		r9,r5,r5		#1st iteration:	no carries.
435	$UMULH		r10,r5,r5
436	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
437						#sqr_add_c2(a,1,0,c2,c3,c1);
438	$LD		r6,`1*$BNSZ`(r4)
439	$UMULL		r7,r5,r6
440	$UMULH		r8,r5,r6
441
442	addc		r10,r7,r10		#add the two register number
443	adde		r11,r8,r0 		# (r8,r7) to the three register
444	addze		r9,r0			# number (r9,r11,r10).NOTE:r0=0
445
446	addc		r10,r7,r10		#add the two register number
447	adde		r11,r8,r11 		# (r8,r7) to the three register
448	addze		r9,r9			# number (r9,r11,r10).
449
450	$ST		r10,`1*$BNSZ`(r3)	# r[1]=c2
451
452						#sqr_add_c(a,1,c3,c1,c2);
453	$UMULL		r7,r6,r6
454	$UMULH		r8,r6,r6
455	addc		r11,r7,r11
456	adde		r9,r8,r9
457	addze		r10,r0
458						#sqr_add_c2(a,2,0,c3,c1,c2);
459	$LD		r6,`2*$BNSZ`(r4)
460	$UMULL		r7,r5,r6
461	$UMULH		r8,r5,r6
462
463	addc		r11,r7,r11
464	adde		r9,r8,r9
465	addze		r10,r10
466
467	addc		r11,r7,r11
468	adde		r9,r8,r9
469	addze		r10,r10
470
471	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
472						#sqr_add_c2(a,3,0,c1,c2,c3);
473	$LD		r6,`3*$BNSZ`(r4)	#r6 = a[3]. r5 is already a[0].
474	$UMULL		r7,r5,r6
475	$UMULH		r8,r5,r6
476
477	addc		r9,r7,r9
478	adde		r10,r8,r10
479	addze		r11,r0
480
481	addc		r9,r7,r9
482	adde		r10,r8,r10
483	addze		r11,r11
484						#sqr_add_c2(a,2,1,c1,c2,c3);
485	$LD		r5,`1*$BNSZ`(r4)
486	$LD		r6,`2*$BNSZ`(r4)
487	$UMULL		r7,r5,r6
488	$UMULH		r8,r5,r6
489
490	addc		r9,r7,r9
491	adde		r10,r8,r10
492	addze		r11,r11
493
494	addc		r9,r7,r9
495	adde		r10,r8,r10
496	addze		r11,r11
497
498	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1;
499						#sqr_add_c(a,2,c2,c3,c1);
500	$UMULL		r7,r6,r6
501	$UMULH		r8,r6,r6
502
503	addc		r10,r7,r10
504	adde		r11,r8,r11
505	addze		r9,r0
506						#sqr_add_c2(a,3,1,c2,c3,c1);
507	$LD		r6,`3*$BNSZ`(r4)
508	$UMULL		r7,r5,r6
509	$UMULH		r8,r5,r6
510
511	addc		r10,r7,r10
512	adde		r11,r8,r11
513	addze		r9,r9
514
515	addc		r10,r7,r10
516	adde		r11,r8,r11
517	addze		r9,r9
518						#sqr_add_c2(a,4,0,c2,c3,c1);
519	$LD		r5,`0*$BNSZ`(r4)
520	$LD		r6,`4*$BNSZ`(r4)
521	$UMULL		r7,r5,r6
522	$UMULH		r8,r5,r6
523
524	addc		r10,r7,r10
525	adde		r11,r8,r11
526	addze		r9,r9
527
528	addc		r10,r7,r10
529	adde		r11,r8,r11
530	addze		r9,r9
531	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2;
532						#sqr_add_c2(a,5,0,c3,c1,c2);
533	$LD		r6,`5*$BNSZ`(r4)
534	$UMULL		r7,r5,r6
535	$UMULH		r8,r5,r6
536
537	addc		r11,r7,r11
538	adde		r9,r8,r9
539	addze		r10,r0
540
541	addc		r11,r7,r11
542	adde		r9,r8,r9
543	addze		r10,r10
544						#sqr_add_c2(a,4,1,c3,c1,c2);
545	$LD		r5,`1*$BNSZ`(r4)
546	$LD		r6,`4*$BNSZ`(r4)
547	$UMULL		r7,r5,r6
548	$UMULH		r8,r5,r6
549
550	addc		r11,r7,r11
551	adde		r9,r8,r9
552	addze		r10,r10
553
554	addc		r11,r7,r11
555	adde		r9,r8,r9
556	addze		r10,r10
557						#sqr_add_c2(a,3,2,c3,c1,c2);
558	$LD		r5,`2*$BNSZ`(r4)
559	$LD		r6,`3*$BNSZ`(r4)
560	$UMULL		r7,r5,r6
561	$UMULH		r8,r5,r6
562
563	addc		r11,r7,r11
564	adde		r9,r8,r9
565	addze		r10,r10
566
567	addc		r11,r7,r11
568	adde		r9,r8,r9
569	addze		r10,r10
570	$ST		r11,`5*$BNSZ`(r3)	#r[5]=c3;
571						#sqr_add_c(a,3,c1,c2,c3);
572	$UMULL		r7,r6,r6
573	$UMULH		r8,r6,r6
574	addc		r9,r7,r9
575	adde		r10,r8,r10
576	addze		r11,r0
577						#sqr_add_c2(a,4,2,c1,c2,c3);
578	$LD		r6,`4*$BNSZ`(r4)
579	$UMULL		r7,r5,r6
580	$UMULH		r8,r5,r6
581
582	addc		r9,r7,r9
583	adde		r10,r8,r10
584	addze		r11,r11
585
586	addc		r9,r7,r9
587	adde		r10,r8,r10
588	addze		r11,r11
589						#sqr_add_c2(a,5,1,c1,c2,c3);
590	$LD		r5,`1*$BNSZ`(r4)
591	$LD		r6,`5*$BNSZ`(r4)
592	$UMULL		r7,r5,r6
593	$UMULH		r8,r5,r6
594
595	addc		r9,r7,r9
596	adde		r10,r8,r10
597	addze		r11,r11
598
599	addc		r9,r7,r9
600	adde		r10,r8,r10
601	addze		r11,r11
602						#sqr_add_c2(a,6,0,c1,c2,c3);
603	$LD		r5,`0*$BNSZ`(r4)
604	$LD		r6,`6*$BNSZ`(r4)
605	$UMULL		r7,r5,r6
606	$UMULH		r8,r5,r6
607	addc		r9,r7,r9
608	adde		r10,r8,r10
609	addze		r11,r11
610	addc		r9,r7,r9
611	adde		r10,r8,r10
612	addze		r11,r11
613	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1;
614						#sqr_add_c2(a,7,0,c2,c3,c1);
615	$LD		r6,`7*$BNSZ`(r4)
616	$UMULL		r7,r5,r6
617	$UMULH		r8,r5,r6
618
619	addc		r10,r7,r10
620	adde		r11,r8,r11
621	addze		r9,r0
622	addc		r10,r7,r10
623	adde		r11,r8,r11
624	addze		r9,r9
625						#sqr_add_c2(a,6,1,c2,c3,c1);
626	$LD		r5,`1*$BNSZ`(r4)
627	$LD		r6,`6*$BNSZ`(r4)
628	$UMULL		r7,r5,r6
629	$UMULH		r8,r5,r6
630
631	addc		r10,r7,r10
632	adde		r11,r8,r11
633	addze		r9,r9
634	addc		r10,r7,r10
635	adde		r11,r8,r11
636	addze		r9,r9
637						#sqr_add_c2(a,5,2,c2,c3,c1);
638	$LD		r5,`2*$BNSZ`(r4)
639	$LD		r6,`5*$BNSZ`(r4)
640	$UMULL		r7,r5,r6
641	$UMULH		r8,r5,r6
642	addc		r10,r7,r10
643	adde		r11,r8,r11
644	addze		r9,r9
645	addc		r10,r7,r10
646	adde		r11,r8,r11
647	addze		r9,r9
648						#sqr_add_c2(a,4,3,c2,c3,c1);
649	$LD		r5,`3*$BNSZ`(r4)
650	$LD		r6,`4*$BNSZ`(r4)
651	$UMULL		r7,r5,r6
652	$UMULH		r8,r5,r6
653
654	addc		r10,r7,r10
655	adde		r11,r8,r11
656	addze		r9,r9
657	addc		r10,r7,r10
658	adde		r11,r8,r11
659	addze		r9,r9
660	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2;
661						#sqr_add_c(a,4,c3,c1,c2);
662	$UMULL		r7,r6,r6
663	$UMULH		r8,r6,r6
664	addc		r11,r7,r11
665	adde		r9,r8,r9
666	addze		r10,r0
667						#sqr_add_c2(a,5,3,c3,c1,c2);
668	$LD		r6,`5*$BNSZ`(r4)
669	$UMULL		r7,r5,r6
670	$UMULH		r8,r5,r6
671	addc		r11,r7,r11
672	adde		r9,r8,r9
673	addze		r10,r10
674	addc		r11,r7,r11
675	adde		r9,r8,r9
676	addze		r10,r10
677						#sqr_add_c2(a,6,2,c3,c1,c2);
678	$LD		r5,`2*$BNSZ`(r4)
679	$LD		r6,`6*$BNSZ`(r4)
680	$UMULL		r7,r5,r6
681	$UMULH		r8,r5,r6
682	addc		r11,r7,r11
683	adde		r9,r8,r9
684	addze		r10,r10
685
686	addc		r11,r7,r11
687	adde		r9,r8,r9
688	addze		r10,r10
689						#sqr_add_c2(a,7,1,c3,c1,c2);
690	$LD		r5,`1*$BNSZ`(r4)
691	$LD		r6,`7*$BNSZ`(r4)
692	$UMULL		r7,r5,r6
693	$UMULH		r8,r5,r6
694	addc		r11,r7,r11
695	adde		r9,r8,r9
696	addze		r10,r10
697	addc		r11,r7,r11
698	adde		r9,r8,r9
699	addze		r10,r10
700	$ST		r11,`8*$BNSZ`(r3)	#r[8]=c3;
701						#sqr_add_c2(a,7,2,c1,c2,c3);
702	$LD		r5,`2*$BNSZ`(r4)
703	$UMULL		r7,r5,r6
704	$UMULH		r8,r5,r6
705
706	addc		r9,r7,r9
707	adde		r10,r8,r10
708	addze		r11,r0
709	addc		r9,r7,r9
710	adde		r10,r8,r10
711	addze		r11,r11
712						#sqr_add_c2(a,6,3,c1,c2,c3);
713	$LD		r5,`3*$BNSZ`(r4)
714	$LD		r6,`6*$BNSZ`(r4)
715	$UMULL		r7,r5,r6
716	$UMULH		r8,r5,r6
717	addc		r9,r7,r9
718	adde		r10,r8,r10
719	addze		r11,r11
720	addc		r9,r7,r9
721	adde		r10,r8,r10
722	addze		r11,r11
723						#sqr_add_c2(a,5,4,c1,c2,c3);
724	$LD		r5,`4*$BNSZ`(r4)
725	$LD		r6,`5*$BNSZ`(r4)
726	$UMULL		r7,r5,r6
727	$UMULH		r8,r5,r6
728	addc		r9,r7,r9
729	adde		r10,r8,r10
730	addze		r11,r11
731	addc		r9,r7,r9
732	adde		r10,r8,r10
733	addze		r11,r11
734	$ST		r9,`9*$BNSZ`(r3)	#r[9]=c1;
735						#sqr_add_c(a,5,c2,c3,c1);
736	$UMULL		r7,r6,r6
737	$UMULH		r8,r6,r6
738	addc		r10,r7,r10
739	adde		r11,r8,r11
740	addze		r9,r0
741						#sqr_add_c2(a,6,4,c2,c3,c1);
742	$LD		r6,`6*$BNSZ`(r4)
743	$UMULL		r7,r5,r6
744	$UMULH		r8,r5,r6
745	addc		r10,r7,r10
746	adde		r11,r8,r11
747	addze		r9,r9
748	addc		r10,r7,r10
749	adde		r11,r8,r11
750	addze		r9,r9
751						#sqr_add_c2(a,7,3,c2,c3,c1);
752	$LD		r5,`3*$BNSZ`(r4)
753	$LD		r6,`7*$BNSZ`(r4)
754	$UMULL		r7,r5,r6
755	$UMULH		r8,r5,r6
756	addc		r10,r7,r10
757	adde		r11,r8,r11
758	addze		r9,r9
759	addc		r10,r7,r10
760	adde		r11,r8,r11
761	addze		r9,r9
762	$ST		r10,`10*$BNSZ`(r3)	#r[10]=c2;
763						#sqr_add_c2(a,7,4,c3,c1,c2);
764	$LD		r5,`4*$BNSZ`(r4)
765	$UMULL		r7,r5,r6
766	$UMULH		r8,r5,r6
767	addc		r11,r7,r11
768	adde		r9,r8,r9
769	addze		r10,r0
770	addc		r11,r7,r11
771	adde		r9,r8,r9
772	addze		r10,r10
773						#sqr_add_c2(a,6,5,c3,c1,c2);
774	$LD		r5,`5*$BNSZ`(r4)
775	$LD		r6,`6*$BNSZ`(r4)
776	$UMULL		r7,r5,r6
777	$UMULH		r8,r5,r6
778	addc		r11,r7,r11
779	adde		r9,r8,r9
780	addze		r10,r10
781	addc		r11,r7,r11
782	adde		r9,r8,r9
783	addze		r10,r10
784	$ST		r11,`11*$BNSZ`(r3)	#r[11]=c3;
785						#sqr_add_c(a,6,c1,c2,c3);
786	$UMULL		r7,r6,r6
787	$UMULH		r8,r6,r6
788	addc		r9,r7,r9
789	adde		r10,r8,r10
790	addze		r11,r0
791						#sqr_add_c2(a,7,5,c1,c2,c3)
792	$LD		r6,`7*$BNSZ`(r4)
793	$UMULL		r7,r5,r6
794	$UMULH		r8,r5,r6
795	addc		r9,r7,r9
796	adde		r10,r8,r10
797	addze		r11,r11
798	addc		r9,r7,r9
799	adde		r10,r8,r10
800	addze		r11,r11
801	$ST		r9,`12*$BNSZ`(r3)	#r[12]=c1;
802
803						#sqr_add_c2(a,7,6,c2,c3,c1)
804	$LD		r5,`6*$BNSZ`(r4)
805	$UMULL		r7,r5,r6
806	$UMULH		r8,r5,r6
807	addc		r10,r7,r10
808	adde		r11,r8,r11
809	addze		r9,r0
810	addc		r10,r7,r10
811	adde		r11,r8,r11
812	addze		r9,r9
813	$ST		r10,`13*$BNSZ`(r3)	#r[13]=c2;
814						#sqr_add_c(a,7,c3,c1,c2);
815	$UMULL		r7,r6,r6
816	$UMULH		r8,r6,r6
817	addc		r11,r7,r11
818	adde		r9,r8,r9
819	$ST		r11,`14*$BNSZ`(r3)	#r[14]=c3;
820	$ST		r9, `15*$BNSZ`(r3)	#r[15]=c1;
821
822
823	blr
824	.long	0
825	.byte	0,12,0x14,0,0,0,2,0
826	.long	0
827.size	.bn_sqr_comba8,.-.bn_sqr_comba8
828
829#
830#	NOTE:	The following label name should be changed to
831#		"bn_mul_comba4" i.e. remove the first dot
832#		for the gcc compiler. This should be automatically
833#		done in the build
834#
835
836.align	4
837.bn_mul_comba4:
838#
839# This is an optimized version of the bn_mul_comba4 routine.
840#
841# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
842# r3 contains r
843# r4 contains a
844# r5 contains b
845# r6, r7 are the 2 BN_ULONGs being multiplied.
846# r8, r9 are the results of the 32x32 giving 64 multiply.
847# r10, r11, r12 are the equivalents of c1, c2, and c3.
848#
849	xor	r0,r0,r0		#r0=0. Used in addze below.
850					#mul_add_c(a[0],b[0],c1,c2,c3);
851	$LD	r6,`0*$BNSZ`(r4)
852	$LD	r7,`0*$BNSZ`(r5)
853	$UMULL	r10,r6,r7
854	$UMULH	r11,r6,r7
855	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1
856					#mul_add_c(a[0],b[1],c2,c3,c1);
857	$LD	r7,`1*$BNSZ`(r5)
858	$UMULL	r8,r6,r7
859	$UMULH	r9,r6,r7
860	addc	r11,r8,r11
861	adde	r12,r9,r0
862	addze	r10,r0
863					#mul_add_c(a[1],b[0],c2,c3,c1);
864	$LD	r6, `1*$BNSZ`(r4)
865	$LD	r7, `0*$BNSZ`(r5)
866	$UMULL	r8,r6,r7
867	$UMULH	r9,r6,r7
868	addc	r11,r8,r11
869	adde	r12,r9,r12
870	addze	r10,r10
871	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2
872					#mul_add_c(a[2],b[0],c3,c1,c2);
873	$LD	r6,`2*$BNSZ`(r4)
874	$UMULL	r8,r6,r7
875	$UMULH	r9,r6,r7
876	addc	r12,r8,r12
877	adde	r10,r9,r10
878	addze	r11,r0
879					#mul_add_c(a[1],b[1],c3,c1,c2);
880	$LD	r6,`1*$BNSZ`(r4)
881	$LD	r7,`1*$BNSZ`(r5)
882	$UMULL	r8,r6,r7
883	$UMULH	r9,r6,r7
884	addc	r12,r8,r12
885	adde	r10,r9,r10
886	addze	r11,r11
887					#mul_add_c(a[0],b[2],c3,c1,c2);
888	$LD	r6,`0*$BNSZ`(r4)
889	$LD	r7,`2*$BNSZ`(r5)
890	$UMULL	r8,r6,r7
891	$UMULH	r9,r6,r7
892	addc	r12,r8,r12
893	adde	r10,r9,r10
894	addze	r11,r11
895	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3
896					#mul_add_c(a[0],b[3],c1,c2,c3);
897	$LD	r7,`3*$BNSZ`(r5)
898	$UMULL	r8,r6,r7
899	$UMULH	r9,r6,r7
900	addc	r10,r8,r10
901	adde	r11,r9,r11
902	addze	r12,r0
903					#mul_add_c(a[1],b[2],c1,c2,c3);
904	$LD	r6,`1*$BNSZ`(r4)
905	$LD	r7,`2*$BNSZ`(r5)
906	$UMULL	r8,r6,r7
907	$UMULH	r9,r6,r7
908	addc	r10,r8,r10
909	adde	r11,r9,r11
910	addze	r12,r12
911					#mul_add_c(a[2],b[1],c1,c2,c3);
912	$LD	r6,`2*$BNSZ`(r4)
913	$LD	r7,`1*$BNSZ`(r5)
914	$UMULL	r8,r6,r7
915	$UMULH	r9,r6,r7
916	addc	r10,r8,r10
917	adde	r11,r9,r11
918	addze	r12,r12
919					#mul_add_c(a[3],b[0],c1,c2,c3);
920	$LD	r6,`3*$BNSZ`(r4)
921	$LD	r7,`0*$BNSZ`(r5)
922	$UMULL	r8,r6,r7
923	$UMULH	r9,r6,r7
924	addc	r10,r8,r10
925	adde	r11,r9,r11
926	addze	r12,r12
927	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1
928					#mul_add_c(a[3],b[1],c2,c3,c1);
929	$LD	r7,`1*$BNSZ`(r5)
930	$UMULL	r8,r6,r7
931	$UMULH	r9,r6,r7
932	addc	r11,r8,r11
933	adde	r12,r9,r12
934	addze	r10,r0
935					#mul_add_c(a[2],b[2],c2,c3,c1);
936	$LD	r6,`2*$BNSZ`(r4)
937	$LD	r7,`2*$BNSZ`(r5)
938	$UMULL	r8,r6,r7
939	$UMULH	r9,r6,r7
940	addc	r11,r8,r11
941	adde	r12,r9,r12
942	addze	r10,r10
943					#mul_add_c(a[1],b[3],c2,c3,c1);
944	$LD	r6,`1*$BNSZ`(r4)
945	$LD	r7,`3*$BNSZ`(r5)
946	$UMULL	r8,r6,r7
947	$UMULH	r9,r6,r7
948	addc	r11,r8,r11
949	adde	r12,r9,r12
950	addze	r10,r10
951	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2
952					#mul_add_c(a[2],b[3],c3,c1,c2);
953	$LD	r6,`2*$BNSZ`(r4)
954	$UMULL	r8,r6,r7
955	$UMULH	r9,r6,r7
956	addc	r12,r8,r12
957	adde	r10,r9,r10
958	addze	r11,r0
959					#mul_add_c(a[3],b[2],c3,c1,c2);
960	$LD	r6,`3*$BNSZ`(r4)
961	$LD	r7,`2*$BNSZ`(r5)
962	$UMULL	r8,r6,r7
963	$UMULH	r9,r6,r7
964	addc	r12,r8,r12
965	adde	r10,r9,r10
966	addze	r11,r11
967	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3
968					#mul_add_c(a[3],b[3],c1,c2,c3);
969	$LD	r7,`3*$BNSZ`(r5)
970	$UMULL	r8,r6,r7
971	$UMULH	r9,r6,r7
972	addc	r10,r8,r10
973	adde	r11,r9,r11
974
975	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1
976	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2
977	blr
978	.long	0
979	.byte	0,12,0x14,0,0,0,3,0
980	.long	0
981.size	.bn_mul_comba4,.-.bn_mul_comba4
982
983#
984#	NOTE:	The following label name should be changed to
985#		"bn_mul_comba8" i.e. remove the first dot
986#		for the gcc compiler. This should be automatically
987#		done in the build
988#
989
990.align	4
991.bn_mul_comba8:
992#
993# Optimized version of the bn_mul_comba8 routine.
994#
995# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
996# r3 contains r
997# r4 contains a
998# r5 contains b
999# r6, r7 are the 2 BN_ULONGs being multiplied.
1000# r8, r9 are the results of the 32x32 giving 64 multiply.
1001# r10, r11, r12 are the equivalents of c1, c2, and c3.
1002#
1003	xor	r0,r0,r0		#r0=0. Used in addze below.
1004
1005					#mul_add_c(a[0],b[0],c1,c2,c3);
1006	$LD	r6,`0*$BNSZ`(r4)	#a[0]
1007	$LD	r7,`0*$BNSZ`(r5)	#b[0]
1008	$UMULL	r10,r6,r7
1009	$UMULH	r11,r6,r7
1010	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1;
1011					#mul_add_c(a[0],b[1],c2,c3,c1);
1012	$LD	r7,`1*$BNSZ`(r5)
1013	$UMULL	r8,r6,r7
1014	$UMULH	r9,r6,r7
1015	addc	r11,r11,r8
1016	addze	r12,r9			# since we didn't set r12 to zero before.
1017	addze	r10,r0
1018					#mul_add_c(a[1],b[0],c2,c3,c1);
1019	$LD	r6,`1*$BNSZ`(r4)
1020	$LD	r7,`0*$BNSZ`(r5)
1021	$UMULL	r8,r6,r7
1022	$UMULH	r9,r6,r7
1023	addc	r11,r11,r8
1024	adde	r12,r12,r9
1025	addze	r10,r10
1026	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2;
1027					#mul_add_c(a[2],b[0],c3,c1,c2);
1028	$LD	r6,`2*$BNSZ`(r4)
1029	$UMULL	r8,r6,r7
1030	$UMULH	r9,r6,r7
1031	addc	r12,r12,r8
1032	adde	r10,r10,r9
1033	addze	r11,r0
1034					#mul_add_c(a[1],b[1],c3,c1,c2);
1035	$LD	r6,`1*$BNSZ`(r4)
1036	$LD	r7,`1*$BNSZ`(r5)
1037	$UMULL	r8,r6,r7
1038	$UMULH	r9,r6,r7
1039	addc	r12,r12,r8
1040	adde	r10,r10,r9
1041	addze	r11,r11
1042					#mul_add_c(a[0],b[2],c3,c1,c2);
1043	$LD	r6,`0*$BNSZ`(r4)
1044	$LD	r7,`2*$BNSZ`(r5)
1045	$UMULL	r8,r6,r7
1046	$UMULH	r9,r6,r7
1047	addc	r12,r12,r8
1048	adde	r10,r10,r9
1049	addze	r11,r11
1050	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3;
1051					#mul_add_c(a[0],b[3],c1,c2,c3);
1052	$LD	r7,`3*$BNSZ`(r5)
1053	$UMULL	r8,r6,r7
1054	$UMULH	r9,r6,r7
1055	addc	r10,r10,r8
1056	adde	r11,r11,r9
1057	addze	r12,r0
1058					#mul_add_c(a[1],b[2],c1,c2,c3);
1059	$LD	r6,`1*$BNSZ`(r4)
1060	$LD	r7,`2*$BNSZ`(r5)
1061	$UMULL	r8,r6,r7
1062	$UMULH	r9,r6,r7
1063	addc	r10,r10,r8
1064	adde	r11,r11,r9
1065	addze	r12,r12
1066
1067					#mul_add_c(a[2],b[1],c1,c2,c3);
1068	$LD	r6,`2*$BNSZ`(r4)
1069	$LD	r7,`1*$BNSZ`(r5)
1070	$UMULL	r8,r6,r7
1071	$UMULH	r9,r6,r7
1072	addc	r10,r10,r8
1073	adde	r11,r11,r9
1074	addze	r12,r12
1075					#mul_add_c(a[3],b[0],c1,c2,c3);
1076	$LD	r6,`3*$BNSZ`(r4)
1077	$LD	r7,`0*$BNSZ`(r5)
1078	$UMULL	r8,r6,r7
1079	$UMULH	r9,r6,r7
1080	addc	r10,r10,r8
1081	adde	r11,r11,r9
1082	addze	r12,r12
1083	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1;
1084					#mul_add_c(a[4],b[0],c2,c3,c1);
1085	$LD	r6,`4*$BNSZ`(r4)
1086	$UMULL	r8,r6,r7
1087	$UMULH	r9,r6,r7
1088	addc	r11,r11,r8
1089	adde	r12,r12,r9
1090	addze	r10,r0
1091					#mul_add_c(a[3],b[1],c2,c3,c1);
1092	$LD	r6,`3*$BNSZ`(r4)
1093	$LD	r7,`1*$BNSZ`(r5)
1094	$UMULL	r8,r6,r7
1095	$UMULH	r9,r6,r7
1096	addc	r11,r11,r8
1097	adde	r12,r12,r9
1098	addze	r10,r10
1099					#mul_add_c(a[2],b[2],c2,c3,c1);
1100	$LD	r6,`2*$BNSZ`(r4)
1101	$LD	r7,`2*$BNSZ`(r5)
1102	$UMULL	r8,r6,r7
1103	$UMULH	r9,r6,r7
1104	addc	r11,r11,r8
1105	adde	r12,r12,r9
1106	addze	r10,r10
1107					#mul_add_c(a[1],b[3],c2,c3,c1);
1108	$LD	r6,`1*$BNSZ`(r4)
1109	$LD	r7,`3*$BNSZ`(r5)
1110	$UMULL	r8,r6,r7
1111	$UMULH	r9,r6,r7
1112	addc	r11,r11,r8
1113	adde	r12,r12,r9
1114	addze	r10,r10
1115					#mul_add_c(a[0],b[4],c2,c3,c1);
1116	$LD	r6,`0*$BNSZ`(r4)
1117	$LD	r7,`4*$BNSZ`(r5)
1118	$UMULL	r8,r6,r7
1119	$UMULH	r9,r6,r7
1120	addc	r11,r11,r8
1121	adde	r12,r12,r9
1122	addze	r10,r10
1123	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2;
1124					#mul_add_c(a[0],b[5],c3,c1,c2);
1125	$LD	r7,`5*$BNSZ`(r5)
1126	$UMULL	r8,r6,r7
1127	$UMULH	r9,r6,r7
1128	addc	r12,r12,r8
1129	adde	r10,r10,r9
1130	addze	r11,r0
1131					#mul_add_c(a[1],b[4],c3,c1,c2);
1132	$LD	r6,`1*$BNSZ`(r4)
1133	$LD	r7,`4*$BNSZ`(r5)
1134	$UMULL	r8,r6,r7
1135	$UMULH	r9,r6,r7
1136	addc	r12,r12,r8
1137	adde	r10,r10,r9
1138	addze	r11,r11
1139					#mul_add_c(a[2],b[3],c3,c1,c2);
1140	$LD	r6,`2*$BNSZ`(r4)
1141	$LD	r7,`3*$BNSZ`(r5)
1142	$UMULL	r8,r6,r7
1143	$UMULH	r9,r6,r7
1144	addc	r12,r12,r8
1145	adde	r10,r10,r9
1146	addze	r11,r11
1147					#mul_add_c(a[3],b[2],c3,c1,c2);
1148	$LD	r6,`3*$BNSZ`(r4)
1149	$LD	r7,`2*$BNSZ`(r5)
1150	$UMULL	r8,r6,r7
1151	$UMULH	r9,r6,r7
1152	addc	r12,r12,r8
1153	adde	r10,r10,r9
1154	addze	r11,r11
1155					#mul_add_c(a[4],b[1],c3,c1,c2);
1156	$LD	r6,`4*$BNSZ`(r4)
1157	$LD	r7,`1*$BNSZ`(r5)
1158	$UMULL	r8,r6,r7
1159	$UMULH	r9,r6,r7
1160	addc	r12,r12,r8
1161	adde	r10,r10,r9
1162	addze	r11,r11
1163					#mul_add_c(a[5],b[0],c3,c1,c2);
1164	$LD	r6,`5*$BNSZ`(r4)
1165	$LD	r7,`0*$BNSZ`(r5)
1166	$UMULL	r8,r6,r7
1167	$UMULH	r9,r6,r7
1168	addc	r12,r12,r8
1169	adde	r10,r10,r9
1170	addze	r11,r11
1171	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3;
1172					#mul_add_c(a[6],b[0],c1,c2,c3);
1173	$LD	r6,`6*$BNSZ`(r4)
1174	$UMULL	r8,r6,r7
1175	$UMULH	r9,r6,r7
1176	addc	r10,r10,r8
1177	adde	r11,r11,r9
1178	addze	r12,r0
1179					#mul_add_c(a[5],b[1],c1,c2,c3);
1180	$LD	r6,`5*$BNSZ`(r4)
1181	$LD	r7,`1*$BNSZ`(r5)
1182	$UMULL	r8,r6,r7
1183	$UMULH	r9,r6,r7
1184	addc	r10,r10,r8
1185	adde	r11,r11,r9
1186	addze	r12,r12
1187					#mul_add_c(a[4],b[2],c1,c2,c3);
1188	$LD	r6,`4*$BNSZ`(r4)
1189	$LD	r7,`2*$BNSZ`(r5)
1190	$UMULL	r8,r6,r7
1191	$UMULH	r9,r6,r7
1192	addc	r10,r10,r8
1193	adde	r11,r11,r9
1194	addze	r12,r12
1195					#mul_add_c(a[3],b[3],c1,c2,c3);
1196	$LD	r6,`3*$BNSZ`(r4)
1197	$LD	r7,`3*$BNSZ`(r5)
1198	$UMULL	r8,r6,r7
1199	$UMULH	r9,r6,r7
1200	addc	r10,r10,r8
1201	adde	r11,r11,r9
1202	addze	r12,r12
1203					#mul_add_c(a[2],b[4],c1,c2,c3);
1204	$LD	r6,`2*$BNSZ`(r4)
1205	$LD	r7,`4*$BNSZ`(r5)
1206	$UMULL	r8,r6,r7
1207	$UMULH	r9,r6,r7
1208	addc	r10,r10,r8
1209	adde	r11,r11,r9
1210	addze	r12,r12
1211					#mul_add_c(a[1],b[5],c1,c2,c3);
1212	$LD	r6,`1*$BNSZ`(r4)
1213	$LD	r7,`5*$BNSZ`(r5)
1214	$UMULL	r8,r6,r7
1215	$UMULH	r9,r6,r7
1216	addc	r10,r10,r8
1217	adde	r11,r11,r9
1218	addze	r12,r12
1219					#mul_add_c(a[0],b[6],c1,c2,c3);
1220	$LD	r6,`0*$BNSZ`(r4)
1221	$LD	r7,`6*$BNSZ`(r5)
1222	$UMULL	r8,r6,r7
1223	$UMULH	r9,r6,r7
1224	addc	r10,r10,r8
1225	adde	r11,r11,r9
1226	addze	r12,r12
1227	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1;
1228					#mul_add_c(a[0],b[7],c2,c3,c1);
1229	$LD	r7,`7*$BNSZ`(r5)
1230	$UMULL	r8,r6,r7
1231	$UMULH	r9,r6,r7
1232	addc	r11,r11,r8
1233	adde	r12,r12,r9
1234	addze	r10,r0
1235					#mul_add_c(a[1],b[6],c2,c3,c1);
1236	$LD	r6,`1*$BNSZ`(r4)
1237	$LD	r7,`6*$BNSZ`(r5)
1238	$UMULL	r8,r6,r7
1239	$UMULH	r9,r6,r7
1240	addc	r11,r11,r8
1241	adde	r12,r12,r9
1242	addze	r10,r10
1243					#mul_add_c(a[2],b[5],c2,c3,c1);
1244	$LD	r6,`2*$BNSZ`(r4)
1245	$LD	r7,`5*$BNSZ`(r5)
1246	$UMULL	r8,r6,r7
1247	$UMULH	r9,r6,r7
1248	addc	r11,r11,r8
1249	adde	r12,r12,r9
1250	addze	r10,r10
1251					#mul_add_c(a[3],b[4],c2,c3,c1);
1252	$LD	r6,`3*$BNSZ`(r4)
1253	$LD	r7,`4*$BNSZ`(r5)
1254	$UMULL	r8,r6,r7
1255	$UMULH	r9,r6,r7
1256	addc	r11,r11,r8
1257	adde	r12,r12,r9
1258	addze	r10,r10
1259					#mul_add_c(a[4],b[3],c2,c3,c1);
1260	$LD	r6,`4*$BNSZ`(r4)
1261	$LD	r7,`3*$BNSZ`(r5)
1262	$UMULL	r8,r6,r7
1263	$UMULH	r9,r6,r7
1264	addc	r11,r11,r8
1265	adde	r12,r12,r9
1266	addze	r10,r10
1267					#mul_add_c(a[5],b[2],c2,c3,c1);
1268	$LD	r6,`5*$BNSZ`(r4)
1269	$LD	r7,`2*$BNSZ`(r5)
1270	$UMULL	r8,r6,r7
1271	$UMULH	r9,r6,r7
1272	addc	r11,r11,r8
1273	adde	r12,r12,r9
1274	addze	r10,r10
1275					#mul_add_c(a[6],b[1],c2,c3,c1);
1276	$LD	r6,`6*$BNSZ`(r4)
1277	$LD	r7,`1*$BNSZ`(r5)
1278	$UMULL	r8,r6,r7
1279	$UMULH	r9,r6,r7
1280	addc	r11,r11,r8
1281	adde	r12,r12,r9
1282	addze	r10,r10
1283					#mul_add_c(a[7],b[0],c2,c3,c1);
1284	$LD	r6,`7*$BNSZ`(r4)
1285	$LD	r7,`0*$BNSZ`(r5)
1286	$UMULL	r8,r6,r7
1287	$UMULH	r9,r6,r7
1288	addc	r11,r11,r8
1289	adde	r12,r12,r9
1290	addze	r10,r10
1291	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2;
1292					#mul_add_c(a[7],b[1],c3,c1,c2);
1293	$LD	r7,`1*$BNSZ`(r5)
1294	$UMULL	r8,r6,r7
1295	$UMULH	r9,r6,r7
1296	addc	r12,r12,r8
1297	adde	r10,r10,r9
1298	addze	r11,r0
1299					#mul_add_c(a[6],b[2],c3,c1,c2);
1300	$LD	r6,`6*$BNSZ`(r4)
1301	$LD	r7,`2*$BNSZ`(r5)
1302	$UMULL	r8,r6,r7
1303	$UMULH	r9,r6,r7
1304	addc	r12,r12,r8
1305	adde	r10,r10,r9
1306	addze	r11,r11
1307					#mul_add_c(a[5],b[3],c3,c1,c2);
1308	$LD	r6,`5*$BNSZ`(r4)
1309	$LD	r7,`3*$BNSZ`(r5)
1310	$UMULL	r8,r6,r7
1311	$UMULH	r9,r6,r7
1312	addc	r12,r12,r8
1313	adde	r10,r10,r9
1314	addze	r11,r11
1315					#mul_add_c(a[4],b[4],c3,c1,c2);
1316	$LD	r6,`4*$BNSZ`(r4)
1317	$LD	r7,`4*$BNSZ`(r5)
1318	$UMULL	r8,r6,r7
1319	$UMULH	r9,r6,r7
1320	addc	r12,r12,r8
1321	adde	r10,r10,r9
1322	addze	r11,r11
1323					#mul_add_c(a[3],b[5],c3,c1,c2);
1324	$LD	r6,`3*$BNSZ`(r4)
1325	$LD	r7,`5*$BNSZ`(r5)
1326	$UMULL	r8,r6,r7
1327	$UMULH	r9,r6,r7
1328	addc	r12,r12,r8
1329	adde	r10,r10,r9
1330	addze	r11,r11
1331					#mul_add_c(a[2],b[6],c3,c1,c2);
1332	$LD	r6,`2*$BNSZ`(r4)
1333	$LD	r7,`6*$BNSZ`(r5)
1334	$UMULL	r8,r6,r7
1335	$UMULH	r9,r6,r7
1336	addc	r12,r12,r8
1337	adde	r10,r10,r9
1338	addze	r11,r11
1339					#mul_add_c(a[1],b[7],c3,c1,c2);
1340	$LD	r6,`1*$BNSZ`(r4)
1341	$LD	r7,`7*$BNSZ`(r5)
1342	$UMULL	r8,r6,r7
1343	$UMULH	r9,r6,r7
1344	addc	r12,r12,r8
1345	adde	r10,r10,r9
1346	addze	r11,r11
1347	$ST	r12,`8*$BNSZ`(r3)	#r[8]=c3;
1348					#mul_add_c(a[2],b[7],c1,c2,c3);
1349	$LD	r6,`2*$BNSZ`(r4)
1350	$UMULL	r8,r6,r7
1351	$UMULH	r9,r6,r7
1352	addc	r10,r10,r8
1353	adde	r11,r11,r9
1354	addze	r12,r0
1355					#mul_add_c(a[3],b[6],c1,c2,c3);
1356	$LD	r6,`3*$BNSZ`(r4)
1357	$LD	r7,`6*$BNSZ`(r5)
1358	$UMULL	r8,r6,r7
1359	$UMULH	r9,r6,r7
1360	addc	r10,r10,r8
1361	adde	r11,r11,r9
1362	addze	r12,r12
1363					#mul_add_c(a[4],b[5],c1,c2,c3);
1364	$LD	r6,`4*$BNSZ`(r4)
1365	$LD	r7,`5*$BNSZ`(r5)
1366	$UMULL	r8,r6,r7
1367	$UMULH	r9,r6,r7
1368	addc	r10,r10,r8
1369	adde	r11,r11,r9
1370	addze	r12,r12
1371					#mul_add_c(a[5],b[4],c1,c2,c3);
1372	$LD	r6,`5*$BNSZ`(r4)
1373	$LD	r7,`4*$BNSZ`(r5)
1374	$UMULL	r8,r6,r7
1375	$UMULH	r9,r6,r7
1376	addc	r10,r10,r8
1377	adde	r11,r11,r9
1378	addze	r12,r12
1379					#mul_add_c(a[6],b[3],c1,c2,c3);
1380	$LD	r6,`6*$BNSZ`(r4)
1381	$LD	r7,`3*$BNSZ`(r5)
1382	$UMULL	r8,r6,r7
1383	$UMULH	r9,r6,r7
1384	addc	r10,r10,r8
1385	adde	r11,r11,r9
1386	addze	r12,r12
1387					#mul_add_c(a[7],b[2],c1,c2,c3);
1388	$LD	r6,`7*$BNSZ`(r4)
1389	$LD	r7,`2*$BNSZ`(r5)
1390	$UMULL	r8,r6,r7
1391	$UMULH	r9,r6,r7
1392	addc	r10,r10,r8
1393	adde	r11,r11,r9
1394	addze	r12,r12
1395	$ST	r10,`9*$BNSZ`(r3)	#r[9]=c1;
1396					#mul_add_c(a[7],b[3],c2,c3,c1);
1397	$LD	r7,`3*$BNSZ`(r5)
1398	$UMULL	r8,r6,r7
1399	$UMULH	r9,r6,r7
1400	addc	r11,r11,r8
1401	adde	r12,r12,r9
1402	addze	r10,r0
1403					#mul_add_c(a[6],b[4],c2,c3,c1);
1404	$LD	r6,`6*$BNSZ`(r4)
1405	$LD	r7,`4*$BNSZ`(r5)
1406	$UMULL	r8,r6,r7
1407	$UMULH	r9,r6,r7
1408	addc	r11,r11,r8
1409	adde	r12,r12,r9
1410	addze	r10,r10
1411					#mul_add_c(a[5],b[5],c2,c3,c1);
1412	$LD	r6,`5*$BNSZ`(r4)
1413	$LD	r7,`5*$BNSZ`(r5)
1414	$UMULL	r8,r6,r7
1415	$UMULH	r9,r6,r7
1416	addc	r11,r11,r8
1417	adde	r12,r12,r9
1418	addze	r10,r10
1419					#mul_add_c(a[4],b[6],c2,c3,c1);
1420	$LD	r6,`4*$BNSZ`(r4)
1421	$LD	r7,`6*$BNSZ`(r5)
1422	$UMULL	r8,r6,r7
1423	$UMULH	r9,r6,r7
1424	addc	r11,r11,r8
1425	adde	r12,r12,r9
1426	addze	r10,r10
1427					#mul_add_c(a[3],b[7],c2,c3,c1);
1428	$LD	r6,`3*$BNSZ`(r4)
1429	$LD	r7,`7*$BNSZ`(r5)
1430	$UMULL	r8,r6,r7
1431	$UMULH	r9,r6,r7
1432	addc	r11,r11,r8
1433	adde	r12,r12,r9
1434	addze	r10,r10
1435	$ST	r11,`10*$BNSZ`(r3)	#r[10]=c2;
1436					#mul_add_c(a[4],b[7],c3,c1,c2);
1437	$LD	r6,`4*$BNSZ`(r4)
1438	$UMULL	r8,r6,r7
1439	$UMULH	r9,r6,r7
1440	addc	r12,r12,r8
1441	adde	r10,r10,r9
1442	addze	r11,r0
1443					#mul_add_c(a[5],b[6],c3,c1,c2);
1444	$LD	r6,`5*$BNSZ`(r4)
1445	$LD	r7,`6*$BNSZ`(r5)
1446	$UMULL	r8,r6,r7
1447	$UMULH	r9,r6,r7
1448	addc	r12,r12,r8
1449	adde	r10,r10,r9
1450	addze	r11,r11
1451					#mul_add_c(a[6],b[5],c3,c1,c2);
1452	$LD	r6,`6*$BNSZ`(r4)
1453	$LD	r7,`5*$BNSZ`(r5)
1454	$UMULL	r8,r6,r7
1455	$UMULH	r9,r6,r7
1456	addc	r12,r12,r8
1457	adde	r10,r10,r9
1458	addze	r11,r11
1459					#mul_add_c(a[7],b[4],c3,c1,c2);
1460	$LD	r6,`7*$BNSZ`(r4)
1461	$LD	r7,`4*$BNSZ`(r5)
1462	$UMULL	r8,r6,r7
1463	$UMULH	r9,r6,r7
1464	addc	r12,r12,r8
1465	adde	r10,r10,r9
1466	addze	r11,r11
1467	$ST	r12,`11*$BNSZ`(r3)	#r[11]=c3;
1468					#mul_add_c(a[7],b[5],c1,c2,c3);
1469	$LD	r7,`5*$BNSZ`(r5)
1470	$UMULL	r8,r6,r7
1471	$UMULH	r9,r6,r7
1472	addc	r10,r10,r8
1473	adde	r11,r11,r9
1474	addze	r12,r0
1475					#mul_add_c(a[6],b[6],c1,c2,c3);
1476	$LD	r6,`6*$BNSZ`(r4)
1477	$LD	r7,`6*$BNSZ`(r5)
1478	$UMULL	r8,r6,r7
1479	$UMULH	r9,r6,r7
1480	addc	r10,r10,r8
1481	adde	r11,r11,r9
1482	addze	r12,r12
1483					#mul_add_c(a[5],b[7],c1,c2,c3);
1484	$LD	r6,`5*$BNSZ`(r4)
1485	$LD	r7,`7*$BNSZ`(r5)
1486	$UMULL	r8,r6,r7
1487	$UMULH	r9,r6,r7
1488	addc	r10,r10,r8
1489	adde	r11,r11,r9
1490	addze	r12,r12
1491	$ST	r10,`12*$BNSZ`(r3)	#r[12]=c1;
1492					#mul_add_c(a[6],b[7],c2,c3,c1);
1493	$LD	r6,`6*$BNSZ`(r4)
1494	$UMULL	r8,r6,r7
1495	$UMULH	r9,r6,r7
1496	addc	r11,r11,r8
1497	adde	r12,r12,r9
1498	addze	r10,r0
1499					#mul_add_c(a[7],b[6],c2,c3,c1);
1500	$LD	r6,`7*$BNSZ`(r4)
1501	$LD	r7,`6*$BNSZ`(r5)
1502	$UMULL	r8,r6,r7
1503	$UMULH	r9,r6,r7
1504	addc	r11,r11,r8
1505	adde	r12,r12,r9
1506	addze	r10,r10
1507	$ST	r11,`13*$BNSZ`(r3)	#r[13]=c2;
1508					#mul_add_c(a[7],b[7],c3,c1,c2);
1509	$LD	r7,`7*$BNSZ`(r5)
1510	$UMULL	r8,r6,r7
1511	$UMULH	r9,r6,r7
1512	addc	r12,r12,r8
1513	adde	r10,r10,r9
1514	$ST	r12,`14*$BNSZ`(r3)	#r[14]=c3;
1515	$ST	r10,`15*$BNSZ`(r3)	#r[15]=c1;
1516	blr
1517	.long	0
1518	.byte	0,12,0x14,0,0,0,3,0
1519	.long	0
1520.size	.bn_mul_comba8,.-.bn_mul_comba8
1521
1522#
1523#	NOTE:	The following label name should be changed to
1524#		"bn_sub_words" i.e. remove the first dot
1525#		for the gcc compiler. This should be automatically
1526#		done in the build
1527#
1528#
1529.align	4
1530.bn_sub_words:
1531#
1532#	Handcoded version of bn_sub_words
1533#
1534#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1535#
1536#	r3 = r
1537#	r4 = a
1538#	r5 = b
1539#	r6 = n
1540#
1541#       Note:	No loop unrolling done since this is not a performance
1542#               critical loop.
1543
1544	xor	r0,r0,r0	#set r0 = 0
1545#
1546#	check for r6 = 0 AND set carry bit.
1547#
1548	subfc.	r7,r0,r6        # If r6 is 0 then result is 0.
1549				# if r6 > 0 then result !=0
1550				# In either case carry bit is set.
1551	beq	Lppcasm_sub_adios
1552	addi	r4,r4,-$BNSZ
1553	addi	r3,r3,-$BNSZ
1554	addi	r5,r5,-$BNSZ
1555	mtctr	r6
1556Lppcasm_sub_mainloop:
1557	$LDU	r7,$BNSZ(r4)
1558	$LDU	r8,$BNSZ(r5)
1559	subfe	r6,r8,r7	# r6 = r7+carry bit + onescomplement(r8)
1560				# if carry = 1 this is r7-r8. Else it
1561				# is r7-r8 -1 as we need.
1562	$STU	r6,$BNSZ(r3)
1563	bdnz	Lppcasm_sub_mainloop
1564Lppcasm_sub_adios:
1565	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
1566	andi.	r3,r3,1         # keep only last bit.
1567	blr
1568	.long	0
1569	.byte	0,12,0x14,0,0,0,4,0
1570	.long	0
1571.size	.bn_sub_words,.-.bn_sub_words
1572
1573#
1574#	NOTE:	The following label name should be changed to
1575#		"bn_add_words" i.e. remove the first dot
1576#		for the gcc compiler. This should be automatically
1577#		done in the build
1578#
1579
1580.align	4
1581.bn_add_words:
1582#
1583#	Handcoded version of bn_add_words
1584#
1585#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1586#
1587#	r3 = r
1588#	r4 = a
1589#	r5 = b
1590#	r6 = n
1591#
1592#       Note:	No loop unrolling done since this is not a performance
1593#               critical loop.
1594
1595	xor	r0,r0,r0
1596#
1597#	check for r6 = 0. Is this needed?
1598#
1599	addic.	r6,r6,0		#test r6 and clear carry bit.
1600	beq	Lppcasm_add_adios
1601	addi	r4,r4,-$BNSZ
1602	addi	r3,r3,-$BNSZ
1603	addi	r5,r5,-$BNSZ
1604	mtctr	r6
1605Lppcasm_add_mainloop:
1606	$LDU	r7,$BNSZ(r4)
1607	$LDU	r8,$BNSZ(r5)
1608	adde	r8,r7,r8
1609	$STU	r8,$BNSZ(r3)
1610	bdnz	Lppcasm_add_mainloop
1611Lppcasm_add_adios:
1612	addze	r3,r0			#return carry bit.
1613	blr
1614	.long	0
1615	.byte	0,12,0x14,0,0,0,4,0
1616	.long	0
1617.size	.bn_add_words,.-.bn_add_words
1618
1619#
1620#	NOTE:	The following label name should be changed to
1621#		"bn_div_words" i.e. remove the first dot
1622#		for the gcc compiler. This should be automatically
1623#		done in the build
1624#
1625
1626.align	4
1627.bn_div_words:
1628#
1629#	This is a cleaned up version of code generated by
1630#	the AIX compiler. The only optimization is to use
1631#	the PPC instruction to count leading zeros instead
1632#	of call to num_bits_word. Since this was compiled
1633#	only at level -O2 we can possibly squeeze it more?
1634#
1635#	r3 = h
1636#	r4 = l
1637#	r5 = d
1638
1639	$UCMPI	0,r5,0			# compare r5 and 0
1640	bne	Lppcasm_div1		# proceed if d!=0
1641	li	r3,-1			# d=0 return -1
1642	blr
1643Lppcasm_div1:
1644	xor	r0,r0,r0		#r0=0
1645	li	r8,$BITS
1646	$CNTLZ.	r7,r5			#r7 = num leading 0s in d.
1647	beq	Lppcasm_div2		#proceed if no leading zeros
1648	subf	r8,r7,r8		#r8 = BN_num_bits_word(d)
1649	$SHR.	r9,r3,r8		#are there any bits above r8'th?
1650	$TR	16,r9,r0		#if there're, signal to dump core...
1651Lppcasm_div2:
1652	$UCMP	0,r3,r5			#h>=d?
1653	blt	Lppcasm_div3		#goto Lppcasm_div3 if not
1654	subf	r3,r5,r3		#h-=d ;
1655Lppcasm_div3:				#r7 = BN_BITS2-i. so r7=i
1656	cmpi	0,0,r7,0		# is (i == 0)?
1657	beq	Lppcasm_div4
1658	$SHL	r3,r3,r7		# h = (h<< i)
1659	$SHR	r8,r4,r8		# r8 = (l >> BN_BITS2 -i)
1660	$SHL	r5,r5,r7		# d<<=i
1661	or	r3,r3,r8		# h = (h<<i)|(l>>(BN_BITS2-i))
1662	$SHL	r4,r4,r7		# l <<=i
1663Lppcasm_div4:
1664	$SHRI	r9,r5,`$BITS/2`		# r9 = dh
1665					# dl will be computed when needed
1666					# as it saves registers.
1667	li	r6,2			#r6=2
1668	mtctr	r6			#counter will be in count.
1669Lppcasm_divouterloop:
1670	$SHRI	r8,r3,`$BITS/2`		#r8 = (h>>BN_BITS4)
1671	$SHRI	r11,r4,`$BITS/2`	#r11= (l&BN_MASK2h)>>BN_BITS4
1672					# compute here for innerloop.
1673	$UCMP	0,r8,r9			# is (h>>BN_BITS4)==dh
1674	bne	Lppcasm_div5		# goto Lppcasm_div5 if not
1675
1676	li	r8,-1
1677	$CLRU	r8,r8,`$BITS/2`		#q = BN_MASK2l
1678	b	Lppcasm_div6
1679Lppcasm_div5:
1680	$UDIV	r8,r3,r9		#q = h/dh
1681Lppcasm_div6:
1682	$UMULL	r12,r9,r8		#th = q*dh
1683	$CLRU	r10,r5,`$BITS/2`	#r10=dl
1684	$UMULL	r6,r8,r10		#tl = q*dl
1685
1686Lppcasm_divinnerloop:
1687	subf	r10,r12,r3		#t = h -th
1688	$SHRI	r7,r10,`$BITS/2`	#r7= (t &BN_MASK2H), sort of...
1689	addic.	r7,r7,0			#test if r7 == 0. used below.
1690					# now want to compute
1691					# r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
1692					# the following 2 instructions do that
1693	$SHLI	r7,r10,`$BITS/2`	# r7 = (t<<BN_BITS4)
1694	or	r7,r7,r11		# r7|=((l&BN_MASK2h)>>BN_BITS4)
1695	$UCMP	cr1,r6,r7		# compare (tl <= r7)
1696	bne	Lppcasm_divinnerexit
1697	ble	cr1,Lppcasm_divinnerexit
1698	addi	r8,r8,-1		#q--
1699	subf	r12,r9,r12		#th -=dh
1700	$CLRU	r10,r5,`$BITS/2`	#r10=dl. t is no longer needed in loop.
1701	subf	r6,r10,r6		#tl -=dl
1702	b	Lppcasm_divinnerloop
1703Lppcasm_divinnerexit:
1704	$SHRI	r10,r6,`$BITS/2`	#t=(tl>>BN_BITS4)
1705	$SHLI	r11,r6,`$BITS/2`	#tl=(tl<<BN_BITS4)&BN_MASK2h;
1706	$UCMP	cr1,r4,r11		# compare l and tl
1707	add	r12,r12,r10		# th+=t
1708	bge	cr1,Lppcasm_div7	# if (l>=tl) goto Lppcasm_div7
1709	addi	r12,r12,1		# th++
1710Lppcasm_div7:
1711	subf	r11,r11,r4		#r11=l-tl
1712	$UCMP	cr1,r3,r12		#compare h and th
1713	bge	cr1,Lppcasm_div8	#if (h>=th) goto Lppcasm_div8
1714	addi	r8,r8,-1		# q--
1715	add	r3,r5,r3		# h+=d
1716Lppcasm_div8:
1717	subf	r12,r12,r3		#r12 = h-th
1718	$SHLI	r4,r11,`$BITS/2`	#l=(l&BN_MASK2l)<<BN_BITS4
1719					# want to compute
1720					# h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
1721					# the following 2 instructions will do this.
1722	$INSR	r11,r12,`$BITS/2`,`$BITS/2`	# r11 is the value we want rotated $BITS/2.
1723	$ROTL	r3,r11,`$BITS/2`	# rotate by $BITS/2 and store in r3
1724	bdz	Lppcasm_div9		#if (count==0) break ;
1725	$SHLI	r0,r8,`$BITS/2`		#ret =q<<BN_BITS4
1726	b	Lppcasm_divouterloop
1727Lppcasm_div9:
1728	or	r3,r8,r0
1729	blr
1730	.long	0
1731	.byte	0,12,0x14,0,0,0,3,0
1732	.long	0
1733.size	.bn_div_words,.-.bn_div_words
1734
1735#
1736#	NOTE:	The following label name should be changed to
1737#		"bn_sqr_words" i.e. remove the first dot
1738#		for the gcc compiler. This should be automatically
1739#		done in the build
1740#
1741.align	4
1742.bn_sqr_words:
1743#
1744#	Optimized version of bn_sqr_words
1745#
1746#	void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
1747#
1748#	r3 = r
1749#	r4 = a
1750#	r5 = n
1751#
1752#	r6 = a[i].
1753#	r7,r8 = product.
1754#
1755#	No unrolling done here. Not performance critical.
1756
1757	addic.	r5,r5,0			#test r5.
1758	beq	Lppcasm_sqr_adios
1759	addi	r4,r4,-$BNSZ
1760	addi	r3,r3,-$BNSZ
1761	mtctr	r5
1762Lppcasm_sqr_mainloop:
1763					#sqr(r[0],r[1],a[0]);
1764	$LDU	r6,$BNSZ(r4)
1765	$UMULL	r7,r6,r6
1766	$UMULH  r8,r6,r6
1767	$STU	r7,$BNSZ(r3)
1768	$STU	r8,$BNSZ(r3)
1769	bdnz	Lppcasm_sqr_mainloop
1770Lppcasm_sqr_adios:
1771	blr
1772	.long	0
1773	.byte	0,12,0x14,0,0,0,3,0
1774	.long	0
1775.size	.bn_sqr_words,.-.bn_sqr_words
1776
1777#
1778#	NOTE:	The following label name should be changed to
1779#		"bn_mul_words" i.e. remove the first dot
1780#		for the gcc compiler. This should be automatically
1781#		done in the build
1782#
1783
1784.align	4
1785.bn_mul_words:
1786#
1787# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1788#
1789# r3 = rp
1790# r4 = ap
1791# r5 = num
1792# r6 = w
1793	xor	r0,r0,r0
1794	xor	r12,r12,r12		# used for carry
1795	rlwinm.	r7,r5,30,2,31		# num >> 2
1796	beq	Lppcasm_mw_REM
1797	mtctr	r7
1798Lppcasm_mw_LOOP:
1799					#mul(rp[0],ap[0],w,c1);
1800	$LD	r8,`0*$BNSZ`(r4)
1801	$UMULL	r9,r6,r8
1802	$UMULH  r10,r6,r8
1803	addc	r9,r9,r12
1804	#addze	r10,r10			#carry is NOT ignored.
1805					#will be taken care of
1806					#in second spin below
1807					#using adde.
1808	$ST	r9,`0*$BNSZ`(r3)
1809					#mul(rp[1],ap[1],w,c1);
1810	$LD	r8,`1*$BNSZ`(r4)
1811	$UMULL	r11,r6,r8
1812	$UMULH  r12,r6,r8
1813	adde	r11,r11,r10
1814	#addze	r12,r12
1815	$ST	r11,`1*$BNSZ`(r3)
1816					#mul(rp[2],ap[2],w,c1);
1817	$LD	r8,`2*$BNSZ`(r4)
1818	$UMULL	r9,r6,r8
1819	$UMULH  r10,r6,r8
1820	adde	r9,r9,r12
1821	#addze	r10,r10
1822	$ST	r9,`2*$BNSZ`(r3)
1823					#mul_add(rp[3],ap[3],w,c1);
1824	$LD	r8,`3*$BNSZ`(r4)
1825	$UMULL	r11,r6,r8
1826	$UMULH  r12,r6,r8
1827	adde	r11,r11,r10
1828	addze	r12,r12			#this spin we collect carry into
1829					#r12
1830	$ST	r11,`3*$BNSZ`(r3)
1831
1832	addi	r3,r3,`4*$BNSZ`
1833	addi	r4,r4,`4*$BNSZ`
1834	bdnz	Lppcasm_mw_LOOP
1835
1836Lppcasm_mw_REM:
1837	andi.	r5,r5,0x3
1838	beq	Lppcasm_mw_OVER
1839					#mul(rp[0],ap[0],w,c1);
1840	$LD	r8,`0*$BNSZ`(r4)
1841	$UMULL	r9,r6,r8
1842	$UMULH  r10,r6,r8
1843	addc	r9,r9,r12
1844	addze	r10,r10
1845	$ST	r9,`0*$BNSZ`(r3)
1846	addi	r12,r10,0
1847
1848	addi	r5,r5,-1
1849	cmpli	0,0,r5,0
1850	beq	Lppcasm_mw_OVER
1851
1852
1853					#mul(rp[1],ap[1],w,c1);
1854	$LD	r8,`1*$BNSZ`(r4)
1855	$UMULL	r9,r6,r8
1856	$UMULH  r10,r6,r8
1857	addc	r9,r9,r12
1858	addze	r10,r10
1859	$ST	r9,`1*$BNSZ`(r3)
1860	addi	r12,r10,0
1861
1862	addi	r5,r5,-1
1863	cmpli	0,0,r5,0
1864	beq	Lppcasm_mw_OVER
1865
1866					#mul_add(rp[2],ap[2],w,c1);
1867	$LD	r8,`2*$BNSZ`(r4)
1868	$UMULL	r9,r6,r8
1869	$UMULH  r10,r6,r8
1870	addc	r9,r9,r12
1871	addze	r10,r10
1872	$ST	r9,`2*$BNSZ`(r3)
1873	addi	r12,r10,0
1874
1875Lppcasm_mw_OVER:
1876	addi	r3,r12,0
1877	blr
1878	.long	0
1879	.byte	0,12,0x14,0,0,0,4,0
1880	.long	0
1881.size	.bn_mul_words,.-.bn_mul_words
1882
1883#
1884#	NOTE:	The following label name should be changed to
1885#		"bn_mul_add_words" i.e. remove the first dot
1886#		for the gcc compiler. This should be automatically
1887#		done in the build
1888#
1889
1890.align	4
1891.bn_mul_add_words:
1892#
1893# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1894#
1895# r3 = rp
1896# r4 = ap
1897# r5 = num
1898# r6 = w
1899#
1900# empirical evidence suggests that unrolled version performs best!!
1901#
1902	xor	r0,r0,r0		#r0 = 0
1903	xor	r12,r12,r12  		#r12 = 0 . used for carry
1904	rlwinm.	r7,r5,30,2,31		# num >> 2
1905	beq	Lppcasm_maw_leftover	# if (num < 4) go LPPCASM_maw_leftover
1906	mtctr	r7
1907Lppcasm_maw_mainloop:
1908					#mul_add(rp[0],ap[0],w,c1);
1909	$LD	r8,`0*$BNSZ`(r4)
1910	$LD	r11,`0*$BNSZ`(r3)
1911	$UMULL	r9,r6,r8
1912	$UMULH  r10,r6,r8
1913	addc	r9,r9,r12		#r12 is carry.
1914	addze	r10,r10
1915	addc	r9,r9,r11
1916	#addze	r10,r10
1917					#the above instruction addze
1918					#is NOT needed. Carry will NOT
1919					#be ignored. It's not affected
1920					#by multiply and will be collected
1921					#in the next spin
1922	$ST	r9,`0*$BNSZ`(r3)
1923
1924					#mul_add(rp[1],ap[1],w,c1);
1925	$LD	r8,`1*$BNSZ`(r4)
1926	$LD	r9,`1*$BNSZ`(r3)
1927	$UMULL	r11,r6,r8
1928	$UMULH  r12,r6,r8
1929	adde	r11,r11,r10		#r10 is carry.
1930	addze	r12,r12
1931	addc	r11,r11,r9
1932	#addze	r12,r12
1933	$ST	r11,`1*$BNSZ`(r3)
1934
1935					#mul_add(rp[2],ap[2],w,c1);
1936	$LD	r8,`2*$BNSZ`(r4)
1937	$UMULL	r9,r6,r8
1938	$LD	r11,`2*$BNSZ`(r3)
1939	$UMULH  r10,r6,r8
1940	adde	r9,r9,r12
1941	addze	r10,r10
1942	addc	r9,r9,r11
1943	#addze	r10,r10
1944	$ST	r9,`2*$BNSZ`(r3)
1945
1946					#mul_add(rp[3],ap[3],w,c1);
1947	$LD	r8,`3*$BNSZ`(r4)
1948	$UMULL	r11,r6,r8
1949	$LD	r9,`3*$BNSZ`(r3)
1950	$UMULH  r12,r6,r8
1951	adde	r11,r11,r10
1952	addze	r12,r12
1953	addc	r11,r11,r9
1954	addze	r12,r12
1955	$ST	r11,`3*$BNSZ`(r3)
1956	addi	r3,r3,`4*$BNSZ`
1957	addi	r4,r4,`4*$BNSZ`
1958	bdnz	Lppcasm_maw_mainloop
1959
1960Lppcasm_maw_leftover:
1961	andi.	r5,r5,0x3
1962	beq	Lppcasm_maw_adios
1963	addi	r3,r3,-$BNSZ
1964	addi	r4,r4,-$BNSZ
1965					#mul_add(rp[0],ap[0],w,c1);
1966	mtctr	r5
1967	$LDU	r8,$BNSZ(r4)
1968	$UMULL	r9,r6,r8
1969	$UMULH  r10,r6,r8
1970	$LDU	r11,$BNSZ(r3)
1971	addc	r9,r9,r11
1972	addze	r10,r10
1973	addc	r9,r9,r12
1974	addze	r12,r10
1975	$ST	r9,0(r3)
1976
1977	bdz	Lppcasm_maw_adios
1978					#mul_add(rp[1],ap[1],w,c1);
1979	$LDU	r8,$BNSZ(r4)
1980	$UMULL	r9,r6,r8
1981	$UMULH  r10,r6,r8
1982	$LDU	r11,$BNSZ(r3)
1983	addc	r9,r9,r11
1984	addze	r10,r10
1985	addc	r9,r9,r12
1986	addze	r12,r10
1987	$ST	r9,0(r3)
1988
1989	bdz	Lppcasm_maw_adios
1990					#mul_add(rp[2],ap[2],w,c1);
1991	$LDU	r8,$BNSZ(r4)
1992	$UMULL	r9,r6,r8
1993	$UMULH  r10,r6,r8
1994	$LDU	r11,$BNSZ(r3)
1995	addc	r9,r9,r11
1996	addze	r10,r10
1997	addc	r9,r9,r12
1998	addze	r12,r10
1999	$ST	r9,0(r3)
2000
2001Lppcasm_maw_adios:
2002	addi	r3,r12,0
2003	blr
2004	.long	0
2005	.byte	0,12,0x14,0,0,0,4,0
2006	.long	0
2007.size	.bn_mul_add_words,.-.bn_mul_add_words
2008	.align	4
2009EOF
2010$data =~ s/\`([^\`]*)\`/eval $1/gem;
2011print $data;
2012close STDOUT or die "error closing STDOUT: $!";
2013