xref: /freebsd/crypto/openssl/crypto/bn/asm/ppc.pl (revision 49b49cda41feabe3439f7318e8bf40e3896c7bf4)
1#!/usr/bin/env perl
2#
3# Implemented as a Perl wrapper as we want to support several different
4# architectures with single file. We pick up the target based on the
5# file name we are asked to generate.
6#
7# It should be noted though that this perl code is nothing like
8# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
9# as pre-processor to cover for platform differences in name decoration,
10# linker tables, 32-/64-bit instruction sets...
11#
12# As you might know there're several PowerPC ABI in use. Most notably
13# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
14# are similar enough to implement leaf(!) functions, which would be ABI
15# neutral. And that's what you find here: ABI neutral leaf functions.
16# In case you wonder what that is...
17#
18#       AIX performance
19#
20#	MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
21#
22#	The following is the performance of 32-bit compiler
23#	generated code:
24#
25#	OpenSSL 0.9.6c 21 dec 2001
26#	built on: Tue Jun 11 11:06:51 EDT 2002
27#	options:bn(64,32) ...
28#compiler: cc -DTHREADS  -DAIX -DB_ENDIAN -DBN_LLONG -O3
29#                  sign    verify    sign/s verify/s
30#rsa  512 bits   0.0098s   0.0009s    102.0   1170.6
31#rsa 1024 bits   0.0507s   0.0026s     19.7    387.5
32#rsa 2048 bits   0.3036s   0.0085s      3.3    117.1
33#rsa 4096 bits   2.0040s   0.0299s      0.5     33.4
34#dsa  512 bits   0.0087s   0.0106s    114.3     94.5
35#dsa 1024 bits   0.0256s   0.0313s     39.0     32.0
36#
37#	Same bechmark with this assembler code:
38#
39#rsa  512 bits   0.0056s   0.0005s    178.6   2049.2
40#rsa 1024 bits   0.0283s   0.0015s     35.3    674.1
41#rsa 2048 bits   0.1744s   0.0050s      5.7    201.2
42#rsa 4096 bits   1.1644s   0.0179s      0.9     55.7
43#dsa  512 bits   0.0052s   0.0062s    191.6    162.0
44#dsa 1024 bits   0.0149s   0.0180s     67.0     55.5
45#
46#	Number of operations increases by at almost 75%
47#
48#	Here are performance numbers for 64-bit compiler
49#	generated code:
50#
51#	OpenSSL 0.9.6g [engine] 9 Aug 2002
52#	built on: Fri Apr 18 16:59:20 EDT 2003
53#	options:bn(64,64) ...
54#	compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
55#                  sign    verify    sign/s verify/s
56#rsa  512 bits   0.0028s   0.0003s    357.1   3844.4
57#rsa 1024 bits   0.0148s   0.0008s     67.5   1239.7
58#rsa 2048 bits   0.0963s   0.0028s     10.4    353.0
59#rsa 4096 bits   0.6538s   0.0102s      1.5     98.1
60#dsa  512 bits   0.0026s   0.0032s    382.5    313.7
61#dsa 1024 bits   0.0081s   0.0099s    122.8    100.6
62#
63#	Same benchmark with this assembler code:
64#
65#rsa  512 bits   0.0020s   0.0002s    510.4   6273.7
66#rsa 1024 bits   0.0088s   0.0005s    114.1   2128.3
67#rsa 2048 bits   0.0540s   0.0016s     18.5    622.5
68#rsa 4096 bits   0.3700s   0.0058s      2.7    171.0
69#dsa  512 bits   0.0016s   0.0020s    610.7    507.1
70#dsa 1024 bits   0.0047s   0.0058s    212.5    173.2
71#
72#	Again, performance increases by at about 75%
73#
74#       Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
75#       OpenSSL 0.9.7c 30 Sep 2003
76#
77#       Original code.
78#
79#rsa  512 bits   0.0011s   0.0001s    906.1  11012.5
80#rsa 1024 bits   0.0060s   0.0003s    166.6   3363.1
81#rsa 2048 bits   0.0370s   0.0010s     27.1    982.4
82#rsa 4096 bits   0.2426s   0.0036s      4.1    280.4
83#dsa  512 bits   0.0010s   0.0012s   1038.1    841.5
84#dsa 1024 bits   0.0030s   0.0037s    329.6    269.7
85#dsa 2048 bits   0.0101s   0.0127s     98.9     78.6
86#
87#       Same benchmark with this assembler code:
88#
89#rsa  512 bits   0.0007s   0.0001s   1416.2  16645.9
90#rsa 1024 bits   0.0036s   0.0002s    274.4   5380.6
91#rsa 2048 bits   0.0222s   0.0006s     45.1   1589.5
92#rsa 4096 bits   0.1469s   0.0022s      6.8    449.6
93#dsa  512 bits   0.0006s   0.0007s   1664.2   1376.2
94#dsa 1024 bits   0.0018s   0.0023s    545.0    442.2
95#dsa 2048 bits   0.0061s   0.0075s    163.5    132.8
96#
97#        Performance increase of ~60%
98#
99#	If you have comments or suggestions to improve code send
100#	me a note at schari@us.ibm.com
101#
102
103$flavour = shift;
104
105if ($flavour =~ /32/) {
106	$BITS=	32;
107	$BNSZ=	$BITS/8;
108	$ISA=	"\"ppc\"";
109
110	$LD=	"lwz";		# load
111	$LDU=	"lwzu";		# load and update
112	$ST=	"stw";		# store
113	$STU=	"stwu";		# store and update
114	$UMULL=	"mullw";	# unsigned multiply low
115	$UMULH=	"mulhwu";	# unsigned multiply high
116	$UDIV=	"divwu";	# unsigned divide
117	$UCMPI=	"cmplwi";	# unsigned compare with immediate
118	$UCMP=	"cmplw";	# unsigned compare
119	$CNTLZ=	"cntlzw";	# count leading zeros
120	$SHL=	"slw";		# shift left
121	$SHR=	"srw";		# unsigned shift right
122	$SHRI=	"srwi";		# unsigned shift right by immediate
123	$SHLI=	"slwi";		# shift left by immediate
124	$CLRU=	"clrlwi";	# clear upper bits
125	$INSR=	"insrwi";	# insert right
126	$ROTL=	"rotlwi";	# rotate left by immediate
127	$TR=	"tw";		# conditional trap
128} elsif ($flavour =~ /64/) {
129	$BITS=	64;
130	$BNSZ=	$BITS/8;
131	$ISA=	"\"ppc64\"";
132
133	# same as above, but 64-bit mnemonics...
134	$LD=	"ld";		# load
135	$LDU=	"ldu";		# load and update
136	$ST=	"std";		# store
137	$STU=	"stdu";		# store and update
138	$UMULL=	"mulld";	# unsigned multiply low
139	$UMULH=	"mulhdu";	# unsigned multiply high
140	$UDIV=	"divdu";	# unsigned divide
141	$UCMPI=	"cmpldi";	# unsigned compare with immediate
142	$UCMP=	"cmpld";	# unsigned compare
143	$CNTLZ=	"cntlzd";	# count leading zeros
144	$SHL=	"sld";		# shift left
145	$SHR=	"srd";		# unsigned shift right
146	$SHRI=	"srdi";		# unsigned shift right by immediate
147	$SHLI=	"sldi";		# shift left by immediate
148	$CLRU=	"clrldi";	# clear upper bits
149	$INSR=	"insrdi";	# insert right
150	$ROTL=	"rotldi";	# rotate left by immediate
151	$TR=	"td";		# conditional trap
152} else { die "nonsense $flavour"; }
153
154$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
155( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
156( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
157die "can't locate ppc-xlate.pl";
158
159open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
160
161$data=<<EOF;
162#--------------------------------------------------------------------
163#
164#
165#
166#
167#	File:		ppc32.s
168#
169#	Created by:	Suresh Chari
170#			IBM Thomas J. Watson Research Library
171#			Hawthorne, NY
172#
173#
174#	Description:	Optimized assembly routines for OpenSSL crypto
175#			on the 32 bitPowerPC platform.
176#
177#
178#	Version History
179#
180#	2. Fixed bn_add,bn_sub and bn_div_words, added comments,
181#	   cleaned up code. Also made a single version which can
182#	   be used for both the AIX and Linux compilers. See NOTE
183#	   below.
184#				12/05/03		Suresh Chari
185#			(with lots of help from)        Andy Polyakov
186##
187#	1. Initial version	10/20/02		Suresh Chari
188#
189#
190#	The following file works for the xlc,cc
191#	and gcc compilers.
192#
193#	NOTE:	To get the file to link correctly with the gcc compiler
194#	        you have to change the names of the routines and remove
195#		the first .(dot) character. This should automatically
196#		be done in the build process.
197#
198#	Hand optimized assembly code for the following routines
199#
200#	bn_sqr_comba4
201#	bn_sqr_comba8
202#	bn_mul_comba4
203#	bn_mul_comba8
204#	bn_sub_words
205#	bn_add_words
206#	bn_div_words
207#	bn_sqr_words
208#	bn_mul_words
209#	bn_mul_add_words
210#
211#	NOTE:	It is possible to optimize this code more for
212#	specific PowerPC or Power architectures. On the Northstar
213#	architecture the optimizations in this file do
214#	 NOT provide much improvement.
215#
216#	If you have comments or suggestions to improve code send
217#	me a note at schari\@us.ibm.com
218#
219#--------------------------------------------------------------------------
220#
221#	Defines to be used in the assembly code.
222#
223#.set r0,0	# we use it as storage for value of 0
224#.set SP,1	# preserved
225#.set RTOC,2	# preserved
226#.set r3,3	# 1st argument/return value
227#.set r4,4	# 2nd argument/volatile register
228#.set r5,5	# 3rd argument/volatile register
229#.set r6,6	# ...
230#.set r7,7
231#.set r8,8
232#.set r9,9
233#.set r10,10
234#.set r11,11
235#.set r12,12
236#.set r13,13	# not used, nor any other "below" it...
237
238#	Declare function names to be global
239#	NOTE:	For gcc these names MUST be changed to remove
240#	        the first . i.e. for example change ".bn_sqr_comba4"
241#		to "bn_sqr_comba4". This should be automatically done
242#		in the build.
243
244	.globl	.bn_sqr_comba4
245	.globl	.bn_sqr_comba8
246	.globl	.bn_mul_comba4
247	.globl	.bn_mul_comba8
248	.globl	.bn_sub_words
249	.globl	.bn_add_words
250	.globl	.bn_div_words
251	.globl	.bn_sqr_words
252	.globl	.bn_mul_words
253	.globl	.bn_mul_add_words
254
255# .text section
256
257	.machine	"any"
258
259#
260#	NOTE:	The following label name should be changed to
261#		"bn_sqr_comba4" i.e. remove the first dot
262#		for the gcc compiler. This should be automatically
263#		done in the build
264#
265
266.align	4
267.bn_sqr_comba4:
268#
269# Optimized version of bn_sqr_comba4.
270#
271# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
272# r3 contains r
273# r4 contains a
274#
275# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
276#
277# r5,r6 are the two BN_ULONGs being multiplied.
278# r7,r8 are the results of the 32x32 giving 64 bit multiply.
279# r9,r10, r11 are the equivalents of c1,c2, c3.
280# Here's the assembly
281#
282#
283	xor		r0,r0,r0		# set r0 = 0. Used in the addze
284						# instructions below
285
286						#sqr_add_c(a,0,c1,c2,c3)
287	$LD		r5,`0*$BNSZ`(r4)
288	$UMULL		r9,r5,r5
289	$UMULH		r10,r5,r5		#in first iteration. No need
290						#to add since c1=c2=c3=0.
291						# Note c3(r11) is NOT set to 0
292						# but will be.
293
294	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
295						# sqr_add_c2(a,1,0,c2,c3,c1);
296	$LD		r6,`1*$BNSZ`(r4)
297	$UMULL		r7,r5,r6
298	$UMULH		r8,r5,r6
299
300	addc		r7,r7,r7		# compute (r7,r8)=2*(r7,r8)
301	adde		r8,r8,r8
302	addze		r9,r0			# catch carry if any.
303						# r9= r0(=0) and carry
304
305	addc		r10,r7,r10		# now add to temp result.
306	addze		r11,r8                  # r8 added to r11 which is 0
307	addze		r9,r9
308
309	$ST		r10,`1*$BNSZ`(r3)	#r[1]=c2;
310						#sqr_add_c(a,1,c3,c1,c2)
311	$UMULL		r7,r6,r6
312	$UMULH		r8,r6,r6
313	addc		r11,r7,r11
314	adde		r9,r8,r9
315	addze		r10,r0
316						#sqr_add_c2(a,2,0,c3,c1,c2)
317	$LD		r6,`2*$BNSZ`(r4)
318	$UMULL		r7,r5,r6
319	$UMULH		r8,r5,r6
320
321	addc		r7,r7,r7
322	adde		r8,r8,r8
323	addze		r10,r10
324
325	addc		r11,r7,r11
326	adde		r9,r8,r9
327	addze		r10,r10
328	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
329						#sqr_add_c2(a,3,0,c1,c2,c3);
330	$LD		r6,`3*$BNSZ`(r4)
331	$UMULL		r7,r5,r6
332	$UMULH		r8,r5,r6
333	addc		r7,r7,r7
334	adde		r8,r8,r8
335	addze		r11,r0
336
337	addc		r9,r7,r9
338	adde		r10,r8,r10
339	addze		r11,r11
340						#sqr_add_c2(a,2,1,c1,c2,c3);
341	$LD		r5,`1*$BNSZ`(r4)
342	$LD		r6,`2*$BNSZ`(r4)
343	$UMULL		r7,r5,r6
344	$UMULH		r8,r5,r6
345
346	addc		r7,r7,r7
347	adde		r8,r8,r8
348	addze		r11,r11
349	addc		r9,r7,r9
350	adde		r10,r8,r10
351	addze		r11,r11
352	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1
353						#sqr_add_c(a,2,c2,c3,c1);
354	$UMULL		r7,r6,r6
355	$UMULH		r8,r6,r6
356	addc		r10,r7,r10
357	adde		r11,r8,r11
358	addze		r9,r0
359						#sqr_add_c2(a,3,1,c2,c3,c1);
360	$LD		r6,`3*$BNSZ`(r4)
361	$UMULL		r7,r5,r6
362	$UMULH		r8,r5,r6
363	addc		r7,r7,r7
364	adde		r8,r8,r8
365	addze		r9,r9
366
367	addc		r10,r7,r10
368	adde		r11,r8,r11
369	addze		r9,r9
370	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2
371						#sqr_add_c2(a,3,2,c3,c1,c2);
372	$LD		r5,`2*$BNSZ`(r4)
373	$UMULL		r7,r5,r6
374	$UMULH		r8,r5,r6
375	addc		r7,r7,r7
376	adde		r8,r8,r8
377	addze		r10,r0
378
379	addc		r11,r7,r11
380	adde		r9,r8,r9
381	addze		r10,r10
382	$ST		r11,`5*$BNSZ`(r3)	#r[5] = c3
383						#sqr_add_c(a,3,c1,c2,c3);
384	$UMULL		r7,r6,r6
385	$UMULH		r8,r6,r6
386	addc		r9,r7,r9
387	adde		r10,r8,r10
388
389	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1
390	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2
391	blr
392	.long	0
393	.byte	0,12,0x14,0,0,0,2,0
394	.long	0
395.size	.bn_sqr_comba4,.-.bn_sqr_comba4
396
397#
398#	NOTE:	The following label name should be changed to
399#		"bn_sqr_comba8" i.e. remove the first dot
400#		for the gcc compiler. This should be automatically
401#		done in the build
402#
403
404.align	4
405.bn_sqr_comba8:
406#
407# This is an optimized version of the bn_sqr_comba8 routine.
408# Tightly uses the adde instruction
409#
410#
411# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
412# r3 contains r
413# r4 contains a
414#
415# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
416#
417# r5,r6 are the two BN_ULONGs being multiplied.
418# r7,r8 are the results of the 32x32 giving 64 bit multiply.
419# r9,r10, r11 are the equivalents of c1,c2, c3.
420#
421# Possible optimization of loading all 8 longs of a into registers
422# doesnt provide any speedup
423#
424
425	xor		r0,r0,r0		#set r0 = 0.Used in addze
426						#instructions below.
427
428						#sqr_add_c(a,0,c1,c2,c3);
429	$LD		r5,`0*$BNSZ`(r4)
430	$UMULL		r9,r5,r5		#1st iteration:	no carries.
431	$UMULH		r10,r5,r5
432	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
433						#sqr_add_c2(a,1,0,c2,c3,c1);
434	$LD		r6,`1*$BNSZ`(r4)
435	$UMULL		r7,r5,r6
436	$UMULH		r8,r5,r6
437
438	addc		r10,r7,r10		#add the two register number
439	adde		r11,r8,r0 		# (r8,r7) to the three register
440	addze		r9,r0			# number (r9,r11,r10).NOTE:r0=0
441
442	addc		r10,r7,r10		#add the two register number
443	adde		r11,r8,r11 		# (r8,r7) to the three register
444	addze		r9,r9			# number (r9,r11,r10).
445
446	$ST		r10,`1*$BNSZ`(r3)	# r[1]=c2
447
448						#sqr_add_c(a,1,c3,c1,c2);
449	$UMULL		r7,r6,r6
450	$UMULH		r8,r6,r6
451	addc		r11,r7,r11
452	adde		r9,r8,r9
453	addze		r10,r0
454						#sqr_add_c2(a,2,0,c3,c1,c2);
455	$LD		r6,`2*$BNSZ`(r4)
456	$UMULL		r7,r5,r6
457	$UMULH		r8,r5,r6
458
459	addc		r11,r7,r11
460	adde		r9,r8,r9
461	addze		r10,r10
462
463	addc		r11,r7,r11
464	adde		r9,r8,r9
465	addze		r10,r10
466
467	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
468						#sqr_add_c2(a,3,0,c1,c2,c3);
469	$LD		r6,`3*$BNSZ`(r4)	#r6 = a[3]. r5 is already a[0].
470	$UMULL		r7,r5,r6
471	$UMULH		r8,r5,r6
472
473	addc		r9,r7,r9
474	adde		r10,r8,r10
475	addze		r11,r0
476
477	addc		r9,r7,r9
478	adde		r10,r8,r10
479	addze		r11,r11
480						#sqr_add_c2(a,2,1,c1,c2,c3);
481	$LD		r5,`1*$BNSZ`(r4)
482	$LD		r6,`2*$BNSZ`(r4)
483	$UMULL		r7,r5,r6
484	$UMULH		r8,r5,r6
485
486	addc		r9,r7,r9
487	adde		r10,r8,r10
488	addze		r11,r11
489
490	addc		r9,r7,r9
491	adde		r10,r8,r10
492	addze		r11,r11
493
494	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1;
495						#sqr_add_c(a,2,c2,c3,c1);
496	$UMULL		r7,r6,r6
497	$UMULH		r8,r6,r6
498
499	addc		r10,r7,r10
500	adde		r11,r8,r11
501	addze		r9,r0
502						#sqr_add_c2(a,3,1,c2,c3,c1);
503	$LD		r6,`3*$BNSZ`(r4)
504	$UMULL		r7,r5,r6
505	$UMULH		r8,r5,r6
506
507	addc		r10,r7,r10
508	adde		r11,r8,r11
509	addze		r9,r9
510
511	addc		r10,r7,r10
512	adde		r11,r8,r11
513	addze		r9,r9
514						#sqr_add_c2(a,4,0,c2,c3,c1);
515	$LD		r5,`0*$BNSZ`(r4)
516	$LD		r6,`4*$BNSZ`(r4)
517	$UMULL		r7,r5,r6
518	$UMULH		r8,r5,r6
519
520	addc		r10,r7,r10
521	adde		r11,r8,r11
522	addze		r9,r9
523
524	addc		r10,r7,r10
525	adde		r11,r8,r11
526	addze		r9,r9
527	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2;
528						#sqr_add_c2(a,5,0,c3,c1,c2);
529	$LD		r6,`5*$BNSZ`(r4)
530	$UMULL		r7,r5,r6
531	$UMULH		r8,r5,r6
532
533	addc		r11,r7,r11
534	adde		r9,r8,r9
535	addze		r10,r0
536
537	addc		r11,r7,r11
538	adde		r9,r8,r9
539	addze		r10,r10
540						#sqr_add_c2(a,4,1,c3,c1,c2);
541	$LD		r5,`1*$BNSZ`(r4)
542	$LD		r6,`4*$BNSZ`(r4)
543	$UMULL		r7,r5,r6
544	$UMULH		r8,r5,r6
545
546	addc		r11,r7,r11
547	adde		r9,r8,r9
548	addze		r10,r10
549
550	addc		r11,r7,r11
551	adde		r9,r8,r9
552	addze		r10,r10
553						#sqr_add_c2(a,3,2,c3,c1,c2);
554	$LD		r5,`2*$BNSZ`(r4)
555	$LD		r6,`3*$BNSZ`(r4)
556	$UMULL		r7,r5,r6
557	$UMULH		r8,r5,r6
558
559	addc		r11,r7,r11
560	adde		r9,r8,r9
561	addze		r10,r10
562
563	addc		r11,r7,r11
564	adde		r9,r8,r9
565	addze		r10,r10
566	$ST		r11,`5*$BNSZ`(r3)	#r[5]=c3;
567						#sqr_add_c(a,3,c1,c2,c3);
568	$UMULL		r7,r6,r6
569	$UMULH		r8,r6,r6
570	addc		r9,r7,r9
571	adde		r10,r8,r10
572	addze		r11,r0
573						#sqr_add_c2(a,4,2,c1,c2,c3);
574	$LD		r6,`4*$BNSZ`(r4)
575	$UMULL		r7,r5,r6
576	$UMULH		r8,r5,r6
577
578	addc		r9,r7,r9
579	adde		r10,r8,r10
580	addze		r11,r11
581
582	addc		r9,r7,r9
583	adde		r10,r8,r10
584	addze		r11,r11
585						#sqr_add_c2(a,5,1,c1,c2,c3);
586	$LD		r5,`1*$BNSZ`(r4)
587	$LD		r6,`5*$BNSZ`(r4)
588	$UMULL		r7,r5,r6
589	$UMULH		r8,r5,r6
590
591	addc		r9,r7,r9
592	adde		r10,r8,r10
593	addze		r11,r11
594
595	addc		r9,r7,r9
596	adde		r10,r8,r10
597	addze		r11,r11
598						#sqr_add_c2(a,6,0,c1,c2,c3);
599	$LD		r5,`0*$BNSZ`(r4)
600	$LD		r6,`6*$BNSZ`(r4)
601	$UMULL		r7,r5,r6
602	$UMULH		r8,r5,r6
603	addc		r9,r7,r9
604	adde		r10,r8,r10
605	addze		r11,r11
606	addc		r9,r7,r9
607	adde		r10,r8,r10
608	addze		r11,r11
609	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1;
610						#sqr_add_c2(a,7,0,c2,c3,c1);
611	$LD		r6,`7*$BNSZ`(r4)
612	$UMULL		r7,r5,r6
613	$UMULH		r8,r5,r6
614
615	addc		r10,r7,r10
616	adde		r11,r8,r11
617	addze		r9,r0
618	addc		r10,r7,r10
619	adde		r11,r8,r11
620	addze		r9,r9
621						#sqr_add_c2(a,6,1,c2,c3,c1);
622	$LD		r5,`1*$BNSZ`(r4)
623	$LD		r6,`6*$BNSZ`(r4)
624	$UMULL		r7,r5,r6
625	$UMULH		r8,r5,r6
626
627	addc		r10,r7,r10
628	adde		r11,r8,r11
629	addze		r9,r9
630	addc		r10,r7,r10
631	adde		r11,r8,r11
632	addze		r9,r9
633						#sqr_add_c2(a,5,2,c2,c3,c1);
634	$LD		r5,`2*$BNSZ`(r4)
635	$LD		r6,`5*$BNSZ`(r4)
636	$UMULL		r7,r5,r6
637	$UMULH		r8,r5,r6
638	addc		r10,r7,r10
639	adde		r11,r8,r11
640	addze		r9,r9
641	addc		r10,r7,r10
642	adde		r11,r8,r11
643	addze		r9,r9
644						#sqr_add_c2(a,4,3,c2,c3,c1);
645	$LD		r5,`3*$BNSZ`(r4)
646	$LD		r6,`4*$BNSZ`(r4)
647	$UMULL		r7,r5,r6
648	$UMULH		r8,r5,r6
649
650	addc		r10,r7,r10
651	adde		r11,r8,r11
652	addze		r9,r9
653	addc		r10,r7,r10
654	adde		r11,r8,r11
655	addze		r9,r9
656	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2;
657						#sqr_add_c(a,4,c3,c1,c2);
658	$UMULL		r7,r6,r6
659	$UMULH		r8,r6,r6
660	addc		r11,r7,r11
661	adde		r9,r8,r9
662	addze		r10,r0
663						#sqr_add_c2(a,5,3,c3,c1,c2);
664	$LD		r6,`5*$BNSZ`(r4)
665	$UMULL		r7,r5,r6
666	$UMULH		r8,r5,r6
667	addc		r11,r7,r11
668	adde		r9,r8,r9
669	addze		r10,r10
670	addc		r11,r7,r11
671	adde		r9,r8,r9
672	addze		r10,r10
673						#sqr_add_c2(a,6,2,c3,c1,c2);
674	$LD		r5,`2*$BNSZ`(r4)
675	$LD		r6,`6*$BNSZ`(r4)
676	$UMULL		r7,r5,r6
677	$UMULH		r8,r5,r6
678	addc		r11,r7,r11
679	adde		r9,r8,r9
680	addze		r10,r10
681
682	addc		r11,r7,r11
683	adde		r9,r8,r9
684	addze		r10,r10
685						#sqr_add_c2(a,7,1,c3,c1,c2);
686	$LD		r5,`1*$BNSZ`(r4)
687	$LD		r6,`7*$BNSZ`(r4)
688	$UMULL		r7,r5,r6
689	$UMULH		r8,r5,r6
690	addc		r11,r7,r11
691	adde		r9,r8,r9
692	addze		r10,r10
693	addc		r11,r7,r11
694	adde		r9,r8,r9
695	addze		r10,r10
696	$ST		r11,`8*$BNSZ`(r3)	#r[8]=c3;
697						#sqr_add_c2(a,7,2,c1,c2,c3);
698	$LD		r5,`2*$BNSZ`(r4)
699	$UMULL		r7,r5,r6
700	$UMULH		r8,r5,r6
701
702	addc		r9,r7,r9
703	adde		r10,r8,r10
704	addze		r11,r0
705	addc		r9,r7,r9
706	adde		r10,r8,r10
707	addze		r11,r11
708						#sqr_add_c2(a,6,3,c1,c2,c3);
709	$LD		r5,`3*$BNSZ`(r4)
710	$LD		r6,`6*$BNSZ`(r4)
711	$UMULL		r7,r5,r6
712	$UMULH		r8,r5,r6
713	addc		r9,r7,r9
714	adde		r10,r8,r10
715	addze		r11,r11
716	addc		r9,r7,r9
717	adde		r10,r8,r10
718	addze		r11,r11
719						#sqr_add_c2(a,5,4,c1,c2,c3);
720	$LD		r5,`4*$BNSZ`(r4)
721	$LD		r6,`5*$BNSZ`(r4)
722	$UMULL		r7,r5,r6
723	$UMULH		r8,r5,r6
724	addc		r9,r7,r9
725	adde		r10,r8,r10
726	addze		r11,r11
727	addc		r9,r7,r9
728	adde		r10,r8,r10
729	addze		r11,r11
730	$ST		r9,`9*$BNSZ`(r3)	#r[9]=c1;
731						#sqr_add_c(a,5,c2,c3,c1);
732	$UMULL		r7,r6,r6
733	$UMULH		r8,r6,r6
734	addc		r10,r7,r10
735	adde		r11,r8,r11
736	addze		r9,r0
737						#sqr_add_c2(a,6,4,c2,c3,c1);
738	$LD		r6,`6*$BNSZ`(r4)
739	$UMULL		r7,r5,r6
740	$UMULH		r8,r5,r6
741	addc		r10,r7,r10
742	adde		r11,r8,r11
743	addze		r9,r9
744	addc		r10,r7,r10
745	adde		r11,r8,r11
746	addze		r9,r9
747						#sqr_add_c2(a,7,3,c2,c3,c1);
748	$LD		r5,`3*$BNSZ`(r4)
749	$LD		r6,`7*$BNSZ`(r4)
750	$UMULL		r7,r5,r6
751	$UMULH		r8,r5,r6
752	addc		r10,r7,r10
753	adde		r11,r8,r11
754	addze		r9,r9
755	addc		r10,r7,r10
756	adde		r11,r8,r11
757	addze		r9,r9
758	$ST		r10,`10*$BNSZ`(r3)	#r[10]=c2;
759						#sqr_add_c2(a,7,4,c3,c1,c2);
760	$LD		r5,`4*$BNSZ`(r4)
761	$UMULL		r7,r5,r6
762	$UMULH		r8,r5,r6
763	addc		r11,r7,r11
764	adde		r9,r8,r9
765	addze		r10,r0
766	addc		r11,r7,r11
767	adde		r9,r8,r9
768	addze		r10,r10
769						#sqr_add_c2(a,6,5,c3,c1,c2);
770	$LD		r5,`5*$BNSZ`(r4)
771	$LD		r6,`6*$BNSZ`(r4)
772	$UMULL		r7,r5,r6
773	$UMULH		r8,r5,r6
774	addc		r11,r7,r11
775	adde		r9,r8,r9
776	addze		r10,r10
777	addc		r11,r7,r11
778	adde		r9,r8,r9
779	addze		r10,r10
780	$ST		r11,`11*$BNSZ`(r3)	#r[11]=c3;
781						#sqr_add_c(a,6,c1,c2,c3);
782	$UMULL		r7,r6,r6
783	$UMULH		r8,r6,r6
784	addc		r9,r7,r9
785	adde		r10,r8,r10
786	addze		r11,r0
787						#sqr_add_c2(a,7,5,c1,c2,c3)
788	$LD		r6,`7*$BNSZ`(r4)
789	$UMULL		r7,r5,r6
790	$UMULH		r8,r5,r6
791	addc		r9,r7,r9
792	adde		r10,r8,r10
793	addze		r11,r11
794	addc		r9,r7,r9
795	adde		r10,r8,r10
796	addze		r11,r11
797	$ST		r9,`12*$BNSZ`(r3)	#r[12]=c1;
798
799						#sqr_add_c2(a,7,6,c2,c3,c1)
800	$LD		r5,`6*$BNSZ`(r4)
801	$UMULL		r7,r5,r6
802	$UMULH		r8,r5,r6
803	addc		r10,r7,r10
804	adde		r11,r8,r11
805	addze		r9,r0
806	addc		r10,r7,r10
807	adde		r11,r8,r11
808	addze		r9,r9
809	$ST		r10,`13*$BNSZ`(r3)	#r[13]=c2;
810						#sqr_add_c(a,7,c3,c1,c2);
811	$UMULL		r7,r6,r6
812	$UMULH		r8,r6,r6
813	addc		r11,r7,r11
814	adde		r9,r8,r9
815	$ST		r11,`14*$BNSZ`(r3)	#r[14]=c3;
816	$ST		r9, `15*$BNSZ`(r3)	#r[15]=c1;
817
818
819	blr
820	.long	0
821	.byte	0,12,0x14,0,0,0,2,0
822	.long	0
823.size	.bn_sqr_comba8,.-.bn_sqr_comba8
824
825#
826#	NOTE:	The following label name should be changed to
827#		"bn_mul_comba4" i.e. remove the first dot
828#		for the gcc compiler. This should be automatically
829#		done in the build
830#
831
832.align	4
833.bn_mul_comba4:
834#
835# This is an optimized version of the bn_mul_comba4 routine.
836#
837# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
838# r3 contains r
839# r4 contains a
840# r5 contains b
841# r6, r7 are the 2 BN_ULONGs being multiplied.
842# r8, r9 are the results of the 32x32 giving 64 multiply.
843# r10, r11, r12 are the equivalents of c1, c2, and c3.
844#
845	xor	r0,r0,r0		#r0=0. Used in addze below.
846					#mul_add_c(a[0],b[0],c1,c2,c3);
847	$LD	r6,`0*$BNSZ`(r4)
848	$LD	r7,`0*$BNSZ`(r5)
849	$UMULL	r10,r6,r7
850	$UMULH	r11,r6,r7
851	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1
852					#mul_add_c(a[0],b[1],c2,c3,c1);
853	$LD	r7,`1*$BNSZ`(r5)
854	$UMULL	r8,r6,r7
855	$UMULH	r9,r6,r7
856	addc	r11,r8,r11
857	adde	r12,r9,r0
858	addze	r10,r0
859					#mul_add_c(a[1],b[0],c2,c3,c1);
860	$LD	r6, `1*$BNSZ`(r4)
861	$LD	r7, `0*$BNSZ`(r5)
862	$UMULL	r8,r6,r7
863	$UMULH	r9,r6,r7
864	addc	r11,r8,r11
865	adde	r12,r9,r12
866	addze	r10,r10
867	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2
868					#mul_add_c(a[2],b[0],c3,c1,c2);
869	$LD	r6,`2*$BNSZ`(r4)
870	$UMULL	r8,r6,r7
871	$UMULH	r9,r6,r7
872	addc	r12,r8,r12
873	adde	r10,r9,r10
874	addze	r11,r0
875					#mul_add_c(a[1],b[1],c3,c1,c2);
876	$LD	r6,`1*$BNSZ`(r4)
877	$LD	r7,`1*$BNSZ`(r5)
878	$UMULL	r8,r6,r7
879	$UMULH	r9,r6,r7
880	addc	r12,r8,r12
881	adde	r10,r9,r10
882	addze	r11,r11
883					#mul_add_c(a[0],b[2],c3,c1,c2);
884	$LD	r6,`0*$BNSZ`(r4)
885	$LD	r7,`2*$BNSZ`(r5)
886	$UMULL	r8,r6,r7
887	$UMULH	r9,r6,r7
888	addc	r12,r8,r12
889	adde	r10,r9,r10
890	addze	r11,r11
891	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3
892					#mul_add_c(a[0],b[3],c1,c2,c3);
893	$LD	r7,`3*$BNSZ`(r5)
894	$UMULL	r8,r6,r7
895	$UMULH	r9,r6,r7
896	addc	r10,r8,r10
897	adde	r11,r9,r11
898	addze	r12,r0
899					#mul_add_c(a[1],b[2],c1,c2,c3);
900	$LD	r6,`1*$BNSZ`(r4)
901	$LD	r7,`2*$BNSZ`(r5)
902	$UMULL	r8,r6,r7
903	$UMULH	r9,r6,r7
904	addc	r10,r8,r10
905	adde	r11,r9,r11
906	addze	r12,r12
907					#mul_add_c(a[2],b[1],c1,c2,c3);
908	$LD	r6,`2*$BNSZ`(r4)
909	$LD	r7,`1*$BNSZ`(r5)
910	$UMULL	r8,r6,r7
911	$UMULH	r9,r6,r7
912	addc	r10,r8,r10
913	adde	r11,r9,r11
914	addze	r12,r12
915					#mul_add_c(a[3],b[0],c1,c2,c3);
916	$LD	r6,`3*$BNSZ`(r4)
917	$LD	r7,`0*$BNSZ`(r5)
918	$UMULL	r8,r6,r7
919	$UMULH	r9,r6,r7
920	addc	r10,r8,r10
921	adde	r11,r9,r11
922	addze	r12,r12
923	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1
924					#mul_add_c(a[3],b[1],c2,c3,c1);
925	$LD	r7,`1*$BNSZ`(r5)
926	$UMULL	r8,r6,r7
927	$UMULH	r9,r6,r7
928	addc	r11,r8,r11
929	adde	r12,r9,r12
930	addze	r10,r0
931					#mul_add_c(a[2],b[2],c2,c3,c1);
932	$LD	r6,`2*$BNSZ`(r4)
933	$LD	r7,`2*$BNSZ`(r5)
934	$UMULL	r8,r6,r7
935	$UMULH	r9,r6,r7
936	addc	r11,r8,r11
937	adde	r12,r9,r12
938	addze	r10,r10
939					#mul_add_c(a[1],b[3],c2,c3,c1);
940	$LD	r6,`1*$BNSZ`(r4)
941	$LD	r7,`3*$BNSZ`(r5)
942	$UMULL	r8,r6,r7
943	$UMULH	r9,r6,r7
944	addc	r11,r8,r11
945	adde	r12,r9,r12
946	addze	r10,r10
947	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2
948					#mul_add_c(a[2],b[3],c3,c1,c2);
949	$LD	r6,`2*$BNSZ`(r4)
950	$UMULL	r8,r6,r7
951	$UMULH	r9,r6,r7
952	addc	r12,r8,r12
953	adde	r10,r9,r10
954	addze	r11,r0
955					#mul_add_c(a[3],b[2],c3,c1,c2);
956	$LD	r6,`3*$BNSZ`(r4)
957	$LD	r7,`2*$BNSZ`(r5)
958	$UMULL	r8,r6,r7
959	$UMULH	r9,r6,r7
960	addc	r12,r8,r12
961	adde	r10,r9,r10
962	addze	r11,r11
963	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3
964					#mul_add_c(a[3],b[3],c1,c2,c3);
965	$LD	r7,`3*$BNSZ`(r5)
966	$UMULL	r8,r6,r7
967	$UMULH	r9,r6,r7
968	addc	r10,r8,r10
969	adde	r11,r9,r11
970
971	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1
972	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2
973	blr
974	.long	0
975	.byte	0,12,0x14,0,0,0,3,0
976	.long	0
977.size	.bn_mul_comba4,.-.bn_mul_comba4
978
979#
980#	NOTE:	The following label name should be changed to
981#		"bn_mul_comba8" i.e. remove the first dot
982#		for the gcc compiler. This should be automatically
983#		done in the build
984#
985
986.align	4
987.bn_mul_comba8:
988#
989# Optimized version of the bn_mul_comba8 routine.
990#
991# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
992# r3 contains r
993# r4 contains a
994# r5 contains b
995# r6, r7 are the 2 BN_ULONGs being multiplied.
996# r8, r9 are the results of the 32x32 giving 64 multiply.
997# r10, r11, r12 are the equivalents of c1, c2, and c3.
998#
999	xor	r0,r0,r0		#r0=0. Used in addze below.
1000
1001					#mul_add_c(a[0],b[0],c1,c2,c3);
1002	$LD	r6,`0*$BNSZ`(r4)	#a[0]
1003	$LD	r7,`0*$BNSZ`(r5)	#b[0]
1004	$UMULL	r10,r6,r7
1005	$UMULH	r11,r6,r7
1006	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1;
1007					#mul_add_c(a[0],b[1],c2,c3,c1);
1008	$LD	r7,`1*$BNSZ`(r5)
1009	$UMULL	r8,r6,r7
1010	$UMULH	r9,r6,r7
1011	addc	r11,r11,r8
1012	addze	r12,r9			# since we didnt set r12 to zero before.
1013	addze	r10,r0
1014					#mul_add_c(a[1],b[0],c2,c3,c1);
1015	$LD	r6,`1*$BNSZ`(r4)
1016	$LD	r7,`0*$BNSZ`(r5)
1017	$UMULL	r8,r6,r7
1018	$UMULH	r9,r6,r7
1019	addc	r11,r11,r8
1020	adde	r12,r12,r9
1021	addze	r10,r10
1022	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2;
1023					#mul_add_c(a[2],b[0],c3,c1,c2);
1024	$LD	r6,`2*$BNSZ`(r4)
1025	$UMULL	r8,r6,r7
1026	$UMULH	r9,r6,r7
1027	addc	r12,r12,r8
1028	adde	r10,r10,r9
1029	addze	r11,r0
1030					#mul_add_c(a[1],b[1],c3,c1,c2);
1031	$LD	r6,`1*$BNSZ`(r4)
1032	$LD	r7,`1*$BNSZ`(r5)
1033	$UMULL	r8,r6,r7
1034	$UMULH	r9,r6,r7
1035	addc	r12,r12,r8
1036	adde	r10,r10,r9
1037	addze	r11,r11
1038					#mul_add_c(a[0],b[2],c3,c1,c2);
1039	$LD	r6,`0*$BNSZ`(r4)
1040	$LD	r7,`2*$BNSZ`(r5)
1041	$UMULL	r8,r6,r7
1042	$UMULH	r9,r6,r7
1043	addc	r12,r12,r8
1044	adde	r10,r10,r9
1045	addze	r11,r11
1046	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3;
1047					#mul_add_c(a[0],b[3],c1,c2,c3);
1048	$LD	r7,`3*$BNSZ`(r5)
1049	$UMULL	r8,r6,r7
1050	$UMULH	r9,r6,r7
1051	addc	r10,r10,r8
1052	adde	r11,r11,r9
1053	addze	r12,r0
1054					#mul_add_c(a[1],b[2],c1,c2,c3);
1055	$LD	r6,`1*$BNSZ`(r4)
1056	$LD	r7,`2*$BNSZ`(r5)
1057	$UMULL	r8,r6,r7
1058	$UMULH	r9,r6,r7
1059	addc	r10,r10,r8
1060	adde	r11,r11,r9
1061	addze	r12,r12
1062
1063					#mul_add_c(a[2],b[1],c1,c2,c3);
1064	$LD	r6,`2*$BNSZ`(r4)
1065	$LD	r7,`1*$BNSZ`(r5)
1066	$UMULL	r8,r6,r7
1067	$UMULH	r9,r6,r7
1068	addc	r10,r10,r8
1069	adde	r11,r11,r9
1070	addze	r12,r12
1071					#mul_add_c(a[3],b[0],c1,c2,c3);
1072	$LD	r6,`3*$BNSZ`(r4)
1073	$LD	r7,`0*$BNSZ`(r5)
1074	$UMULL	r8,r6,r7
1075	$UMULH	r9,r6,r7
1076	addc	r10,r10,r8
1077	adde	r11,r11,r9
1078	addze	r12,r12
1079	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1;
1080					#mul_add_c(a[4],b[0],c2,c3,c1);
1081	$LD	r6,`4*$BNSZ`(r4)
1082	$UMULL	r8,r6,r7
1083	$UMULH	r9,r6,r7
1084	addc	r11,r11,r8
1085	adde	r12,r12,r9
1086	addze	r10,r0
1087					#mul_add_c(a[3],b[1],c2,c3,c1);
1088	$LD	r6,`3*$BNSZ`(r4)
1089	$LD	r7,`1*$BNSZ`(r5)
1090	$UMULL	r8,r6,r7
1091	$UMULH	r9,r6,r7
1092	addc	r11,r11,r8
1093	adde	r12,r12,r9
1094	addze	r10,r10
1095					#mul_add_c(a[2],b[2],c2,c3,c1);
1096	$LD	r6,`2*$BNSZ`(r4)
1097	$LD	r7,`2*$BNSZ`(r5)
1098	$UMULL	r8,r6,r7
1099	$UMULH	r9,r6,r7
1100	addc	r11,r11,r8
1101	adde	r12,r12,r9
1102	addze	r10,r10
1103					#mul_add_c(a[1],b[3],c2,c3,c1);
1104	$LD	r6,`1*$BNSZ`(r4)
1105	$LD	r7,`3*$BNSZ`(r5)
1106	$UMULL	r8,r6,r7
1107	$UMULH	r9,r6,r7
1108	addc	r11,r11,r8
1109	adde	r12,r12,r9
1110	addze	r10,r10
1111					#mul_add_c(a[0],b[4],c2,c3,c1);
1112	$LD	r6,`0*$BNSZ`(r4)
1113	$LD	r7,`4*$BNSZ`(r5)
1114	$UMULL	r8,r6,r7
1115	$UMULH	r9,r6,r7
1116	addc	r11,r11,r8
1117	adde	r12,r12,r9
1118	addze	r10,r10
1119	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2;
1120					#mul_add_c(a[0],b[5],c3,c1,c2);
1121	$LD	r7,`5*$BNSZ`(r5)
1122	$UMULL	r8,r6,r7
1123	$UMULH	r9,r6,r7
1124	addc	r12,r12,r8
1125	adde	r10,r10,r9
1126	addze	r11,r0
1127					#mul_add_c(a[1],b[4],c3,c1,c2);
1128	$LD	r6,`1*$BNSZ`(r4)
1129	$LD	r7,`4*$BNSZ`(r5)
1130	$UMULL	r8,r6,r7
1131	$UMULH	r9,r6,r7
1132	addc	r12,r12,r8
1133	adde	r10,r10,r9
1134	addze	r11,r11
1135					#mul_add_c(a[2],b[3],c3,c1,c2);
1136	$LD	r6,`2*$BNSZ`(r4)
1137	$LD	r7,`3*$BNSZ`(r5)
1138	$UMULL	r8,r6,r7
1139	$UMULH	r9,r6,r7
1140	addc	r12,r12,r8
1141	adde	r10,r10,r9
1142	addze	r11,r11
1143					#mul_add_c(a[3],b[2],c3,c1,c2);
1144	$LD	r6,`3*$BNSZ`(r4)
1145	$LD	r7,`2*$BNSZ`(r5)
1146	$UMULL	r8,r6,r7
1147	$UMULH	r9,r6,r7
1148	addc	r12,r12,r8
1149	adde	r10,r10,r9
1150	addze	r11,r11
1151					#mul_add_c(a[4],b[1],c3,c1,c2);
1152	$LD	r6,`4*$BNSZ`(r4)
1153	$LD	r7,`1*$BNSZ`(r5)
1154	$UMULL	r8,r6,r7
1155	$UMULH	r9,r6,r7
1156	addc	r12,r12,r8
1157	adde	r10,r10,r9
1158	addze	r11,r11
1159					#mul_add_c(a[5],b[0],c3,c1,c2);
1160	$LD	r6,`5*$BNSZ`(r4)
1161	$LD	r7,`0*$BNSZ`(r5)
1162	$UMULL	r8,r6,r7
1163	$UMULH	r9,r6,r7
1164	addc	r12,r12,r8
1165	adde	r10,r10,r9
1166	addze	r11,r11
1167	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3;
1168					#mul_add_c(a[6],b[0],c1,c2,c3);
1169	$LD	r6,`6*$BNSZ`(r4)
1170	$UMULL	r8,r6,r7
1171	$UMULH	r9,r6,r7
1172	addc	r10,r10,r8
1173	adde	r11,r11,r9
1174	addze	r12,r0
1175					#mul_add_c(a[5],b[1],c1,c2,c3);
1176	$LD	r6,`5*$BNSZ`(r4)
1177	$LD	r7,`1*$BNSZ`(r5)
1178	$UMULL	r8,r6,r7
1179	$UMULH	r9,r6,r7
1180	addc	r10,r10,r8
1181	adde	r11,r11,r9
1182	addze	r12,r12
1183					#mul_add_c(a[4],b[2],c1,c2,c3);
1184	$LD	r6,`4*$BNSZ`(r4)
1185	$LD	r7,`2*$BNSZ`(r5)
1186	$UMULL	r8,r6,r7
1187	$UMULH	r9,r6,r7
1188	addc	r10,r10,r8
1189	adde	r11,r11,r9
1190	addze	r12,r12
1191					#mul_add_c(a[3],b[3],c1,c2,c3);
1192	$LD	r6,`3*$BNSZ`(r4)
1193	$LD	r7,`3*$BNSZ`(r5)
1194	$UMULL	r8,r6,r7
1195	$UMULH	r9,r6,r7
1196	addc	r10,r10,r8
1197	adde	r11,r11,r9
1198	addze	r12,r12
1199					#mul_add_c(a[2],b[4],c1,c2,c3);
1200	$LD	r6,`2*$BNSZ`(r4)
1201	$LD	r7,`4*$BNSZ`(r5)
1202	$UMULL	r8,r6,r7
1203	$UMULH	r9,r6,r7
1204	addc	r10,r10,r8
1205	adde	r11,r11,r9
1206	addze	r12,r12
1207					#mul_add_c(a[1],b[5],c1,c2,c3);
1208	$LD	r6,`1*$BNSZ`(r4)
1209	$LD	r7,`5*$BNSZ`(r5)
1210	$UMULL	r8,r6,r7
1211	$UMULH	r9,r6,r7
1212	addc	r10,r10,r8
1213	adde	r11,r11,r9
1214	addze	r12,r12
1215					#mul_add_c(a[0],b[6],c1,c2,c3);
1216	$LD	r6,`0*$BNSZ`(r4)
1217	$LD	r7,`6*$BNSZ`(r5)
1218	$UMULL	r8,r6,r7
1219	$UMULH	r9,r6,r7
1220	addc	r10,r10,r8
1221	adde	r11,r11,r9
1222	addze	r12,r12
1223	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1;
1224					#mul_add_c(a[0],b[7],c2,c3,c1);
1225	$LD	r7,`7*$BNSZ`(r5)
1226	$UMULL	r8,r6,r7
1227	$UMULH	r9,r6,r7
1228	addc	r11,r11,r8
1229	adde	r12,r12,r9
1230	addze	r10,r0
1231					#mul_add_c(a[1],b[6],c2,c3,c1);
1232	$LD	r6,`1*$BNSZ`(r4)
1233	$LD	r7,`6*$BNSZ`(r5)
1234	$UMULL	r8,r6,r7
1235	$UMULH	r9,r6,r7
1236	addc	r11,r11,r8
1237	adde	r12,r12,r9
1238	addze	r10,r10
1239					#mul_add_c(a[2],b[5],c2,c3,c1);
1240	$LD	r6,`2*$BNSZ`(r4)
1241	$LD	r7,`5*$BNSZ`(r5)
1242	$UMULL	r8,r6,r7
1243	$UMULH	r9,r6,r7
1244	addc	r11,r11,r8
1245	adde	r12,r12,r9
1246	addze	r10,r10
1247					#mul_add_c(a[3],b[4],c2,c3,c1);
1248	$LD	r6,`3*$BNSZ`(r4)
1249	$LD	r7,`4*$BNSZ`(r5)
1250	$UMULL	r8,r6,r7
1251	$UMULH	r9,r6,r7
1252	addc	r11,r11,r8
1253	adde	r12,r12,r9
1254	addze	r10,r10
1255					#mul_add_c(a[4],b[3],c2,c3,c1);
1256	$LD	r6,`4*$BNSZ`(r4)
1257	$LD	r7,`3*$BNSZ`(r5)
1258	$UMULL	r8,r6,r7
1259	$UMULH	r9,r6,r7
1260	addc	r11,r11,r8
1261	adde	r12,r12,r9
1262	addze	r10,r10
1263					#mul_add_c(a[5],b[2],c2,c3,c1);
1264	$LD	r6,`5*$BNSZ`(r4)
1265	$LD	r7,`2*$BNSZ`(r5)
1266	$UMULL	r8,r6,r7
1267	$UMULH	r9,r6,r7
1268	addc	r11,r11,r8
1269	adde	r12,r12,r9
1270	addze	r10,r10
1271					#mul_add_c(a[6],b[1],c2,c3,c1);
1272	$LD	r6,`6*$BNSZ`(r4)
1273	$LD	r7,`1*$BNSZ`(r5)
1274	$UMULL	r8,r6,r7
1275	$UMULH	r9,r6,r7
1276	addc	r11,r11,r8
1277	adde	r12,r12,r9
1278	addze	r10,r10
1279					#mul_add_c(a[7],b[0],c2,c3,c1);
1280	$LD	r6,`7*$BNSZ`(r4)
1281	$LD	r7,`0*$BNSZ`(r5)
1282	$UMULL	r8,r6,r7
1283	$UMULH	r9,r6,r7
1284	addc	r11,r11,r8
1285	adde	r12,r12,r9
1286	addze	r10,r10
1287	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2;
1288					#mul_add_c(a[7],b[1],c3,c1,c2);
1289	$LD	r7,`1*$BNSZ`(r5)
1290	$UMULL	r8,r6,r7
1291	$UMULH	r9,r6,r7
1292	addc	r12,r12,r8
1293	adde	r10,r10,r9
1294	addze	r11,r0
1295					#mul_add_c(a[6],b[2],c3,c1,c2);
1296	$LD	r6,`6*$BNSZ`(r4)
1297	$LD	r7,`2*$BNSZ`(r5)
1298	$UMULL	r8,r6,r7
1299	$UMULH	r9,r6,r7
1300	addc	r12,r12,r8
1301	adde	r10,r10,r9
1302	addze	r11,r11
1303					#mul_add_c(a[5],b[3],c3,c1,c2);
1304	$LD	r6,`5*$BNSZ`(r4)
1305	$LD	r7,`3*$BNSZ`(r5)
1306	$UMULL	r8,r6,r7
1307	$UMULH	r9,r6,r7
1308	addc	r12,r12,r8
1309	adde	r10,r10,r9
1310	addze	r11,r11
1311					#mul_add_c(a[4],b[4],c3,c1,c2);
1312	$LD	r6,`4*$BNSZ`(r4)
1313	$LD	r7,`4*$BNSZ`(r5)
1314	$UMULL	r8,r6,r7
1315	$UMULH	r9,r6,r7
1316	addc	r12,r12,r8
1317	adde	r10,r10,r9
1318	addze	r11,r11
1319					#mul_add_c(a[3],b[5],c3,c1,c2);
1320	$LD	r6,`3*$BNSZ`(r4)
1321	$LD	r7,`5*$BNSZ`(r5)
1322	$UMULL	r8,r6,r7
1323	$UMULH	r9,r6,r7
1324	addc	r12,r12,r8
1325	adde	r10,r10,r9
1326	addze	r11,r11
1327					#mul_add_c(a[2],b[6],c3,c1,c2);
1328	$LD	r6,`2*$BNSZ`(r4)
1329	$LD	r7,`6*$BNSZ`(r5)
1330	$UMULL	r8,r6,r7
1331	$UMULH	r9,r6,r7
1332	addc	r12,r12,r8
1333	adde	r10,r10,r9
1334	addze	r11,r11
1335					#mul_add_c(a[1],b[7],c3,c1,c2);
1336	$LD	r6,`1*$BNSZ`(r4)
1337	$LD	r7,`7*$BNSZ`(r5)
1338	$UMULL	r8,r6,r7
1339	$UMULH	r9,r6,r7
1340	addc	r12,r12,r8
1341	adde	r10,r10,r9
1342	addze	r11,r11
1343	$ST	r12,`8*$BNSZ`(r3)	#r[8]=c3;
1344					#mul_add_c(a[2],b[7],c1,c2,c3);
1345	$LD	r6,`2*$BNSZ`(r4)
1346	$UMULL	r8,r6,r7
1347	$UMULH	r9,r6,r7
1348	addc	r10,r10,r8
1349	adde	r11,r11,r9
1350	addze	r12,r0
1351					#mul_add_c(a[3],b[6],c1,c2,c3);
1352	$LD	r6,`3*$BNSZ`(r4)
1353	$LD	r7,`6*$BNSZ`(r5)
1354	$UMULL	r8,r6,r7
1355	$UMULH	r9,r6,r7
1356	addc	r10,r10,r8
1357	adde	r11,r11,r9
1358	addze	r12,r12
1359					#mul_add_c(a[4],b[5],c1,c2,c3);
1360	$LD	r6,`4*$BNSZ`(r4)
1361	$LD	r7,`5*$BNSZ`(r5)
1362	$UMULL	r8,r6,r7
1363	$UMULH	r9,r6,r7
1364	addc	r10,r10,r8
1365	adde	r11,r11,r9
1366	addze	r12,r12
1367					#mul_add_c(a[5],b[4],c1,c2,c3);
1368	$LD	r6,`5*$BNSZ`(r4)
1369	$LD	r7,`4*$BNSZ`(r5)
1370	$UMULL	r8,r6,r7
1371	$UMULH	r9,r6,r7
1372	addc	r10,r10,r8
1373	adde	r11,r11,r9
1374	addze	r12,r12
1375					#mul_add_c(a[6],b[3],c1,c2,c3);
1376	$LD	r6,`6*$BNSZ`(r4)
1377	$LD	r7,`3*$BNSZ`(r5)
1378	$UMULL	r8,r6,r7
1379	$UMULH	r9,r6,r7
1380	addc	r10,r10,r8
1381	adde	r11,r11,r9
1382	addze	r12,r12
1383					#mul_add_c(a[7],b[2],c1,c2,c3);
1384	$LD	r6,`7*$BNSZ`(r4)
1385	$LD	r7,`2*$BNSZ`(r5)
1386	$UMULL	r8,r6,r7
1387	$UMULH	r9,r6,r7
1388	addc	r10,r10,r8
1389	adde	r11,r11,r9
1390	addze	r12,r12
1391	$ST	r10,`9*$BNSZ`(r3)	#r[9]=c1;
1392					#mul_add_c(a[7],b[3],c2,c3,c1);
1393	$LD	r7,`3*$BNSZ`(r5)
1394	$UMULL	r8,r6,r7
1395	$UMULH	r9,r6,r7
1396	addc	r11,r11,r8
1397	adde	r12,r12,r9
1398	addze	r10,r0
1399					#mul_add_c(a[6],b[4],c2,c3,c1);
1400	$LD	r6,`6*$BNSZ`(r4)
1401	$LD	r7,`4*$BNSZ`(r5)
1402	$UMULL	r8,r6,r7
1403	$UMULH	r9,r6,r7
1404	addc	r11,r11,r8
1405	adde	r12,r12,r9
1406	addze	r10,r10
1407					#mul_add_c(a[5],b[5],c2,c3,c1);
1408	$LD	r6,`5*$BNSZ`(r4)
1409	$LD	r7,`5*$BNSZ`(r5)
1410	$UMULL	r8,r6,r7
1411	$UMULH	r9,r6,r7
1412	addc	r11,r11,r8
1413	adde	r12,r12,r9
1414	addze	r10,r10
1415					#mul_add_c(a[4],b[6],c2,c3,c1);
1416	$LD	r6,`4*$BNSZ`(r4)
1417	$LD	r7,`6*$BNSZ`(r5)
1418	$UMULL	r8,r6,r7
1419	$UMULH	r9,r6,r7
1420	addc	r11,r11,r8
1421	adde	r12,r12,r9
1422	addze	r10,r10
1423					#mul_add_c(a[3],b[7],c2,c3,c1);
1424	$LD	r6,`3*$BNSZ`(r4)
1425	$LD	r7,`7*$BNSZ`(r5)
1426	$UMULL	r8,r6,r7
1427	$UMULH	r9,r6,r7
1428	addc	r11,r11,r8
1429	adde	r12,r12,r9
1430	addze	r10,r10
1431	$ST	r11,`10*$BNSZ`(r3)	#r[10]=c2;
1432					#mul_add_c(a[4],b[7],c3,c1,c2);
1433	$LD	r6,`4*$BNSZ`(r4)
1434	$UMULL	r8,r6,r7
1435	$UMULH	r9,r6,r7
1436	addc	r12,r12,r8
1437	adde	r10,r10,r9
1438	addze	r11,r0
1439					#mul_add_c(a[5],b[6],c3,c1,c2);
1440	$LD	r6,`5*$BNSZ`(r4)
1441	$LD	r7,`6*$BNSZ`(r5)
1442	$UMULL	r8,r6,r7
1443	$UMULH	r9,r6,r7
1444	addc	r12,r12,r8
1445	adde	r10,r10,r9
1446	addze	r11,r11
1447					#mul_add_c(a[6],b[5],c3,c1,c2);
1448	$LD	r6,`6*$BNSZ`(r4)
1449	$LD	r7,`5*$BNSZ`(r5)
1450	$UMULL	r8,r6,r7
1451	$UMULH	r9,r6,r7
1452	addc	r12,r12,r8
1453	adde	r10,r10,r9
1454	addze	r11,r11
1455					#mul_add_c(a[7],b[4],c3,c1,c2);
1456	$LD	r6,`7*$BNSZ`(r4)
1457	$LD	r7,`4*$BNSZ`(r5)
1458	$UMULL	r8,r6,r7
1459	$UMULH	r9,r6,r7
1460	addc	r12,r12,r8
1461	adde	r10,r10,r9
1462	addze	r11,r11
1463	$ST	r12,`11*$BNSZ`(r3)	#r[11]=c3;
1464					#mul_add_c(a[7],b[5],c1,c2,c3);
1465	$LD	r7,`5*$BNSZ`(r5)
1466	$UMULL	r8,r6,r7
1467	$UMULH	r9,r6,r7
1468	addc	r10,r10,r8
1469	adde	r11,r11,r9
1470	addze	r12,r0
1471					#mul_add_c(a[6],b[6],c1,c2,c3);
1472	$LD	r6,`6*$BNSZ`(r4)
1473	$LD	r7,`6*$BNSZ`(r5)
1474	$UMULL	r8,r6,r7
1475	$UMULH	r9,r6,r7
1476	addc	r10,r10,r8
1477	adde	r11,r11,r9
1478	addze	r12,r12
1479					#mul_add_c(a[5],b[7],c1,c2,c3);
1480	$LD	r6,`5*$BNSZ`(r4)
1481	$LD	r7,`7*$BNSZ`(r5)
1482	$UMULL	r8,r6,r7
1483	$UMULH	r9,r6,r7
1484	addc	r10,r10,r8
1485	adde	r11,r11,r9
1486	addze	r12,r12
1487	$ST	r10,`12*$BNSZ`(r3)	#r[12]=c1;
1488					#mul_add_c(a[6],b[7],c2,c3,c1);
1489	$LD	r6,`6*$BNSZ`(r4)
1490	$UMULL	r8,r6,r7
1491	$UMULH	r9,r6,r7
1492	addc	r11,r11,r8
1493	adde	r12,r12,r9
1494	addze	r10,r0
1495					#mul_add_c(a[7],b[6],c2,c3,c1);
1496	$LD	r6,`7*$BNSZ`(r4)
1497	$LD	r7,`6*$BNSZ`(r5)
1498	$UMULL	r8,r6,r7
1499	$UMULH	r9,r6,r7
1500	addc	r11,r11,r8
1501	adde	r12,r12,r9
1502	addze	r10,r10
1503	$ST	r11,`13*$BNSZ`(r3)	#r[13]=c2;
1504					#mul_add_c(a[7],b[7],c3,c1,c2);
1505	$LD	r7,`7*$BNSZ`(r5)
1506	$UMULL	r8,r6,r7
1507	$UMULH	r9,r6,r7
1508	addc	r12,r12,r8
1509	adde	r10,r10,r9
1510	$ST	r12,`14*$BNSZ`(r3)	#r[14]=c3;
1511	$ST	r10,`15*$BNSZ`(r3)	#r[15]=c1;
1512	blr
1513	.long	0
1514	.byte	0,12,0x14,0,0,0,3,0
1515	.long	0
1516.size	.bn_mul_comba8,.-.bn_mul_comba8
1517
1518#
1519#	NOTE:	The following label name should be changed to
1520#		"bn_sub_words" i.e. remove the first dot
1521#		for the gcc compiler. This should be automatically
1522#		done in the build
1523#
1524#
1525.align	4
1526.bn_sub_words:
1527#
1528#	Handcoded version of bn_sub_words
1529#
1530#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1531#
1532#	r3 = r
1533#	r4 = a
1534#	r5 = b
1535#	r6 = n
1536#
1537#       Note:	No loop unrolling done since this is not a performance
1538#               critical loop.
1539
1540	xor	r0,r0,r0	#set r0 = 0
1541#
1542#	check for r6 = 0 AND set carry bit.
1543#
1544	subfc.	r7,r0,r6        # If r6 is 0 then result is 0.
1545				# if r6 > 0 then result !=0
1546				# In either case carry bit is set.
1547	beq	Lppcasm_sub_adios
1548	addi	r4,r4,-$BNSZ
1549	addi	r3,r3,-$BNSZ
1550	addi	r5,r5,-$BNSZ
1551	mtctr	r6
1552Lppcasm_sub_mainloop:
1553	$LDU	r7,$BNSZ(r4)
1554	$LDU	r8,$BNSZ(r5)
1555	subfe	r6,r8,r7	# r6 = r7+carry bit + onescomplement(r8)
1556				# if carry = 1 this is r7-r8. Else it
1557				# is r7-r8 -1 as we need.
1558	$STU	r6,$BNSZ(r3)
1559	bdnz-	Lppcasm_sub_mainloop
1560Lppcasm_sub_adios:
1561	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
1562	andi.	r3,r3,1         # keep only last bit.
1563	blr
1564	.long	0
1565	.byte	0,12,0x14,0,0,0,4,0
1566	.long	0
1567.size	.bn_sub_words,.-.bn_sub_words
1568
1569#
1570#	NOTE:	The following label name should be changed to
1571#		"bn_add_words" i.e. remove the first dot
1572#		for the gcc compiler. This should be automatically
1573#		done in the build
1574#
1575
1576.align	4
1577.bn_add_words:
1578#
1579#	Handcoded version of bn_add_words
1580#
1581#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1582#
1583#	r3 = r
1584#	r4 = a
1585#	r5 = b
1586#	r6 = n
1587#
1588#       Note:	No loop unrolling done since this is not a performance
1589#               critical loop.
1590
1591	xor	r0,r0,r0
1592#
1593#	check for r6 = 0. Is this needed?
1594#
1595	addic.	r6,r6,0		#test r6 and clear carry bit.
1596	beq	Lppcasm_add_adios
1597	addi	r4,r4,-$BNSZ
1598	addi	r3,r3,-$BNSZ
1599	addi	r5,r5,-$BNSZ
1600	mtctr	r6
1601Lppcasm_add_mainloop:
1602	$LDU	r7,$BNSZ(r4)
1603	$LDU	r8,$BNSZ(r5)
1604	adde	r8,r7,r8
1605	$STU	r8,$BNSZ(r3)
1606	bdnz-	Lppcasm_add_mainloop
1607Lppcasm_add_adios:
1608	addze	r3,r0			#return carry bit.
1609	blr
1610	.long	0
1611	.byte	0,12,0x14,0,0,0,4,0
1612	.long	0
1613.size	.bn_add_words,.-.bn_add_words
1614
1615#
1616#	NOTE:	The following label name should be changed to
1617#		"bn_div_words" i.e. remove the first dot
1618#		for the gcc compiler. This should be automatically
1619#		done in the build
1620#
1621
1622.align	4
1623.bn_div_words:
1624#
1625#	This is a cleaned up version of code generated by
1626#	the AIX compiler. The only optimization is to use
1627#	the PPC instruction to count leading zeros instead
1628#	of call to num_bits_word. Since this was compiled
1629#	only at level -O2 we can possibly squeeze it more?
1630#
1631#	r3 = h
1632#	r4 = l
1633#	r5 = d
1634
1635	$UCMPI	0,r5,0			# compare r5 and 0
1636	bne	Lppcasm_div1		# proceed if d!=0
1637	li	r3,-1			# d=0 return -1
1638	blr
1639Lppcasm_div1:
1640	xor	r0,r0,r0		#r0=0
1641	li	r8,$BITS
1642	$CNTLZ.	r7,r5			#r7 = num leading 0s in d.
1643	beq	Lppcasm_div2		#proceed if no leading zeros
1644	subf	r8,r7,r8		#r8 = BN_num_bits_word(d)
1645	$SHR.	r9,r3,r8		#are there any bits above r8'th?
1646	$TR	16,r9,r0		#if there're, signal to dump core...
1647Lppcasm_div2:
1648	$UCMP	0,r3,r5			#h>=d?
1649	blt	Lppcasm_div3		#goto Lppcasm_div3 if not
1650	subf	r3,r5,r3		#h-=d ;
1651Lppcasm_div3:				#r7 = BN_BITS2-i. so r7=i
1652	cmpi	0,0,r7,0		# is (i == 0)?
1653	beq	Lppcasm_div4
1654	$SHL	r3,r3,r7		# h = (h<< i)
1655	$SHR	r8,r4,r8		# r8 = (l >> BN_BITS2 -i)
1656	$SHL	r5,r5,r7		# d<<=i
1657	or	r3,r3,r8		# h = (h<<i)|(l>>(BN_BITS2-i))
1658	$SHL	r4,r4,r7		# l <<=i
1659Lppcasm_div4:
1660	$SHRI	r9,r5,`$BITS/2`		# r9 = dh
1661					# dl will be computed when needed
1662					# as it saves registers.
1663	li	r6,2			#r6=2
1664	mtctr	r6			#counter will be in count.
1665Lppcasm_divouterloop:
1666	$SHRI	r8,r3,`$BITS/2`		#r8 = (h>>BN_BITS4)
1667	$SHRI	r11,r4,`$BITS/2`	#r11= (l&BN_MASK2h)>>BN_BITS4
1668					# compute here for innerloop.
1669	$UCMP	0,r8,r9			# is (h>>BN_BITS4)==dh
1670	bne	Lppcasm_div5		# goto Lppcasm_div5 if not
1671
1672	li	r8,-1
1673	$CLRU	r8,r8,`$BITS/2`		#q = BN_MASK2l
1674	b	Lppcasm_div6
1675Lppcasm_div5:
1676	$UDIV	r8,r3,r9		#q = h/dh
1677Lppcasm_div6:
1678	$UMULL	r12,r9,r8		#th = q*dh
1679	$CLRU	r10,r5,`$BITS/2`	#r10=dl
1680	$UMULL	r6,r8,r10		#tl = q*dl
1681
1682Lppcasm_divinnerloop:
1683	subf	r10,r12,r3		#t = h -th
1684	$SHRI	r7,r10,`$BITS/2`	#r7= (t &BN_MASK2H), sort of...
1685	addic.	r7,r7,0			#test if r7 == 0. used below.
1686					# now want to compute
1687					# r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
1688					# the following 2 instructions do that
1689	$SHLI	r7,r10,`$BITS/2`	# r7 = (t<<BN_BITS4)
1690	or	r7,r7,r11		# r7|=((l&BN_MASK2h)>>BN_BITS4)
1691	$UCMP	cr1,r6,r7		# compare (tl <= r7)
1692	bne	Lppcasm_divinnerexit
1693	ble	cr1,Lppcasm_divinnerexit
1694	addi	r8,r8,-1		#q--
1695	subf	r12,r9,r12		#th -=dh
1696	$CLRU	r10,r5,`$BITS/2`	#r10=dl. t is no longer needed in loop.
1697	subf	r6,r10,r6		#tl -=dl
1698	b	Lppcasm_divinnerloop
1699Lppcasm_divinnerexit:
1700	$SHRI	r10,r6,`$BITS/2`	#t=(tl>>BN_BITS4)
1701	$SHLI	r11,r6,`$BITS/2`	#tl=(tl<<BN_BITS4)&BN_MASK2h;
1702	$UCMP	cr1,r4,r11		# compare l and tl
1703	add	r12,r12,r10		# th+=t
1704	bge	cr1,Lppcasm_div7	# if (l>=tl) goto Lppcasm_div7
1705	addi	r12,r12,1		# th++
1706Lppcasm_div7:
1707	subf	r11,r11,r4		#r11=l-tl
1708	$UCMP	cr1,r3,r12		#compare h and th
1709	bge	cr1,Lppcasm_div8	#if (h>=th) goto Lppcasm_div8
1710	addi	r8,r8,-1		# q--
1711	add	r3,r5,r3		# h+=d
1712Lppcasm_div8:
1713	subf	r12,r12,r3		#r12 = h-th
1714	$SHLI	r4,r11,`$BITS/2`	#l=(l&BN_MASK2l)<<BN_BITS4
1715					# want to compute
1716					# h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
1717					# the following 2 instructions will do this.
1718	$INSR	r11,r12,`$BITS/2`,`$BITS/2`	# r11 is the value we want rotated $BITS/2.
1719	$ROTL	r3,r11,`$BITS/2`	# rotate by $BITS/2 and store in r3
1720	bdz	Lppcasm_div9		#if (count==0) break ;
1721	$SHLI	r0,r8,`$BITS/2`		#ret =q<<BN_BITS4
1722	b	Lppcasm_divouterloop
1723Lppcasm_div9:
1724	or	r3,r8,r0
1725	blr
1726	.long	0
1727	.byte	0,12,0x14,0,0,0,3,0
1728	.long	0
1729.size	.bn_div_words,.-.bn_div_words
1730
1731#
1732#	NOTE:	The following label name should be changed to
1733#		"bn_sqr_words" i.e. remove the first dot
1734#		for the gcc compiler. This should be automatically
1735#		done in the build
1736#
1737.align	4
1738.bn_sqr_words:
1739#
1740#	Optimized version of bn_sqr_words
1741#
1742#	void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
1743#
1744#	r3 = r
1745#	r4 = a
1746#	r5 = n
1747#
1748#	r6 = a[i].
1749#	r7,r8 = product.
1750#
1751#	No unrolling done here. Not performance critical.
1752
1753	addic.	r5,r5,0			#test r5.
1754	beq	Lppcasm_sqr_adios
1755	addi	r4,r4,-$BNSZ
1756	addi	r3,r3,-$BNSZ
1757	mtctr	r5
1758Lppcasm_sqr_mainloop:
1759					#sqr(r[0],r[1],a[0]);
1760	$LDU	r6,$BNSZ(r4)
1761	$UMULL	r7,r6,r6
1762	$UMULH  r8,r6,r6
1763	$STU	r7,$BNSZ(r3)
1764	$STU	r8,$BNSZ(r3)
1765	bdnz-	Lppcasm_sqr_mainloop
1766Lppcasm_sqr_adios:
1767	blr
1768	.long	0
1769	.byte	0,12,0x14,0,0,0,3,0
1770	.long	0
1771.size	.bn_sqr_words,.-.bn_sqr_words
1772
1773#
1774#	NOTE:	The following label name should be changed to
1775#		"bn_mul_words" i.e. remove the first dot
1776#		for the gcc compiler. This should be automatically
1777#		done in the build
1778#
1779
1780.align	4
1781.bn_mul_words:
1782#
1783# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1784#
1785# r3 = rp
1786# r4 = ap
1787# r5 = num
1788# r6 = w
1789	xor	r0,r0,r0
1790	xor	r12,r12,r12		# used for carry
1791	rlwinm.	r7,r5,30,2,31		# num >> 2
1792	beq	Lppcasm_mw_REM
1793	mtctr	r7
1794Lppcasm_mw_LOOP:
1795					#mul(rp[0],ap[0],w,c1);
1796	$LD	r8,`0*$BNSZ`(r4)
1797	$UMULL	r9,r6,r8
1798	$UMULH  r10,r6,r8
1799	addc	r9,r9,r12
1800	#addze	r10,r10			#carry is NOT ignored.
1801					#will be taken care of
1802					#in second spin below
1803					#using adde.
1804	$ST	r9,`0*$BNSZ`(r3)
1805					#mul(rp[1],ap[1],w,c1);
1806	$LD	r8,`1*$BNSZ`(r4)
1807	$UMULL	r11,r6,r8
1808	$UMULH  r12,r6,r8
1809	adde	r11,r11,r10
1810	#addze	r12,r12
1811	$ST	r11,`1*$BNSZ`(r3)
1812					#mul(rp[2],ap[2],w,c1);
1813	$LD	r8,`2*$BNSZ`(r4)
1814	$UMULL	r9,r6,r8
1815	$UMULH  r10,r6,r8
1816	adde	r9,r9,r12
1817	#addze	r10,r10
1818	$ST	r9,`2*$BNSZ`(r3)
1819					#mul_add(rp[3],ap[3],w,c1);
1820	$LD	r8,`3*$BNSZ`(r4)
1821	$UMULL	r11,r6,r8
1822	$UMULH  r12,r6,r8
1823	adde	r11,r11,r10
1824	addze	r12,r12			#this spin we collect carry into
1825					#r12
1826	$ST	r11,`3*$BNSZ`(r3)
1827
1828	addi	r3,r3,`4*$BNSZ`
1829	addi	r4,r4,`4*$BNSZ`
1830	bdnz-	Lppcasm_mw_LOOP
1831
1832Lppcasm_mw_REM:
1833	andi.	r5,r5,0x3
1834	beq	Lppcasm_mw_OVER
1835					#mul(rp[0],ap[0],w,c1);
1836	$LD	r8,`0*$BNSZ`(r4)
1837	$UMULL	r9,r6,r8
1838	$UMULH  r10,r6,r8
1839	addc	r9,r9,r12
1840	addze	r10,r10
1841	$ST	r9,`0*$BNSZ`(r3)
1842	addi	r12,r10,0
1843
1844	addi	r5,r5,-1
1845	cmpli	0,0,r5,0
1846	beq	Lppcasm_mw_OVER
1847
1848
1849					#mul(rp[1],ap[1],w,c1);
1850	$LD	r8,`1*$BNSZ`(r4)
1851	$UMULL	r9,r6,r8
1852	$UMULH  r10,r6,r8
1853	addc	r9,r9,r12
1854	addze	r10,r10
1855	$ST	r9,`1*$BNSZ`(r3)
1856	addi	r12,r10,0
1857
1858	addi	r5,r5,-1
1859	cmpli	0,0,r5,0
1860	beq	Lppcasm_mw_OVER
1861
1862					#mul_add(rp[2],ap[2],w,c1);
1863	$LD	r8,`2*$BNSZ`(r4)
1864	$UMULL	r9,r6,r8
1865	$UMULH  r10,r6,r8
1866	addc	r9,r9,r12
1867	addze	r10,r10
1868	$ST	r9,`2*$BNSZ`(r3)
1869	addi	r12,r10,0
1870
1871Lppcasm_mw_OVER:
1872	addi	r3,r12,0
1873	blr
1874	.long	0
1875	.byte	0,12,0x14,0,0,0,4,0
1876	.long	0
1877.size	bn_mul_words,.-bn_mul_words
1878
1879#
1880#	NOTE:	The following label name should be changed to
1881#		"bn_mul_add_words" i.e. remove the first dot
1882#		for the gcc compiler. This should be automatically
1883#		done in the build
1884#
1885
1886.align	4
1887.bn_mul_add_words:
1888#
1889# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1890#
1891# r3 = rp
1892# r4 = ap
1893# r5 = num
1894# r6 = w
1895#
1896# empirical evidence suggests that unrolled version performs best!!
1897#
1898	xor	r0,r0,r0		#r0 = 0
1899	xor	r12,r12,r12  		#r12 = 0 . used for carry
1900	rlwinm.	r7,r5,30,2,31		# num >> 2
1901	beq	Lppcasm_maw_leftover	# if (num < 4) go LPPCASM_maw_leftover
1902	mtctr	r7
1903Lppcasm_maw_mainloop:
1904					#mul_add(rp[0],ap[0],w,c1);
1905	$LD	r8,`0*$BNSZ`(r4)
1906	$LD	r11,`0*$BNSZ`(r3)
1907	$UMULL	r9,r6,r8
1908	$UMULH  r10,r6,r8
1909	addc	r9,r9,r12		#r12 is carry.
1910	addze	r10,r10
1911	addc	r9,r9,r11
1912	#addze	r10,r10
1913					#the above instruction addze
1914					#is NOT needed. Carry will NOT
1915					#be ignored. It's not affected
1916					#by multiply and will be collected
1917					#in the next spin
1918	$ST	r9,`0*$BNSZ`(r3)
1919
1920					#mul_add(rp[1],ap[1],w,c1);
1921	$LD	r8,`1*$BNSZ`(r4)
1922	$LD	r9,`1*$BNSZ`(r3)
1923	$UMULL	r11,r6,r8
1924	$UMULH  r12,r6,r8
1925	adde	r11,r11,r10		#r10 is carry.
1926	addze	r12,r12
1927	addc	r11,r11,r9
1928	#addze	r12,r12
1929	$ST	r11,`1*$BNSZ`(r3)
1930
1931					#mul_add(rp[2],ap[2],w,c1);
1932	$LD	r8,`2*$BNSZ`(r4)
1933	$UMULL	r9,r6,r8
1934	$LD	r11,`2*$BNSZ`(r3)
1935	$UMULH  r10,r6,r8
1936	adde	r9,r9,r12
1937	addze	r10,r10
1938	addc	r9,r9,r11
1939	#addze	r10,r10
1940	$ST	r9,`2*$BNSZ`(r3)
1941
1942					#mul_add(rp[3],ap[3],w,c1);
1943	$LD	r8,`3*$BNSZ`(r4)
1944	$UMULL	r11,r6,r8
1945	$LD	r9,`3*$BNSZ`(r3)
1946	$UMULH  r12,r6,r8
1947	adde	r11,r11,r10
1948	addze	r12,r12
1949	addc	r11,r11,r9
1950	addze	r12,r12
1951	$ST	r11,`3*$BNSZ`(r3)
1952	addi	r3,r3,`4*$BNSZ`
1953	addi	r4,r4,`4*$BNSZ`
1954	bdnz-	Lppcasm_maw_mainloop
1955
1956Lppcasm_maw_leftover:
1957	andi.	r5,r5,0x3
1958	beq	Lppcasm_maw_adios
1959	addi	r3,r3,-$BNSZ
1960	addi	r4,r4,-$BNSZ
1961					#mul_add(rp[0],ap[0],w,c1);
1962	mtctr	r5
1963	$LDU	r8,$BNSZ(r4)
1964	$UMULL	r9,r6,r8
1965	$UMULH  r10,r6,r8
1966	$LDU	r11,$BNSZ(r3)
1967	addc	r9,r9,r11
1968	addze	r10,r10
1969	addc	r9,r9,r12
1970	addze	r12,r10
1971	$ST	r9,0(r3)
1972
1973	bdz	Lppcasm_maw_adios
1974					#mul_add(rp[1],ap[1],w,c1);
1975	$LDU	r8,$BNSZ(r4)
1976	$UMULL	r9,r6,r8
1977	$UMULH  r10,r6,r8
1978	$LDU	r11,$BNSZ(r3)
1979	addc	r9,r9,r11
1980	addze	r10,r10
1981	addc	r9,r9,r12
1982	addze	r12,r10
1983	$ST	r9,0(r3)
1984
1985	bdz	Lppcasm_maw_adios
1986					#mul_add(rp[2],ap[2],w,c1);
1987	$LDU	r8,$BNSZ(r4)
1988	$UMULL	r9,r6,r8
1989	$UMULH  r10,r6,r8
1990	$LDU	r11,$BNSZ(r3)
1991	addc	r9,r9,r11
1992	addze	r10,r10
1993	addc	r9,r9,r12
1994	addze	r12,r10
1995	$ST	r9,0(r3)
1996
1997Lppcasm_maw_adios:
1998	addi	r3,r12,0
1999	blr
2000	.long	0
2001	.byte	0,12,0x14,0,0,0,4,0
2002	.long	0
2003.size	.bn_mul_add_words,.-.bn_mul_add_words
2004	.align	4
2005EOF
2006$data =~ s/\`([^\`]*)\`/eval $1/gem;
2007print $data;
2008close STDOUT;
2009