xref: /freebsd/crypto/openssl/crypto/bn/asm/ppc.pl (revision 39beb93c3f8bdbf72a61fda42300b5ebed7390c8)
1#!/usr/bin/env perl
2#
3# Implemented as a Perl wrapper as we want to support several different
4# architectures with single file. We pick up the target based on the
5# file name we are asked to generate.
6#
7# It should be noted though that this perl code is nothing like
8# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
9# as pre-processor to cover for platform differences in name decoration,
10# linker tables, 32-/64-bit instruction sets...
11#
12# As you might know there're several PowerPC ABI in use. Most notably
13# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
14# are similar enough to implement leaf(!) functions, which would be ABI
15# neutral. And that's what you find here: ABI neutral leaf functions.
16# In case you wonder what that is...
17#
18#       AIX performance
19#
20#	MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
21#
22#	The following is the performance of 32-bit compiler
23#	generated code:
24#
25#	OpenSSL 0.9.6c 21 dec 2001
26#	built on: Tue Jun 11 11:06:51 EDT 2002
27#	options:bn(64,32) ...
28#compiler: cc -DTHREADS  -DAIX -DB_ENDIAN -DBN_LLONG -O3
29#                  sign    verify    sign/s verify/s
30#rsa  512 bits   0.0098s   0.0009s    102.0   1170.6
31#rsa 1024 bits   0.0507s   0.0026s     19.7    387.5
32#rsa 2048 bits   0.3036s   0.0085s      3.3    117.1
33#rsa 4096 bits   2.0040s   0.0299s      0.5     33.4
34#dsa  512 bits   0.0087s   0.0106s    114.3     94.5
35#dsa 1024 bits   0.0256s   0.0313s     39.0     32.0
36#
37#	Same bechmark with this assembler code:
38#
39#rsa  512 bits   0.0056s   0.0005s    178.6   2049.2
40#rsa 1024 bits   0.0283s   0.0015s     35.3    674.1
41#rsa 2048 bits   0.1744s   0.0050s      5.7    201.2
42#rsa 4096 bits   1.1644s   0.0179s      0.9     55.7
43#dsa  512 bits   0.0052s   0.0062s    191.6    162.0
44#dsa 1024 bits   0.0149s   0.0180s     67.0     55.5
45#
46#	Number of operations increases by at almost 75%
47#
48#	Here are performance numbers for 64-bit compiler
49#	generated code:
50#
51#	OpenSSL 0.9.6g [engine] 9 Aug 2002
52#	built on: Fri Apr 18 16:59:20 EDT 2003
53#	options:bn(64,64) ...
54#	compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
55#                  sign    verify    sign/s verify/s
56#rsa  512 bits   0.0028s   0.0003s    357.1   3844.4
57#rsa 1024 bits   0.0148s   0.0008s     67.5   1239.7
58#rsa 2048 bits   0.0963s   0.0028s     10.4    353.0
59#rsa 4096 bits   0.6538s   0.0102s      1.5     98.1
60#dsa  512 bits   0.0026s   0.0032s    382.5    313.7
61#dsa 1024 bits   0.0081s   0.0099s    122.8    100.6
62#
63#	Same benchmark with this assembler code:
64#
65#rsa  512 bits   0.0020s   0.0002s    510.4   6273.7
66#rsa 1024 bits   0.0088s   0.0005s    114.1   2128.3
67#rsa 2048 bits   0.0540s   0.0016s     18.5    622.5
68#rsa 4096 bits   0.3700s   0.0058s      2.7    171.0
69#dsa  512 bits   0.0016s   0.0020s    610.7    507.1
70#dsa 1024 bits   0.0047s   0.0058s    212.5    173.2
71#
72#	Again, performance increases by at about 75%
73#
74#       Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
75#       OpenSSL 0.9.7c 30 Sep 2003
76#
77#       Original code.
78#
79#rsa  512 bits   0.0011s   0.0001s    906.1  11012.5
80#rsa 1024 bits   0.0060s   0.0003s    166.6   3363.1
81#rsa 2048 bits   0.0370s   0.0010s     27.1    982.4
82#rsa 4096 bits   0.2426s   0.0036s      4.1    280.4
83#dsa  512 bits   0.0010s   0.0012s   1038.1    841.5
84#dsa 1024 bits   0.0030s   0.0037s    329.6    269.7
85#dsa 2048 bits   0.0101s   0.0127s     98.9     78.6
86#
87#       Same benchmark with this assembler code:
88#
89#rsa  512 bits   0.0007s   0.0001s   1416.2  16645.9
90#rsa 1024 bits   0.0036s   0.0002s    274.4   5380.6
91#rsa 2048 bits   0.0222s   0.0006s     45.1   1589.5
92#rsa 4096 bits   0.1469s   0.0022s      6.8    449.6
93#dsa  512 bits   0.0006s   0.0007s   1664.2   1376.2
94#dsa 1024 bits   0.0018s   0.0023s    545.0    442.2
95#dsa 2048 bits   0.0061s   0.0075s    163.5    132.8
96#
97#        Performance increase of ~60%
98#
99#	If you have comments or suggestions to improve code send
100#	me a note at schari@us.ibm.com
101#
102
103$opf = shift;
104
105if ($opf =~ /32\.s/) {
106	$BITS=	32;
107	$BNSZ=	$BITS/8;
108	$ISA=	"\"ppc\"";
109
110	$LD=	"lwz";		# load
111	$LDU=	"lwzu";		# load and update
112	$ST=	"stw";		# store
113	$STU=	"stwu";		# store and update
114	$UMULL=	"mullw";	# unsigned multiply low
115	$UMULH=	"mulhwu";	# unsigned multiply high
116	$UDIV=	"divwu";	# unsigned divide
117	$UCMPI=	"cmplwi";	# unsigned compare with immediate
118	$UCMP=	"cmplw";	# unsigned compare
119	$CNTLZ=	"cntlzw";	# count leading zeros
120	$SHL=	"slw";		# shift left
121	$SHR=	"srw";		# unsigned shift right
122	$SHRI=	"srwi";		# unsigned shift right by immediate
123	$SHLI=	"slwi";		# shift left by immediate
124	$CLRU=	"clrlwi";	# clear upper bits
125	$INSR=	"insrwi";	# insert right
126	$ROTL=	"rotlwi";	# rotate left by immediate
127	$TR=	"tw";		# conditional trap
128} elsif ($opf =~ /64\.s/) {
129	$BITS=	64;
130	$BNSZ=	$BITS/8;
131	$ISA=	"\"ppc64\"";
132
133	# same as above, but 64-bit mnemonics...
134	$LD=	"ld";		# load
135	$LDU=	"ldu";		# load and update
136	$ST=	"std";		# store
137	$STU=	"stdu";		# store and update
138	$UMULL=	"mulld";	# unsigned multiply low
139	$UMULH=	"mulhdu";	# unsigned multiply high
140	$UDIV=	"divdu";	# unsigned divide
141	$UCMPI=	"cmpldi";	# unsigned compare with immediate
142	$UCMP=	"cmpld";	# unsigned compare
143	$CNTLZ=	"cntlzd";	# count leading zeros
144	$SHL=	"sld";		# shift left
145	$SHR=	"srd";		# unsigned shift right
146	$SHRI=	"srdi";		# unsigned shift right by immediate
147	$SHLI=	"sldi";		# shift left by immediate
148	$CLRU=	"clrldi";	# clear upper bits
149	$INSR=	"insrdi";	# insert right
150	$ROTL=	"rotldi";	# rotate left by immediate
151	$TR=	"td";		# conditional trap
152} else { die "nonsense $opf"; }
153
154( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!";
155
156# function entry points from the AIX code
157#
158# There are other, more elegant, ways to handle this. We (IBM) chose
159# this approach as it plays well with scripts we run to 'namespace'
160# OpenSSL .i.e. we add a prefix to all the public symbols so we can
161# co-exist in the same process with other implementations of OpenSSL.
162# 'cleverer' ways of doing these substitutions tend to hide data we
163# need to be obvious.
164#
165my @items = ("bn_sqr_comba4",
166	     "bn_sqr_comba8",
167	     "bn_mul_comba4",
168	     "bn_mul_comba8",
169	     "bn_sub_words",
170	     "bn_add_words",
171	     "bn_div_words",
172	     "bn_sqr_words",
173	     "bn_mul_words",
174	     "bn_mul_add_words");
175
176if    ($opf =~ /linux/)	{  do_linux();	}
177elsif ($opf =~ /aix/)	{  do_aix();	}
178elsif ($opf =~ /osx/)	{  do_osx();	}
179else			{  do_bsd();	}
180
181sub do_linux {
182    $d=&data();
183
184    if ($BITS==64) {
185      foreach $t (@items) {
186        $d =~ s/\.$t:/\
187\t.section\t".opd","aw"\
188\t.align\t3\
189\t.globl\t$t\
190$t:\
191\t.quad\t.$t,.TOC.\@tocbase,0\
192\t.size\t$t,24\
193\t.previous\n\
194\t.type\t.$t,\@function\
195\t.globl\t.$t\
196.$t:/g;
197      }
198    }
199    else {
200      foreach $t (@items) {
201        $d=~s/\.$t/$t/g;
202      }
203    }
204    # hide internal labels to avoid pollution of name table...
205    $d=~s/Lppcasm_/.Lppcasm_/gm;
206    print $d;
207}
208
209sub do_aix {
210    # AIX assembler is smart enough to please the linker without
211    # making us do something special...
212    print &data();
213}
214
215# MacOSX 32 bit
216sub do_osx {
217    $d=&data();
218    # Change the bn symbol prefix from '.' to '_'
219    foreach $t (@items) {
220      $d=~s/\.$t/_$t/g;
221    }
222    # Change .machine to something OS X asm will accept
223    $d=~s/\.machine.*/.text/g;
224    $d=~s/\#/;/g; # change comment from '#' to ';'
225    print $d;
226}
227
228# BSD (Untested)
229sub do_bsd {
230    $d=&data();
231    foreach $t (@items) {
232      $d=~s/\.$t/_$t/g;
233    }
234    print $d;
235}
236
237sub data {
238	local($data)=<<EOF;
239#--------------------------------------------------------------------
240#
241#
242#
243#
244#	File:		ppc32.s
245#
246#	Created by:	Suresh Chari
247#			IBM Thomas J. Watson Research Library
248#			Hawthorne, NY
249#
250#
251#	Description:	Optimized assembly routines for OpenSSL crypto
252#			on the 32 bitPowerPC platform.
253#
254#
255#	Version History
256#
257#	2. Fixed bn_add,bn_sub and bn_div_words, added comments,
258#	   cleaned up code. Also made a single version which can
259#	   be used for both the AIX and Linux compilers. See NOTE
260#	   below.
261#				12/05/03		Suresh Chari
262#			(with lots of help from)        Andy Polyakov
263##
264#	1. Initial version	10/20/02		Suresh Chari
265#
266#
267#	The following file works for the xlc,cc
268#	and gcc compilers.
269#
270#	NOTE:	To get the file to link correctly with the gcc compiler
271#	        you have to change the names of the routines and remove
272#		the first .(dot) character. This should automatically
273#		be done in the build process.
274#
275#	Hand optimized assembly code for the following routines
276#
277#	bn_sqr_comba4
278#	bn_sqr_comba8
279#	bn_mul_comba4
280#	bn_mul_comba8
281#	bn_sub_words
282#	bn_add_words
283#	bn_div_words
284#	bn_sqr_words
285#	bn_mul_words
286#	bn_mul_add_words
287#
288#	NOTE:	It is possible to optimize this code more for
289#	specific PowerPC or Power architectures. On the Northstar
290#	architecture the optimizations in this file do
291#	 NOT provide much improvement.
292#
293#	If you have comments or suggestions to improve code send
294#	me a note at schari\@us.ibm.com
295#
296#--------------------------------------------------------------------------
297#
298#	Defines to be used in the assembly code.
299#
300.set r0,0	# we use it as storage for value of 0
301.set SP,1	# preserved
302.set RTOC,2	# preserved
303.set r3,3	# 1st argument/return value
304.set r4,4	# 2nd argument/volatile register
305.set r5,5	# 3rd argument/volatile register
306.set r6,6	# ...
307.set r7,7
308.set r8,8
309.set r9,9
310.set r10,10
311.set r11,11
312.set r12,12
313.set r13,13	# not used, nor any other "below" it...
314
315.set BO_IF_NOT,4
316.set BO_IF,12
317.set BO_dCTR_NZERO,16
318.set BO_dCTR_ZERO,18
319.set BO_ALWAYS,20
320.set CR0_LT,0;
321.set CR0_GT,1;
322.set CR0_EQ,2
323.set CR1_FX,4;
324.set CR1_FEX,5;
325.set CR1_VX,6
326.set LR,8
327
328#	Declare function names to be global
329#	NOTE:	For gcc these names MUST be changed to remove
330#	        the first . i.e. for example change ".bn_sqr_comba4"
331#		to "bn_sqr_comba4". This should be automatically done
332#		in the build.
333
334	.globl	.bn_sqr_comba4
335	.globl	.bn_sqr_comba8
336	.globl	.bn_mul_comba4
337	.globl	.bn_mul_comba8
338	.globl	.bn_sub_words
339	.globl	.bn_add_words
340	.globl	.bn_div_words
341	.globl	.bn_sqr_words
342	.globl	.bn_mul_words
343	.globl	.bn_mul_add_words
344
345# .text section
346
347	.machine	$ISA
348
349#
350#	NOTE:	The following label name should be changed to
351#		"bn_sqr_comba4" i.e. remove the first dot
352#		for the gcc compiler. This should be automatically
353#		done in the build
354#
355
356.align	4
357.bn_sqr_comba4:
358#
359# Optimized version of bn_sqr_comba4.
360#
361# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
362# r3 contains r
363# r4 contains a
364#
365# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
366#
367# r5,r6 are the two BN_ULONGs being multiplied.
368# r7,r8 are the results of the 32x32 giving 64 bit multiply.
369# r9,r10, r11 are the equivalents of c1,c2, c3.
370# Here's the assembly
371#
372#
373	xor		r0,r0,r0		# set r0 = 0. Used in the addze
374						# instructions below
375
376						#sqr_add_c(a,0,c1,c2,c3)
377	$LD		r5,`0*$BNSZ`(r4)
378	$UMULL		r9,r5,r5
379	$UMULH		r10,r5,r5		#in first iteration. No need
380						#to add since c1=c2=c3=0.
381						# Note c3(r11) is NOT set to 0
382						# but will be.
383
384	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
385						# sqr_add_c2(a,1,0,c2,c3,c1);
386	$LD		r6,`1*$BNSZ`(r4)
387	$UMULL		r7,r5,r6
388	$UMULH		r8,r5,r6
389
390	addc		r7,r7,r7		# compute (r7,r8)=2*(r7,r8)
391	adde		r8,r8,r8
392	addze		r9,r0			# catch carry if any.
393						# r9= r0(=0) and carry
394
395	addc		r10,r7,r10		# now add to temp result.
396	addze		r11,r8                  # r8 added to r11 which is 0
397	addze		r9,r9
398
399	$ST		r10,`1*$BNSZ`(r3)	#r[1]=c2;
400						#sqr_add_c(a,1,c3,c1,c2)
401	$UMULL		r7,r6,r6
402	$UMULH		r8,r6,r6
403	addc		r11,r7,r11
404	adde		r9,r8,r9
405	addze		r10,r0
406						#sqr_add_c2(a,2,0,c3,c1,c2)
407	$LD		r6,`2*$BNSZ`(r4)
408	$UMULL		r7,r5,r6
409	$UMULH		r8,r5,r6
410
411	addc		r7,r7,r7
412	adde		r8,r8,r8
413	addze		r10,r10
414
415	addc		r11,r7,r11
416	adde		r9,r8,r9
417	addze		r10,r10
418	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
419						#sqr_add_c2(a,3,0,c1,c2,c3);
420	$LD		r6,`3*$BNSZ`(r4)
421	$UMULL		r7,r5,r6
422	$UMULH		r8,r5,r6
423	addc		r7,r7,r7
424	adde		r8,r8,r8
425	addze		r11,r0
426
427	addc		r9,r7,r9
428	adde		r10,r8,r10
429	addze		r11,r11
430						#sqr_add_c2(a,2,1,c1,c2,c3);
431	$LD		r5,`1*$BNSZ`(r4)
432	$LD		r6,`2*$BNSZ`(r4)
433	$UMULL		r7,r5,r6
434	$UMULH		r8,r5,r6
435
436	addc		r7,r7,r7
437	adde		r8,r8,r8
438	addze		r11,r11
439	addc		r9,r7,r9
440	adde		r10,r8,r10
441	addze		r11,r11
442	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1
443						#sqr_add_c(a,2,c2,c3,c1);
444	$UMULL		r7,r6,r6
445	$UMULH		r8,r6,r6
446	addc		r10,r7,r10
447	adde		r11,r8,r11
448	addze		r9,r0
449						#sqr_add_c2(a,3,1,c2,c3,c1);
450	$LD		r6,`3*$BNSZ`(r4)
451	$UMULL		r7,r5,r6
452	$UMULH		r8,r5,r6
453	addc		r7,r7,r7
454	adde		r8,r8,r8
455	addze		r9,r9
456
457	addc		r10,r7,r10
458	adde		r11,r8,r11
459	addze		r9,r9
460	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2
461						#sqr_add_c2(a,3,2,c3,c1,c2);
462	$LD		r5,`2*$BNSZ`(r4)
463	$UMULL		r7,r5,r6
464	$UMULH		r8,r5,r6
465	addc		r7,r7,r7
466	adde		r8,r8,r8
467	addze		r10,r0
468
469	addc		r11,r7,r11
470	adde		r9,r8,r9
471	addze		r10,r10
472	$ST		r11,`5*$BNSZ`(r3)	#r[5] = c3
473						#sqr_add_c(a,3,c1,c2,c3);
474	$UMULL		r7,r6,r6
475	$UMULH		r8,r6,r6
476	addc		r9,r7,r9
477	adde		r10,r8,r10
478
479	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1
480	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2
481	bclr	BO_ALWAYS,CR0_LT
482	.long	0x00000000
483
484#
485#	NOTE:	The following label name should be changed to
486#		"bn_sqr_comba8" i.e. remove the first dot
487#		for the gcc compiler. This should be automatically
488#		done in the build
489#
490
491.align	4
492.bn_sqr_comba8:
493#
494# This is an optimized version of the bn_sqr_comba8 routine.
495# Tightly uses the adde instruction
496#
497#
498# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
499# r3 contains r
500# r4 contains a
501#
502# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
503#
504# r5,r6 are the two BN_ULONGs being multiplied.
505# r7,r8 are the results of the 32x32 giving 64 bit multiply.
506# r9,r10, r11 are the equivalents of c1,c2, c3.
507#
508# Possible optimization of loading all 8 longs of a into registers
509# doesnt provide any speedup
510#
511
512	xor		r0,r0,r0		#set r0 = 0.Used in addze
513						#instructions below.
514
515						#sqr_add_c(a,0,c1,c2,c3);
516	$LD		r5,`0*$BNSZ`(r4)
517	$UMULL		r9,r5,r5		#1st iteration:	no carries.
518	$UMULH		r10,r5,r5
519	$ST		r9,`0*$BNSZ`(r3)	# r[0]=c1;
520						#sqr_add_c2(a,1,0,c2,c3,c1);
521	$LD		r6,`1*$BNSZ`(r4)
522	$UMULL		r7,r5,r6
523	$UMULH		r8,r5,r6
524
525	addc		r10,r7,r10		#add the two register number
526	adde		r11,r8,r0 		# (r8,r7) to the three register
527	addze		r9,r0			# number (r9,r11,r10).NOTE:r0=0
528
529	addc		r10,r7,r10		#add the two register number
530	adde		r11,r8,r11 		# (r8,r7) to the three register
531	addze		r9,r9			# number (r9,r11,r10).
532
533	$ST		r10,`1*$BNSZ`(r3)	# r[1]=c2
534
535						#sqr_add_c(a,1,c3,c1,c2);
536	$UMULL		r7,r6,r6
537	$UMULH		r8,r6,r6
538	addc		r11,r7,r11
539	adde		r9,r8,r9
540	addze		r10,r0
541						#sqr_add_c2(a,2,0,c3,c1,c2);
542	$LD		r6,`2*$BNSZ`(r4)
543	$UMULL		r7,r5,r6
544	$UMULH		r8,r5,r6
545
546	addc		r11,r7,r11
547	adde		r9,r8,r9
548	addze		r10,r10
549
550	addc		r11,r7,r11
551	adde		r9,r8,r9
552	addze		r10,r10
553
554	$ST		r11,`2*$BNSZ`(r3)	#r[2]=c3
555						#sqr_add_c2(a,3,0,c1,c2,c3);
556	$LD		r6,`3*$BNSZ`(r4)	#r6 = a[3]. r5 is already a[0].
557	$UMULL		r7,r5,r6
558	$UMULH		r8,r5,r6
559
560	addc		r9,r7,r9
561	adde		r10,r8,r10
562	addze		r11,r0
563
564	addc		r9,r7,r9
565	adde		r10,r8,r10
566	addze		r11,r11
567						#sqr_add_c2(a,2,1,c1,c2,c3);
568	$LD		r5,`1*$BNSZ`(r4)
569	$LD		r6,`2*$BNSZ`(r4)
570	$UMULL		r7,r5,r6
571	$UMULH		r8,r5,r6
572
573	addc		r9,r7,r9
574	adde		r10,r8,r10
575	addze		r11,r11
576
577	addc		r9,r7,r9
578	adde		r10,r8,r10
579	addze		r11,r11
580
581	$ST		r9,`3*$BNSZ`(r3)	#r[3]=c1;
582						#sqr_add_c(a,2,c2,c3,c1);
583	$UMULL		r7,r6,r6
584	$UMULH		r8,r6,r6
585
586	addc		r10,r7,r10
587	adde		r11,r8,r11
588	addze		r9,r0
589						#sqr_add_c2(a,3,1,c2,c3,c1);
590	$LD		r6,`3*$BNSZ`(r4)
591	$UMULL		r7,r5,r6
592	$UMULH		r8,r5,r6
593
594	addc		r10,r7,r10
595	adde		r11,r8,r11
596	addze		r9,r9
597
598	addc		r10,r7,r10
599	adde		r11,r8,r11
600	addze		r9,r9
601						#sqr_add_c2(a,4,0,c2,c3,c1);
602	$LD		r5,`0*$BNSZ`(r4)
603	$LD		r6,`4*$BNSZ`(r4)
604	$UMULL		r7,r5,r6
605	$UMULH		r8,r5,r6
606
607	addc		r10,r7,r10
608	adde		r11,r8,r11
609	addze		r9,r9
610
611	addc		r10,r7,r10
612	adde		r11,r8,r11
613	addze		r9,r9
614	$ST		r10,`4*$BNSZ`(r3)	#r[4]=c2;
615						#sqr_add_c2(a,5,0,c3,c1,c2);
616	$LD		r6,`5*$BNSZ`(r4)
617	$UMULL		r7,r5,r6
618	$UMULH		r8,r5,r6
619
620	addc		r11,r7,r11
621	adde		r9,r8,r9
622	addze		r10,r0
623
624	addc		r11,r7,r11
625	adde		r9,r8,r9
626	addze		r10,r10
627						#sqr_add_c2(a,4,1,c3,c1,c2);
628	$LD		r5,`1*$BNSZ`(r4)
629	$LD		r6,`4*$BNSZ`(r4)
630	$UMULL		r7,r5,r6
631	$UMULH		r8,r5,r6
632
633	addc		r11,r7,r11
634	adde		r9,r8,r9
635	addze		r10,r10
636
637	addc		r11,r7,r11
638	adde		r9,r8,r9
639	addze		r10,r10
640						#sqr_add_c2(a,3,2,c3,c1,c2);
641	$LD		r5,`2*$BNSZ`(r4)
642	$LD		r6,`3*$BNSZ`(r4)
643	$UMULL		r7,r5,r6
644	$UMULH		r8,r5,r6
645
646	addc		r11,r7,r11
647	adde		r9,r8,r9
648	addze		r10,r10
649
650	addc		r11,r7,r11
651	adde		r9,r8,r9
652	addze		r10,r10
653	$ST		r11,`5*$BNSZ`(r3)	#r[5]=c3;
654						#sqr_add_c(a,3,c1,c2,c3);
655	$UMULL		r7,r6,r6
656	$UMULH		r8,r6,r6
657	addc		r9,r7,r9
658	adde		r10,r8,r10
659	addze		r11,r0
660						#sqr_add_c2(a,4,2,c1,c2,c3);
661	$LD		r6,`4*$BNSZ`(r4)
662	$UMULL		r7,r5,r6
663	$UMULH		r8,r5,r6
664
665	addc		r9,r7,r9
666	adde		r10,r8,r10
667	addze		r11,r11
668
669	addc		r9,r7,r9
670	adde		r10,r8,r10
671	addze		r11,r11
672						#sqr_add_c2(a,5,1,c1,c2,c3);
673	$LD		r5,`1*$BNSZ`(r4)
674	$LD		r6,`5*$BNSZ`(r4)
675	$UMULL		r7,r5,r6
676	$UMULH		r8,r5,r6
677
678	addc		r9,r7,r9
679	adde		r10,r8,r10
680	addze		r11,r11
681
682	addc		r9,r7,r9
683	adde		r10,r8,r10
684	addze		r11,r11
685						#sqr_add_c2(a,6,0,c1,c2,c3);
686	$LD		r5,`0*$BNSZ`(r4)
687	$LD		r6,`6*$BNSZ`(r4)
688	$UMULL		r7,r5,r6
689	$UMULH		r8,r5,r6
690	addc		r9,r7,r9
691	adde		r10,r8,r10
692	addze		r11,r11
693	addc		r9,r7,r9
694	adde		r10,r8,r10
695	addze		r11,r11
696	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1;
697						#sqr_add_c2(a,7,0,c2,c3,c1);
698	$LD		r6,`7*$BNSZ`(r4)
699	$UMULL		r7,r5,r6
700	$UMULH		r8,r5,r6
701
702	addc		r10,r7,r10
703	adde		r11,r8,r11
704	addze		r9,r0
705	addc		r10,r7,r10
706	adde		r11,r8,r11
707	addze		r9,r9
708						#sqr_add_c2(a,6,1,c2,c3,c1);
709	$LD		r5,`1*$BNSZ`(r4)
710	$LD		r6,`6*$BNSZ`(r4)
711	$UMULL		r7,r5,r6
712	$UMULH		r8,r5,r6
713
714	addc		r10,r7,r10
715	adde		r11,r8,r11
716	addze		r9,r9
717	addc		r10,r7,r10
718	adde		r11,r8,r11
719	addze		r9,r9
720						#sqr_add_c2(a,5,2,c2,c3,c1);
721	$LD		r5,`2*$BNSZ`(r4)
722	$LD		r6,`5*$BNSZ`(r4)
723	$UMULL		r7,r5,r6
724	$UMULH		r8,r5,r6
725	addc		r10,r7,r10
726	adde		r11,r8,r11
727	addze		r9,r9
728	addc		r10,r7,r10
729	adde		r11,r8,r11
730	addze		r9,r9
731						#sqr_add_c2(a,4,3,c2,c3,c1);
732	$LD		r5,`3*$BNSZ`(r4)
733	$LD		r6,`4*$BNSZ`(r4)
734	$UMULL		r7,r5,r6
735	$UMULH		r8,r5,r6
736
737	addc		r10,r7,r10
738	adde		r11,r8,r11
739	addze		r9,r9
740	addc		r10,r7,r10
741	adde		r11,r8,r11
742	addze		r9,r9
743	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2;
744						#sqr_add_c(a,4,c3,c1,c2);
745	$UMULL		r7,r6,r6
746	$UMULH		r8,r6,r6
747	addc		r11,r7,r11
748	adde		r9,r8,r9
749	addze		r10,r0
750						#sqr_add_c2(a,5,3,c3,c1,c2);
751	$LD		r6,`5*$BNSZ`(r4)
752	$UMULL		r7,r5,r6
753	$UMULH		r8,r5,r6
754	addc		r11,r7,r11
755	adde		r9,r8,r9
756	addze		r10,r10
757	addc		r11,r7,r11
758	adde		r9,r8,r9
759	addze		r10,r10
760						#sqr_add_c2(a,6,2,c3,c1,c2);
761	$LD		r5,`2*$BNSZ`(r4)
762	$LD		r6,`6*$BNSZ`(r4)
763	$UMULL		r7,r5,r6
764	$UMULH		r8,r5,r6
765	addc		r11,r7,r11
766	adde		r9,r8,r9
767	addze		r10,r10
768
769	addc		r11,r7,r11
770	adde		r9,r8,r9
771	addze		r10,r10
772						#sqr_add_c2(a,7,1,c3,c1,c2);
773	$LD		r5,`1*$BNSZ`(r4)
774	$LD		r6,`7*$BNSZ`(r4)
775	$UMULL		r7,r5,r6
776	$UMULH		r8,r5,r6
777	addc		r11,r7,r11
778	adde		r9,r8,r9
779	addze		r10,r10
780	addc		r11,r7,r11
781	adde		r9,r8,r9
782	addze		r10,r10
783	$ST		r11,`8*$BNSZ`(r3)	#r[8]=c3;
784						#sqr_add_c2(a,7,2,c1,c2,c3);
785	$LD		r5,`2*$BNSZ`(r4)
786	$UMULL		r7,r5,r6
787	$UMULH		r8,r5,r6
788
789	addc		r9,r7,r9
790	adde		r10,r8,r10
791	addze		r11,r0
792	addc		r9,r7,r9
793	adde		r10,r8,r10
794	addze		r11,r11
795						#sqr_add_c2(a,6,3,c1,c2,c3);
796	$LD		r5,`3*$BNSZ`(r4)
797	$LD		r6,`6*$BNSZ`(r4)
798	$UMULL		r7,r5,r6
799	$UMULH		r8,r5,r6
800	addc		r9,r7,r9
801	adde		r10,r8,r10
802	addze		r11,r11
803	addc		r9,r7,r9
804	adde		r10,r8,r10
805	addze		r11,r11
806						#sqr_add_c2(a,5,4,c1,c2,c3);
807	$LD		r5,`4*$BNSZ`(r4)
808	$LD		r6,`5*$BNSZ`(r4)
809	$UMULL		r7,r5,r6
810	$UMULH		r8,r5,r6
811	addc		r9,r7,r9
812	adde		r10,r8,r10
813	addze		r11,r11
814	addc		r9,r7,r9
815	adde		r10,r8,r10
816	addze		r11,r11
817	$ST		r9,`9*$BNSZ`(r3)	#r[9]=c1;
818						#sqr_add_c(a,5,c2,c3,c1);
819	$UMULL		r7,r6,r6
820	$UMULH		r8,r6,r6
821	addc		r10,r7,r10
822	adde		r11,r8,r11
823	addze		r9,r0
824						#sqr_add_c2(a,6,4,c2,c3,c1);
825	$LD		r6,`6*$BNSZ`(r4)
826	$UMULL		r7,r5,r6
827	$UMULH		r8,r5,r6
828	addc		r10,r7,r10
829	adde		r11,r8,r11
830	addze		r9,r9
831	addc		r10,r7,r10
832	adde		r11,r8,r11
833	addze		r9,r9
834						#sqr_add_c2(a,7,3,c2,c3,c1);
835	$LD		r5,`3*$BNSZ`(r4)
836	$LD		r6,`7*$BNSZ`(r4)
837	$UMULL		r7,r5,r6
838	$UMULH		r8,r5,r6
839	addc		r10,r7,r10
840	adde		r11,r8,r11
841	addze		r9,r9
842	addc		r10,r7,r10
843	adde		r11,r8,r11
844	addze		r9,r9
845	$ST		r10,`10*$BNSZ`(r3)	#r[10]=c2;
846						#sqr_add_c2(a,7,4,c3,c1,c2);
847	$LD		r5,`4*$BNSZ`(r4)
848	$UMULL		r7,r5,r6
849	$UMULH		r8,r5,r6
850	addc		r11,r7,r11
851	adde		r9,r8,r9
852	addze		r10,r0
853	addc		r11,r7,r11
854	adde		r9,r8,r9
855	addze		r10,r10
856						#sqr_add_c2(a,6,5,c3,c1,c2);
857	$LD		r5,`5*$BNSZ`(r4)
858	$LD		r6,`6*$BNSZ`(r4)
859	$UMULL		r7,r5,r6
860	$UMULH		r8,r5,r6
861	addc		r11,r7,r11
862	adde		r9,r8,r9
863	addze		r10,r10
864	addc		r11,r7,r11
865	adde		r9,r8,r9
866	addze		r10,r10
867	$ST		r11,`11*$BNSZ`(r3)	#r[11]=c3;
868						#sqr_add_c(a,6,c1,c2,c3);
869	$UMULL		r7,r6,r6
870	$UMULH		r8,r6,r6
871	addc		r9,r7,r9
872	adde		r10,r8,r10
873	addze		r11,r0
874						#sqr_add_c2(a,7,5,c1,c2,c3)
875	$LD		r6,`7*$BNSZ`(r4)
876	$UMULL		r7,r5,r6
877	$UMULH		r8,r5,r6
878	addc		r9,r7,r9
879	adde		r10,r8,r10
880	addze		r11,r11
881	addc		r9,r7,r9
882	adde		r10,r8,r10
883	addze		r11,r11
884	$ST		r9,`12*$BNSZ`(r3)	#r[12]=c1;
885
886						#sqr_add_c2(a,7,6,c2,c3,c1)
887	$LD		r5,`6*$BNSZ`(r4)
888	$UMULL		r7,r5,r6
889	$UMULH		r8,r5,r6
890	addc		r10,r7,r10
891	adde		r11,r8,r11
892	addze		r9,r0
893	addc		r10,r7,r10
894	adde		r11,r8,r11
895	addze		r9,r9
896	$ST		r10,`13*$BNSZ`(r3)	#r[13]=c2;
897						#sqr_add_c(a,7,c3,c1,c2);
898	$UMULL		r7,r6,r6
899	$UMULH		r8,r6,r6
900	addc		r11,r7,r11
901	adde		r9,r8,r9
902	$ST		r11,`14*$BNSZ`(r3)	#r[14]=c3;
903	$ST		r9, `15*$BNSZ`(r3)	#r[15]=c1;
904
905
906	bclr	BO_ALWAYS,CR0_LT
907
908	.long	0x00000000
909
910#
911#	NOTE:	The following label name should be changed to
912#		"bn_mul_comba4" i.e. remove the first dot
913#		for the gcc compiler. This should be automatically
914#		done in the build
915#
916
917.align	4
918.bn_mul_comba4:
919#
920# This is an optimized version of the bn_mul_comba4 routine.
921#
922# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
923# r3 contains r
924# r4 contains a
925# r5 contains b
926# r6, r7 are the 2 BN_ULONGs being multiplied.
927# r8, r9 are the results of the 32x32 giving 64 multiply.
928# r10, r11, r12 are the equivalents of c1, c2, and c3.
929#
930	xor	r0,r0,r0		#r0=0. Used in addze below.
931					#mul_add_c(a[0],b[0],c1,c2,c3);
932	$LD	r6,`0*$BNSZ`(r4)
933	$LD	r7,`0*$BNSZ`(r5)
934	$UMULL	r10,r6,r7
935	$UMULH	r11,r6,r7
936	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1
937					#mul_add_c(a[0],b[1],c2,c3,c1);
938	$LD	r7,`1*$BNSZ`(r5)
939	$UMULL	r8,r6,r7
940	$UMULH	r9,r6,r7
941	addc	r11,r8,r11
942	adde	r12,r9,r0
943	addze	r10,r0
944					#mul_add_c(a[1],b[0],c2,c3,c1);
945	$LD	r6, `1*$BNSZ`(r4)
946	$LD	r7, `0*$BNSZ`(r5)
947	$UMULL	r8,r6,r7
948	$UMULH	r9,r6,r7
949	addc	r11,r8,r11
950	adde	r12,r9,r12
951	addze	r10,r10
952	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2
953					#mul_add_c(a[2],b[0],c3,c1,c2);
954	$LD	r6,`2*$BNSZ`(r4)
955	$UMULL	r8,r6,r7
956	$UMULH	r9,r6,r7
957	addc	r12,r8,r12
958	adde	r10,r9,r10
959	addze	r11,r0
960					#mul_add_c(a[1],b[1],c3,c1,c2);
961	$LD	r6,`1*$BNSZ`(r4)
962	$LD	r7,`1*$BNSZ`(r5)
963	$UMULL	r8,r6,r7
964	$UMULH	r9,r6,r7
965	addc	r12,r8,r12
966	adde	r10,r9,r10
967	addze	r11,r11
968					#mul_add_c(a[0],b[2],c3,c1,c2);
969	$LD	r6,`0*$BNSZ`(r4)
970	$LD	r7,`2*$BNSZ`(r5)
971	$UMULL	r8,r6,r7
972	$UMULH	r9,r6,r7
973	addc	r12,r8,r12
974	adde	r10,r9,r10
975	addze	r11,r11
976	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3
977					#mul_add_c(a[0],b[3],c1,c2,c3);
978	$LD	r7,`3*$BNSZ`(r5)
979	$UMULL	r8,r6,r7
980	$UMULH	r9,r6,r7
981	addc	r10,r8,r10
982	adde	r11,r9,r11
983	addze	r12,r0
984					#mul_add_c(a[1],b[2],c1,c2,c3);
985	$LD	r6,`1*$BNSZ`(r4)
986	$LD	r7,`2*$BNSZ`(r5)
987	$UMULL	r8,r6,r7
988	$UMULH	r9,r6,r7
989	addc	r10,r8,r10
990	adde	r11,r9,r11
991	addze	r12,r12
992					#mul_add_c(a[2],b[1],c1,c2,c3);
993	$LD	r6,`2*$BNSZ`(r4)
994	$LD	r7,`1*$BNSZ`(r5)
995	$UMULL	r8,r6,r7
996	$UMULH	r9,r6,r7
997	addc	r10,r8,r10
998	adde	r11,r9,r11
999	addze	r12,r12
1000					#mul_add_c(a[3],b[0],c1,c2,c3);
1001	$LD	r6,`3*$BNSZ`(r4)
1002	$LD	r7,`0*$BNSZ`(r5)
1003	$UMULL	r8,r6,r7
1004	$UMULH	r9,r6,r7
1005	addc	r10,r8,r10
1006	adde	r11,r9,r11
1007	addze	r12,r12
1008	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1
1009					#mul_add_c(a[3],b[1],c2,c3,c1);
1010	$LD	r7,`1*$BNSZ`(r5)
1011	$UMULL	r8,r6,r7
1012	$UMULH	r9,r6,r7
1013	addc	r11,r8,r11
1014	adde	r12,r9,r12
1015	addze	r10,r0
1016					#mul_add_c(a[2],b[2],c2,c3,c1);
1017	$LD	r6,`2*$BNSZ`(r4)
1018	$LD	r7,`2*$BNSZ`(r5)
1019	$UMULL	r8,r6,r7
1020	$UMULH	r9,r6,r7
1021	addc	r11,r8,r11
1022	adde	r12,r9,r12
1023	addze	r10,r10
1024					#mul_add_c(a[1],b[3],c2,c3,c1);
1025	$LD	r6,`1*$BNSZ`(r4)
1026	$LD	r7,`3*$BNSZ`(r5)
1027	$UMULL	r8,r6,r7
1028	$UMULH	r9,r6,r7
1029	addc	r11,r8,r11
1030	adde	r12,r9,r12
1031	addze	r10,r10
1032	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2
1033					#mul_add_c(a[2],b[3],c3,c1,c2);
1034	$LD	r6,`2*$BNSZ`(r4)
1035	$UMULL	r8,r6,r7
1036	$UMULH	r9,r6,r7
1037	addc	r12,r8,r12
1038	adde	r10,r9,r10
1039	addze	r11,r0
1040					#mul_add_c(a[3],b[2],c3,c1,c2);
1041	$LD	r6,`3*$BNSZ`(r4)
1042	$LD	r7,`2*$BNSZ`(r4)
1043	$UMULL	r8,r6,r7
1044	$UMULH	r9,r6,r7
1045	addc	r12,r8,r12
1046	adde	r10,r9,r10
1047	addze	r11,r11
1048	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3
1049					#mul_add_c(a[3],b[3],c1,c2,c3);
1050	$LD	r7,`3*$BNSZ`(r5)
1051	$UMULL	r8,r6,r7
1052	$UMULH	r9,r6,r7
1053	addc	r10,r8,r10
1054	adde	r11,r9,r11
1055
1056	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1
1057	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2
1058	bclr	BO_ALWAYS,CR0_LT
1059	.long	0x00000000
1060
1061#
1062#	NOTE:	The following label name should be changed to
1063#		"bn_mul_comba8" i.e. remove the first dot
1064#		for the gcc compiler. This should be automatically
1065#		done in the build
1066#
1067
1068.align	4
1069.bn_mul_comba8:
1070#
1071# Optimized version of the bn_mul_comba8 routine.
1072#
1073# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1074# r3 contains r
1075# r4 contains a
1076# r5 contains b
1077# r6, r7 are the 2 BN_ULONGs being multiplied.
1078# r8, r9 are the results of the 32x32 giving 64 multiply.
1079# r10, r11, r12 are the equivalents of c1, c2, and c3.
1080#
1081	xor	r0,r0,r0		#r0=0. Used in addze below.
1082
1083					#mul_add_c(a[0],b[0],c1,c2,c3);
1084	$LD	r6,`0*$BNSZ`(r4)	#a[0]
1085	$LD	r7,`0*$BNSZ`(r5)	#b[0]
1086	$UMULL	r10,r6,r7
1087	$UMULH	r11,r6,r7
1088	$ST	r10,`0*$BNSZ`(r3)	#r[0]=c1;
1089					#mul_add_c(a[0],b[1],c2,c3,c1);
1090	$LD	r7,`1*$BNSZ`(r5)
1091	$UMULL	r8,r6,r7
1092	$UMULH	r9,r6,r7
1093	addc	r11,r11,r8
1094	addze	r12,r9			# since we didnt set r12 to zero before.
1095	addze	r10,r0
1096					#mul_add_c(a[1],b[0],c2,c3,c1);
1097	$LD	r6,`1*$BNSZ`(r4)
1098	$LD	r7,`0*$BNSZ`(r5)
1099	$UMULL	r8,r6,r7
1100	$UMULH	r9,r6,r7
1101	addc	r11,r11,r8
1102	adde	r12,r12,r9
1103	addze	r10,r10
1104	$ST	r11,`1*$BNSZ`(r3)	#r[1]=c2;
1105					#mul_add_c(a[2],b[0],c3,c1,c2);
1106	$LD	r6,`2*$BNSZ`(r4)
1107	$UMULL	r8,r6,r7
1108	$UMULH	r9,r6,r7
1109	addc	r12,r12,r8
1110	adde	r10,r10,r9
1111	addze	r11,r0
1112					#mul_add_c(a[1],b[1],c3,c1,c2);
1113	$LD	r6,`1*$BNSZ`(r4)
1114	$LD	r7,`1*$BNSZ`(r5)
1115	$UMULL	r8,r6,r7
1116	$UMULH	r9,r6,r7
1117	addc	r12,r12,r8
1118	adde	r10,r10,r9
1119	addze	r11,r11
1120					#mul_add_c(a[0],b[2],c3,c1,c2);
1121	$LD	r6,`0*$BNSZ`(r4)
1122	$LD	r7,`2*$BNSZ`(r5)
1123	$UMULL	r8,r6,r7
1124	$UMULH	r9,r6,r7
1125	addc	r12,r12,r8
1126	adde	r10,r10,r9
1127	addze	r11,r11
1128	$ST	r12,`2*$BNSZ`(r3)	#r[2]=c3;
1129					#mul_add_c(a[0],b[3],c1,c2,c3);
1130	$LD	r7,`3*$BNSZ`(r5)
1131	$UMULL	r8,r6,r7
1132	$UMULH	r9,r6,r7
1133	addc	r10,r10,r8
1134	adde	r11,r11,r9
1135	addze	r12,r0
1136					#mul_add_c(a[1],b[2],c1,c2,c3);
1137	$LD	r6,`1*$BNSZ`(r4)
1138	$LD	r7,`2*$BNSZ`(r5)
1139	$UMULL	r8,r6,r7
1140	$UMULH	r9,r6,r7
1141	addc	r10,r10,r8
1142	adde	r11,r11,r9
1143	addze	r12,r12
1144
1145					#mul_add_c(a[2],b[1],c1,c2,c3);
1146	$LD	r6,`2*$BNSZ`(r4)
1147	$LD	r7,`1*$BNSZ`(r5)
1148	$UMULL	r8,r6,r7
1149	$UMULH	r9,r6,r7
1150	addc	r10,r10,r8
1151	adde	r11,r11,r9
1152	addze	r12,r12
1153					#mul_add_c(a[3],b[0],c1,c2,c3);
1154	$LD	r6,`3*$BNSZ`(r4)
1155	$LD	r7,`0*$BNSZ`(r5)
1156	$UMULL	r8,r6,r7
1157	$UMULH	r9,r6,r7
1158	addc	r10,r10,r8
1159	adde	r11,r11,r9
1160	addze	r12,r12
1161	$ST	r10,`3*$BNSZ`(r3)	#r[3]=c1;
1162					#mul_add_c(a[4],b[0],c2,c3,c1);
1163	$LD	r6,`4*$BNSZ`(r4)
1164	$UMULL	r8,r6,r7
1165	$UMULH	r9,r6,r7
1166	addc	r11,r11,r8
1167	adde	r12,r12,r9
1168	addze	r10,r0
1169					#mul_add_c(a[3],b[1],c2,c3,c1);
1170	$LD	r6,`3*$BNSZ`(r4)
1171	$LD	r7,`1*$BNSZ`(r5)
1172	$UMULL	r8,r6,r7
1173	$UMULH	r9,r6,r7
1174	addc	r11,r11,r8
1175	adde	r12,r12,r9
1176	addze	r10,r10
1177					#mul_add_c(a[2],b[2],c2,c3,c1);
1178	$LD	r6,`2*$BNSZ`(r4)
1179	$LD	r7,`2*$BNSZ`(r5)
1180	$UMULL	r8,r6,r7
1181	$UMULH	r9,r6,r7
1182	addc	r11,r11,r8
1183	adde	r12,r12,r9
1184	addze	r10,r10
1185					#mul_add_c(a[1],b[3],c2,c3,c1);
1186	$LD	r6,`1*$BNSZ`(r4)
1187	$LD	r7,`3*$BNSZ`(r5)
1188	$UMULL	r8,r6,r7
1189	$UMULH	r9,r6,r7
1190	addc	r11,r11,r8
1191	adde	r12,r12,r9
1192	addze	r10,r10
1193					#mul_add_c(a[0],b[4],c2,c3,c1);
1194	$LD	r6,`0*$BNSZ`(r4)
1195	$LD	r7,`4*$BNSZ`(r5)
1196	$UMULL	r8,r6,r7
1197	$UMULH	r9,r6,r7
1198	addc	r11,r11,r8
1199	adde	r12,r12,r9
1200	addze	r10,r10
1201	$ST	r11,`4*$BNSZ`(r3)	#r[4]=c2;
1202					#mul_add_c(a[0],b[5],c3,c1,c2);
1203	$LD	r7,`5*$BNSZ`(r5)
1204	$UMULL	r8,r6,r7
1205	$UMULH	r9,r6,r7
1206	addc	r12,r12,r8
1207	adde	r10,r10,r9
1208	addze	r11,r0
1209					#mul_add_c(a[1],b[4],c3,c1,c2);
1210	$LD	r6,`1*$BNSZ`(r4)
1211	$LD	r7,`4*$BNSZ`(r5)
1212	$UMULL	r8,r6,r7
1213	$UMULH	r9,r6,r7
1214	addc	r12,r12,r8
1215	adde	r10,r10,r9
1216	addze	r11,r11
1217					#mul_add_c(a[2],b[3],c3,c1,c2);
1218	$LD	r6,`2*$BNSZ`(r4)
1219	$LD	r7,`3*$BNSZ`(r5)
1220	$UMULL	r8,r6,r7
1221	$UMULH	r9,r6,r7
1222	addc	r12,r12,r8
1223	adde	r10,r10,r9
1224	addze	r11,r11
1225					#mul_add_c(a[3],b[2],c3,c1,c2);
1226	$LD	r6,`3*$BNSZ`(r4)
1227	$LD	r7,`2*$BNSZ`(r5)
1228	$UMULL	r8,r6,r7
1229	$UMULH	r9,r6,r7
1230	addc	r12,r12,r8
1231	adde	r10,r10,r9
1232	addze	r11,r11
1233					#mul_add_c(a[4],b[1],c3,c1,c2);
1234	$LD	r6,`4*$BNSZ`(r4)
1235	$LD	r7,`1*$BNSZ`(r5)
1236	$UMULL	r8,r6,r7
1237	$UMULH	r9,r6,r7
1238	addc	r12,r12,r8
1239	adde	r10,r10,r9
1240	addze	r11,r11
1241					#mul_add_c(a[5],b[0],c3,c1,c2);
1242	$LD	r6,`5*$BNSZ`(r4)
1243	$LD	r7,`0*$BNSZ`(r5)
1244	$UMULL	r8,r6,r7
1245	$UMULH	r9,r6,r7
1246	addc	r12,r12,r8
1247	adde	r10,r10,r9
1248	addze	r11,r11
1249	$ST	r12,`5*$BNSZ`(r3)	#r[5]=c3;
1250					#mul_add_c(a[6],b[0],c1,c2,c3);
1251	$LD	r6,`6*$BNSZ`(r4)
1252	$UMULL	r8,r6,r7
1253	$UMULH	r9,r6,r7
1254	addc	r10,r10,r8
1255	adde	r11,r11,r9
1256	addze	r12,r0
1257					#mul_add_c(a[5],b[1],c1,c2,c3);
1258	$LD	r6,`5*$BNSZ`(r4)
1259	$LD	r7,`1*$BNSZ`(r5)
1260	$UMULL	r8,r6,r7
1261	$UMULH	r9,r6,r7
1262	addc	r10,r10,r8
1263	adde	r11,r11,r9
1264	addze	r12,r12
1265					#mul_add_c(a[4],b[2],c1,c2,c3);
1266	$LD	r6,`4*$BNSZ`(r4)
1267	$LD	r7,`2*$BNSZ`(r5)
1268	$UMULL	r8,r6,r7
1269	$UMULH	r9,r6,r7
1270	addc	r10,r10,r8
1271	adde	r11,r11,r9
1272	addze	r12,r12
1273					#mul_add_c(a[3],b[3],c1,c2,c3);
1274	$LD	r6,`3*$BNSZ`(r4)
1275	$LD	r7,`3*$BNSZ`(r5)
1276	$UMULL	r8,r6,r7
1277	$UMULH	r9,r6,r7
1278	addc	r10,r10,r8
1279	adde	r11,r11,r9
1280	addze	r12,r12
1281					#mul_add_c(a[2],b[4],c1,c2,c3);
1282	$LD	r6,`2*$BNSZ`(r4)
1283	$LD	r7,`4*$BNSZ`(r5)
1284	$UMULL	r8,r6,r7
1285	$UMULH	r9,r6,r7
1286	addc	r10,r10,r8
1287	adde	r11,r11,r9
1288	addze	r12,r12
1289					#mul_add_c(a[1],b[5],c1,c2,c3);
1290	$LD	r6,`1*$BNSZ`(r4)
1291	$LD	r7,`5*$BNSZ`(r5)
1292	$UMULL	r8,r6,r7
1293	$UMULH	r9,r6,r7
1294	addc	r10,r10,r8
1295	adde	r11,r11,r9
1296	addze	r12,r12
1297					#mul_add_c(a[0],b[6],c1,c2,c3);
1298	$LD	r6,`0*$BNSZ`(r4)
1299	$LD	r7,`6*$BNSZ`(r5)
1300	$UMULL	r8,r6,r7
1301	$UMULH	r9,r6,r7
1302	addc	r10,r10,r8
1303	adde	r11,r11,r9
1304	addze	r12,r12
1305	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1;
1306					#mul_add_c(a[0],b[7],c2,c3,c1);
1307	$LD	r7,`7*$BNSZ`(r5)
1308	$UMULL	r8,r6,r7
1309	$UMULH	r9,r6,r7
1310	addc	r11,r11,r8
1311	adde	r12,r12,r9
1312	addze	r10,r0
1313					#mul_add_c(a[1],b[6],c2,c3,c1);
1314	$LD	r6,`1*$BNSZ`(r4)
1315	$LD	r7,`6*$BNSZ`(r5)
1316	$UMULL	r8,r6,r7
1317	$UMULH	r9,r6,r7
1318	addc	r11,r11,r8
1319	adde	r12,r12,r9
1320	addze	r10,r10
1321					#mul_add_c(a[2],b[5],c2,c3,c1);
1322	$LD	r6,`2*$BNSZ`(r4)
1323	$LD	r7,`5*$BNSZ`(r5)
1324	$UMULL	r8,r6,r7
1325	$UMULH	r9,r6,r7
1326	addc	r11,r11,r8
1327	adde	r12,r12,r9
1328	addze	r10,r10
1329					#mul_add_c(a[3],b[4],c2,c3,c1);
1330	$LD	r6,`3*$BNSZ`(r4)
1331	$LD	r7,`4*$BNSZ`(r5)
1332	$UMULL	r8,r6,r7
1333	$UMULH	r9,r6,r7
1334	addc	r11,r11,r8
1335	adde	r12,r12,r9
1336	addze	r10,r10
1337					#mul_add_c(a[4],b[3],c2,c3,c1);
1338	$LD	r6,`4*$BNSZ`(r4)
1339	$LD	r7,`3*$BNSZ`(r5)
1340	$UMULL	r8,r6,r7
1341	$UMULH	r9,r6,r7
1342	addc	r11,r11,r8
1343	adde	r12,r12,r9
1344	addze	r10,r10
1345					#mul_add_c(a[5],b[2],c2,c3,c1);
1346	$LD	r6,`5*$BNSZ`(r4)
1347	$LD	r7,`2*$BNSZ`(r5)
1348	$UMULL	r8,r6,r7
1349	$UMULH	r9,r6,r7
1350	addc	r11,r11,r8
1351	adde	r12,r12,r9
1352	addze	r10,r10
1353					#mul_add_c(a[6],b[1],c2,c3,c1);
1354	$LD	r6,`6*$BNSZ`(r4)
1355	$LD	r7,`1*$BNSZ`(r5)
1356	$UMULL	r8,r6,r7
1357	$UMULH	r9,r6,r7
1358	addc	r11,r11,r8
1359	adde	r12,r12,r9
1360	addze	r10,r10
1361					#mul_add_c(a[7],b[0],c2,c3,c1);
1362	$LD	r6,`7*$BNSZ`(r4)
1363	$LD	r7,`0*$BNSZ`(r5)
1364	$UMULL	r8,r6,r7
1365	$UMULH	r9,r6,r7
1366	addc	r11,r11,r8
1367	adde	r12,r12,r9
1368	addze	r10,r10
1369	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2;
1370					#mul_add_c(a[7],b[1],c3,c1,c2);
1371	$LD	r7,`1*$BNSZ`(r5)
1372	$UMULL	r8,r6,r7
1373	$UMULH	r9,r6,r7
1374	addc	r12,r12,r8
1375	adde	r10,r10,r9
1376	addze	r11,r0
1377					#mul_add_c(a[6],b[2],c3,c1,c2);
1378	$LD	r6,`6*$BNSZ`(r4)
1379	$LD	r7,`2*$BNSZ`(r5)
1380	$UMULL	r8,r6,r7
1381	$UMULH	r9,r6,r7
1382	addc	r12,r12,r8
1383	adde	r10,r10,r9
1384	addze	r11,r11
1385					#mul_add_c(a[5],b[3],c3,c1,c2);
1386	$LD	r6,`5*$BNSZ`(r4)
1387	$LD	r7,`3*$BNSZ`(r5)
1388	$UMULL	r8,r6,r7
1389	$UMULH	r9,r6,r7
1390	addc	r12,r12,r8
1391	adde	r10,r10,r9
1392	addze	r11,r11
1393					#mul_add_c(a[4],b[4],c3,c1,c2);
1394	$LD	r6,`4*$BNSZ`(r4)
1395	$LD	r7,`4*$BNSZ`(r5)
1396	$UMULL	r8,r6,r7
1397	$UMULH	r9,r6,r7
1398	addc	r12,r12,r8
1399	adde	r10,r10,r9
1400	addze	r11,r11
1401					#mul_add_c(a[3],b[5],c3,c1,c2);
1402	$LD	r6,`3*$BNSZ`(r4)
1403	$LD	r7,`5*$BNSZ`(r5)
1404	$UMULL	r8,r6,r7
1405	$UMULH	r9,r6,r7
1406	addc	r12,r12,r8
1407	adde	r10,r10,r9
1408	addze	r11,r11
1409					#mul_add_c(a[2],b[6],c3,c1,c2);
1410	$LD	r6,`2*$BNSZ`(r4)
1411	$LD	r7,`6*$BNSZ`(r5)
1412	$UMULL	r8,r6,r7
1413	$UMULH	r9,r6,r7
1414	addc	r12,r12,r8
1415	adde	r10,r10,r9
1416	addze	r11,r11
1417					#mul_add_c(a[1],b[7],c3,c1,c2);
1418	$LD	r6,`1*$BNSZ`(r4)
1419	$LD	r7,`7*$BNSZ`(r5)
1420	$UMULL	r8,r6,r7
1421	$UMULH	r9,r6,r7
1422	addc	r12,r12,r8
1423	adde	r10,r10,r9
1424	addze	r11,r11
1425	$ST	r12,`8*$BNSZ`(r3)	#r[8]=c3;
1426					#mul_add_c(a[2],b[7],c1,c2,c3);
1427	$LD	r6,`2*$BNSZ`(r4)
1428	$UMULL	r8,r6,r7
1429	$UMULH	r9,r6,r7
1430	addc	r10,r10,r8
1431	adde	r11,r11,r9
1432	addze	r12,r0
1433					#mul_add_c(a[3],b[6],c1,c2,c3);
1434	$LD	r6,`3*$BNSZ`(r4)
1435	$LD	r7,`6*$BNSZ`(r5)
1436	$UMULL	r8,r6,r7
1437	$UMULH	r9,r6,r7
1438	addc	r10,r10,r8
1439	adde	r11,r11,r9
1440	addze	r12,r12
1441					#mul_add_c(a[4],b[5],c1,c2,c3);
1442	$LD	r6,`4*$BNSZ`(r4)
1443	$LD	r7,`5*$BNSZ`(r5)
1444	$UMULL	r8,r6,r7
1445	$UMULH	r9,r6,r7
1446	addc	r10,r10,r8
1447	adde	r11,r11,r9
1448	addze	r12,r12
1449					#mul_add_c(a[5],b[4],c1,c2,c3);
1450	$LD	r6,`5*$BNSZ`(r4)
1451	$LD	r7,`4*$BNSZ`(r5)
1452	$UMULL	r8,r6,r7
1453	$UMULH	r9,r6,r7
1454	addc	r10,r10,r8
1455	adde	r11,r11,r9
1456	addze	r12,r12
1457					#mul_add_c(a[6],b[3],c1,c2,c3);
1458	$LD	r6,`6*$BNSZ`(r4)
1459	$LD	r7,`3*$BNSZ`(r5)
1460	$UMULL	r8,r6,r7
1461	$UMULH	r9,r6,r7
1462	addc	r10,r10,r8
1463	adde	r11,r11,r9
1464	addze	r12,r12
1465					#mul_add_c(a[7],b[2],c1,c2,c3);
1466	$LD	r6,`7*$BNSZ`(r4)
1467	$LD	r7,`2*$BNSZ`(r5)
1468	$UMULL	r8,r6,r7
1469	$UMULH	r9,r6,r7
1470	addc	r10,r10,r8
1471	adde	r11,r11,r9
1472	addze	r12,r12
1473	$ST	r10,`9*$BNSZ`(r3)	#r[9]=c1;
1474					#mul_add_c(a[7],b[3],c2,c3,c1);
1475	$LD	r7,`3*$BNSZ`(r5)
1476	$UMULL	r8,r6,r7
1477	$UMULH	r9,r6,r7
1478	addc	r11,r11,r8
1479	adde	r12,r12,r9
1480	addze	r10,r0
1481					#mul_add_c(a[6],b[4],c2,c3,c1);
1482	$LD	r6,`6*$BNSZ`(r4)
1483	$LD	r7,`4*$BNSZ`(r5)
1484	$UMULL	r8,r6,r7
1485	$UMULH	r9,r6,r7
1486	addc	r11,r11,r8
1487	adde	r12,r12,r9
1488	addze	r10,r10
1489					#mul_add_c(a[5],b[5],c2,c3,c1);
1490	$LD	r6,`5*$BNSZ`(r4)
1491	$LD	r7,`5*$BNSZ`(r5)
1492	$UMULL	r8,r6,r7
1493	$UMULH	r9,r6,r7
1494	addc	r11,r11,r8
1495	adde	r12,r12,r9
1496	addze	r10,r10
1497					#mul_add_c(a[4],b[6],c2,c3,c1);
1498	$LD	r6,`4*$BNSZ`(r4)
1499	$LD	r7,`6*$BNSZ`(r5)
1500	$UMULL	r8,r6,r7
1501	$UMULH	r9,r6,r7
1502	addc	r11,r11,r8
1503	adde	r12,r12,r9
1504	addze	r10,r10
1505					#mul_add_c(a[3],b[7],c2,c3,c1);
1506	$LD	r6,`3*$BNSZ`(r4)
1507	$LD	r7,`7*$BNSZ`(r5)
1508	$UMULL	r8,r6,r7
1509	$UMULH	r9,r6,r7
1510	addc	r11,r11,r8
1511	adde	r12,r12,r9
1512	addze	r10,r10
1513	$ST	r11,`10*$BNSZ`(r3)	#r[10]=c2;
1514					#mul_add_c(a[4],b[7],c3,c1,c2);
1515	$LD	r6,`4*$BNSZ`(r4)
1516	$UMULL	r8,r6,r7
1517	$UMULH	r9,r6,r7
1518	addc	r12,r12,r8
1519	adde	r10,r10,r9
1520	addze	r11,r0
1521					#mul_add_c(a[5],b[6],c3,c1,c2);
1522	$LD	r6,`5*$BNSZ`(r4)
1523	$LD	r7,`6*$BNSZ`(r5)
1524	$UMULL	r8,r6,r7
1525	$UMULH	r9,r6,r7
1526	addc	r12,r12,r8
1527	adde	r10,r10,r9
1528	addze	r11,r11
1529					#mul_add_c(a[6],b[5],c3,c1,c2);
1530	$LD	r6,`6*$BNSZ`(r4)
1531	$LD	r7,`5*$BNSZ`(r5)
1532	$UMULL	r8,r6,r7
1533	$UMULH	r9,r6,r7
1534	addc	r12,r12,r8
1535	adde	r10,r10,r9
1536	addze	r11,r11
1537					#mul_add_c(a[7],b[4],c3,c1,c2);
1538	$LD	r6,`7*$BNSZ`(r4)
1539	$LD	r7,`4*$BNSZ`(r5)
1540	$UMULL	r8,r6,r7
1541	$UMULH	r9,r6,r7
1542	addc	r12,r12,r8
1543	adde	r10,r10,r9
1544	addze	r11,r11
1545	$ST	r12,`11*$BNSZ`(r3)	#r[11]=c3;
1546					#mul_add_c(a[7],b[5],c1,c2,c3);
1547	$LD	r7,`5*$BNSZ`(r5)
1548	$UMULL	r8,r6,r7
1549	$UMULH	r9,r6,r7
1550	addc	r10,r10,r8
1551	adde	r11,r11,r9
1552	addze	r12,r0
1553					#mul_add_c(a[6],b[6],c1,c2,c3);
1554	$LD	r6,`6*$BNSZ`(r4)
1555	$LD	r7,`6*$BNSZ`(r5)
1556	$UMULL	r8,r6,r7
1557	$UMULH	r9,r6,r7
1558	addc	r10,r10,r8
1559	adde	r11,r11,r9
1560	addze	r12,r12
1561					#mul_add_c(a[5],b[7],c1,c2,c3);
1562	$LD	r6,`5*$BNSZ`(r4)
1563	$LD	r7,`7*$BNSZ`(r5)
1564	$UMULL	r8,r6,r7
1565	$UMULH	r9,r6,r7
1566	addc	r10,r10,r8
1567	adde	r11,r11,r9
1568	addze	r12,r12
1569	$ST	r10,`12*$BNSZ`(r3)	#r[12]=c1;
1570					#mul_add_c(a[6],b[7],c2,c3,c1);
1571	$LD	r6,`6*$BNSZ`(r4)
1572	$UMULL	r8,r6,r7
1573	$UMULH	r9,r6,r7
1574	addc	r11,r11,r8
1575	adde	r12,r12,r9
1576	addze	r10,r0
1577					#mul_add_c(a[7],b[6],c2,c3,c1);
1578	$LD	r6,`7*$BNSZ`(r4)
1579	$LD	r7,`6*$BNSZ`(r5)
1580	$UMULL	r8,r6,r7
1581	$UMULH	r9,r6,r7
1582	addc	r11,r11,r8
1583	adde	r12,r12,r9
1584	addze	r10,r10
1585	$ST	r11,`13*$BNSZ`(r3)	#r[13]=c2;
1586					#mul_add_c(a[7],b[7],c3,c1,c2);
1587	$LD	r7,`7*$BNSZ`(r5)
1588	$UMULL	r8,r6,r7
1589	$UMULH	r9,r6,r7
1590	addc	r12,r12,r8
1591	adde	r10,r10,r9
1592	$ST	r12,`14*$BNSZ`(r3)	#r[14]=c3;
1593	$ST	r10,`15*$BNSZ`(r3)	#r[15]=c1;
1594	bclr	BO_ALWAYS,CR0_LT
1595	.long	0x00000000
1596
1597#
1598#	NOTE:	The following label name should be changed to
1599#		"bn_sub_words" i.e. remove the first dot
1600#		for the gcc compiler. This should be automatically
1601#		done in the build
1602#
1603#
1604.align	4
1605.bn_sub_words:
1606#
1607#	Handcoded version of bn_sub_words
1608#
1609#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1610#
1611#	r3 = r
1612#	r4 = a
1613#	r5 = b
1614#	r6 = n
1615#
1616#       Note:	No loop unrolling done since this is not a performance
1617#               critical loop.
1618
1619	xor	r0,r0,r0	#set r0 = 0
1620#
1621#	check for r6 = 0 AND set carry bit.
1622#
1623	subfc.	r7,r0,r6        # If r6 is 0 then result is 0.
1624				# if r6 > 0 then result !=0
1625				# In either case carry bit is set.
1626	bc	BO_IF,CR0_EQ,Lppcasm_sub_adios
1627	addi	r4,r4,-$BNSZ
1628	addi	r3,r3,-$BNSZ
1629	addi	r5,r5,-$BNSZ
1630	mtctr	r6
1631Lppcasm_sub_mainloop:
1632	$LDU	r7,$BNSZ(r4)
1633	$LDU	r8,$BNSZ(r5)
1634	subfe	r6,r8,r7	# r6 = r7+carry bit + onescomplement(r8)
1635				# if carry = 1 this is r7-r8. Else it
1636				# is r7-r8 -1 as we need.
1637	$STU	r6,$BNSZ(r3)
1638	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloop
1639Lppcasm_sub_adios:
1640	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
1641	andi.	r3,r3,1         # keep only last bit.
1642	bclr	BO_ALWAYS,CR0_LT
1643	.long	0x00000000
1644
1645
1646#
1647#	NOTE:	The following label name should be changed to
1648#		"bn_add_words" i.e. remove the first dot
1649#		for the gcc compiler. This should be automatically
1650#		done in the build
1651#
1652
1653.align	4
1654.bn_add_words:
1655#
1656#	Handcoded version of bn_add_words
1657#
1658#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1659#
1660#	r3 = r
1661#	r4 = a
1662#	r5 = b
1663#	r6 = n
1664#
1665#       Note:	No loop unrolling done since this is not a performance
1666#               critical loop.
1667
1668	xor	r0,r0,r0
1669#
1670#	check for r6 = 0. Is this needed?
1671#
1672	addic.	r6,r6,0		#test r6 and clear carry bit.
1673	bc	BO_IF,CR0_EQ,Lppcasm_add_adios
1674	addi	r4,r4,-$BNSZ
1675	addi	r3,r3,-$BNSZ
1676	addi	r5,r5,-$BNSZ
1677	mtctr	r6
1678Lppcasm_add_mainloop:
1679	$LDU	r7,$BNSZ(r4)
1680	$LDU	r8,$BNSZ(r5)
1681	adde	r8,r7,r8
1682	$STU	r8,$BNSZ(r3)
1683	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloop
1684Lppcasm_add_adios:
1685	addze	r3,r0			#return carry bit.
1686	bclr	BO_ALWAYS,CR0_LT
1687	.long	0x00000000
1688
1689#
1690#	NOTE:	The following label name should be changed to
1691#		"bn_div_words" i.e. remove the first dot
1692#		for the gcc compiler. This should be automatically
1693#		done in the build
1694#
1695
1696.align	4
1697.bn_div_words:
1698#
1699#	This is a cleaned up version of code generated by
1700#	the AIX compiler. The only optimization is to use
1701#	the PPC instruction to count leading zeros instead
1702#	of call to num_bits_word. Since this was compiled
1703#	only at level -O2 we can possibly squeeze it more?
1704#
1705#	r3 = h
1706#	r4 = l
1707#	r5 = d
1708
1709	$UCMPI	0,r5,0			# compare r5 and 0
1710	bc	BO_IF_NOT,CR0_EQ,Lppcasm_div1	# proceed if d!=0
1711	li	r3,-1			# d=0 return -1
1712	bclr	BO_ALWAYS,CR0_LT
1713Lppcasm_div1:
1714	xor	r0,r0,r0		#r0=0
1715	li	r8,$BITS
1716	$CNTLZ.	r7,r5			#r7 = num leading 0s in d.
1717	bc	BO_IF,CR0_EQ,Lppcasm_div2	#proceed if no leading zeros
1718	subf	r8,r7,r8		#r8 = BN_num_bits_word(d)
1719	$SHR.	r9,r3,r8		#are there any bits above r8'th?
1720	$TR	16,r9,r0		#if there're, signal to dump core...
1721Lppcasm_div2:
1722	$UCMP	0,r3,r5			#h>=d?
1723	bc	BO_IF,CR0_LT,Lppcasm_div3	#goto Lppcasm_div3 if not
1724	subf	r3,r5,r3		#h-=d ;
1725Lppcasm_div3:				#r7 = BN_BITS2-i. so r7=i
1726	cmpi	0,0,r7,0		# is (i == 0)?
1727	bc	BO_IF,CR0_EQ,Lppcasm_div4
1728	$SHL	r3,r3,r7		# h = (h<< i)
1729	$SHR	r8,r4,r8		# r8 = (l >> BN_BITS2 -i)
1730	$SHL	r5,r5,r7		# d<<=i
1731	or	r3,r3,r8		# h = (h<<i)|(l>>(BN_BITS2-i))
1732	$SHL	r4,r4,r7		# l <<=i
1733Lppcasm_div4:
1734	$SHRI	r9,r5,`$BITS/2`		# r9 = dh
1735					# dl will be computed when needed
1736					# as it saves registers.
1737	li	r6,2			#r6=2
1738	mtctr	r6			#counter will be in count.
1739Lppcasm_divouterloop:
1740	$SHRI	r8,r3,`$BITS/2`		#r8 = (h>>BN_BITS4)
1741	$SHRI	r11,r4,`$BITS/2`	#r11= (l&BN_MASK2h)>>BN_BITS4
1742					# compute here for innerloop.
1743	$UCMP	0,r8,r9			# is (h>>BN_BITS4)==dh
1744	bc	BO_IF_NOT,CR0_EQ,Lppcasm_div5	# goto Lppcasm_div5 if not
1745
1746	li	r8,-1
1747	$CLRU	r8,r8,`$BITS/2`		#q = BN_MASK2l
1748	b	Lppcasm_div6
1749Lppcasm_div5:
1750	$UDIV	r8,r3,r9		#q = h/dh
1751Lppcasm_div6:
1752	$UMULL	r12,r9,r8		#th = q*dh
1753	$CLRU	r10,r5,`$BITS/2`	#r10=dl
1754	$UMULL	r6,r8,r10		#tl = q*dl
1755
1756Lppcasm_divinnerloop:
1757	subf	r10,r12,r3		#t = h -th
1758	$SHRI	r7,r10,`$BITS/2`	#r7= (t &BN_MASK2H), sort of...
1759	addic.	r7,r7,0			#test if r7 == 0. used below.
1760					# now want to compute
1761					# r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
1762					# the following 2 instructions do that
1763	$SHLI	r7,r10,`$BITS/2`	# r7 = (t<<BN_BITS4)
1764	or	r7,r7,r11		# r7|=((l&BN_MASK2h)>>BN_BITS4)
1765	$UCMP	1,r6,r7			# compare (tl <= r7)
1766	bc	BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit
1767	bc	BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit
1768	addi	r8,r8,-1		#q--
1769	subf	r12,r9,r12		#th -=dh
1770	$CLRU	r10,r5,`$BITS/2`	#r10=dl. t is no longer needed in loop.
1771	subf	r6,r10,r6		#tl -=dl
1772	b	Lppcasm_divinnerloop
1773Lppcasm_divinnerexit:
1774	$SHRI	r10,r6,`$BITS/2`	#t=(tl>>BN_BITS4)
1775	$SHLI	r11,r6,`$BITS/2`	#tl=(tl<<BN_BITS4)&BN_MASK2h;
1776	$UCMP	1,r4,r11		# compare l and tl
1777	add	r12,r12,r10		# th+=t
1778	bc	BO_IF_NOT,CR1_FX,Lppcasm_div7  # if (l>=tl) goto Lppcasm_div7
1779	addi	r12,r12,1		# th++
1780Lppcasm_div7:
1781	subf	r11,r11,r4		#r11=l-tl
1782	$UCMP	1,r3,r12		#compare h and th
1783	bc	BO_IF_NOT,CR1_FX,Lppcasm_div8	#if (h>=th) goto Lppcasm_div8
1784	addi	r8,r8,-1		# q--
1785	add	r3,r5,r3		# h+=d
1786Lppcasm_div8:
1787	subf	r12,r12,r3		#r12 = h-th
1788	$SHLI	r4,r11,`$BITS/2`	#l=(l&BN_MASK2l)<<BN_BITS4
1789					# want to compute
1790					# h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
1791					# the following 2 instructions will do this.
1792	$INSR	r11,r12,`$BITS/2`,`$BITS/2`	# r11 is the value we want rotated $BITS/2.
1793	$ROTL	r3,r11,`$BITS/2`	# rotate by $BITS/2 and store in r3
1794	bc	BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ;
1795	$SHLI	r0,r8,`$BITS/2`		#ret =q<<BN_BITS4
1796	b	Lppcasm_divouterloop
1797Lppcasm_div9:
1798	or	r3,r8,r0
1799	bclr	BO_ALWAYS,CR0_LT
1800	.long	0x00000000
1801
1802#
1803#	NOTE:	The following label name should be changed to
1804#		"bn_sqr_words" i.e. remove the first dot
1805#		for the gcc compiler. This should be automatically
1806#		done in the build
1807#
1808.align	4
1809.bn_sqr_words:
1810#
1811#	Optimized version of bn_sqr_words
1812#
1813#	void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
1814#
1815#	r3 = r
1816#	r4 = a
1817#	r5 = n
1818#
1819#	r6 = a[i].
1820#	r7,r8 = product.
1821#
1822#	No unrolling done here. Not performance critical.
1823
1824	addic.	r5,r5,0			#test r5.
1825	bc	BO_IF,CR0_EQ,Lppcasm_sqr_adios
1826	addi	r4,r4,-$BNSZ
1827	addi	r3,r3,-$BNSZ
1828	mtctr	r5
1829Lppcasm_sqr_mainloop:
1830					#sqr(r[0],r[1],a[0]);
1831	$LDU	r6,$BNSZ(r4)
1832	$UMULL	r7,r6,r6
1833	$UMULH  r8,r6,r6
1834	$STU	r7,$BNSZ(r3)
1835	$STU	r8,$BNSZ(r3)
1836	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_sqr_mainloop
1837Lppcasm_sqr_adios:
1838	bclr	BO_ALWAYS,CR0_LT
1839	.long	0x00000000
1840
1841
1842#
1843#	NOTE:	The following label name should be changed to
1844#		"bn_mul_words" i.e. remove the first dot
1845#		for the gcc compiler. This should be automatically
1846#		done in the build
1847#
1848
1849.align	4
1850.bn_mul_words:
1851#
1852# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1853#
1854# r3 = rp
1855# r4 = ap
1856# r5 = num
1857# r6 = w
1858	xor	r0,r0,r0
1859	xor	r12,r12,r12		# used for carry
1860	rlwinm.	r7,r5,30,2,31		# num >> 2
1861	bc	BO_IF,CR0_EQ,Lppcasm_mw_REM
1862	mtctr	r7
1863Lppcasm_mw_LOOP:
1864					#mul(rp[0],ap[0],w,c1);
1865	$LD	r8,`0*$BNSZ`(r4)
1866	$UMULL	r9,r6,r8
1867	$UMULH  r10,r6,r8
1868	addc	r9,r9,r12
1869	#addze	r10,r10			#carry is NOT ignored.
1870					#will be taken care of
1871					#in second spin below
1872					#using adde.
1873	$ST	r9,`0*$BNSZ`(r3)
1874					#mul(rp[1],ap[1],w,c1);
1875	$LD	r8,`1*$BNSZ`(r4)
1876	$UMULL	r11,r6,r8
1877	$UMULH  r12,r6,r8
1878	adde	r11,r11,r10
1879	#addze	r12,r12
1880	$ST	r11,`1*$BNSZ`(r3)
1881					#mul(rp[2],ap[2],w,c1);
1882	$LD	r8,`2*$BNSZ`(r4)
1883	$UMULL	r9,r6,r8
1884	$UMULH  r10,r6,r8
1885	adde	r9,r9,r12
1886	#addze	r10,r10
1887	$ST	r9,`2*$BNSZ`(r3)
1888					#mul_add(rp[3],ap[3],w,c1);
1889	$LD	r8,`3*$BNSZ`(r4)
1890	$UMULL	r11,r6,r8
1891	$UMULH  r12,r6,r8
1892	adde	r11,r11,r10
1893	addze	r12,r12			#this spin we collect carry into
1894					#r12
1895	$ST	r11,`3*$BNSZ`(r3)
1896
1897	addi	r3,r3,`4*$BNSZ`
1898	addi	r4,r4,`4*$BNSZ`
1899	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOP
1900
1901Lppcasm_mw_REM:
1902	andi.	r5,r5,0x3
1903	bc	BO_IF,CR0_EQ,Lppcasm_mw_OVER
1904					#mul(rp[0],ap[0],w,c1);
1905	$LD	r8,`0*$BNSZ`(r4)
1906	$UMULL	r9,r6,r8
1907	$UMULH  r10,r6,r8
1908	addc	r9,r9,r12
1909	addze	r10,r10
1910	$ST	r9,`0*$BNSZ`(r3)
1911	addi	r12,r10,0
1912
1913	addi	r5,r5,-1
1914	cmpli	0,0,r5,0
1915	bc	BO_IF,CR0_EQ,Lppcasm_mw_OVER
1916
1917
1918					#mul(rp[1],ap[1],w,c1);
1919	$LD	r8,`1*$BNSZ`(r4)
1920	$UMULL	r9,r6,r8
1921	$UMULH  r10,r6,r8
1922	addc	r9,r9,r12
1923	addze	r10,r10
1924	$ST	r9,`1*$BNSZ`(r3)
1925	addi	r12,r10,0
1926
1927	addi	r5,r5,-1
1928	cmpli	0,0,r5,0
1929	bc	BO_IF,CR0_EQ,Lppcasm_mw_OVER
1930
1931					#mul_add(rp[2],ap[2],w,c1);
1932	$LD	r8,`2*$BNSZ`(r4)
1933	$UMULL	r9,r6,r8
1934	$UMULH  r10,r6,r8
1935	addc	r9,r9,r12
1936	addze	r10,r10
1937	$ST	r9,`2*$BNSZ`(r3)
1938	addi	r12,r10,0
1939
1940Lppcasm_mw_OVER:
1941	addi	r3,r12,0
1942	bclr	BO_ALWAYS,CR0_LT
1943	.long	0x00000000
1944
1945#
1946#	NOTE:	The following label name should be changed to
1947#		"bn_mul_add_words" i.e. remove the first dot
1948#		for the gcc compiler. This should be automatically
1949#		done in the build
1950#
1951
1952.align	4
1953.bn_mul_add_words:
1954#
1955# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1956#
1957# r3 = rp
1958# r4 = ap
1959# r5 = num
1960# r6 = w
1961#
1962# empirical evidence suggests that unrolled version performs best!!
1963#
1964	xor	r0,r0,r0		#r0 = 0
1965	xor	r12,r12,r12  		#r12 = 0 . used for carry
1966	rlwinm.	r7,r5,30,2,31		# num >> 2
1967	bc	BO_IF,CR0_EQ,Lppcasm_maw_leftover	# if (num < 4) go LPPCASM_maw_leftover
1968	mtctr	r7
1969Lppcasm_maw_mainloop:
1970					#mul_add(rp[0],ap[0],w,c1);
1971	$LD	r8,`0*$BNSZ`(r4)
1972	$LD	r11,`0*$BNSZ`(r3)
1973	$UMULL	r9,r6,r8
1974	$UMULH  r10,r6,r8
1975	addc	r9,r9,r12		#r12 is carry.
1976	addze	r10,r10
1977	addc	r9,r9,r11
1978	#addze	r10,r10
1979					#the above instruction addze
1980					#is NOT needed. Carry will NOT
1981					#be ignored. It's not affected
1982					#by multiply and will be collected
1983					#in the next spin
1984	$ST	r9,`0*$BNSZ`(r3)
1985
1986					#mul_add(rp[1],ap[1],w,c1);
1987	$LD	r8,`1*$BNSZ`(r4)
1988	$LD	r9,`1*$BNSZ`(r3)
1989	$UMULL	r11,r6,r8
1990	$UMULH  r12,r6,r8
1991	adde	r11,r11,r10		#r10 is carry.
1992	addze	r12,r12
1993	addc	r11,r11,r9
1994	#addze	r12,r12
1995	$ST	r11,`1*$BNSZ`(r3)
1996
1997					#mul_add(rp[2],ap[2],w,c1);
1998	$LD	r8,`2*$BNSZ`(r4)
1999	$UMULL	r9,r6,r8
2000	$LD	r11,`2*$BNSZ`(r3)
2001	$UMULH  r10,r6,r8
2002	adde	r9,r9,r12
2003	addze	r10,r10
2004	addc	r9,r9,r11
2005	#addze	r10,r10
2006	$ST	r9,`2*$BNSZ`(r3)
2007
2008					#mul_add(rp[3],ap[3],w,c1);
2009	$LD	r8,`3*$BNSZ`(r4)
2010	$UMULL	r11,r6,r8
2011	$LD	r9,`3*$BNSZ`(r3)
2012	$UMULH  r12,r6,r8
2013	adde	r11,r11,r10
2014	addze	r12,r12
2015	addc	r11,r11,r9
2016	addze	r12,r12
2017	$ST	r11,`3*$BNSZ`(r3)
2018	addi	r3,r3,`4*$BNSZ`
2019	addi	r4,r4,`4*$BNSZ`
2020	bc	BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop
2021
2022Lppcasm_maw_leftover:
2023	andi.	r5,r5,0x3
2024	bc	BO_IF,CR0_EQ,Lppcasm_maw_adios
2025	addi	r3,r3,-$BNSZ
2026	addi	r4,r4,-$BNSZ
2027					#mul_add(rp[0],ap[0],w,c1);
2028	mtctr	r5
2029	$LDU	r8,$BNSZ(r4)
2030	$UMULL	r9,r6,r8
2031	$UMULH  r10,r6,r8
2032	$LDU	r11,$BNSZ(r3)
2033	addc	r9,r9,r11
2034	addze	r10,r10
2035	addc	r9,r9,r12
2036	addze	r12,r10
2037	$ST	r9,0(r3)
2038
2039	bc	BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios
2040					#mul_add(rp[1],ap[1],w,c1);
2041	$LDU	r8,$BNSZ(r4)
2042	$UMULL	r9,r6,r8
2043	$UMULH  r10,r6,r8
2044	$LDU	r11,$BNSZ(r3)
2045	addc	r9,r9,r11
2046	addze	r10,r10
2047	addc	r9,r9,r12
2048	addze	r12,r10
2049	$ST	r9,0(r3)
2050
2051	bc	BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios
2052					#mul_add(rp[2],ap[2],w,c1);
2053	$LDU	r8,$BNSZ(r4)
2054	$UMULL	r9,r6,r8
2055	$UMULH  r10,r6,r8
2056	$LDU	r11,$BNSZ(r3)
2057	addc	r9,r9,r11
2058	addze	r10,r10
2059	addc	r9,r9,r12
2060	addze	r12,r10
2061	$ST	r9,0(r3)
2062
2063Lppcasm_maw_adios:
2064	addi	r3,r12,0
2065	bclr	BO_ALWAYS,CR0_LT
2066	.long	0x00000000
2067	.align	4
2068EOF
2069	$data =~ s/\`([^\`]*)\`/eval $1/gem;
2070
2071	# if some assembler chokes on some simplified mnemonic,
2072	# this is the spot to fix it up, e.g.:
2073	# GNU as doesn't seem to accept cmplw, 32-bit unsigned compare
2074	$data =~ s/^(\s*)cmplw(\s+)([^,]+),(.*)/$1cmpl$2$3,0,$4/gm;
2075	# assembler X doesn't accept li, load immediate value
2076	#$data =~ s/^(\s*)li(\s+)([^,]+),(.*)/$1addi$2$3,0,$4/gm;
2077	return($data);
2078}
2079