xref: /freebsd/crypto/openssl/crypto/poly1305/asm/poly1305-ppc.pl (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1#! /usr/bin/env perl
2# Copyright 2016-2024 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL
12# project. The module is dual licensed under OpenSSL and CRYPTOGAMS
13# licenses depending on where you obtain it. For further details see
14# https://github.com/dot-asm/cryptogams/.
15# ====================================================================
16#
17# This module implements Poly1305 hash for PowerPC.
18#
19# June 2015
20#
21# Numbers are cycles per processed byte with poly1305_blocks alone,
22# and improvement coefficients relative to gcc-generated code.
23#
24#			-m32		-m64
25#
26# Freescale e300	14.8/+80%	-
27# PPC74x0		7.60/+60%	-
28# PPC970		7.00/+114%	3.51/+205%
29# POWER7		3.75/+260%	1.93/+100%
30# POWER8		-		2.03/+200%
31# POWER9		-		2.00/+150%
32#
33# Do we need floating-point implementation for PPC? Results presented
34# in poly1305_ieee754.c are tricky to compare to, because they are for
35# compiler-generated code. On the other hand it's known that floating-
36# point performance can be dominated by FPU latency, which means that
37# there is limit even for ideally optimized (and even vectorized) code.
38# And this limit is estimated to be higher than above -m64 results. Or
39# in other words floating-point implementation can be meaningful to
40# consider only in 32-bit application context. We probably have to
41# recognize that 32-bit builds are getting less popular on high-end
42# systems and therefore tend to target embedded ones, which might not
43# even have FPU...
44#
45# On side note, Power ISA 2.07 enables vector base 2^26 implementation,
46# and POWER8 might have capacity to break 1.0 cycle per byte barrier...
47#
48# January 2019
49#
50# ... Unfortunately not:-( Estimate was a projection of ARM result,
51# but ARM has vector multiply-n-add instruction, while PowerISA does
52# not, not one usable in the context. Improvement is ~40% over -m64
53# result above and is ~1.43 on little-endian systems.
54
55# $output is the last argument if it looks like a file (it has an extension)
56# $flavour is the first argument if it doesn't look like a file
57$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
58$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
59
60if ($flavour =~ /64/) {
61	$SIZE_T	=8;
62	$LRSAVE	=2*$SIZE_T;
63	$UCMP	="cmpld";
64	$STU	="stdu";
65	$POP	="ld";
66	$PUSH	="std";
67} elsif ($flavour =~ /32/) {
68	$SIZE_T	=4;
69	$LRSAVE	=$SIZE_T;
70	$UCMP	="cmplw";
71	$STU	="stwu";
72	$POP	="lwz";
73	$PUSH	="stw";
74} else { die "nonsense $flavour"; }
75
76# Define endianness based on flavour
77# i.e.: linux64le
78$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
79
80$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
81( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
82( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
83die "can't locate ppc-xlate.pl";
84
85open STDOUT,"| $^X $xlate $flavour \"$output\""
86    or die "can't call $xlate: $!";
87
88$FRAME=24*$SIZE_T;
89
90$sp="r1";
91my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
92my ($mac,$nonce)=($inp,$len);
93my $mask = "r0";
94
95$code=<<___;
96.machine	"any"
97.text
98___
99							if ($flavour =~ /64/) {
100###############################################################################
101# base 2^64 implementation
102
103my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31));
104
105$code.=<<___;
106.globl	.poly1305_init_int
107.align	4
108.poly1305_init_int:
109	xor	r0,r0,r0
110	std	r0,0($ctx)		# zero hash value
111	std	r0,8($ctx)
112	std	r0,16($ctx)
113	stw	r0,24($ctx)		# clear is_base2_26
114
115	$UCMP	$inp,r0
116	beq-	Lno_key
117___
118$code.=<<___	if ($LITTLE_ENDIAN);
119	ld	$d0,0($inp)		# load key material
120	ld	$d1,8($inp)
121___
122$code.=<<___	if (!$LITTLE_ENDIAN);
123	li	$h0,4
124	lwbrx	$d0,0,$inp		# load key material
125	li	$d1,8
126	lwbrx	$h0,$h0,$inp
127	li	$h1,12
128	lwbrx	$d1,$d1,$inp
129	lwbrx	$h1,$h1,$inp
130	insrdi	$d0,$h0,32,0
131	insrdi	$d1,$h1,32,0
132___
133$code.=<<___;
134	lis	$h1,0xfff		# 0x0fff0000
135	ori	$h1,$h1,0xfffc		# 0x0ffffffc
136	insrdi	$h1,$h1,32,0		# 0x0ffffffc0ffffffc
137	ori	$h0,$h1,3		# 0x0ffffffc0fffffff
138
139	and	$d0,$d0,$h0
140	and	$d1,$d1,$h1
141
142	std	$d0,32($ctx)		# store key
143	std	$d1,40($ctx)
144
145Lno_key:
146	xor	r3,r3,r3
147	blr
148	.long	0
149	.byte	0,12,0x14,0,0,0,2,0
150.size	.poly1305_init_int,.-.poly1305_init_int
151
152.globl	.poly1305_blocks
153.align	4
154.poly1305_blocks:
155Lpoly1305_blocks:
156	srdi.	$len,$len,4
157	beq-	Labort
158
159	$STU	$sp,-$FRAME($sp)
160	mflr	r0
161	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
162	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
163	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
164	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
165	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
166	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
167
168	ld	$r0,32($ctx)		# load key
169	ld	$r1,40($ctx)
170
171	ld	$h0,0($ctx)		# load hash value
172	ld	$h1,8($ctx)
173	ld	$h2,16($ctx)
174
175	srdi	$s1,$r1,2
176	mtctr	$len
177	add	$s1,$s1,$r1		# s1 = r1 + r1>>2
178	li	$mask,3
179	b	Loop
180
181.align	4
182Loop:
183___
184$code.=<<___	if ($LITTLE_ENDIAN);
185	ld	$t0,0($inp)		# load input
186	ld	$t1,8($inp)
187___
188$code.=<<___	if (!$LITTLE_ENDIAN);
189	li	$d0,4
190	lwbrx	$t0,0,$inp		# load input
191	li	$t1,8
192	lwbrx	$d0,$d0,$inp
193	li	$d1,12
194	lwbrx	$t1,$t1,$inp
195	lwbrx	$d1,$d1,$inp
196	insrdi	$t0,$d0,32,0
197	insrdi	$t1,$d1,32,0
198___
199$code.=<<___;
200	addi	$inp,$inp,16
201
202	addc	$h0,$h0,$t0		# accumulate input
203	adde	$h1,$h1,$t1
204
205	mulld	$d0,$h0,$r0		# h0*r0
206	mulhdu	$d1,$h0,$r0
207	adde	$h2,$h2,$padbit
208
209	mulld	$t0,$h1,$s1		# h1*5*r1
210	mulhdu	$t1,$h1,$s1
211	addc	$d0,$d0,$t0
212	adde	$d1,$d1,$t1
213
214	mulld	$t0,$h0,$r1		# h0*r1
215	mulhdu	$d2,$h0,$r1
216	addc	$d1,$d1,$t0
217	addze	$d2,$d2
218
219	mulld	$t0,$h1,$r0		# h1*r0
220	mulhdu	$t1,$h1,$r0
221	addc	$d1,$d1,$t0
222	adde	$d2,$d2,$t1
223
224	mulld	$t0,$h2,$s1		# h2*5*r1
225	mulld	$t1,$h2,$r0		# h2*r0
226	addc	$d1,$d1,$t0
227	adde	$d2,$d2,$t1
228
229	andc	$t0,$d2,$mask		# final reduction step
230	and	$h2,$d2,$mask
231	srdi	$t1,$t0,2
232	add	$t0,$t0,$t1
233	addc	$h0,$d0,$t0
234	addze	$h1,$d1
235	addze	$h2,$h2
236
237	bdnz	Loop
238
239	std	$h0,0($ctx)		# store hash value
240	std	$h1,8($ctx)
241	std	$h2,16($ctx)
242
243	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
244	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
245	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
246	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
247	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
248	addi	$sp,$sp,$FRAME
249Labort:
250	blr
251	.long	0
252	.byte	0,12,4,1,0x80,5,4,0
253.size	.poly1305_blocks,.-.poly1305_blocks
254___
255{
256my ($h0,$h1,$h2,$h3,$h4,$t0) = map("r$_",(7..12));
257
258$code.=<<___;
259.globl	.poly1305_emit
260.align	5
261.poly1305_emit:
262	lwz	$h0,0($ctx)	# load hash value base 2^26
263	lwz	$h1,4($ctx)
264	lwz	$h2,8($ctx)
265	lwz	$h3,12($ctx)
266	lwz	$h4,16($ctx)
267	lwz	r0,24($ctx)	# is_base2_26
268
269	sldi	$h1,$h1,26	# base 2^26 -> base 2^64
270	sldi	$t0,$h2,52
271	srdi	$h2,$h2,12
272	sldi	$h3,$h3,14
273	add	$h0,$h0,$h1
274	addc	$h0,$h0,$t0
275	sldi	$t0,$h4,40
276	srdi	$h4,$h4,24
277	adde	$h1,$h2,$h3
278	addc	$h1,$h1,$t0
279	addze	$h2,$h4
280
281	ld	$h3,0($ctx)	# load hash value base 2^64
282	ld	$h4,8($ctx)
283	ld	$t0,16($ctx)
284
285	neg	r0,r0
286	xor	$h0,$h0,$h3	# choose between radixes
287	xor	$h1,$h1,$h4
288	xor	$h2,$h2,$t0
289	and	$h0,$h0,r0
290	and	$h1,$h1,r0
291	and	$h2,$h2,r0
292	xor	$h0,$h0,$h3
293	xor	$h1,$h1,$h4
294	xor	$h2,$h2,$t0
295
296	addic	$h3,$h0,5	# compare to modulus
297	addze	$h4,$h1
298	addze	$t0,$h2
299
300	srdi	$t0,$t0,2	# see if it carried/borrowed
301	neg	$t0,$t0
302
303	andc	$h0,$h0,$t0
304	and	$h3,$h3,$t0
305	andc	$h1,$h1,$t0
306	and	$h4,$h4,$t0
307	or	$h0,$h0,$h3
308	or	$h1,$h1,$h4
309
310	lwz	$t0,4($nonce)
311	lwz	$h2,12($nonce)
312	lwz	$h3,0($nonce)
313	lwz	$h4,8($nonce)
314
315	insrdi	$h3,$t0,32,0
316	insrdi	$h4,$h2,32,0
317
318	addc	$h0,$h0,$h3	# accumulate nonce
319	adde	$h1,$h1,$h4
320
321	addi	$ctx,$mac,-1
322	addi	$mac,$mac,7
323
324	stbu	$h0,1($ctx)	# write [little-endian] result
325	srdi	$h0,$h0,8
326	stbu	$h1,1($mac)
327	srdi	$h1,$h1,8
328
329	stbu	$h0,1($ctx)
330	srdi	$h0,$h0,8
331	stbu	$h1,1($mac)
332	srdi	$h1,$h1,8
333
334	stbu	$h0,1($ctx)
335	srdi	$h0,$h0,8
336	stbu	$h1,1($mac)
337	srdi	$h1,$h1,8
338
339	stbu	$h0,1($ctx)
340	srdi	$h0,$h0,8
341	stbu	$h1,1($mac)
342	srdi	$h1,$h1,8
343
344	stbu	$h0,1($ctx)
345	srdi	$h0,$h0,8
346	stbu	$h1,1($mac)
347	srdi	$h1,$h1,8
348
349	stbu	$h0,1($ctx)
350	srdi	$h0,$h0,8
351	stbu	$h1,1($mac)
352	srdi	$h1,$h1,8
353
354	stbu	$h0,1($ctx)
355	srdi	$h0,$h0,8
356	stbu	$h1,1($mac)
357	srdi	$h1,$h1,8
358
359	stbu	$h0,1($ctx)
360	stbu	$h1,1($mac)
361
362	blr
363	.long	0
364	.byte	0,12,0x14,0,0,0,3,0
365.size	.poly1305_emit,.-.poly1305_emit
366___
367}							} else {
368###############################################################################
369# base 2^32 implementation
370
371my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3,
372    $t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3
373   ) = map("r$_",(7..12,14..31));
374
375$code.=<<___;
376.globl	.poly1305_init_int
377.align	4
378.poly1305_init_int:
379	xor	r0,r0,r0
380	stw	r0,0($ctx)		# zero hash value
381	stw	r0,4($ctx)
382	stw	r0,8($ctx)
383	stw	r0,12($ctx)
384	stw	r0,16($ctx)
385	stw	r0,24($ctx)		# clear is_base2_26
386
387	$UCMP	$inp,r0
388	beq-	Lno_key
389___
390$code.=<<___	if ($LITTLE_ENDIAN);
391	lw	$h0,0($inp)		# load key material
392	lw	$h1,4($inp)
393	lw	$h2,8($inp)
394	lw	$h3,12($inp)
395___
396$code.=<<___	if (!$LITTLE_ENDIAN);
397	li	$h1,4
398	lwbrx	$h0,0,$inp		# load key material
399	li	$h2,8
400	lwbrx	$h1,$h1,$inp
401	li	$h3,12
402	lwbrx	$h2,$h2,$inp
403	lwbrx	$h3,$h3,$inp
404___
405$code.=<<___;
406	lis	$mask,0xf000		# 0xf0000000
407	li	$r0,-4
408	andc	$r0,$r0,$mask		# 0x0ffffffc
409
410	andc	$h0,$h0,$mask
411	and	$h1,$h1,$r0
412	and	$h2,$h2,$r0
413	and	$h3,$h3,$r0
414
415	stw	$h0,32($ctx)		# store key
416	stw	$h1,36($ctx)
417	stw	$h2,40($ctx)
418	stw	$h3,44($ctx)
419
420Lno_key:
421	xor	r3,r3,r3
422	blr
423	.long	0
424	.byte	0,12,0x14,0,0,0,2,0
425.size	.poly1305_init_int,.-.poly1305_init_int
426
427.globl	.poly1305_blocks
428.align	4
429.poly1305_blocks:
430Lpoly1305_blocks:
431	srwi.	$len,$len,4
432	beq-	Labort
433
434	$STU	$sp,-$FRAME($sp)
435	mflr	r0
436	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
437	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
438	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
439	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
440	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
441	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
442	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
443	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
444	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
445	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
446	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
447	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
448	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
449	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
450	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
451	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
452	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
453	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
454	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
455
456	lwz	$r0,32($ctx)		# load key
457	lwz	$r1,36($ctx)
458	lwz	$r2,40($ctx)
459	lwz	$r3,44($ctx)
460
461	lwz	$h0,0($ctx)		# load hash value
462	lwz	$h1,4($ctx)
463	lwz	$h2,8($ctx)
464	lwz	$h3,12($ctx)
465	lwz	$h4,16($ctx)
466
467	srwi	$s1,$r1,2
468	srwi	$s2,$r2,2
469	srwi	$s3,$r3,2
470	add	$s1,$s1,$r1		# si = ri + ri>>2
471	add	$s2,$s2,$r2
472	add	$s3,$s3,$r3
473	mtctr	$len
474	li	$mask,3
475	b	Loop
476
477.align	4
478Loop:
479___
480$code.=<<___	if ($LITTLE_ENDIAN);
481	lwz	$d0,0($inp)		# load input
482	lwz	$d1,4($inp)
483	lwz	$d2,8($inp)
484	lwz	$d3,12($inp)
485___
486$code.=<<___	if (!$LITTLE_ENDIAN);
487	li	$d1,4
488	lwbrx	$d0,0,$inp		# load input
489	li	$d2,8
490	lwbrx	$d1,$d1,$inp
491	li	$d3,12
492	lwbrx	$d2,$d2,$inp
493	lwbrx	$d3,$d3,$inp
494___
495$code.=<<___;
496	addi	$inp,$inp,16
497
498	addc	$h0,$h0,$d0		# accumulate input
499	adde	$h1,$h1,$d1
500	adde	$h2,$h2,$d2
501
502	mullw	$d0,$h0,$r0		# h0*r0
503	mulhwu	$D0,$h0,$r0
504
505	mullw	$d1,$h0,$r1		# h0*r1
506	mulhwu	$D1,$h0,$r1
507
508	mullw	$d2,$h0,$r2		# h0*r2
509	mulhwu	$D2,$h0,$r2
510
511	 adde	$h3,$h3,$d3
512	 adde	$h4,$h4,$padbit
513
514	mullw	$d3,$h0,$r3		# h0*r3
515	mulhwu	$D3,$h0,$r3
516
517	mullw	$t0,$h1,$s3		# h1*s3
518	mulhwu	$t1,$h1,$s3
519
520	mullw	$t2,$h1,$r0		# h1*r0
521	mulhwu	$t3,$h1,$r0
522	 addc	$d0,$d0,$t0
523	 adde	$D0,$D0,$t1
524
525	mullw	$t0,$h1,$r1		# h1*r1
526	mulhwu	$t1,$h1,$r1
527	 addc	$d1,$d1,$t2
528	 adde	$D1,$D1,$t3
529
530	mullw	$t2,$h1,$r2		# h1*r2
531	mulhwu	$t3,$h1,$r2
532	 addc	$d2,$d2,$t0
533	 adde	$D2,$D2,$t1
534
535	mullw	$t0,$h2,$s2		# h2*s2
536	mulhwu	$t1,$h2,$s2
537	 addc	$d3,$d3,$t2
538	 adde	$D3,$D3,$t3
539
540	mullw	$t2,$h2,$s3		# h2*s3
541	mulhwu	$t3,$h2,$s3
542	 addc	$d0,$d0,$t0
543	 adde	$D0,$D0,$t1
544
545	mullw	$t0,$h2,$r0		# h2*r0
546	mulhwu	$t1,$h2,$r0
547	 addc	$d1,$d1,$t2
548	 adde	$D1,$D1,$t3
549
550	mullw	$t2,$h2,$r1		# h2*r1
551	mulhwu	$t3,$h2,$r1
552	 addc	$d2,$d2,$t0
553	 adde	$D2,$D2,$t1
554
555	mullw	$t0,$h3,$s1		# h3*s1
556	mulhwu	$t1,$h3,$s1
557	 addc	$d3,$d3,$t2
558	 adde	$D3,$D3,$t3
559
560	mullw	$t2,$h3,$s2		# h3*s2
561	mulhwu	$t3,$h3,$s2
562	 addc	$d0,$d0,$t0
563	 adde	$D0,$D0,$t1
564
565	mullw	$t0,$h3,$s3		# h3*s3
566	mulhwu	$t1,$h3,$s3
567	 addc	$d1,$d1,$t2
568	 adde	$D1,$D1,$t3
569
570	mullw	$t2,$h3,$r0		# h3*r0
571	mulhwu	$t3,$h3,$r0
572	 addc	$d2,$d2,$t0
573	 adde	$D2,$D2,$t1
574
575	mullw	$t0,$h4,$s1		# h4*s1
576	 addc	$d3,$d3,$t2
577	 adde	$D3,$D3,$t3
578	addc	$d1,$d1,$t0
579
580	mullw	$t1,$h4,$s2		# h4*s2
581	 addze	$D1,$D1
582	addc	$d2,$d2,$t1
583	addze	$D2,$D2
584
585	mullw	$t2,$h4,$s3		# h4*s3
586	addc	$d3,$d3,$t2
587	addze	$D3,$D3
588
589	mullw	$h4,$h4,$r0		# h4*r0
590
591	addc	$h1,$d1,$D0
592	adde	$h2,$d2,$D1
593	adde	$h3,$d3,$D2
594	adde	$h4,$h4,$D3
595
596	andc	$D0,$h4,$mask		# final reduction step
597	and	$h4,$h4,$mask
598	srwi	$D1,$D0,2
599	add	$D0,$D0,$D1
600	addc	$h0,$d0,$D0
601	addze	$h1,$h1
602	addze	$h2,$h2
603	addze	$h3,$h3
604	addze	$h4,$h4
605
606	bdnz	Loop
607
608	stw	$h0,0($ctx)		# store hash value
609	stw	$h1,4($ctx)
610	stw	$h2,8($ctx)
611	stw	$h3,12($ctx)
612	stw	$h4,16($ctx)
613
614	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
615	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
616	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
617	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
618	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
619	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
620	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
621	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
622	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
623	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
624	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
625	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
626	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
627	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
628	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
629	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
630	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
631	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
632	addi	$sp,$sp,$FRAME
633Labort:
634	blr
635	.long	0
636	.byte	0,12,4,1,0x80,18,4,0
637.size	.poly1305_blocks,.-.poly1305_blocks
638___
639{
640my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(6..12));
641
642$code.=<<___;
643.globl	.poly1305_emit
644.align	5
645.poly1305_emit:
646	lwz	r0,24($ctx)	# is_base2_26
647	lwz	$h0,0($ctx)	# load hash value
648	lwz	$h1,4($ctx)
649	lwz	$h2,8($ctx)
650	lwz	$h3,12($ctx)
651	lwz	$h4,16($ctx)
652	cmplwi	r0,0
653	beq	Lemit_base2_32
654
655	slwi	$t0,$h1,26	# base 2^26 -> base 2^32
656	srwi	$h1,$h1,6
657	slwi	$t1,$h2,20
658	srwi	$h2,$h2,12
659	addc	$h0,$h0,$t0
660	slwi	$t0,$h3,14
661	srwi	$h3,$h3,18
662	adde	$h1,$h1,$t1
663	slwi	$t1,$h4,8
664	srwi	$h4,$h4,24
665	adde	$h2,$h2,$t0
666	adde	$h3,$h3,$t1
667	addze	$h4,$h4
668
669Lemit_base2_32:
670	addic	r0,$h0,5	# compare to modulus
671	addze	r0,$h1
672	addze	r0,$h2
673	addze	r0,$h3
674	addze	r0,$h4
675
676	srwi	r0,r0,2		# see if it carried/borrowed
677	neg	r0,r0
678	andi.	r0,r0,5
679
680	addc	$h0,$h0,r0
681	lwz	r0,0($nonce)
682	addze	$h1,$h1
683	lwz	$t0,4($nonce)
684	addze	$h2,$h2
685	lwz	$t1,8($nonce)
686	addze	$h3,$h3
687	lwz	$h4,12($nonce)
688
689	addc	$h0,$h0,r0	# accumulate nonce
690	adde	$h1,$h1,$t0
691	adde	$h2,$h2,$t1
692	adde	$h3,$h3,$h4
693
694	addi	$ctx,$mac,-1
695	addi	$mac,$mac,7
696
697	stbu	$h0,1($ctx)	# write [little-endian] result
698	srwi	$h0,$h0,8
699	stbu	$h2,1($mac)
700	srwi	$h2,$h2,8
701
702	stbu	$h0,1($ctx)
703	srwi	$h0,$h0,8
704	stbu	$h2,1($mac)
705	srwi	$h2,$h2,8
706
707	stbu	$h0,1($ctx)
708	srwi	$h0,$h0,8
709	stbu	$h2,1($mac)
710	srwi	$h2,$h2,8
711
712	stbu	$h0,1($ctx)
713	stbu	$h2,1($mac)
714
715	stbu	$h1,1($ctx)
716	srwi	$h1,$h1,8
717	stbu	$h3,1($mac)
718	srwi	$h3,$h3,8
719
720	stbu	$h1,1($ctx)
721	srwi	$h1,$h1,8
722	stbu	$h3,1($mac)
723	srwi	$h3,$h3,8
724
725	stbu	$h1,1($ctx)
726	srwi	$h1,$h1,8
727	stbu	$h3,1($mac)
728	srwi	$h3,$h3,8
729
730	stbu	$h1,1($ctx)
731	stbu	$h3,1($mac)
732
733	blr
734	.long	0
735	.byte	0,12,0x14,0,0,0,3,0
736.size	.poly1305_emit,.-.poly1305_emit
737___
738}							}
739{{{
740########################################################################
741# PowerISA 2.07/VSX section                                            #
742########################################################################
743
744my $LOCALS= 6*$SIZE_T;
745my $VSXFRAME = $LOCALS + 6*$SIZE_T;
746   $VSXFRAME += 128;	# local variables
747   $VSXFRAME += 12*16;	# v20-v31 offload
748
749my $BIG_ENDIAN = ($flavour !~ /le/) ? 4 : 0;
750
751########################################################################
752# Layout of opaque area is following:
753#
754#	unsigned __int32 h[5];		# current hash value base 2^26
755#	unsigned __int32 pad;
756#	unsigned __int32 is_base2_26, pad;
757#	unsigned __int64 r[2];		# key value base 2^64
758#	struct { unsigned __int32 r^2, r^4, r^1, r^3; } r[9];
759#
760# where r^n are base 2^26 digits of powers of multiplier key. There are
761# 5 digits, but last four are interleaved with multiples of 5, totalling
762# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. Order of
763# powers is as they appear in register, not memory.
764
765my ($H0, $H1, $H2, $H3, $H4) = map("v$_",(0..4));
766my ($I0, $I1, $I2, $I3, $I4) = map("v$_",(5..9));
767my ($R0, $R1, $S1, $R2, $S2) = map("v$_",(10..14));
768my      ($R3, $S3, $R4, $S4) = ($R1, $S1, $R2, $S2);
769my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("v$_",(15..19));
770my ($T0, $T1, $T2, $T3, $T4) = map("v$_",(20..24));
771my ($_26,$_4,$_40,$_14,$mask26,$padbits,$I2perm) = map("v$_",(25..31));
772my ($x00,$x60,$x70,$x10,$x20,$x30,$x40,$x50) = (0, map("r$_",(7,8,27..31)));
773my ($ctx_,$_ctx,$const) = map("r$_",(10..12));
774
775							if ($flavour =~ /64/) {
776###############################################################################
777# setup phase of poly1305_blocks_vsx is different on 32- and 64-bit platforms,
778# but the base 2^26 computational part is same...
779
780my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(6..11,27..31));
781my $mask = "r0";
782
783$code.=<<___;
784.globl	.poly1305_blocks_vsx
785.align	5
786.poly1305_blocks_vsx:
787	lwz	r7,24($ctx)		# is_base2_26
788	cmpldi	$len,128
789	bge	__poly1305_blocks_vsx
790
791	neg	r0,r7			# is_base2_26 as mask
792	lwz	r7,0($ctx)		# load hash base 2^26
793	lwz	r8,4($ctx)
794	lwz	r9,8($ctx)
795	lwz	r10,12($ctx)
796	lwz	r11,16($ctx)
797
798	sldi	r8,r8,26		# base 2^26 -> base 2^64
799	sldi	r12,r9,52
800	add	r7,r7,r8
801	srdi	r9,r9,12
802	sldi	r10,r10,14
803	addc	r7,r7,r12
804	sldi	r8,r11,40
805	adde	r9,r9,r10
806	srdi	r11,r11,24
807	addc	r9,r9,r8
808	addze	r11,r11
809
810	ld	r8,0($ctx)		# load hash base 2^64
811	ld	r10,8($ctx)
812	ld	r12,16($ctx)
813
814	xor	r7,r7,r8		# select between radixes
815	xor	r9,r9,r10
816	xor	r11,r11,r12
817	and	r7,r7,r0
818	and	r9,r9,r0
819	and	r11,r11,r0
820	xor	r7,r7,r8
821	xor	r9,r9,r10
822	xor	r11,r11,r12
823
824	li	r0,0
825	std	r7,0($ctx)		# store hash base 2^64
826	std	r9,8($ctx)
827	std	r11,16($ctx)
828	stw	r0,24($ctx)		# clear is_base2_26
829
830	b	Lpoly1305_blocks
831	.long	0
832	.byte	0,12,0x14,0,0,0,4,0
833.size	.poly1305_blocks_vsx,.-.poly1305_blocks_vsx
834
835.align	5
836__poly1305_mul:
837	mulld	$d0,$h0,$r0		# h0*r0
838	mulhdu	$d1,$h0,$r0
839
840	mulld	$t0,$h1,$s1		# h1*5*r1
841	mulhdu	$t1,$h1,$s1
842	addc	$d0,$d0,$t0
843	adde	$d1,$d1,$t1
844
845	mulld	$t0,$h0,$r1		# h0*r1
846	mulhdu	$d2,$h0,$r1
847	addc	$d1,$d1,$t0
848	addze	$d2,$d2
849
850	mulld	$t0,$h1,$r0		# h1*r0
851	mulhdu	$t1,$h1,$r0
852	addc	$d1,$d1,$t0
853	adde	$d2,$d2,$t1
854
855	mulld	$t0,$h2,$s1		# h2*5*r1
856	mulld	$t1,$h2,$r0		# h2*r0
857	addc	$d1,$d1,$t0
858	adde	$d2,$d2,$t1
859
860	andc	$t0,$d2,$mask		# final reduction step
861	and	$h2,$d2,$mask
862	srdi	$t1,$t0,2
863	add	$t0,$t0,$t1
864	addc	$h0,$d0,$t0
865	addze	$h1,$d1
866	addze	$h2,$h2
867
868	blr
869	.long	0
870	.byte	0,12,0x14,0,0,0,0,0
871.size	__poly1305_mul,.-__poly1305_mul
872
873.align	5
874__poly1305_splat:
875	extrdi	$d0,$h0,26,38
876	extrdi	$d1,$h0,26,12
877	stw	$d0,0x00($t1)
878
879	extrdi	$d2,$h0,12,0
880	slwi	$d0,$d1,2
881	stw	$d1,0x10($t1)
882	add	$d0,$d0,$d1		# * 5
883	stw	$d0,0x20($t1)
884
885	insrdi	$d2,$h1,14,38
886	slwi	$d0,$d2,2
887	stw	$d2,0x30($t1)
888	add	$d0,$d0,$d2		# * 5
889	stw	$d0,0x40($t1)
890
891	extrdi	$d1,$h1,26,24
892	extrdi	$d2,$h1,24,0
893	slwi	$d0,$d1,2
894	stw	$d1,0x50($t1)
895	add	$d0,$d0,$d1		# * 5
896	stw	$d0,0x60($t1)
897
898	insrdi	$d2,$h2,3,37
899	slwi	$d0,$d2,2
900	stw	$d2,0x70($t1)
901	add	$d0,$d0,$d2		# * 5
902	stw	$d0,0x80($t1)
903
904	blr
905	.long	0
906	.byte	0,12,0x14,0,0,0,0,0
907.size	__poly1305_splat,.-__poly1305_splat
908
909.align	5
910__poly1305_blocks_vsx:
911	$STU	$sp,-$VSXFRAME($sp)
912	mflr	r0
913	li	r10,`15+$LOCALS+128`
914	li	r11,`31+$LOCALS+128`
915	mfspr	r12,256
916	stvx	v20,r10,$sp
917	addi	r10,r10,32
918	stvx	v21,r11,$sp
919	addi	r11,r11,32
920	stvx	v22,r10,$sp
921	addi	r10,r10,32
922	stvx	v23,r11,$sp
923	addi	r11,r11,32
924	stvx	v24,r10,$sp
925	addi	r10,r10,32
926	stvx	v25,r11,$sp
927	addi	r11,r11,32
928	stvx	v26,r10,$sp
929	addi	r10,r10,32
930	stvx	v27,r11,$sp
931	addi	r11,r11,32
932	stvx	v28,r10,$sp
933	addi	r10,r10,32
934	stvx	v29,r11,$sp
935	addi	r11,r11,32
936	stvx	v30,r10,$sp
937	stvx	v31,r11,$sp
938	stw	r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave
939	li	r12,-1
940	mtspr	256,r12			# preserve all AltiVec registers
941	$PUSH	r27,`$VSXFRAME-$SIZE_T*5`($sp)
942	$PUSH	r28,`$VSXFRAME-$SIZE_T*4`($sp)
943	$PUSH	r29,`$VSXFRAME-$SIZE_T*3`($sp)
944	$PUSH	r30,`$VSXFRAME-$SIZE_T*2`($sp)
945	$PUSH	r31,`$VSXFRAME-$SIZE_T*1`($sp)
946	$PUSH	r0,`$VSXFRAME+$LRSAVE`($sp)
947
948	bl	LPICmeup
949
950	li	$x10,0x10
951	li	$x20,0x20
952	li	$x30,0x30
953	li	$x40,0x40
954	li	$x50,0x50
955	lvx_u	$mask26,$x00,$const
956	lvx_u	$_26,$x10,$const
957	lvx_u	$_40,$x20,$const
958	lvx_u	$I2perm,$x30,$const
959	lvx_u	$padbits,$x40,$const
960
961	cmplwi	r7,0			# is_base2_26?
962	bne	Lskip_init_vsx
963
964	ld	$r0,32($ctx)		# load key base 2^64
965	ld	$r1,40($ctx)
966	srdi	$s1,$r1,2
967	li	$mask,3
968	add	$s1,$s1,$r1		# s1 = r1 + r1>>2
969
970	mr	$h0,$r0			# "calculate" r^1
971	mr	$h1,$r1
972	li	$h2,0
973	addi	$t1,$ctx,`48+(12^$BIG_ENDIAN)`
974	bl	__poly1305_splat
975
976	bl	__poly1305_mul		# calculate r^2
977	addi	$t1,$ctx,`48+(4^$BIG_ENDIAN)`
978	bl	__poly1305_splat
979
980	bl	__poly1305_mul		# calculate r^3
981	addi	$t1,$ctx,`48+(8^$BIG_ENDIAN)`
982	bl	__poly1305_splat
983
984	bl	__poly1305_mul		# calculate r^4
985	addi	$t1,$ctx,`48+(0^$BIG_ENDIAN)`
986	bl	__poly1305_splat
987
988	ld	$h0,0($ctx)		# load hash
989	ld	$h1,8($ctx)
990	ld	$h2,16($ctx)
991
992	extrdi	$d0,$h0,26,38		# base 2^64 -> base 2^26
993	extrdi	$d1,$h0,26,12
994	extrdi	$d2,$h0,12,0
995	mtvrwz	$H0,$d0
996	insrdi	$d2,$h1,14,38
997	mtvrwz	$H1,$d1
998	extrdi	$d1,$h1,26,24
999	mtvrwz	$H2,$d2
1000	extrdi	$d2,$h1,24,0
1001	mtvrwz	$H3,$d1
1002	insrdi	$d2,$h2,3,37
1003	mtvrwz	$H4,$d2
1004___
1005							} else {
1006###############################################################################
1007# 32-bit initialization
1008
1009my ($h0,$h1,$h2,$h3,$h4,$t0,$t1) = map("r$_",(7..11,0,12));
1010my ($R3,$S3,$R4,$S4)=($I1,$I2,$I3,$I4);
1011
1012$code.=<<___;
1013.globl	.poly1305_blocks_vsx
1014.align	5
1015.poly1305_blocks_vsx:
1016	lwz	r7,24($ctx)		# is_base2_26
1017	cmplwi	$len,128
1018	bge	__poly1305_blocks_vsx
1019	cmplwi	r7,0
1020	beq	Lpoly1305_blocks
1021
1022	lwz	$h0,0($ctx)		# load hash
1023	lwz	$h1,4($ctx)
1024	lwz	$h2,8($ctx)
1025	lwz	$h3,12($ctx)
1026	lwz	$h4,16($ctx)
1027
1028	slwi	$t0,$h1,26		# base 2^26 -> base 2^32
1029	srwi	$h1,$h1,6
1030	slwi	$t1,$h2,20
1031	srwi	$h2,$h2,12
1032	addc	$h0,$h0,$t0
1033	slwi	$t0,$h3,14
1034	srwi	$h3,$h3,18
1035	adde	$h1,$h1,$t1
1036	slwi	$t1,$h4,8
1037	srwi	$h4,$h4,24
1038	adde	$h2,$h2,$t0
1039	li	$t0,0
1040	adde	$h3,$h3,$t1
1041	addze	$h4,$h4
1042
1043	stw	$h0,0($ctx)		# store hash base 2^32
1044	stw	$h1,4($ctx)
1045	stw	$h2,8($ctx)
1046	stw	$h3,12($ctx)
1047	stw	$h4,16($ctx)
1048	stw	$t0,24($ctx)		# clear is_base2_26
1049
1050	b	Lpoly1305_blocks
1051	.long	0
1052	.byte	0,12,0x14,0,0,0,4,0
1053.size	.poly1305_blocks_vsx,.-.poly1305_blocks_vsx
1054
1055.align	5
1056__poly1305_mul:
1057	vmulouw		$ACC0,$H0,$R0
1058	vmulouw		$ACC1,$H1,$R0
1059	vmulouw		$ACC2,$H2,$R0
1060	vmulouw		$ACC3,$H3,$R0
1061	vmulouw		$ACC4,$H4,$R0
1062
1063	vmulouw		$T0,$H4,$S1
1064	vaddudm		$ACC0,$ACC0,$T0
1065	vmulouw		$T0,$H0,$R1
1066	vaddudm		$ACC1,$ACC1,$T0
1067	vmulouw		$T0,$H1,$R1
1068	vaddudm		$ACC2,$ACC2,$T0
1069	vmulouw		$T0,$H2,$R1
1070	vaddudm		$ACC3,$ACC3,$T0
1071	vmulouw		$T0,$H3,$R1
1072	vaddudm		$ACC4,$ACC4,$T0
1073
1074	vmulouw		$T0,$H3,$S2
1075	vaddudm		$ACC0,$ACC0,$T0
1076	vmulouw		$T0,$H4,$S2
1077	vaddudm		$ACC1,$ACC1,$T0
1078	vmulouw		$T0,$H0,$R2
1079	vaddudm		$ACC2,$ACC2,$T0
1080	vmulouw		$T0,$H1,$R2
1081	vaddudm		$ACC3,$ACC3,$T0
1082	vmulouw		$T0,$H2,$R2
1083	vaddudm		$ACC4,$ACC4,$T0
1084
1085	vmulouw		$T0,$H2,$S3
1086	vaddudm		$ACC0,$ACC0,$T0
1087	vmulouw		$T0,$H3,$S3
1088	vaddudm		$ACC1,$ACC1,$T0
1089	vmulouw		$T0,$H4,$S3
1090	vaddudm		$ACC2,$ACC2,$T0
1091	vmulouw		$T0,$H0,$R3
1092	vaddudm		$ACC3,$ACC3,$T0
1093	vmulouw		$T0,$H1,$R3
1094	vaddudm		$ACC4,$ACC4,$T0
1095
1096	vmulouw		$T0,$H1,$S4
1097	vaddudm		$ACC0,$ACC0,$T0
1098	vmulouw		$T0,$H2,$S4
1099	vaddudm		$ACC1,$ACC1,$T0
1100	vmulouw		$T0,$H3,$S4
1101	vaddudm		$ACC2,$ACC2,$T0
1102	vmulouw		$T0,$H4,$S4
1103	vaddudm		$ACC3,$ACC3,$T0
1104	vmulouw		$T0,$H0,$R4
1105	vaddudm		$ACC4,$ACC4,$T0
1106
1107	################################################################
1108	# lazy reduction
1109
1110	vspltisb	$T0,2
1111	vsrd		$H4,$ACC3,$_26
1112	vsrd		$H1,$ACC0,$_26
1113	vand		$H3,$ACC3,$mask26
1114	vand		$H0,$ACC0,$mask26
1115	vaddudm		$H4,$H4,$ACC4		# h3 -> h4
1116	vaddudm		$H1,$H1,$ACC1		# h0 -> h1
1117
1118	vsrd		$ACC4,$H4,$_26
1119	vsrd		$ACC1,$H1,$_26
1120	vand		$H4,$H4,$mask26
1121	vand		$H1,$H1,$mask26
1122	vaddudm		$H0,$H0,$ACC4
1123	vaddudm		$H2,$ACC2,$ACC1		# h1 -> h2
1124
1125	vsld		$ACC4,$ACC4,$T0		# <<2
1126	vsrd		$ACC2,$H2,$_26
1127	vand		$H2,$H2,$mask26
1128	vaddudm		$H0,$H0,$ACC4		# h4 -> h0
1129	vaddudm		$H3,$H3,$ACC2		# h2 -> h3
1130
1131	vsrd		$ACC0,$H0,$_26
1132	vsrd		$ACC3,$H3,$_26
1133	vand		$H0,$H0,$mask26
1134	vand		$H3,$H3,$mask26
1135	vaddudm		$H1,$H1,$ACC0		# h0 -> h1
1136	vaddudm		$H4,$H4,$ACC3		# h3 -> h4
1137
1138	blr
1139	.long	0
1140	.byte	0,12,0x14,0,0,0,0,0
1141.size	__poly1305_mul,.-__poly1305_mul
1142
1143.align	5
1144__poly1305_blocks_vsx:
1145	$STU	$sp,-$VSXFRAME($sp)
1146	mflr	r0
1147	li	r10,`15+$LOCALS+128`
1148	li	r11,`31+$LOCALS+128`
1149	mfspr	r12,256
1150	stvx	v20,r10,$sp
1151	addi	r10,r10,32
1152	stvx	v21,r11,$sp
1153	addi	r11,r11,32
1154	stvx	v22,r10,$sp
1155	addi	r10,r10,32
1156	stvx	v23,r11,$sp
1157	addi	r11,r11,32
1158	stvx	v24,r10,$sp
1159	addi	r10,r10,32
1160	stvx	v25,r11,$sp
1161	addi	r11,r11,32
1162	stvx	v26,r10,$sp
1163	addi	r10,r10,32
1164	stvx	v27,r11,$sp
1165	addi	r11,r11,32
1166	stvx	v28,r10,$sp
1167	addi	r10,r10,32
1168	stvx	v29,r11,$sp
1169	addi	r11,r11,32
1170	stvx	v30,r10,$sp
1171	stvx	v31,r11,$sp
1172	stw	r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# save vrsave
1173	li	r12,-1
1174	mtspr	256,r12			# preserve all AltiVec registers
1175	$PUSH	r27,`$VSXFRAME-$SIZE_T*5`($sp)
1176	$PUSH	r28,`$VSXFRAME-$SIZE_T*4`($sp)
1177	$PUSH	r29,`$VSXFRAME-$SIZE_T*3`($sp)
1178	$PUSH	r30,`$VSXFRAME-$SIZE_T*2`($sp)
1179	$PUSH	r31,`$VSXFRAME-$SIZE_T*1`($sp)
1180	$PUSH	r0,`$VSXFRAME+$LRSAVE`($sp)
1181
1182	bl	LPICmeup
1183
1184	li	$x10,0x10
1185	li	$x20,0x20
1186	li	$x30,0x30
1187	li	$x40,0x40
1188	li	$x50,0x50
1189	lvx_u	$mask26,$x00,$const
1190	lvx_u	$_26,$x10,$const
1191	lvx_u	$_40,$x20,$const
1192	lvx_u	$I2perm,$x30,$const
1193	lvx_u	$padbits,$x40,$const
1194
1195	cmplwi	r7,0			# is_base2_26?
1196	bne	Lskip_init_vsx
1197
1198	lwz	$h1,32($ctx)		# load key base 2^32
1199	lwz	$h2,36($ctx)
1200	lwz	$h3,40($ctx)
1201	lwz	$h4,44($ctx)
1202
1203	extrwi	$h0,$h1,26,6		# base 2^32 -> base 2^26
1204	extrwi	$h1,$h1,6,0
1205	insrwi	$h1,$h2,20,6
1206	extrwi	$h2,$h2,12,0
1207	insrwi	$h2,$h3,14,6
1208	extrwi	$h3,$h3,18,0
1209	insrwi	$h3,$h4,8,6
1210	extrwi	$h4,$h4,24,0
1211
1212	mtvrwz	$R0,$h0
1213	slwi	$h0,$h1,2
1214	mtvrwz	$R1,$h1
1215	add	$h1,$h1,$h0
1216	mtvrwz	$S1,$h1
1217	slwi	$h1,$h2,2
1218	mtvrwz	$R2,$h2
1219	add	$h2,$h2,$h1
1220	mtvrwz	$S2,$h2
1221	slwi	$h2,$h3,2
1222	mtvrwz	$R3,$h3
1223	add	$h3,$h3,$h2
1224	mtvrwz	$S3,$h3
1225	slwi	$h3,$h4,2
1226	mtvrwz	$R4,$h4
1227	add	$h4,$h4,$h3
1228	mtvrwz	$S4,$h4
1229
1230	vmr	$H0,$R0
1231	vmr	$H1,$R1
1232	vmr	$H2,$R2
1233	vmr	$H3,$R3
1234	vmr	$H4,$R4
1235
1236	bl	__poly1305_mul		# r^1:- * r^1:-
1237
1238	vpermdi	$R0,$H0,$R0,0b00
1239	vpermdi	$R1,$H1,$R1,0b00
1240	vpermdi	$R2,$H2,$R2,0b00
1241	vpermdi	$R3,$H3,$R3,0b00
1242	vpermdi	$R4,$H4,$R4,0b00
1243	vpermdi	$H0,$H0,$H0,0b00
1244	vpermdi	$H1,$H1,$H1,0b00
1245	vpermdi	$H2,$H2,$H2,0b00
1246	vpermdi	$H3,$H3,$H3,0b00
1247	vpermdi	$H4,$H4,$H4,0b00
1248	vsld	$S1,$R1,$T0		# <<2
1249	vsld	$S2,$R2,$T0
1250	vsld	$S3,$R3,$T0
1251	vsld	$S4,$R4,$T0
1252	vaddudm	$S1,$S1,$R1
1253	vaddudm	$S2,$S2,$R2
1254	vaddudm	$S3,$S3,$R3
1255	vaddudm	$S4,$S4,$R4
1256
1257	bl	__poly1305_mul		# r^2:r^2 * r^2:r^1
1258
1259	addi	$h0,$ctx,0x60
1260	lwz	$h1,0($ctx)		# load hash
1261	lwz	$h2,4($ctx)
1262	lwz	$h3,8($ctx)
1263	lwz	$h4,12($ctx)
1264	lwz	$t0,16($ctx)
1265
1266	vmrgow	$R0,$R0,$H0		# r^2:r^4:r^1:r^3
1267	vmrgow	$R1,$R1,$H1
1268	vmrgow	$R2,$R2,$H2
1269	vmrgow	$R3,$R3,$H3
1270	vmrgow	$R4,$R4,$H4
1271	vslw	$S1,$R1,$T0		# <<2
1272	vslw	$S2,$R2,$T0
1273	vslw	$S3,$R3,$T0
1274	vslw	$S4,$R4,$T0
1275	vadduwm	$S1,$S1,$R1
1276	vadduwm	$S2,$S2,$R2
1277	vadduwm	$S3,$S3,$R3
1278	vadduwm	$S4,$S4,$R4
1279
1280	stvx_u	$R0,$x30,$ctx
1281	stvx_u	$R1,$x40,$ctx
1282	stvx_u	$S1,$x50,$ctx
1283	stvx_u	$R2,$x00,$h0
1284	stvx_u	$S2,$x10,$h0
1285	stvx_u	$R3,$x20,$h0
1286	stvx_u	$S3,$x30,$h0
1287	stvx_u	$R4,$x40,$h0
1288	stvx_u	$S4,$x50,$h0
1289
1290	extrwi	$h0,$h1,26,6		# base 2^32 -> base 2^26
1291	extrwi	$h1,$h1,6,0
1292	mtvrwz	$H0,$h0
1293	insrwi	$h1,$h2,20,6
1294	extrwi	$h2,$h2,12,0
1295	mtvrwz	$H1,$h1
1296	insrwi	$h2,$h3,14,6
1297	extrwi	$h3,$h3,18,0
1298	mtvrwz	$H2,$h2
1299	insrwi	$h3,$h4,8,6
1300	extrwi	$h4,$h4,24,0
1301	mtvrwz	$H3,$h3
1302	insrwi	$h4,$t0,3,5
1303	mtvrwz	$H4,$h4
1304___
1305							}
1306$code.=<<___;
1307	li	r0,1
1308	stw	r0,24($ctx)		# set is_base2_26
1309	b	Loaded_vsx
1310
1311.align	4
1312Lskip_init_vsx:
1313	li		$x10,4
1314	li		$x20,8
1315	li		$x30,12
1316	li		$x40,16
1317	lvwzx_u		$H0,$x00,$ctx
1318	lvwzx_u		$H1,$x10,$ctx
1319	lvwzx_u		$H2,$x20,$ctx
1320	lvwzx_u		$H3,$x30,$ctx
1321	lvwzx_u		$H4,$x40,$ctx
1322
1323Loaded_vsx:
1324	li		$x10,0x10
1325	li		$x20,0x20
1326	li		$x30,0x30
1327	li		$x40,0x40
1328	li		$x50,0x50
1329	li		$x60,0x60
1330	li		$x70,0x70
1331	addi		$ctx_,$ctx,64		# &ctx->r[1]
1332	addi		$_ctx,$sp,`$LOCALS+15`	# &ctx->r[1], r^2:r^4 shadow
1333
1334	vxor		$T0,$T0,$T0		# ensure second half is zero
1335	vpermdi		$H0,$H0,$T0,0b00
1336	vpermdi		$H1,$H1,$T0,0b00
1337	vpermdi		$H2,$H2,$T0,0b00
1338	vpermdi		$H3,$H3,$T0,0b00
1339	vpermdi		$H4,$H4,$T0,0b00
1340
1341	be?lvx_u	$_4,$x50,$const		# byte swap mask
1342	lvx_u		$T1,$x00,$inp		# load first input block
1343	lvx_u		$T2,$x10,$inp
1344	lvx_u		$T3,$x20,$inp
1345	lvx_u		$T4,$x30,$inp
1346	be?vperm	$T1,$T1,$T1,$_4
1347	be?vperm	$T2,$T2,$T2,$_4
1348	be?vperm	$T3,$T3,$T3,$_4
1349	be?vperm	$T4,$T4,$T4,$_4
1350
1351	vpermdi		$I0,$T1,$T2,0b00	# smash input to base 2^26
1352	vspltisb	$_4,4
1353	vperm		$I2,$T1,$T2,$I2perm	# 0x...0e0f0001...1e1f1011
1354	vspltisb	$_14,14
1355	vpermdi		$I3,$T1,$T2,0b11
1356
1357	vsrd		$I1,$I0,$_26
1358	vsrd		$I2,$I2,$_4
1359	vsrd		$I4,$I3,$_40
1360	vsrd		$I3,$I3,$_14
1361	vand		$I0,$I0,$mask26
1362	vand		$I1,$I1,$mask26
1363	vand		$I2,$I2,$mask26
1364	vand		$I3,$I3,$mask26
1365
1366	vpermdi		$T1,$T3,$T4,0b00
1367	vperm		$T2,$T3,$T4,$I2perm	# 0x...0e0f0001...1e1f1011
1368	vpermdi		$T3,$T3,$T4,0b11
1369
1370	vsrd		$T0,$T1,$_26
1371	vsrd		$T2,$T2,$_4
1372	vsrd		$T4,$T3,$_40
1373	vsrd		$T3,$T3,$_14
1374	vand		$T1,$T1,$mask26
1375	vand		$T0,$T0,$mask26
1376	vand		$T2,$T2,$mask26
1377	vand		$T3,$T3,$mask26
1378
1379	# inp[2]:inp[0]:inp[3]:inp[1]
1380	vmrgow		$I4,$T4,$I4
1381	vmrgow		$I0,$T1,$I0
1382	vmrgow		$I1,$T0,$I1
1383	vmrgow		$I2,$T2,$I2
1384	vmrgow		$I3,$T3,$I3
1385	vor		$I4,$I4,$padbits
1386
1387	lvx_splt	$R0,$x30,$ctx		# taking lvx_vsplt out of loop
1388	lvx_splt	$R1,$x00,$ctx_		# gives ~8% improvement
1389	lvx_splt	$S1,$x10,$ctx_
1390	lvx_splt	$R2,$x20,$ctx_
1391	lvx_splt	$S2,$x30,$ctx_
1392	lvx_splt	$T1,$x40,$ctx_
1393	lvx_splt	$T2,$x50,$ctx_
1394	lvx_splt	$T3,$x60,$ctx_
1395	lvx_splt	$T4,$x70,$ctx_
1396	stvx		$R1,$x00,$_ctx
1397	stvx		$S1,$x10,$_ctx
1398	stvx		$R2,$x20,$_ctx
1399	stvx		$S2,$x30,$_ctx
1400	stvx		$T1,$x40,$_ctx
1401	stvx		$T2,$x50,$_ctx
1402	stvx		$T3,$x60,$_ctx
1403	stvx		$T4,$x70,$_ctx
1404
1405	addi		$inp,$inp,0x40
1406	addi		$const,$const,0x50
1407	addi		r0,$len,-64
1408	srdi		r0,r0,6
1409	mtctr		r0
1410	b		Loop_vsx
1411
1412.align	4
1413Loop_vsx:
1414	################################################################
1415	## ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
1416	## ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
1417	##   \___________________/
1418	##
1419	## Note that we start with inp[2:3]*r^2. This is because it
1420	## doesn't depend on reduction in previous iteration.
1421	################################################################
1422	## d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
1423	## d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
1424	## d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
1425	## d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
1426	## d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
1427
1428	vmuleuw		$ACC0,$I0,$R0
1429	vmuleuw		$ACC1,$I0,$R1
1430	vmuleuw		$ACC2,$I0,$R2
1431	vmuleuw		$ACC3,$I1,$R2
1432
1433	vmuleuw		$T0,$I1,$R0
1434	vaddudm		$ACC1,$ACC1,$T0
1435	vmuleuw		$T0,$I1,$R1
1436	vaddudm		$ACC2,$ACC2,$T0
1437	 vmuleuw	$ACC4,$I2,$R2
1438	vmuleuw		$T0,$I4,$S1
1439	vaddudm		$ACC0,$ACC0,$T0
1440	vmuleuw		$T0,$I2,$R1
1441	vaddudm		$ACC3,$ACC3,$T0
1442	lvx		$S3,$x50,$_ctx
1443	vmuleuw		$T0,$I3,$R1
1444	vaddudm		$ACC4,$ACC4,$T0
1445	lvx		$R3,$x40,$_ctx
1446
1447	 vaddudm	$H2,$H2,$I2
1448	 vaddudm	$H0,$H0,$I0
1449	 vaddudm	$H3,$H3,$I3
1450	 vaddudm	$H1,$H1,$I1
1451	 vaddudm	$H4,$H4,$I4
1452
1453	vmuleuw		$T0,$I3,$S2
1454	vaddudm		$ACC0,$ACC0,$T0
1455	vmuleuw		$T0,$I4,$S2
1456	vaddudm		$ACC1,$ACC1,$T0
1457	vmuleuw		$T0,$I2,$R0
1458	vaddudm		$ACC2,$ACC2,$T0
1459	vmuleuw		$T0,$I3,$R0
1460	vaddudm		$ACC3,$ACC3,$T0
1461	lvx		$S4,$x70,$_ctx
1462	vmuleuw		$T0,$I4,$R0
1463	vaddudm		$ACC4,$ACC4,$T0
1464	lvx		$R4,$x60,$_ctx
1465
1466	vmuleuw		$T0,$I2,$S3
1467	vaddudm		$ACC0,$ACC0,$T0
1468	vmuleuw		$T0,$I3,$S3
1469	vaddudm		$ACC1,$ACC1,$T0
1470	vmuleuw		$T0,$I4,$S3
1471	vaddudm		$ACC2,$ACC2,$T0
1472	vmuleuw		$T0,$I0,$R3
1473	vaddudm		$ACC3,$ACC3,$T0
1474	vmuleuw		$T0,$I1,$R3
1475	vaddudm		$ACC4,$ACC4,$T0
1476
1477	 be?lvx_u	$_4,$x00,$const		# byte swap mask
1478	 lvx_u		$T1,$x00,$inp		# load next input block
1479	 lvx_u		$T2,$x10,$inp
1480	 lvx_u		$T3,$x20,$inp
1481	 lvx_u		$T4,$x30,$inp
1482	 be?vperm	$T1,$T1,$T1,$_4
1483	 be?vperm	$T2,$T2,$T2,$_4
1484	 be?vperm	$T3,$T3,$T3,$_4
1485	 be?vperm	$T4,$T4,$T4,$_4
1486
1487	vmuleuw		$T0,$I1,$S4
1488	vaddudm		$ACC0,$ACC0,$T0
1489	vmuleuw		$T0,$I2,$S4
1490	vaddudm		$ACC1,$ACC1,$T0
1491	vmuleuw		$T0,$I3,$S4
1492	vaddudm		$ACC2,$ACC2,$T0
1493	vmuleuw		$T0,$I4,$S4
1494	vaddudm		$ACC3,$ACC3,$T0
1495	vmuleuw		$T0,$I0,$R4
1496	vaddudm		$ACC4,$ACC4,$T0
1497
1498	 vpermdi	$I0,$T1,$T2,0b00	# smash input to base 2^26
1499	 vspltisb	$_4,4
1500	 vperm		$I2,$T1,$T2,$I2perm	# 0x...0e0f0001...1e1f1011
1501	 vpermdi	$I3,$T1,$T2,0b11
1502
1503	# (hash + inp[0:1]) * r^4
1504	vmulouw		$T0,$H0,$R0
1505	vaddudm		$ACC0,$ACC0,$T0
1506	vmulouw		$T0,$H1,$R0
1507	vaddudm		$ACC1,$ACC1,$T0
1508	vmulouw		$T0,$H2,$R0
1509	vaddudm		$ACC2,$ACC2,$T0
1510	vmulouw		$T0,$H3,$R0
1511	vaddudm		$ACC3,$ACC3,$T0
1512	vmulouw		$T0,$H4,$R0
1513	vaddudm		$ACC4,$ACC4,$T0
1514
1515	 vpermdi	$T1,$T3,$T4,0b00
1516	 vperm		$T2,$T3,$T4,$I2perm	# 0x...0e0f0001...1e1f1011
1517	 vpermdi	$T3,$T3,$T4,0b11
1518
1519	vmulouw		$T0,$H2,$S3
1520	vaddudm		$ACC0,$ACC0,$T0
1521	vmulouw		$T0,$H3,$S3
1522	vaddudm		$ACC1,$ACC1,$T0
1523	vmulouw		$T0,$H4,$S3
1524	vaddudm		$ACC2,$ACC2,$T0
1525	vmulouw		$T0,$H0,$R3
1526	vaddudm		$ACC3,$ACC3,$T0
1527	lvx		$S1,$x10,$_ctx
1528	vmulouw		$T0,$H1,$R3
1529	vaddudm		$ACC4,$ACC4,$T0
1530	lvx		$R1,$x00,$_ctx
1531
1532	 vsrd		$I1,$I0,$_26
1533	 vsrd		$I2,$I2,$_4
1534	 vsrd		$I4,$I3,$_40
1535	 vsrd		$I3,$I3,$_14
1536
1537	vmulouw		$T0,$H1,$S4
1538	vaddudm		$ACC0,$ACC0,$T0
1539	vmulouw		$T0,$H2,$S4
1540	vaddudm		$ACC1,$ACC1,$T0
1541	vmulouw		$T0,$H3,$S4
1542	vaddudm		$ACC2,$ACC2,$T0
1543	vmulouw		$T0,$H4,$S4
1544	vaddudm		$ACC3,$ACC3,$T0
1545	lvx		$S2,$x30,$_ctx
1546	vmulouw		$T0,$H0,$R4
1547	vaddudm		$ACC4,$ACC4,$T0
1548	lvx		$R2,$x20,$_ctx
1549
1550	 vand		$I0,$I0,$mask26
1551	 vand		$I1,$I1,$mask26
1552	 vand		$I2,$I2,$mask26
1553	 vand		$I3,$I3,$mask26
1554
1555	vmulouw		$T0,$H4,$S1
1556	vaddudm		$ACC0,$ACC0,$T0
1557	vmulouw		$T0,$H0,$R1
1558	vaddudm		$ACC1,$ACC1,$T0
1559	vmulouw		$T0,$H1,$R1
1560	vaddudm		$ACC2,$ACC2,$T0
1561	vmulouw		$T0,$H2,$R1
1562	vaddudm		$ACC3,$ACC3,$T0
1563	vmulouw		$T0,$H3,$R1
1564	vaddudm		$ACC4,$ACC4,$T0
1565
1566	 vsrd		$T2,$T2,$_4
1567	 vsrd		$_4,$T1,$_26
1568	 vsrd		$T4,$T3,$_40
1569	 vsrd		$T3,$T3,$_14
1570
1571	vmulouw		$T0,$H3,$S2
1572	vaddudm		$ACC0,$ACC0,$T0
1573	vmulouw		$T0,$H4,$S2
1574	vaddudm		$ACC1,$ACC1,$T0
1575	vmulouw		$T0,$H0,$R2
1576	vaddudm		$ACC2,$ACC2,$T0
1577	vmulouw		$T0,$H1,$R2
1578	vaddudm		$ACC3,$ACC3,$T0
1579	vmulouw		$T0,$H2,$R2
1580	vaddudm		$ACC4,$ACC4,$T0
1581
1582	 vand		$T1,$T1,$mask26
1583	 vand		$_4,$_4,$mask26
1584	 vand		$T2,$T2,$mask26
1585	 vand		$T3,$T3,$mask26
1586
1587	################################################################
1588	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
1589	# and P. Schwabe
1590
1591	vspltisb	$T0,2
1592	vsrd		$H4,$ACC3,$_26
1593	vsrd		$H1,$ACC0,$_26
1594	vand		$H3,$ACC3,$mask26
1595	vand		$H0,$ACC0,$mask26
1596	vaddudm		$H4,$H4,$ACC4		# h3 -> h4
1597	vaddudm		$H1,$H1,$ACC1		# h0 -> h1
1598
1599	 vmrgow		$I4,$T4,$I4
1600	 vmrgow		$I0,$T1,$I0
1601	 vmrgow		$I1,$_4,$I1
1602	 vmrgow		$I2,$T2,$I2
1603	 vmrgow		$I3,$T3,$I3
1604	 vor		$I4,$I4,$padbits
1605
1606	vsrd		$ACC4,$H4,$_26
1607	vsrd		$ACC1,$H1,$_26
1608	vand		$H4,$H4,$mask26
1609	vand		$H1,$H1,$mask26
1610	vaddudm		$H0,$H0,$ACC4
1611	vaddudm		$H2,$ACC2,$ACC1		# h1 -> h2
1612
1613	vsld		$ACC4,$ACC4,$T0		# <<2
1614	vsrd		$ACC2,$H2,$_26
1615	vand		$H2,$H2,$mask26
1616	vaddudm		$H0,$H0,$ACC4		# h4 -> h0
1617	vaddudm		$H3,$H3,$ACC2		# h2 -> h3
1618
1619	vsrd		$ACC0,$H0,$_26
1620	vsrd		$ACC3,$H3,$_26
1621	vand		$H0,$H0,$mask26
1622	vand		$H3,$H3,$mask26
1623	vaddudm		$H1,$H1,$ACC0		# h0 -> h1
1624	vaddudm		$H4,$H4,$ACC3		# h3 -> h4
1625
1626	addi		$inp,$inp,0x40
1627	bdnz		Loop_vsx
1628
1629	neg		$len,$len
1630	andi.		$len,$len,0x30
1631	sub		$inp,$inp,$len
1632
1633	lvx_u		$R0,$x30,$ctx		# load all powers
1634	lvx_u		$R1,$x00,$ctx_
1635	lvx_u		$S1,$x10,$ctx_
1636	lvx_u		$R2,$x20,$ctx_
1637	lvx_u		$S2,$x30,$ctx_
1638
1639Last_vsx:
1640	vmuleuw		$ACC0,$I0,$R0
1641	vmuleuw		$ACC1,$I1,$R0
1642	vmuleuw		$ACC2,$I2,$R0
1643	vmuleuw		$ACC3,$I3,$R0
1644	vmuleuw		$ACC4,$I4,$R0
1645
1646	vmuleuw		$T0,$I4,$S1
1647	vaddudm		$ACC0,$ACC0,$T0
1648	vmuleuw		$T0,$I0,$R1
1649	vaddudm		$ACC1,$ACC1,$T0
1650	vmuleuw		$T0,$I1,$R1
1651	vaddudm		$ACC2,$ACC2,$T0
1652	vmuleuw		$T0,$I2,$R1
1653	vaddudm		$ACC3,$ACC3,$T0
1654	lvx_u		$S3,$x50,$ctx_
1655	vmuleuw		$T0,$I3,$R1
1656	vaddudm		$ACC4,$ACC4,$T0
1657	lvx_u		$R3,$x40,$ctx_
1658
1659	 vaddudm	$H2,$H2,$I2
1660	 vaddudm	$H0,$H0,$I0
1661	 vaddudm	$H3,$H3,$I3
1662	 vaddudm	$H1,$H1,$I1
1663	 vaddudm	$H4,$H4,$I4
1664
1665	vmuleuw		$T0,$I3,$S2
1666	vaddudm		$ACC0,$ACC0,$T0
1667	vmuleuw		$T0,$I4,$S2
1668	vaddudm		$ACC1,$ACC1,$T0
1669	vmuleuw		$T0,$I0,$R2
1670	vaddudm		$ACC2,$ACC2,$T0
1671	vmuleuw		$T0,$I1,$R2
1672	vaddudm		$ACC3,$ACC3,$T0
1673	lvx_u		$S4,$x70,$ctx_
1674	vmuleuw		$T0,$I2,$R2
1675	vaddudm		$ACC4,$ACC4,$T0
1676	lvx_u		$R4,$x60,$ctx_
1677
1678	vmuleuw		$T0,$I2,$S3
1679	vaddudm		$ACC0,$ACC0,$T0
1680	vmuleuw		$T0,$I3,$S3
1681	vaddudm		$ACC1,$ACC1,$T0
1682	vmuleuw		$T0,$I4,$S3
1683	vaddudm		$ACC2,$ACC2,$T0
1684	vmuleuw		$T0,$I0,$R3
1685	vaddudm		$ACC3,$ACC3,$T0
1686	vmuleuw		$T0,$I1,$R3
1687	vaddudm		$ACC4,$ACC4,$T0
1688
1689	vmuleuw		$T0,$I1,$S4
1690	vaddudm		$ACC0,$ACC0,$T0
1691	vmuleuw		$T0,$I2,$S4
1692	vaddudm		$ACC1,$ACC1,$T0
1693	vmuleuw		$T0,$I3,$S4
1694	vaddudm		$ACC2,$ACC2,$T0
1695	vmuleuw		$T0,$I4,$S4
1696	vaddudm		$ACC3,$ACC3,$T0
1697	vmuleuw		$T0,$I0,$R4
1698	vaddudm		$ACC4,$ACC4,$T0
1699
1700	# (hash + inp[0:1]) * r^4
1701	vmulouw		$T0,$H0,$R0
1702	vaddudm		$ACC0,$ACC0,$T0
1703	vmulouw		$T0,$H1,$R0
1704	vaddudm		$ACC1,$ACC1,$T0
1705	vmulouw		$T0,$H2,$R0
1706	vaddudm		$ACC2,$ACC2,$T0
1707	vmulouw		$T0,$H3,$R0
1708	vaddudm		$ACC3,$ACC3,$T0
1709	vmulouw		$T0,$H4,$R0
1710	vaddudm		$ACC4,$ACC4,$T0
1711
1712	vmulouw		$T0,$H2,$S3
1713	vaddudm		$ACC0,$ACC0,$T0
1714	vmulouw		$T0,$H3,$S3
1715	vaddudm		$ACC1,$ACC1,$T0
1716	vmulouw		$T0,$H4,$S3
1717	vaddudm		$ACC2,$ACC2,$T0
1718	vmulouw		$T0,$H0,$R3
1719	vaddudm		$ACC3,$ACC3,$T0
1720	lvx_u		$S1,$x10,$ctx_
1721	vmulouw		$T0,$H1,$R3
1722	vaddudm		$ACC4,$ACC4,$T0
1723	lvx_u		$R1,$x00,$ctx_
1724
1725	vmulouw		$T0,$H1,$S4
1726	vaddudm		$ACC0,$ACC0,$T0
1727	vmulouw		$T0,$H2,$S4
1728	vaddudm		$ACC1,$ACC1,$T0
1729	vmulouw		$T0,$H3,$S4
1730	vaddudm		$ACC2,$ACC2,$T0
1731	vmulouw		$T0,$H4,$S4
1732	vaddudm		$ACC3,$ACC3,$T0
1733	lvx_u		$S2,$x30,$ctx_
1734	vmulouw		$T0,$H0,$R4
1735	vaddudm		$ACC4,$ACC4,$T0
1736	lvx_u		$R2,$x20,$ctx_
1737
1738	vmulouw		$T0,$H4,$S1
1739	vaddudm		$ACC0,$ACC0,$T0
1740	vmulouw		$T0,$H0,$R1
1741	vaddudm		$ACC1,$ACC1,$T0
1742	vmulouw		$T0,$H1,$R1
1743	vaddudm		$ACC2,$ACC2,$T0
1744	vmulouw		$T0,$H2,$R1
1745	vaddudm		$ACC3,$ACC3,$T0
1746	vmulouw		$T0,$H3,$R1
1747	vaddudm		$ACC4,$ACC4,$T0
1748
1749	vmulouw		$T0,$H3,$S2
1750	vaddudm		$ACC0,$ACC0,$T0
1751	vmulouw		$T0,$H4,$S2
1752	vaddudm		$ACC1,$ACC1,$T0
1753	vmulouw		$T0,$H0,$R2
1754	vaddudm		$ACC2,$ACC2,$T0
1755	vmulouw		$T0,$H1,$R2
1756	vaddudm		$ACC3,$ACC3,$T0
1757	vmulouw		$T0,$H2,$R2
1758	vaddudm		$ACC4,$ACC4,$T0
1759
1760	################################################################
1761	# horizontal addition
1762
1763	vpermdi		$H0,$ACC0,$ACC0,0b10
1764	vpermdi		$H1,$ACC1,$ACC1,0b10
1765	vpermdi		$H2,$ACC2,$ACC2,0b10
1766	vpermdi		$H3,$ACC3,$ACC3,0b10
1767	vpermdi		$H4,$ACC4,$ACC4,0b10
1768	vaddudm		$ACC0,$ACC0,$H0
1769	vaddudm		$ACC1,$ACC1,$H1
1770	vaddudm		$ACC2,$ACC2,$H2
1771	vaddudm		$ACC3,$ACC3,$H3
1772	vaddudm		$ACC4,$ACC4,$H4
1773
1774	################################################################
1775	# lazy reduction
1776
1777	vspltisb	$T0,2
1778	vsrd		$H4,$ACC3,$_26
1779	vsrd		$H1,$ACC0,$_26
1780	vand		$H3,$ACC3,$mask26
1781	vand		$H0,$ACC0,$mask26
1782	vaddudm		$H4,$H4,$ACC4		# h3 -> h4
1783	vaddudm		$H1,$H1,$ACC1		# h0 -> h1
1784
1785	vsrd		$ACC4,$H4,$_26
1786	vsrd		$ACC1,$H1,$_26
1787	vand		$H4,$H4,$mask26
1788	vand		$H1,$H1,$mask26
1789	vaddudm		$H0,$H0,$ACC4
1790	vaddudm		$H2,$ACC2,$ACC1		# h1 -> h2
1791
1792	vsld		$ACC4,$ACC4,$T0		# <<2
1793	vsrd		$ACC2,$H2,$_26
1794	vand		$H2,$H2,$mask26
1795	vaddudm		$H0,$H0,$ACC4		# h4 -> h0
1796	vaddudm		$H3,$H3,$ACC2		# h2 -> h3
1797
1798	vsrd		$ACC0,$H0,$_26
1799	vsrd		$ACC3,$H3,$_26
1800	vand		$H0,$H0,$mask26
1801	vand		$H3,$H3,$mask26
1802	vaddudm		$H1,$H1,$ACC0		# h0 -> h1
1803	vaddudm		$H4,$H4,$ACC3		# h3 -> h4
1804
1805	beq		Ldone_vsx
1806
1807	add		r6,$const,$len
1808
1809	be?lvx_u	$_4,$x00,$const		# byte swap mask
1810	lvx_u		$T1,$x00,$inp		# load last partial input block
1811	lvx_u		$T2,$x10,$inp
1812	lvx_u		$T3,$x20,$inp
1813	lvx_u		$T4,$x30,$inp
1814	be?vperm	$T1,$T1,$T1,$_4
1815	be?vperm	$T2,$T2,$T2,$_4
1816	be?vperm	$T3,$T3,$T3,$_4
1817	be?vperm	$T4,$T4,$T4,$_4
1818
1819	vpermdi		$I0,$T1,$T2,0b00	# smash input to base 2^26
1820	vspltisb	$_4,4
1821	vperm		$I2,$T1,$T2,$I2perm	# 0x...0e0f0001...1e1f1011
1822	vpermdi		$I3,$T1,$T2,0b11
1823
1824	vsrd		$I1,$I0,$_26
1825	vsrd		$I2,$I2,$_4
1826	vsrd		$I4,$I3,$_40
1827	vsrd		$I3,$I3,$_14
1828	vand		$I0,$I0,$mask26
1829	vand		$I1,$I1,$mask26
1830	vand		$I2,$I2,$mask26
1831	vand		$I3,$I3,$mask26
1832
1833	vpermdi		$T0,$T3,$T4,0b00
1834	vperm		$T1,$T3,$T4,$I2perm	# 0x...0e0f0001...1e1f1011
1835	vpermdi		$T2,$T3,$T4,0b11
1836
1837	lvx_u		$ACC0,$x00,r6
1838	lvx_u		$ACC1,$x30,r6
1839
1840	vsrd		$T3,$T0,$_26
1841	vsrd		$T1,$T1,$_4
1842	vsrd		$T4,$T2,$_40
1843	vsrd		$T2,$T2,$_14
1844	vand		$T0,$T0,$mask26
1845	vand		$T3,$T3,$mask26
1846	vand		$T1,$T1,$mask26
1847	vand		$T2,$T2,$mask26
1848
1849	# inp[2]:inp[0]:inp[3]:inp[1]
1850	vmrgow		$I4,$T4,$I4
1851	vmrgow		$I0,$T0,$I0
1852	vmrgow		$I1,$T3,$I1
1853	vmrgow		$I2,$T1,$I2
1854	vmrgow		$I3,$T2,$I3
1855	vor		$I4,$I4,$padbits
1856
1857	vperm		$H0,$H0,$H0,$ACC0	# move hash to right lane
1858	vand		$I0,$I0,    $ACC1	# mask redundant input lane[s]
1859	vperm		$H1,$H1,$H1,$ACC0
1860	vand		$I1,$I1,    $ACC1
1861	vperm		$H2,$H2,$H2,$ACC0
1862	vand		$I2,$I2,    $ACC1
1863	vperm		$H3,$H3,$H3,$ACC0
1864	vand		$I3,$I3,    $ACC1
1865	vperm		$H4,$H4,$H4,$ACC0
1866	vand		$I4,$I4,    $ACC1
1867
1868	vaddudm		$I0,$I0,$H0		# accumulate hash
1869	vxor		$H0,$H0,$H0		# wipe hash value
1870	vaddudm		$I1,$I1,$H1
1871	vxor		$H1,$H1,$H1
1872	vaddudm		$I2,$I2,$H2
1873	vxor		$H2,$H2,$H2
1874	vaddudm		$I3,$I3,$H3
1875	vxor		$H3,$H3,$H3
1876	vaddudm		$I4,$I4,$H4
1877	vxor		$H4,$H4,$H4
1878
1879	xor.		$len,$len,$len
1880	b		Last_vsx
1881
1882.align	4
1883Ldone_vsx:
1884	$POP	r0,`$VSXFRAME+$LRSAVE`($sp)
1885	li	$x10,4
1886	li	$x20,8
1887	li	$x30,12
1888	li	$x40,16
1889	stvwx_u	$H0,$x00,$ctx			# store hash
1890	stvwx_u	$H1,$x10,$ctx
1891	stvwx_u	$H2,$x20,$ctx
1892	stvwx_u	$H3,$x30,$ctx
1893	stvwx_u	$H4,$x40,$ctx
1894
1895	lwz	r12,`$VSXFRAME-$SIZE_T*5-4`($sp)# pull vrsave
1896	mtlr	r0
1897	li	r10,`15+$LOCALS+128`
1898	li	r11,`31+$LOCALS+128`
1899	mtspr	256,r12				# restore vrsave
1900	lvx	v20,r10,$sp
1901	addi	r10,r10,32
1902	lvx	v21,r11,$sp
1903	addi	r11,r11,32
1904	lvx	v22,r10,$sp
1905	addi	r10,r10,32
1906	lvx	v23,r11,$sp
1907	addi	r11,r11,32
1908	lvx	v24,r10,$sp
1909	addi	r10,r10,32
1910	lvx	v25,r11,$sp
1911	addi	r11,r11,32
1912	lvx	v26,r10,$sp
1913	addi	r10,r10,32
1914	lvx	v27,r11,$sp
1915	addi	r11,r11,32
1916	lvx	v28,r10,$sp
1917	addi	r10,r10,32
1918	lvx	v29,r11,$sp
1919	addi	r11,r11,32
1920	lvx	v30,r10,$sp
1921	lvx	v31,r11,$sp
1922	$POP	r27,`$VSXFRAME-$SIZE_T*5`($sp)
1923	$POP	r28,`$VSXFRAME-$SIZE_T*4`($sp)
1924	$POP	r29,`$VSXFRAME-$SIZE_T*3`($sp)
1925	$POP	r30,`$VSXFRAME-$SIZE_T*2`($sp)
1926	$POP	r31,`$VSXFRAME-$SIZE_T*1`($sp)
1927	addi	$sp,$sp,$VSXFRAME
1928	blr
1929	.long	0
1930	.byte	0,12,0x04,1,0x80,5,4,0
1931	.long	0
1932.size	__poly1305_blocks_vsx,.-__poly1305_blocks_vsx
1933
1934.align	6
1935LPICmeup:
1936	mflr	r0
1937	bcl	20,31,\$+4
1938	mflr	$const      # vvvvvv "distance" between . and 1st data entry
1939	addi	$const,$const,`64-8`
1940	mtlr	r0
1941	blr
1942	.long	0
1943	.byte	0,12,0x14,0,0,0,0,0
1944	.space	`64-9*4`
1945
1946.quad	0x0000000003ffffff,0x0000000003ffffff	# mask26
1947.quad	0x000000000000001a,0x000000000000001a	# _26
1948.quad	0x0000000000000028,0x0000000000000028	# _40
1949.quad	0x000000000e0f0001,0x000000001e1f1011	# I2perm
1950.quad	0x0100000001000000,0x0100000001000000	# padbits
1951.quad	0x0706050403020100,0x0f0e0d0c0b0a0908	# byte swap for big-endian
1952
1953.quad	0x0000000000000000,0x0000000004050607	# magic tail masks
1954.quad	0x0405060700000000,0x0000000000000000
1955.quad	0x0000000000000000,0x0405060700000000
1956
1957.quad	0xffffffff00000000,0xffffffffffffffff
1958.quad	0xffffffff00000000,0xffffffff00000000
1959.quad	0x0000000000000000,0xffffffff00000000
1960___
1961}}}
1962$code.=<<___;
1963.asciz	"Poly1305 for PPC, CRYPTOGAMS by \@dot-asm"
1964___
1965
1966foreach (split("\n",$code)) {
1967	s/\`([^\`]*)\`/eval($1)/ge;
1968
1969	# instructions prefixed with '?' are endian-specific and need
1970	# to be adjusted accordingly...
1971	if ($flavour !~ /le$/) {	# big-endian
1972	    s/be\?//		or
1973	    s/le\?/#le#/
1974	} else {			# little-endian
1975	    s/le\?//		or
1976	    s/be\?/#be#/
1977	}
1978
1979	print $_,"\n";
1980}
1981close STDOUT or die "error closing STDOUT: $!";
1982