xref: /freebsd/crypto/openssl/crypto/poly1305/asm/poly1305-ppc.pl (revision e6bfd18d21b225af6a0ed67ceeaf1293b7b9eba5)
1#! /usr/bin/env perl
2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements Poly1305 hash for PowerPC.
18#
19# June 2015
20#
21# Numbers are cycles per processed byte with poly1305_blocks alone,
22# and improvement coefficients relative to gcc-generated code.
23#
24#			-m32		-m64
25#
26# Freescale e300	14.8/+80%	-
27# PPC74x0		7.60/+60%	-
28# PPC970		7.00/+114%	3.51/+205%
29# POWER7		3.75/+260%	1.93/+100%
30# POWER8		-		2.03/+200%
31# POWER9		-		2.00/+150%
32#
33# Do we need floating-point implementation for PPC? Results presented
34# in poly1305_ieee754.c are tricky to compare to, because they are for
35# compiler-generated code. On the other hand it's known that floating-
36# point performance can be dominated by FPU latency, which means that
37# there is limit even for ideally optimized (and even vectorized) code.
38# And this limit is estimated to be higher than above -m64 results. Or
39# in other words floating-point implementation can be meaningful to
40# consider only in 32-bit application context. We probably have to
41# recognize that 32-bit builds are getting less popular on high-end
42# systems and therefore tend to target embedded ones, which might not
43# even have FPU...
44#
45# On side note, Power ISA 2.07 enables vector base 2^26 implementation,
46# and POWER8 might have capacity to break 1.0 cycle per byte barrier...
47
48$flavour = shift;
49
50if ($flavour =~ /64/) {
51	$SIZE_T	=8;
52	$LRSAVE	=2*$SIZE_T;
53	$UCMP	="cmpld";
54	$STU	="stdu";
55	$POP	="ld";
56	$PUSH	="std";
57} elsif ($flavour =~ /32/) {
58	$SIZE_T	=4;
59	$LRSAVE	=$SIZE_T;
60	$UCMP	="cmplw";
61	$STU	="stwu";
62	$POP	="lwz";
63	$PUSH	="stw";
64} else { die "nonsense $flavour"; }
65
66# Define endianness based on flavour
67# i.e.: linux64le
68$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
69
70$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
71( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
72( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
73die "can't locate ppc-xlate.pl";
74
75open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
76
77$FRAME=24*$SIZE_T;
78
79$sp="r1";
80my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6));
81my ($mac,$nonce)=($inp,$len);
82my $mask = "r0";
83
84$code=<<___;
85.machine	"any"
86.text
87___
88							if ($flavour =~ /64/) {
89###############################################################################
90# base 2^64 implementation
91
92my ($h0,$h1,$h2,$d0,$d1,$d2, $r0,$r1,$s1, $t0,$t1) = map("r$_",(7..12,27..31));
93
94$code.=<<___;
95.globl	.poly1305_init_int
96.align	4
97.poly1305_init_int:
98	xor	r0,r0,r0
99	std	r0,0($ctx)		# zero hash value
100	std	r0,8($ctx)
101	std	r0,16($ctx)
102
103	$UCMP	$inp,r0
104	beq-	Lno_key
105___
106$code.=<<___	if ($LITTLE_ENDIAN);
107	ld	$d0,0($inp)		# load key material
108	ld	$d1,8($inp)
109___
110$code.=<<___	if (!$LITTLE_ENDIAN);
111	li	$h0,4
112	lwbrx	$d0,0,$inp		# load key material
113	li	$d1,8
114	lwbrx	$h0,$h0,$inp
115	li	$h1,12
116	lwbrx	$d1,$d1,$inp
117	lwbrx	$h1,$h1,$inp
118	insrdi	$d0,$h0,32,0
119	insrdi	$d1,$h1,32,0
120___
121$code.=<<___;
122	lis	$h1,0xfff		# 0x0fff0000
123	ori	$h1,$h1,0xfffc		# 0x0ffffffc
124	insrdi	$h1,$h1,32,0		# 0x0ffffffc0ffffffc
125	ori	$h0,$h1,3		# 0x0ffffffc0fffffff
126
127	and	$d0,$d0,$h0
128	and	$d1,$d1,$h1
129
130	std	$d0,32($ctx)		# store key
131	std	$d1,40($ctx)
132
133Lno_key:
134	xor	r3,r3,r3
135	blr
136	.long	0
137	.byte	0,12,0x14,0,0,0,2,0
138.size	.poly1305_init_int,.-.poly1305_init_int
139
140.globl	.poly1305_blocks
141.align	4
142.poly1305_blocks:
143	srdi.	$len,$len,4
144	beq-	Labort
145
146	$STU	$sp,-$FRAME($sp)
147	mflr	r0
148	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
149	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
150	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
151	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
152	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
153	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
154
155	ld	$r0,32($ctx)		# load key
156	ld	$r1,40($ctx)
157
158	ld	$h0,0($ctx)		# load hash value
159	ld	$h1,8($ctx)
160	ld	$h2,16($ctx)
161
162	srdi	$s1,$r1,2
163	mtctr	$len
164	add	$s1,$s1,$r1		# s1 = r1 + r1>>2
165	li	$mask,3
166	b	Loop
167
168.align	4
169Loop:
170___
171$code.=<<___	if ($LITTLE_ENDIAN);
172	ld	$t0,0($inp)		# load input
173	ld	$t1,8($inp)
174___
175$code.=<<___	if (!$LITTLE_ENDIAN);
176	li	$d0,4
177	lwbrx	$t0,0,$inp		# load input
178	li	$t1,8
179	lwbrx	$d0,$d0,$inp
180	li	$d1,12
181	lwbrx	$t1,$t1,$inp
182	lwbrx	$d1,$d1,$inp
183	insrdi	$t0,$d0,32,0
184	insrdi	$t1,$d1,32,0
185___
186$code.=<<___;
187	addi	$inp,$inp,16
188
189	addc	$h0,$h0,$t0		# accumulate input
190	adde	$h1,$h1,$t1
191
192	mulld	$d0,$h0,$r0		# h0*r0
193	mulhdu	$d1,$h0,$r0
194	adde	$h2,$h2,$padbit
195
196	mulld	$t0,$h1,$s1		# h1*5*r1
197	mulhdu	$t1,$h1,$s1
198	addc	$d0,$d0,$t0
199	adde	$d1,$d1,$t1
200
201	mulld	$t0,$h0,$r1		# h0*r1
202	mulhdu	$d2,$h0,$r1
203	addc	$d1,$d1,$t0
204	addze	$d2,$d2
205
206	mulld	$t0,$h1,$r0		# h1*r0
207	mulhdu	$t1,$h1,$r0
208	addc	$d1,$d1,$t0
209	adde	$d2,$d2,$t1
210
211	mulld	$t0,$h2,$s1		# h2*5*r1
212	mulld	$t1,$h2,$r0		# h2*r0
213	addc	$d1,$d1,$t0
214	adde	$d2,$d2,$t1
215
216	andc	$t0,$d2,$mask		# final reduction step
217	and	$h2,$d2,$mask
218	srdi	$t1,$t0,2
219	add	$t0,$t0,$t1
220	addc	$h0,$d0,$t0
221	addze	$h1,$d1
222	addze	$h2,$h2
223
224	bdnz	Loop
225
226	std	$h0,0($ctx)		# store hash value
227	std	$h1,8($ctx)
228	std	$h2,16($ctx)
229
230	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
231	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
232	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
233	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
234	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
235	addi	$sp,$sp,$FRAME
236Labort:
237	blr
238	.long	0
239	.byte	0,12,4,1,0x80,5,4,0
240.size	.poly1305_blocks,.-.poly1305_blocks
241
242.globl	.poly1305_emit
243.align	4
244.poly1305_emit:
245	ld	$h0,0($ctx)		# load hash
246	ld	$h1,8($ctx)
247	ld	$h2,16($ctx)
248	ld	$padbit,0($nonce)	# load nonce
249	ld	$nonce,8($nonce)
250
251	addic	$d0,$h0,5		# compare to modulus
252	addze	$d1,$h1
253	addze	$d2,$h2
254
255	srdi	$mask,$d2,2		# did it carry/borrow?
256	neg	$mask,$mask
257
258	andc	$h0,$h0,$mask
259	and	$d0,$d0,$mask
260	andc	$h1,$h1,$mask
261	and	$d1,$d1,$mask
262	or	$h0,$h0,$d0
263	or	$h1,$h1,$d1
264___
265$code.=<<___	if (!$LITTLE_ENDIAN);
266	rotldi	$padbit,$padbit,32	# flip nonce words
267	rotldi	$nonce,$nonce,32
268___
269$code.=<<___;
270	addc	$h0,$h0,$padbit		# accumulate nonce
271	adde	$h1,$h1,$nonce
272___
273$code.=<<___	if ($LITTLE_ENDIAN);
274	std	$h0,0($mac)		# write result
275	std	$h1,8($mac)
276___
277$code.=<<___	if (!$LITTLE_ENDIAN);
278	extrdi	r0,$h0,32,0
279	li	$d0,4
280	stwbrx	$h0,0,$mac		# write result
281	extrdi	$h0,$h1,32,0
282	li	$d1,8
283	stwbrx	r0,$d0,$mac
284	li	$d2,12
285	stwbrx	$h1,$d1,$mac
286	stwbrx	$h0,$d2,$mac
287___
288$code.=<<___;
289	blr
290	.long	0
291	.byte	0,12,0x14,0,0,0,3,0
292.size	.poly1305_emit,.-.poly1305_emit
293___
294							} else {
295###############################################################################
296# base 2^32 implementation
297
298my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $s1,$s2,$s3,
299    $t0,$t1,$t2,$t3, $D0,$D1,$D2,$D3, $d0,$d1,$d2,$d3
300   ) = map("r$_",(7..12,14..31));
301
302$code.=<<___;
303.globl	.poly1305_init_int
304.align	4
305.poly1305_init_int:
306	xor	r0,r0,r0
307	stw	r0,0($ctx)		# zero hash value
308	stw	r0,4($ctx)
309	stw	r0,8($ctx)
310	stw	r0,12($ctx)
311	stw	r0,16($ctx)
312
313	$UCMP	$inp,r0
314	beq-	Lno_key
315___
316$code.=<<___	if ($LITTLE_ENDIAN);
317	lw	$h0,0($inp)		# load key material
318	lw	$h1,4($inp)
319	lw	$h2,8($inp)
320	lw	$h3,12($inp)
321___
322$code.=<<___	if (!$LITTLE_ENDIAN);
323	li	$h1,4
324	lwbrx	$h0,0,$inp		# load key material
325	li	$h2,8
326	lwbrx	$h1,$h1,$inp
327	li	$h3,12
328	lwbrx	$h2,$h2,$inp
329	lwbrx	$h3,$h3,$inp
330___
331$code.=<<___;
332	lis	$mask,0xf000		# 0xf0000000
333	li	$r0,-4
334	andc	$r0,$r0,$mask		# 0x0ffffffc
335
336	andc	$h0,$h0,$mask
337	and	$h1,$h1,$r0
338	and	$h2,$h2,$r0
339	and	$h3,$h3,$r0
340
341	stw	$h0,32($ctx)		# store key
342	stw	$h1,36($ctx)
343	stw	$h2,40($ctx)
344	stw	$h3,44($ctx)
345
346Lno_key:
347	xor	r3,r3,r3
348	blr
349	.long	0
350	.byte	0,12,0x14,0,0,0,2,0
351.size	.poly1305_init_int,.-.poly1305_init_int
352
353.globl	.poly1305_blocks
354.align	4
355.poly1305_blocks:
356	srwi.	$len,$len,4
357	beq-	Labort
358
359	$STU	$sp,-$FRAME($sp)
360	mflr	r0
361	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
362	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
363	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
364	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
365	$PUSH	r18,`$FRAME-$SIZE_T*14`($sp)
366	$PUSH	r19,`$FRAME-$SIZE_T*13`($sp)
367	$PUSH	r20,`$FRAME-$SIZE_T*12`($sp)
368	$PUSH	r21,`$FRAME-$SIZE_T*11`($sp)
369	$PUSH	r22,`$FRAME-$SIZE_T*10`($sp)
370	$PUSH	r23,`$FRAME-$SIZE_T*9`($sp)
371	$PUSH	r24,`$FRAME-$SIZE_T*8`($sp)
372	$PUSH	r25,`$FRAME-$SIZE_T*7`($sp)
373	$PUSH	r26,`$FRAME-$SIZE_T*6`($sp)
374	$PUSH	r27,`$FRAME-$SIZE_T*5`($sp)
375	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
376	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
377	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
378	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
379	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
380
381	lwz	$r0,32($ctx)		# load key
382	lwz	$r1,36($ctx)
383	lwz	$r2,40($ctx)
384	lwz	$r3,44($ctx)
385
386	lwz	$h0,0($ctx)		# load hash value
387	lwz	$h1,4($ctx)
388	lwz	$h2,8($ctx)
389	lwz	$h3,12($ctx)
390	lwz	$h4,16($ctx)
391
392	srwi	$s1,$r1,2
393	srwi	$s2,$r2,2
394	srwi	$s3,$r3,2
395	add	$s1,$s1,$r1		# si = ri + ri>>2
396	add	$s2,$s2,$r2
397	add	$s3,$s3,$r3
398	mtctr	$len
399	li	$mask,3
400	b	Loop
401
402.align	4
403Loop:
404___
405$code.=<<___	if ($LITTLE_ENDIAN);
406	lwz	$d0,0($inp)		# load input
407	lwz	$d1,4($inp)
408	lwz	$d2,8($inp)
409	lwz	$d3,12($inp)
410___
411$code.=<<___	if (!$LITTLE_ENDIAN);
412	li	$d1,4
413	lwbrx	$d0,0,$inp		# load input
414	li	$d2,8
415	lwbrx	$d1,$d1,$inp
416	li	$d3,12
417	lwbrx	$d2,$d2,$inp
418	lwbrx	$d3,$d3,$inp
419___
420$code.=<<___;
421	addi	$inp,$inp,16
422
423	addc	$h0,$h0,$d0		# accumulate input
424	adde	$h1,$h1,$d1
425	adde	$h2,$h2,$d2
426
427	mullw	$d0,$h0,$r0		# h0*r0
428	mulhwu	$D0,$h0,$r0
429
430	mullw	$d1,$h0,$r1		# h0*r1
431	mulhwu	$D1,$h0,$r1
432
433	mullw	$d2,$h0,$r2		# h0*r2
434	mulhwu	$D2,$h0,$r2
435
436	 adde	$h3,$h3,$d3
437	 adde	$h4,$h4,$padbit
438
439	mullw	$d3,$h0,$r3		# h0*r3
440	mulhwu	$D3,$h0,$r3
441
442	mullw	$t0,$h1,$s3		# h1*s3
443	mulhwu	$t1,$h1,$s3
444
445	mullw	$t2,$h1,$r0		# h1*r0
446	mulhwu	$t3,$h1,$r0
447	 addc	$d0,$d0,$t0
448	 adde	$D0,$D0,$t1
449
450	mullw	$t0,$h1,$r1		# h1*r1
451	mulhwu	$t1,$h1,$r1
452	 addc	$d1,$d1,$t2
453	 adde	$D1,$D1,$t3
454
455	mullw	$t2,$h1,$r2		# h1*r2
456	mulhwu	$t3,$h1,$r2
457	 addc	$d2,$d2,$t0
458	 adde	$D2,$D2,$t1
459
460	mullw	$t0,$h2,$s2		# h2*s2
461	mulhwu	$t1,$h2,$s2
462	 addc	$d3,$d3,$t2
463	 adde	$D3,$D3,$t3
464
465	mullw	$t2,$h2,$s3		# h2*s3
466	mulhwu	$t3,$h2,$s3
467	 addc	$d0,$d0,$t0
468	 adde	$D0,$D0,$t1
469
470	mullw	$t0,$h2,$r0		# h2*r0
471	mulhwu	$t1,$h2,$r0
472	 addc	$d1,$d1,$t2
473	 adde	$D1,$D1,$t3
474
475	mullw	$t2,$h2,$r1		# h2*r1
476	mulhwu	$t3,$h2,$r1
477	 addc	$d2,$d2,$t0
478	 adde	$D2,$D2,$t1
479
480	mullw	$t0,$h3,$s1		# h3*s1
481	mulhwu	$t1,$h3,$s1
482	 addc	$d3,$d3,$t2
483	 adde	$D3,$D3,$t3
484
485	mullw	$t2,$h3,$s2		# h3*s2
486	mulhwu	$t3,$h3,$s2
487	 addc	$d0,$d0,$t0
488	 adde	$D0,$D0,$t1
489
490	mullw	$t0,$h3,$s3		# h3*s3
491	mulhwu	$t1,$h3,$s3
492	 addc	$d1,$d1,$t2
493	 adde	$D1,$D1,$t3
494
495	mullw	$t2,$h3,$r0		# h3*r0
496	mulhwu	$t3,$h3,$r0
497	 addc	$d2,$d2,$t0
498	 adde	$D2,$D2,$t1
499
500	mullw	$t0,$h4,$s1		# h4*s1
501	 addc	$d3,$d3,$t2
502	 adde	$D3,$D3,$t3
503	addc	$d1,$d1,$t0
504
505	mullw	$t1,$h4,$s2		# h4*s2
506	 addze	$D1,$D1
507	addc	$d2,$d2,$t1
508	addze	$D2,$D2
509
510	mullw	$t2,$h4,$s3		# h4*s3
511	addc	$d3,$d3,$t2
512	addze	$D3,$D3
513
514	mullw	$h4,$h4,$r0		# h4*r0
515
516	addc	$h1,$d1,$D0
517	adde	$h2,$d2,$D1
518	adde	$h3,$d3,$D2
519	adde	$h4,$h4,$D3
520
521	andc	$D0,$h4,$mask		# final reduction step
522	and	$h4,$h4,$mask
523	srwi	$D1,$D0,2
524	add	$D0,$D0,$D1
525	addc	$h0,$d0,$D0
526	addze	$h1,$h1
527	addze	$h2,$h2
528	addze	$h3,$h3
529	addze	$h4,$h4
530
531	bdnz	Loop
532
533	stw	$h0,0($ctx)		# store hash value
534	stw	$h1,4($ctx)
535	stw	$h2,8($ctx)
536	stw	$h3,12($ctx)
537	stw	$h4,16($ctx)
538
539	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
540	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
541	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
542	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
543	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
544	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
545	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
546	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
547	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
548	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
549	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
550	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
551	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
552	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
553	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
554	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
555	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
556	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
557	addi	$sp,$sp,$FRAME
558Labort:
559	blr
560	.long	0
561	.byte	0,12,4,1,0x80,18,4,0
562.size	.poly1305_blocks,.-.poly1305_blocks
563
564.globl	.poly1305_emit
565.align	4
566.poly1305_emit:
567	$STU	$sp,-$FRAME($sp)
568	mflr	r0
569	$PUSH	r28,`$FRAME-$SIZE_T*4`($sp)
570	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
571	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
572	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
573	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
574
575	lwz	$h0,0($ctx)		# load hash
576	lwz	$h1,4($ctx)
577	lwz	$h2,8($ctx)
578	lwz	$h3,12($ctx)
579	lwz	$h4,16($ctx)
580
581	addic	$d0,$h0,5		# compare to modulus
582	addze	$d1,$h1
583	addze	$d2,$h2
584	addze	$d3,$h3
585	addze	$mask,$h4
586
587	srwi	$mask,$mask,2		# did it carry/borrow?
588	neg	$mask,$mask
589
590	andc	$h0,$h0,$mask
591	and	$d0,$d0,$mask
592	andc	$h1,$h1,$mask
593	and	$d1,$d1,$mask
594	or	$h0,$h0,$d0
595	lwz	$d0,0($nonce)		# load nonce
596	andc	$h2,$h2,$mask
597	and	$d2,$d2,$mask
598	or	$h1,$h1,$d1
599	lwz	$d1,4($nonce)
600	andc	$h3,$h3,$mask
601	and	$d3,$d3,$mask
602	or	$h2,$h2,$d2
603	lwz	$d2,8($nonce)
604	or	$h3,$h3,$d3
605	lwz	$d3,12($nonce)
606
607	addc	$h0,$h0,$d0		# accumulate nonce
608	adde	$h1,$h1,$d1
609	adde	$h2,$h2,$d2
610	adde	$h3,$h3,$d3
611___
612$code.=<<___	if ($LITTLE_ENDIAN);
613	stw	$h0,0($mac)		# write result
614	stw	$h1,4($mac)
615	stw	$h2,8($mac)
616	stw	$h3,12($mac)
617___
618$code.=<<___	if (!$LITTLE_ENDIAN);
619	li	$d1,4
620	stwbrx	$h0,0,$mac		# write result
621	li	$d2,8
622	stwbrx	$h1,$d1,$mac
623	li	$d3,12
624	stwbrx	$h2,$d2,$mac
625	stwbrx	$h3,$d3,$mac
626___
627$code.=<<___;
628	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
629	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
630	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
631	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
632	addi	$sp,$sp,$FRAME
633	blr
634	.long	0
635	.byte	0,12,4,1,0x80,4,3,0
636.size	.poly1305_emit,.-.poly1305_emit
637___
638							}
639$code.=<<___;
640.asciz	"Poly1305 for PPC, CRYPTOGAMS by <appro\@openssl.org>"
641___
642
643$code =~ s/\`([^\`]*)\`/eval $1/gem;
644print $code;
645close STDOUT or die "error closing STDOUT: $!";
646