xref: /linux/arch/powerpc/crypto/ghashp10-ppc.pl (revision 1ac731c529cd4d6adbce134754b51ff7d822b145)
1*81d358b1SMichael Ellerman#!/usr/bin/env perl
2*81d358b1SMichael Ellerman# SPDX-License-Identifier: GPL-2.0
3*81d358b1SMichael Ellerman
4*81d358b1SMichael Ellerman# This code is taken from the OpenSSL project but the author (Andy Polyakov)
5*81d358b1SMichael Ellerman# has relicensed it under the GPLv2. Therefore this program is free software;
6*81d358b1SMichael Ellerman# you can redistribute it and/or modify it under the terms of the GNU General
7*81d358b1SMichael Ellerman# Public License version 2 as published by the Free Software Foundation.
8*81d358b1SMichael Ellerman#
9*81d358b1SMichael Ellerman# The original headers, including the original license headers, are
10*81d358b1SMichael Ellerman# included below for completeness.
11*81d358b1SMichael Ellerman
12*81d358b1SMichael Ellerman# ====================================================================
13*81d358b1SMichael Ellerman# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14*81d358b1SMichael Ellerman# project. The module is, however, dual licensed under OpenSSL and
15*81d358b1SMichael Ellerman# CRYPTOGAMS licenses depending on where you obtain it. For further
16*81d358b1SMichael Ellerman# details see https://www.openssl.org/~appro/cryptogams/.
17*81d358b1SMichael Ellerman# ====================================================================
18*81d358b1SMichael Ellerman#
19*81d358b1SMichael Ellerman# GHASH for PowerISA v2.07.
20*81d358b1SMichael Ellerman#
21*81d358b1SMichael Ellerman# July 2014
22*81d358b1SMichael Ellerman#
23*81d358b1SMichael Ellerman# Accurate performance measurements are problematic, because it's
24*81d358b1SMichael Ellerman# always virtualized setup with possibly throttled processor.
25*81d358b1SMichael Ellerman# Relative comparison is therefore more informative. This initial
26*81d358b1SMichael Ellerman# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
27*81d358b1SMichael Ellerman# faster than "4-bit" integer-only compiler-generated 64-bit code.
28*81d358b1SMichael Ellerman# "Initial version" means that there is room for futher improvement.
29*81d358b1SMichael Ellerman
30*81d358b1SMichael Ellerman$flavour=shift;
31*81d358b1SMichael Ellerman$output =shift;
32*81d358b1SMichael Ellerman
33*81d358b1SMichael Ellermanif ($flavour =~ /64/) {
34*81d358b1SMichael Ellerman	$SIZE_T=8;
35*81d358b1SMichael Ellerman	$LRSAVE=2*$SIZE_T;
36*81d358b1SMichael Ellerman	$STU="stdu";
37*81d358b1SMichael Ellerman	$POP="ld";
38*81d358b1SMichael Ellerman	$PUSH="std";
39*81d358b1SMichael Ellerman} elsif ($flavour =~ /32/) {
40*81d358b1SMichael Ellerman	$SIZE_T=4;
41*81d358b1SMichael Ellerman	$LRSAVE=$SIZE_T;
42*81d358b1SMichael Ellerman	$STU="stwu";
43*81d358b1SMichael Ellerman	$POP="lwz";
44*81d358b1SMichael Ellerman	$PUSH="stw";
45*81d358b1SMichael Ellerman} else { die "nonsense $flavour"; }
46*81d358b1SMichael Ellerman
47*81d358b1SMichael Ellerman$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
48*81d358b1SMichael Ellerman( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
49*81d358b1SMichael Ellerman( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
50*81d358b1SMichael Ellermandie "can't locate ppc-xlate.pl";
51*81d358b1SMichael Ellerman
52*81d358b1SMichael Ellermanopen STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
53*81d358b1SMichael Ellerman
54*81d358b1SMichael Ellermanmy ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block
55*81d358b1SMichael Ellerman
56*81d358b1SMichael Ellermanmy ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
57*81d358b1SMichael Ellermanmy ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
58*81d358b1SMichael Ellermanmy ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
59*81d358b1SMichael Ellermanmy $vrsave="r12";
60*81d358b1SMichael Ellermanmy ($t4,$t5,$t6) = ($Hl,$H,$Hh);
61*81d358b1SMichael Ellerman
62*81d358b1SMichael Ellerman$code=<<___;
63*81d358b1SMichael Ellerman.machine	"any"
64*81d358b1SMichael Ellerman
65*81d358b1SMichael Ellerman.text
66*81d358b1SMichael Ellerman
67*81d358b1SMichael Ellerman.globl	.gcm_init_p10
68*81d358b1SMichael Ellerman	lis		r0,0xfff0
69*81d358b1SMichael Ellerman	li		r8,0x10
70*81d358b1SMichael Ellerman	mfspr		$vrsave,256
71*81d358b1SMichael Ellerman	li		r9,0x20
72*81d358b1SMichael Ellerman	mtspr		256,r0
73*81d358b1SMichael Ellerman	li		r10,0x30
74*81d358b1SMichael Ellerman	lvx_u		$H,0,r4			# load H
75*81d358b1SMichael Ellerman	le?xor		r7,r7,r7
76*81d358b1SMichael Ellerman	le?addi		r7,r7,0x8		# need a vperm start with 08
77*81d358b1SMichael Ellerman	le?lvsr		5,0,r7
78*81d358b1SMichael Ellerman	le?vspltisb	6,0x0f
79*81d358b1SMichael Ellerman	le?vxor		5,5,6			# set a b-endian mask
80*81d358b1SMichael Ellerman	le?vperm	$H,$H,$H,5
81*81d358b1SMichael Ellerman
82*81d358b1SMichael Ellerman	vspltisb	$xC2,-16		# 0xf0
83*81d358b1SMichael Ellerman	vspltisb	$t0,1			# one
84*81d358b1SMichael Ellerman	vaddubm		$xC2,$xC2,$xC2		# 0xe0
85*81d358b1SMichael Ellerman	vxor		$zero,$zero,$zero
86*81d358b1SMichael Ellerman	vor		$xC2,$xC2,$t0		# 0xe1
87*81d358b1SMichael Ellerman	vsldoi		$xC2,$xC2,$zero,15	# 0xe1...
88*81d358b1SMichael Ellerman	vsldoi		$t1,$zero,$t0,1		# ...1
89*81d358b1SMichael Ellerman	vaddubm		$xC2,$xC2,$xC2		# 0xc2...
90*81d358b1SMichael Ellerman	vspltisb	$t2,7
91*81d358b1SMichael Ellerman	vor		$xC2,$xC2,$t1		# 0xc2....01
92*81d358b1SMichael Ellerman	vspltb		$t1,$H,0		# most significant byte
93*81d358b1SMichael Ellerman	vsl		$H,$H,$t0		# H<<=1
94*81d358b1SMichael Ellerman	vsrab		$t1,$t1,$t2		# broadcast carry bit
95*81d358b1SMichael Ellerman	vand		$t1,$t1,$xC2
96*81d358b1SMichael Ellerman	vxor		$H,$H,$t1		# twisted H
97*81d358b1SMichael Ellerman
98*81d358b1SMichael Ellerman	vsldoi		$H,$H,$H,8		# twist even more ...
99*81d358b1SMichael Ellerman	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
100*81d358b1SMichael Ellerman	vsldoi		$Hl,$zero,$H,8		# ... and split
101*81d358b1SMichael Ellerman	vsldoi		$Hh,$H,$zero,8
102*81d358b1SMichael Ellerman
103*81d358b1SMichael Ellerman	stvx_u		$xC2,0,r3		# save pre-computed table
104*81d358b1SMichael Ellerman	stvx_u		$Hl,r8,r3
105*81d358b1SMichael Ellerman	stvx_u		$H, r9,r3
106*81d358b1SMichael Ellerman	stvx_u		$Hh,r10,r3
107*81d358b1SMichael Ellerman
108*81d358b1SMichael Ellerman	mtspr		256,$vrsave
109*81d358b1SMichael Ellerman	blr
110*81d358b1SMichael Ellerman	.long		0
111*81d358b1SMichael Ellerman	.byte		0,12,0x14,0,0,0,2,0
112*81d358b1SMichael Ellerman	.long		0
113*81d358b1SMichael Ellerman.size	.gcm_init_p10,.-.gcm_init_p10
114*81d358b1SMichael Ellerman
115*81d358b1SMichael Ellerman.globl	.gcm_init_htable
116*81d358b1SMichael Ellerman	lis		r0,0xfff0
117*81d358b1SMichael Ellerman	li		r8,0x10
118*81d358b1SMichael Ellerman	mfspr		$vrsave,256
119*81d358b1SMichael Ellerman	li		r9,0x20
120*81d358b1SMichael Ellerman	mtspr		256,r0
121*81d358b1SMichael Ellerman	li		r10,0x30
122*81d358b1SMichael Ellerman	lvx_u		$H,0,r4			# load H
123*81d358b1SMichael Ellerman
124*81d358b1SMichael Ellerman	vspltisb	$xC2,-16		# 0xf0
125*81d358b1SMichael Ellerman	vspltisb	$t0,1			# one
126*81d358b1SMichael Ellerman	vaddubm		$xC2,$xC2,$xC2		# 0xe0
127*81d358b1SMichael Ellerman	vxor		$zero,$zero,$zero
128*81d358b1SMichael Ellerman	vor		$xC2,$xC2,$t0		# 0xe1
129*81d358b1SMichael Ellerman	vsldoi		$xC2,$xC2,$zero,15	# 0xe1...
130*81d358b1SMichael Ellerman	vsldoi		$t1,$zero,$t0,1		# ...1
131*81d358b1SMichael Ellerman	vaddubm		$xC2,$xC2,$xC2		# 0xc2...
132*81d358b1SMichael Ellerman	vspltisb	$t2,7
133*81d358b1SMichael Ellerman	vor		$xC2,$xC2,$t1		# 0xc2....01
134*81d358b1SMichael Ellerman	vspltb		$t1,$H,0		# most significant byte
135*81d358b1SMichael Ellerman	vsl		$H,$H,$t0		# H<<=1
136*81d358b1SMichael Ellerman	vsrab		$t1,$t1,$t2		# broadcast carry bit
137*81d358b1SMichael Ellerman	vand		$t1,$t1,$xC2
138*81d358b1SMichael Ellerman	vxor		$IN,$H,$t1		# twisted H
139*81d358b1SMichael Ellerman
140*81d358b1SMichael Ellerman	vsldoi		$H,$IN,$IN,8		# twist even more ...
141*81d358b1SMichael Ellerman	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
142*81d358b1SMichael Ellerman	vsldoi		$Hl,$zero,$H,8		# ... and split
143*81d358b1SMichael Ellerman	vsldoi		$Hh,$H,$zero,8
144*81d358b1SMichael Ellerman
145*81d358b1SMichael Ellerman	stvx_u		$xC2,0,r3		# save pre-computed table
146*81d358b1SMichael Ellerman	stvx_u		$Hl,r8,r3
147*81d358b1SMichael Ellerman	li		r8,0x40
148*81d358b1SMichael Ellerman	stvx_u		$H, r9,r3
149*81d358b1SMichael Ellerman	li		r9,0x50
150*81d358b1SMichael Ellerman	stvx_u		$Hh,r10,r3
151*81d358b1SMichael Ellerman	li		r10,0x60
152*81d358b1SMichael Ellerman
153*81d358b1SMichael Ellerman	vpmsumd		$Xl,$IN,$Hl		# H.lo·H.lo
154*81d358b1SMichael Ellerman	vpmsumd		$Xm,$IN,$H		# H.hi·H.lo+H.lo·H.hi
155*81d358b1SMichael Ellerman	vpmsumd		$Xh,$IN,$Hh		# H.hi·H.hi
156*81d358b1SMichael Ellerman
157*81d358b1SMichael Ellerman	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
158*81d358b1SMichael Ellerman
159*81d358b1SMichael Ellerman	vsldoi		$t0,$Xm,$zero,8
160*81d358b1SMichael Ellerman	vsldoi		$t1,$zero,$Xm,8
161*81d358b1SMichael Ellerman	vxor		$Xl,$Xl,$t0
162*81d358b1SMichael Ellerman	vxor		$Xh,$Xh,$t1
163*81d358b1SMichael Ellerman
164*81d358b1SMichael Ellerman	vsldoi		$Xl,$Xl,$Xl,8
165*81d358b1SMichael Ellerman	vxor		$Xl,$Xl,$t2
166*81d358b1SMichael Ellerman
167*81d358b1SMichael Ellerman	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
168*81d358b1SMichael Ellerman	vpmsumd		$Xl,$Xl,$xC2
169*81d358b1SMichael Ellerman	vxor		$t1,$t1,$Xh
170*81d358b1SMichael Ellerman	vxor		$IN1,$Xl,$t1
171*81d358b1SMichael Ellerman
172*81d358b1SMichael Ellerman	vsldoi		$H2,$IN1,$IN1,8
173*81d358b1SMichael Ellerman	vsldoi		$H2l,$zero,$H2,8
174*81d358b1SMichael Ellerman	vsldoi		$H2h,$H2,$zero,8
175*81d358b1SMichael Ellerman
176*81d358b1SMichael Ellerman	stvx_u		$H2l,r8,r3		# save H^2
177*81d358b1SMichael Ellerman	li		r8,0x70
178*81d358b1SMichael Ellerman	stvx_u		$H2,r9,r3
179*81d358b1SMichael Ellerman	li		r9,0x80
180*81d358b1SMichael Ellerman	stvx_u		$H2h,r10,r3
181*81d358b1SMichael Ellerman	li		r10,0x90
182*81d358b1SMichael Ellerman
183*81d358b1SMichael Ellerman	vpmsumd		$Xl,$IN,$H2l		# H.lo·H^2.lo
184*81d358b1SMichael Ellerman	 vpmsumd	$Xl1,$IN1,$H2l		# H^2.lo·H^2.lo
185*81d358b1SMichael Ellerman	vpmsumd		$Xm,$IN,$H2		# H.hi·H^2.lo+H.lo·H^2.hi
186*81d358b1SMichael Ellerman	 vpmsumd	$Xm1,$IN1,$H2		# H^2.hi·H^2.lo+H^2.lo·H^2.hi
187*81d358b1SMichael Ellerman	vpmsumd		$Xh,$IN,$H2h		# H.hi·H^2.hi
188*81d358b1SMichael Ellerman	 vpmsumd	$Xh1,$IN1,$H2h		# H^2.hi·H^2.hi
189*81d358b1SMichael Ellerman
190*81d358b1SMichael Ellerman	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
191*81d358b1SMichael Ellerman	 vpmsumd	$t6,$Xl1,$xC2		# 1st reduction phase
192*81d358b1SMichael Ellerman
193*81d358b1SMichael Ellerman	vsldoi		$t0,$Xm,$zero,8
194*81d358b1SMichael Ellerman	vsldoi		$t1,$zero,$Xm,8
195*81d358b1SMichael Ellerman	 vsldoi		$t4,$Xm1,$zero,8
196*81d358b1SMichael Ellerman	 vsldoi		$t5,$zero,$Xm1,8
197*81d358b1SMichael Ellerman	vxor		$Xl,$Xl,$t0
198*81d358b1SMichael Ellerman	vxor		$Xh,$Xh,$t1
199*81d358b1SMichael Ellerman	 vxor		$Xl1,$Xl1,$t4
200*81d358b1SMichael Ellerman	 vxor		$Xh1,$Xh1,$t5
201*81d358b1SMichael Ellerman
202*81d358b1SMichael Ellerman	vsldoi		$Xl,$Xl,$Xl,8
203*81d358b1SMichael Ellerman	 vsldoi		$Xl1,$Xl1,$Xl1,8
204*81d358b1SMichael Ellerman	vxor		$Xl,$Xl,$t2
205*81d358b1SMichael Ellerman	 vxor		$Xl1,$Xl1,$t6
206*81d358b1SMichael Ellerman
207*81d358b1SMichael Ellerman	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
208*81d358b1SMichael Ellerman	 vsldoi		$t5,$Xl1,$Xl1,8		# 2nd reduction phase
209*81d358b1SMichael Ellerman	vpmsumd		$Xl,$Xl,$xC2
210*81d358b1SMichael Ellerman	 vpmsumd	$Xl1,$Xl1,$xC2
211*81d358b1SMichael Ellerman	vxor		$t1,$t1,$Xh
212*81d358b1SMichael Ellerman	 vxor		$t5,$t5,$Xh1
213*81d358b1SMichael Ellerman	vxor		$Xl,$Xl,$t1
214*81d358b1SMichael Ellerman	 vxor		$Xl1,$Xl1,$t5
215*81d358b1SMichael Ellerman
216*81d358b1SMichael Ellerman	vsldoi		$H,$Xl,$Xl,8
217*81d358b1SMichael Ellerman	 vsldoi		$H2,$Xl1,$Xl1,8
218*81d358b1SMichael Ellerman	vsldoi		$Hl,$zero,$H,8
219*81d358b1SMichael Ellerman	vsldoi		$Hh,$H,$zero,8
220*81d358b1SMichael Ellerman	 vsldoi		$H2l,$zero,$H2,8
221*81d358b1SMichael Ellerman	 vsldoi		$H2h,$H2,$zero,8
222*81d358b1SMichael Ellerman
223*81d358b1SMichael Ellerman	stvx_u		$Hl,r8,r3		# save H^3
224*81d358b1SMichael Ellerman	li		r8,0xa0
225*81d358b1SMichael Ellerman	stvx_u		$H,r9,r3
226*81d358b1SMichael Ellerman	li		r9,0xb0
227*81d358b1SMichael Ellerman	stvx_u		$Hh,r10,r3
228*81d358b1SMichael Ellerman	li		r10,0xc0
229*81d358b1SMichael Ellerman	 stvx_u		$H2l,r8,r3		# save H^4
230*81d358b1SMichael Ellerman	 stvx_u		$H2,r9,r3
231*81d358b1SMichael Ellerman	 stvx_u		$H2h,r10,r3
232*81d358b1SMichael Ellerman
233*81d358b1SMichael Ellerman	mtspr		256,$vrsave
234*81d358b1SMichael Ellerman	blr
235*81d358b1SMichael Ellerman	.long		0
236*81d358b1SMichael Ellerman	.byte		0,12,0x14,0,0,0,2,0
237*81d358b1SMichael Ellerman	.long		0
238*81d358b1SMichael Ellerman.size	.gcm_init_htable,.-.gcm_init_htable
239*81d358b1SMichael Ellerman
240*81d358b1SMichael Ellerman.globl	.gcm_gmult_p10
241*81d358b1SMichael Ellerman	lis		r0,0xfff8
242*81d358b1SMichael Ellerman	li		r8,0x10
243*81d358b1SMichael Ellerman	mfspr		$vrsave,256
244*81d358b1SMichael Ellerman	li		r9,0x20
245*81d358b1SMichael Ellerman	mtspr		256,r0
246*81d358b1SMichael Ellerman	li		r10,0x30
247*81d358b1SMichael Ellerman	lvx_u		$IN,0,$Xip		# load Xi
248*81d358b1SMichael Ellerman
249*81d358b1SMichael Ellerman	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
250*81d358b1SMichael Ellerman	 le?lvsl	$lemask,r0,r0
251*81d358b1SMichael Ellerman	lvx_u		$H, r9,$Htbl
252*81d358b1SMichael Ellerman	 le?vspltisb	$t0,0x07
253*81d358b1SMichael Ellerman	lvx_u		$Hh,r10,$Htbl
254*81d358b1SMichael Ellerman	 le?vxor	$lemask,$lemask,$t0
255*81d358b1SMichael Ellerman	lvx_u		$xC2,0,$Htbl
256*81d358b1SMichael Ellerman	 le?vperm	$IN,$IN,$IN,$lemask
257*81d358b1SMichael Ellerman	vxor		$zero,$zero,$zero
258*81d358b1SMichael Ellerman
259*81d358b1SMichael Ellerman	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
260*81d358b1SMichael Ellerman	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
261*81d358b1SMichael Ellerman	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
262*81d358b1SMichael Ellerman
263*81d358b1SMichael Ellerman	vpmsumd		$t2,$Xl,$xC2		# 1st phase
264*81d358b1SMichael Ellerman
265*81d358b1SMichael Ellerman	vsldoi		$t0,$Xm,$zero,8
266*81d358b1SMichael Ellerman	vsldoi		$t1,$zero,$Xm,8
267*81d358b1SMichael Ellerman	vxor		$Xl,$Xl,$t0
268*81d358b1SMichael Ellerman	vxor		$Xh,$Xh,$t1
269*81d358b1SMichael Ellerman
270*81d358b1SMichael Ellerman	vsldoi		$Xl,$Xl,$Xl,8
271*81d358b1SMichael Ellerman	vxor		$Xl,$Xl,$t2
272*81d358b1SMichael Ellerman
273*81d358b1SMichael Ellerman	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
274*81d358b1SMichael Ellerman	vpmsumd		$Xl,$Xl,$xC2
275*81d358b1SMichael Ellerman	vxor		$t1,$t1,$Xh
276*81d358b1SMichael Ellerman	vxor		$Xl,$Xl,$t1
277*81d358b1SMichael Ellerman
278*81d358b1SMichael Ellerman	le?vperm	$Xl,$Xl,$Xl,$lemask
279*81d358b1SMichael Ellerman	stvx_u		$Xl,0,$Xip		# write out Xi
280*81d358b1SMichael Ellerman
281*81d358b1SMichael Ellerman	mtspr		256,$vrsave
282*81d358b1SMichael Ellerman	blr
283*81d358b1SMichael Ellerman	.long		0
284*81d358b1SMichael Ellerman	.byte		0,12,0x14,0,0,0,2,0
285*81d358b1SMichael Ellerman	.long		0
286*81d358b1SMichael Ellerman.size	.gcm_gmult_p10,.-.gcm_gmult_p10
287*81d358b1SMichael Ellerman
288*81d358b1SMichael Ellerman.globl	.gcm_ghash_p10
289*81d358b1SMichael Ellerman	lis		r0,0xfff8
290*81d358b1SMichael Ellerman	li		r8,0x10
291*81d358b1SMichael Ellerman	mfspr		$vrsave,256
292*81d358b1SMichael Ellerman	li		r9,0x20
293*81d358b1SMichael Ellerman	mtspr		256,r0
294*81d358b1SMichael Ellerman	li		r10,0x30
295*81d358b1SMichael Ellerman	lvx_u		$Xl,0,$Xip		# load Xi
296*81d358b1SMichael Ellerman
297*81d358b1SMichael Ellerman	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
298*81d358b1SMichael Ellerman	 le?lvsl	$lemask,r0,r0
299*81d358b1SMichael Ellerman	lvx_u		$H, r9,$Htbl
300*81d358b1SMichael Ellerman	 le?vspltisb	$t0,0x07
301*81d358b1SMichael Ellerman	lvx_u		$Hh,r10,$Htbl
302*81d358b1SMichael Ellerman	 le?vxor	$lemask,$lemask,$t0
303*81d358b1SMichael Ellerman	lvx_u		$xC2,0,$Htbl
304*81d358b1SMichael Ellerman	 le?vperm	$Xl,$Xl,$Xl,$lemask
305*81d358b1SMichael Ellerman	vxor		$zero,$zero,$zero
306*81d358b1SMichael Ellerman
307*81d358b1SMichael Ellerman	lvx_u		$IN,0,$inp
308*81d358b1SMichael Ellerman	addi		$inp,$inp,16
309*81d358b1SMichael Ellerman	subi		$len,$len,16
310*81d358b1SMichael Ellerman	 le?vperm	$IN,$IN,$IN,$lemask
311*81d358b1SMichael Ellerman	vxor		$IN,$IN,$Xl
312*81d358b1SMichael Ellerman	b		Loop
313*81d358b1SMichael Ellerman
314*81d358b1SMichael Ellerman.align	5
315*81d358b1SMichael EllermanLoop:
316*81d358b1SMichael Ellerman	 subic		$len,$len,16
317*81d358b1SMichael Ellerman	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
318*81d358b1SMichael Ellerman	 subfe.		r0,r0,r0		# borrow?-1:0
319*81d358b1SMichael Ellerman	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
320*81d358b1SMichael Ellerman	 and		r0,r0,$len
321*81d358b1SMichael Ellerman	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
322*81d358b1SMichael Ellerman	 add		$inp,$inp,r0
323*81d358b1SMichael Ellerman
324*81d358b1SMichael Ellerman	vpmsumd		$t2,$Xl,$xC2		# 1st phase
325*81d358b1SMichael Ellerman
326*81d358b1SMichael Ellerman	vsldoi		$t0,$Xm,$zero,8
327*81d358b1SMichael Ellerman	vsldoi		$t1,$zero,$Xm,8
328*81d358b1SMichael Ellerman	vxor		$Xl,$Xl,$t0
329*81d358b1SMichael Ellerman	vxor		$Xh,$Xh,$t1
330*81d358b1SMichael Ellerman
331*81d358b1SMichael Ellerman	vsldoi		$Xl,$Xl,$Xl,8
332*81d358b1SMichael Ellerman	vxor		$Xl,$Xl,$t2
333*81d358b1SMichael Ellerman	 lvx_u		$IN,0,$inp
334*81d358b1SMichael Ellerman	 addi		$inp,$inp,16
335*81d358b1SMichael Ellerman
336*81d358b1SMichael Ellerman	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
337*81d358b1SMichael Ellerman	vpmsumd		$Xl,$Xl,$xC2
338*81d358b1SMichael Ellerman	 le?vperm	$IN,$IN,$IN,$lemask
339*81d358b1SMichael Ellerman	vxor		$t1,$t1,$Xh
340*81d358b1SMichael Ellerman	vxor		$IN,$IN,$t1
341*81d358b1SMichael Ellerman	vxor		$IN,$IN,$Xl
342*81d358b1SMichael Ellerman	beq		Loop			# did $len-=16 borrow?
343*81d358b1SMichael Ellerman
344*81d358b1SMichael Ellerman	vxor		$Xl,$Xl,$t1
345*81d358b1SMichael Ellerman	le?vperm	$Xl,$Xl,$Xl,$lemask
346*81d358b1SMichael Ellerman	stvx_u		$Xl,0,$Xip		# write out Xi
347*81d358b1SMichael Ellerman
348*81d358b1SMichael Ellerman	mtspr		256,$vrsave
349*81d358b1SMichael Ellerman	blr
350*81d358b1SMichael Ellerman	.long		0
351*81d358b1SMichael Ellerman	.byte		0,12,0x14,0,0,0,4,0
352*81d358b1SMichael Ellerman	.long		0
353*81d358b1SMichael Ellerman.size	.gcm_ghash_p10,.-.gcm_ghash_p10
354*81d358b1SMichael Ellerman
355*81d358b1SMichael Ellerman.asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
356*81d358b1SMichael Ellerman.align  2
357*81d358b1SMichael Ellerman___
358*81d358b1SMichael Ellerman
359*81d358b1SMichael Ellermanforeach (split("\n",$code)) {
360*81d358b1SMichael Ellerman	if ($flavour =~ /le$/o) {	# little-endian
361*81d358b1SMichael Ellerman	    s/le\?//o		or
362*81d358b1SMichael Ellerman	    s/be\?/#be#/o;
363*81d358b1SMichael Ellerman	} else {
364*81d358b1SMichael Ellerman	    s/le\?/#le#/o	or
365*81d358b1SMichael Ellerman	    s/be\?//o;
366*81d358b1SMichael Ellerman	}
367*81d358b1SMichael Ellerman	print $_,"\n";
368*81d358b1SMichael Ellerman}
369*81d358b1SMichael Ellerman
370*81d358b1SMichael Ellermanclose STDOUT; # enforce flush
371