xref: /freebsd/crypto/openssl/crypto/modes/asm/ghash-alpha.pl (revision e0c4386e7e71d93b0edc0c8fa156263fc4a8b0b6)
1*e0c4386eSCy Schubert#! /usr/bin/env perl
2*e0c4386eSCy Schubert# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
3*e0c4386eSCy Schubert#
4*e0c4386eSCy Schubert# Licensed under the Apache License 2.0 (the "License").  You may not use
5*e0c4386eSCy Schubert# this file except in compliance with the License.  You can obtain a copy
6*e0c4386eSCy Schubert# in the file LICENSE in the source distribution or at
7*e0c4386eSCy Schubert# https://www.openssl.org/source/license.html
8*e0c4386eSCy Schubert
9*e0c4386eSCy Schubert#
10*e0c4386eSCy Schubert# ====================================================================
11*e0c4386eSCy Schubert# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12*e0c4386eSCy Schubert# project. The module is, however, dual licensed under OpenSSL and
13*e0c4386eSCy Schubert# CRYPTOGAMS licenses depending on where you obtain it. For further
14*e0c4386eSCy Schubert# details see http://www.openssl.org/~appro/cryptogams/.
15*e0c4386eSCy Schubert# ====================================================================
16*e0c4386eSCy Schubert#
17*e0c4386eSCy Schubert# March 2010
18*e0c4386eSCy Schubert#
19*e0c4386eSCy Schubert# The module implements "4-bit" GCM GHASH function and underlying
20*e0c4386eSCy Schubert# single multiplication operation in GF(2^128). "4-bit" means that it
21*e0c4386eSCy Schubert# uses 256 bytes per-key table [+128 bytes shared table]. Even though
22*e0c4386eSCy Schubert# loops are aggressively modulo-scheduled in respect to references to
23*e0c4386eSCy Schubert# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
24*e0c4386eSCy Schubert# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
25*e0c4386eSCy Schubert# scheduling "glitch," because uprofile(1) indicates uniform sample
26*e0c4386eSCy Schubert# distribution, as if all instruction bundles execute in 1.5 cycles.
27*e0c4386eSCy Schubert# Meaning that it could have been even faster, yet 12 cycles is ~60%
28*e0c4386eSCy Schubert# better than gcc-generated code and ~80% than code generated by vendor
29*e0c4386eSCy Schubert# compiler.
30*e0c4386eSCy Schubert
31*e0c4386eSCy Schubert$cnt="v0";	# $0
32*e0c4386eSCy Schubert$t0="t0";
33*e0c4386eSCy Schubert$t1="t1";
34*e0c4386eSCy Schubert$t2="t2";
35*e0c4386eSCy Schubert$Thi0="t3";	# $4
36*e0c4386eSCy Schubert$Tlo0="t4";
37*e0c4386eSCy Schubert$Thi1="t5";
38*e0c4386eSCy Schubert$Tlo1="t6";
39*e0c4386eSCy Schubert$rem="t7";	# $8
40*e0c4386eSCy Schubert#################
41*e0c4386eSCy Schubert$Xi="a0";	# $16, input argument block
42*e0c4386eSCy Schubert$Htbl="a1";
43*e0c4386eSCy Schubert$inp="a2";
44*e0c4386eSCy Schubert$len="a3";
45*e0c4386eSCy Schubert$nlo="a4";	# $20
46*e0c4386eSCy Schubert$nhi="a5";
47*e0c4386eSCy Schubert$Zhi="t8";
48*e0c4386eSCy Schubert$Zlo="t9";
49*e0c4386eSCy Schubert$Xhi="t10";	# $24
50*e0c4386eSCy Schubert$Xlo="t11";
51*e0c4386eSCy Schubert$remp="t12";
52*e0c4386eSCy Schubert$rem_4bit="AT";	# $28
53*e0c4386eSCy Schubert
54*e0c4386eSCy Schubert{ my $N;
55*e0c4386eSCy Schubert  sub loop() {
56*e0c4386eSCy Schubert
57*e0c4386eSCy Schubert	$N++;
58*e0c4386eSCy Schubert$code.=<<___;
59*e0c4386eSCy Schubert.align	4
60*e0c4386eSCy Schubert	extbl	$Xlo,7,$nlo
61*e0c4386eSCy Schubert	and	$nlo,0xf0,$nhi
62*e0c4386eSCy Schubert	sll	$nlo,4,$nlo
63*e0c4386eSCy Schubert	and	$nlo,0xf0,$nlo
64*e0c4386eSCy Schubert
65*e0c4386eSCy Schubert	addq	$nlo,$Htbl,$nlo
66*e0c4386eSCy Schubert	ldq	$Zlo,8($nlo)
67*e0c4386eSCy Schubert	addq	$nhi,$Htbl,$nhi
68*e0c4386eSCy Schubert	ldq	$Zhi,0($nlo)
69*e0c4386eSCy Schubert
70*e0c4386eSCy Schubert	and	$Zlo,0x0f,$remp
71*e0c4386eSCy Schubert	sll	$Zhi,60,$t0
72*e0c4386eSCy Schubert	lda	$cnt,6(zero)
73*e0c4386eSCy Schubert	extbl	$Xlo,6,$nlo
74*e0c4386eSCy Schubert
75*e0c4386eSCy Schubert	ldq	$Tlo1,8($nhi)
76*e0c4386eSCy Schubert	s8addq	$remp,$rem_4bit,$remp
77*e0c4386eSCy Schubert	ldq	$Thi1,0($nhi)
78*e0c4386eSCy Schubert	srl	$Zlo,4,$Zlo
79*e0c4386eSCy Schubert
80*e0c4386eSCy Schubert	ldq	$rem,0($remp)
81*e0c4386eSCy Schubert	srl	$Zhi,4,$Zhi
82*e0c4386eSCy Schubert	xor	$t0,$Zlo,$Zlo
83*e0c4386eSCy Schubert	and	$nlo,0xf0,$nhi
84*e0c4386eSCy Schubert
85*e0c4386eSCy Schubert	xor	$Tlo1,$Zlo,$Zlo
86*e0c4386eSCy Schubert	sll	$nlo,4,$nlo
87*e0c4386eSCy Schubert	xor	$Thi1,$Zhi,$Zhi
88*e0c4386eSCy Schubert	and	$nlo,0xf0,$nlo
89*e0c4386eSCy Schubert
90*e0c4386eSCy Schubert	addq	$nlo,$Htbl,$nlo
91*e0c4386eSCy Schubert	ldq	$Tlo0,8($nlo)
92*e0c4386eSCy Schubert	addq	$nhi,$Htbl,$nhi
93*e0c4386eSCy Schubert	ldq	$Thi0,0($nlo)
94*e0c4386eSCy Schubert
95*e0c4386eSCy Schubert.Looplo$N:
96*e0c4386eSCy Schubert	and	$Zlo,0x0f,$remp
97*e0c4386eSCy Schubert	sll	$Zhi,60,$t0
98*e0c4386eSCy Schubert	subq	$cnt,1,$cnt
99*e0c4386eSCy Schubert	srl	$Zlo,4,$Zlo
100*e0c4386eSCy Schubert
101*e0c4386eSCy Schubert	ldq	$Tlo1,8($nhi)
102*e0c4386eSCy Schubert	xor	$rem,$Zhi,$Zhi
103*e0c4386eSCy Schubert	ldq	$Thi1,0($nhi)
104*e0c4386eSCy Schubert	s8addq	$remp,$rem_4bit,$remp
105*e0c4386eSCy Schubert
106*e0c4386eSCy Schubert	ldq	$rem,0($remp)
107*e0c4386eSCy Schubert	srl	$Zhi,4,$Zhi
108*e0c4386eSCy Schubert	xor	$t0,$Zlo,$Zlo
109*e0c4386eSCy Schubert	extbl	$Xlo,$cnt,$nlo
110*e0c4386eSCy Schubert
111*e0c4386eSCy Schubert	and	$nlo,0xf0,$nhi
112*e0c4386eSCy Schubert	xor	$Thi0,$Zhi,$Zhi
113*e0c4386eSCy Schubert	xor	$Tlo0,$Zlo,$Zlo
114*e0c4386eSCy Schubert	sll	$nlo,4,$nlo
115*e0c4386eSCy Schubert
116*e0c4386eSCy Schubert
117*e0c4386eSCy Schubert	and	$Zlo,0x0f,$remp
118*e0c4386eSCy Schubert	sll	$Zhi,60,$t0
119*e0c4386eSCy Schubert	and	$nlo,0xf0,$nlo
120*e0c4386eSCy Schubert	srl	$Zlo,4,$Zlo
121*e0c4386eSCy Schubert
122*e0c4386eSCy Schubert	s8addq	$remp,$rem_4bit,$remp
123*e0c4386eSCy Schubert	xor	$rem,$Zhi,$Zhi
124*e0c4386eSCy Schubert	addq	$nlo,$Htbl,$nlo
125*e0c4386eSCy Schubert	addq	$nhi,$Htbl,$nhi
126*e0c4386eSCy Schubert
127*e0c4386eSCy Schubert	ldq	$rem,0($remp)
128*e0c4386eSCy Schubert	srl	$Zhi,4,$Zhi
129*e0c4386eSCy Schubert	ldq	$Tlo0,8($nlo)
130*e0c4386eSCy Schubert	xor	$t0,$Zlo,$Zlo
131*e0c4386eSCy Schubert
132*e0c4386eSCy Schubert	xor	$Tlo1,$Zlo,$Zlo
133*e0c4386eSCy Schubert	xor	$Thi1,$Zhi,$Zhi
134*e0c4386eSCy Schubert	ldq	$Thi0,0($nlo)
135*e0c4386eSCy Schubert	bne	$cnt,.Looplo$N
136*e0c4386eSCy Schubert
137*e0c4386eSCy Schubert
138*e0c4386eSCy Schubert	and	$Zlo,0x0f,$remp
139*e0c4386eSCy Schubert	sll	$Zhi,60,$t0
140*e0c4386eSCy Schubert	lda	$cnt,7(zero)
141*e0c4386eSCy Schubert	srl	$Zlo,4,$Zlo
142*e0c4386eSCy Schubert
143*e0c4386eSCy Schubert	ldq	$Tlo1,8($nhi)
144*e0c4386eSCy Schubert	xor	$rem,$Zhi,$Zhi
145*e0c4386eSCy Schubert	ldq	$Thi1,0($nhi)
146*e0c4386eSCy Schubert	s8addq	$remp,$rem_4bit,$remp
147*e0c4386eSCy Schubert
148*e0c4386eSCy Schubert	ldq	$rem,0($remp)
149*e0c4386eSCy Schubert	srl	$Zhi,4,$Zhi
150*e0c4386eSCy Schubert	xor	$t0,$Zlo,$Zlo
151*e0c4386eSCy Schubert	extbl	$Xhi,$cnt,$nlo
152*e0c4386eSCy Schubert
153*e0c4386eSCy Schubert	and	$nlo,0xf0,$nhi
154*e0c4386eSCy Schubert	xor	$Thi0,$Zhi,$Zhi
155*e0c4386eSCy Schubert	xor	$Tlo0,$Zlo,$Zlo
156*e0c4386eSCy Schubert	sll	$nlo,4,$nlo
157*e0c4386eSCy Schubert
158*e0c4386eSCy Schubert	and	$Zlo,0x0f,$remp
159*e0c4386eSCy Schubert	sll	$Zhi,60,$t0
160*e0c4386eSCy Schubert	and	$nlo,0xf0,$nlo
161*e0c4386eSCy Schubert	srl	$Zlo,4,$Zlo
162*e0c4386eSCy Schubert
163*e0c4386eSCy Schubert	s8addq	$remp,$rem_4bit,$remp
164*e0c4386eSCy Schubert	xor	$rem,$Zhi,$Zhi
165*e0c4386eSCy Schubert	addq	$nlo,$Htbl,$nlo
166*e0c4386eSCy Schubert	addq	$nhi,$Htbl,$nhi
167*e0c4386eSCy Schubert
168*e0c4386eSCy Schubert	ldq	$rem,0($remp)
169*e0c4386eSCy Schubert	srl	$Zhi,4,$Zhi
170*e0c4386eSCy Schubert	ldq	$Tlo0,8($nlo)
171*e0c4386eSCy Schubert	xor	$t0,$Zlo,$Zlo
172*e0c4386eSCy Schubert
173*e0c4386eSCy Schubert	xor	$Tlo1,$Zlo,$Zlo
174*e0c4386eSCy Schubert	xor	$Thi1,$Zhi,$Zhi
175*e0c4386eSCy Schubert	ldq	$Thi0,0($nlo)
176*e0c4386eSCy Schubert	unop
177*e0c4386eSCy Schubert
178*e0c4386eSCy Schubert
179*e0c4386eSCy Schubert.Loophi$N:
180*e0c4386eSCy Schubert	and	$Zlo,0x0f,$remp
181*e0c4386eSCy Schubert	sll	$Zhi,60,$t0
182*e0c4386eSCy Schubert	subq	$cnt,1,$cnt
183*e0c4386eSCy Schubert	srl	$Zlo,4,$Zlo
184*e0c4386eSCy Schubert
185*e0c4386eSCy Schubert	ldq	$Tlo1,8($nhi)
186*e0c4386eSCy Schubert	xor	$rem,$Zhi,$Zhi
187*e0c4386eSCy Schubert	ldq	$Thi1,0($nhi)
188*e0c4386eSCy Schubert	s8addq	$remp,$rem_4bit,$remp
189*e0c4386eSCy Schubert
190*e0c4386eSCy Schubert	ldq	$rem,0($remp)
191*e0c4386eSCy Schubert	srl	$Zhi,4,$Zhi
192*e0c4386eSCy Schubert	xor	$t0,$Zlo,$Zlo
193*e0c4386eSCy Schubert	extbl	$Xhi,$cnt,$nlo
194*e0c4386eSCy Schubert
195*e0c4386eSCy Schubert	and	$nlo,0xf0,$nhi
196*e0c4386eSCy Schubert	xor	$Thi0,$Zhi,$Zhi
197*e0c4386eSCy Schubert	xor	$Tlo0,$Zlo,$Zlo
198*e0c4386eSCy Schubert	sll	$nlo,4,$nlo
199*e0c4386eSCy Schubert
200*e0c4386eSCy Schubert
201*e0c4386eSCy Schubert	and	$Zlo,0x0f,$remp
202*e0c4386eSCy Schubert	sll	$Zhi,60,$t0
203*e0c4386eSCy Schubert	and	$nlo,0xf0,$nlo
204*e0c4386eSCy Schubert	srl	$Zlo,4,$Zlo
205*e0c4386eSCy Schubert
206*e0c4386eSCy Schubert	s8addq	$remp,$rem_4bit,$remp
207*e0c4386eSCy Schubert	xor	$rem,$Zhi,$Zhi
208*e0c4386eSCy Schubert	addq	$nlo,$Htbl,$nlo
209*e0c4386eSCy Schubert	addq	$nhi,$Htbl,$nhi
210*e0c4386eSCy Schubert
211*e0c4386eSCy Schubert	ldq	$rem,0($remp)
212*e0c4386eSCy Schubert	srl	$Zhi,4,$Zhi
213*e0c4386eSCy Schubert	ldq	$Tlo0,8($nlo)
214*e0c4386eSCy Schubert	xor	$t0,$Zlo,$Zlo
215*e0c4386eSCy Schubert
216*e0c4386eSCy Schubert	xor	$Tlo1,$Zlo,$Zlo
217*e0c4386eSCy Schubert	xor	$Thi1,$Zhi,$Zhi
218*e0c4386eSCy Schubert	ldq	$Thi0,0($nlo)
219*e0c4386eSCy Schubert	bne	$cnt,.Loophi$N
220*e0c4386eSCy Schubert
221*e0c4386eSCy Schubert
222*e0c4386eSCy Schubert	and	$Zlo,0x0f,$remp
223*e0c4386eSCy Schubert	sll	$Zhi,60,$t0
224*e0c4386eSCy Schubert	srl	$Zlo,4,$Zlo
225*e0c4386eSCy Schubert
226*e0c4386eSCy Schubert	ldq	$Tlo1,8($nhi)
227*e0c4386eSCy Schubert	xor	$rem,$Zhi,$Zhi
228*e0c4386eSCy Schubert	ldq	$Thi1,0($nhi)
229*e0c4386eSCy Schubert	s8addq	$remp,$rem_4bit,$remp
230*e0c4386eSCy Schubert
231*e0c4386eSCy Schubert	ldq	$rem,0($remp)
232*e0c4386eSCy Schubert	srl	$Zhi,4,$Zhi
233*e0c4386eSCy Schubert	xor	$t0,$Zlo,$Zlo
234*e0c4386eSCy Schubert
235*e0c4386eSCy Schubert	xor	$Tlo0,$Zlo,$Zlo
236*e0c4386eSCy Schubert	xor	$Thi0,$Zhi,$Zhi
237*e0c4386eSCy Schubert
238*e0c4386eSCy Schubert	and	$Zlo,0x0f,$remp
239*e0c4386eSCy Schubert	sll	$Zhi,60,$t0
240*e0c4386eSCy Schubert	srl	$Zlo,4,$Zlo
241*e0c4386eSCy Schubert
242*e0c4386eSCy Schubert	s8addq	$remp,$rem_4bit,$remp
243*e0c4386eSCy Schubert	xor	$rem,$Zhi,$Zhi
244*e0c4386eSCy Schubert
245*e0c4386eSCy Schubert	ldq	$rem,0($remp)
246*e0c4386eSCy Schubert	srl	$Zhi,4,$Zhi
247*e0c4386eSCy Schubert	xor	$Tlo1,$Zlo,$Zlo
248*e0c4386eSCy Schubert	xor	$Thi1,$Zhi,$Zhi
249*e0c4386eSCy Schubert	xor	$t0,$Zlo,$Zlo
250*e0c4386eSCy Schubert	xor	$rem,$Zhi,$Zhi
251*e0c4386eSCy Schubert___
252*e0c4386eSCy Schubert}}
253*e0c4386eSCy Schubert
254*e0c4386eSCy Schubert$code=<<___;
255*e0c4386eSCy Schubert#ifdef __linux__
256*e0c4386eSCy Schubert#include <asm/regdef.h>
257*e0c4386eSCy Schubert#else
258*e0c4386eSCy Schubert#include <asm.h>
259*e0c4386eSCy Schubert#include <regdef.h>
260*e0c4386eSCy Schubert#endif
261*e0c4386eSCy Schubert
262*e0c4386eSCy Schubert.text
263*e0c4386eSCy Schubert
264*e0c4386eSCy Schubert.set	noat
265*e0c4386eSCy Schubert.set	noreorder
266*e0c4386eSCy Schubert.globl	gcm_gmult_4bit
267*e0c4386eSCy Schubert.align	4
268*e0c4386eSCy Schubert.ent	gcm_gmult_4bit
269*e0c4386eSCy Schubertgcm_gmult_4bit:
270*e0c4386eSCy Schubert	.frame	sp,0,ra
271*e0c4386eSCy Schubert	.prologue 0
272*e0c4386eSCy Schubert
273*e0c4386eSCy Schubert	ldq	$Xlo,8($Xi)
274*e0c4386eSCy Schubert	ldq	$Xhi,0($Xi)
275*e0c4386eSCy Schubert
276*e0c4386eSCy Schubert	bsr	$t0,picmeup
277*e0c4386eSCy Schubert	nop
278*e0c4386eSCy Schubert___
279*e0c4386eSCy Schubert
280*e0c4386eSCy Schubert	&loop();
281*e0c4386eSCy Schubert
282*e0c4386eSCy Schubert$code.=<<___;
283*e0c4386eSCy Schubert	srl	$Zlo,24,$t0	# byte swap
284*e0c4386eSCy Schubert	srl	$Zlo,8,$t1
285*e0c4386eSCy Schubert
286*e0c4386eSCy Schubert	sll	$Zlo,8,$t2
287*e0c4386eSCy Schubert	sll	$Zlo,24,$Zlo
288*e0c4386eSCy Schubert	zapnot	$t0,0x11,$t0
289*e0c4386eSCy Schubert	zapnot	$t1,0x22,$t1
290*e0c4386eSCy Schubert
291*e0c4386eSCy Schubert	zapnot	$Zlo,0x88,$Zlo
292*e0c4386eSCy Schubert	or	$t0,$t1,$t0
293*e0c4386eSCy Schubert	zapnot	$t2,0x44,$t2
294*e0c4386eSCy Schubert
295*e0c4386eSCy Schubert	or	$Zlo,$t0,$Zlo
296*e0c4386eSCy Schubert	srl	$Zhi,24,$t0
297*e0c4386eSCy Schubert	srl	$Zhi,8,$t1
298*e0c4386eSCy Schubert
299*e0c4386eSCy Schubert	or	$Zlo,$t2,$Zlo
300*e0c4386eSCy Schubert	sll	$Zhi,8,$t2
301*e0c4386eSCy Schubert	sll	$Zhi,24,$Zhi
302*e0c4386eSCy Schubert
303*e0c4386eSCy Schubert	srl	$Zlo,32,$Xlo
304*e0c4386eSCy Schubert	sll	$Zlo,32,$Zlo
305*e0c4386eSCy Schubert
306*e0c4386eSCy Schubert	zapnot	$t0,0x11,$t0
307*e0c4386eSCy Schubert	zapnot	$t1,0x22,$t1
308*e0c4386eSCy Schubert	or	$Zlo,$Xlo,$Xlo
309*e0c4386eSCy Schubert
310*e0c4386eSCy Schubert	zapnot	$Zhi,0x88,$Zhi
311*e0c4386eSCy Schubert	or	$t0,$t1,$t0
312*e0c4386eSCy Schubert	zapnot	$t2,0x44,$t2
313*e0c4386eSCy Schubert
314*e0c4386eSCy Schubert	or	$Zhi,$t0,$Zhi
315*e0c4386eSCy Schubert	or	$Zhi,$t2,$Zhi
316*e0c4386eSCy Schubert
317*e0c4386eSCy Schubert	srl	$Zhi,32,$Xhi
318*e0c4386eSCy Schubert	sll	$Zhi,32,$Zhi
319*e0c4386eSCy Schubert
320*e0c4386eSCy Schubert	or	$Zhi,$Xhi,$Xhi
321*e0c4386eSCy Schubert	stq	$Xlo,8($Xi)
322*e0c4386eSCy Schubert	stq	$Xhi,0($Xi)
323*e0c4386eSCy Schubert
324*e0c4386eSCy Schubert	ret	(ra)
325*e0c4386eSCy Schubert.end	gcm_gmult_4bit
326*e0c4386eSCy Schubert___
327*e0c4386eSCy Schubert
328*e0c4386eSCy Schubert$inhi="s0";
329*e0c4386eSCy Schubert$inlo="s1";
330*e0c4386eSCy Schubert
331*e0c4386eSCy Schubert$code.=<<___;
332*e0c4386eSCy Schubert.globl	gcm_ghash_4bit
333*e0c4386eSCy Schubert.align	4
334*e0c4386eSCy Schubert.ent	gcm_ghash_4bit
335*e0c4386eSCy Schubertgcm_ghash_4bit:
336*e0c4386eSCy Schubert	lda	sp,-32(sp)
337*e0c4386eSCy Schubert	stq	ra,0(sp)
338*e0c4386eSCy Schubert	stq	s0,8(sp)
339*e0c4386eSCy Schubert	stq	s1,16(sp)
340*e0c4386eSCy Schubert	.mask	0x04000600,-32
341*e0c4386eSCy Schubert	.frame	sp,32,ra
342*e0c4386eSCy Schubert	.prologue 0
343*e0c4386eSCy Schubert
344*e0c4386eSCy Schubert	ldq_u	$inhi,0($inp)
345*e0c4386eSCy Schubert	ldq_u	$Thi0,7($inp)
346*e0c4386eSCy Schubert	ldq_u	$inlo,8($inp)
347*e0c4386eSCy Schubert	ldq_u	$Tlo0,15($inp)
348*e0c4386eSCy Schubert	ldq	$Xhi,0($Xi)
349*e0c4386eSCy Schubert	ldq	$Xlo,8($Xi)
350*e0c4386eSCy Schubert
351*e0c4386eSCy Schubert	bsr	$t0,picmeup
352*e0c4386eSCy Schubert	nop
353*e0c4386eSCy Schubert
354*e0c4386eSCy Schubert.Louter:
355*e0c4386eSCy Schubert	extql	$inhi,$inp,$inhi
356*e0c4386eSCy Schubert	extqh	$Thi0,$inp,$Thi0
357*e0c4386eSCy Schubert	or	$inhi,$Thi0,$inhi
358*e0c4386eSCy Schubert	lda	$inp,16($inp)
359*e0c4386eSCy Schubert
360*e0c4386eSCy Schubert	extql	$inlo,$inp,$inlo
361*e0c4386eSCy Schubert	extqh	$Tlo0,$inp,$Tlo0
362*e0c4386eSCy Schubert	or	$inlo,$Tlo0,$inlo
363*e0c4386eSCy Schubert	subq	$len,16,$len
364*e0c4386eSCy Schubert
365*e0c4386eSCy Schubert	xor	$Xlo,$inlo,$Xlo
366*e0c4386eSCy Schubert	xor	$Xhi,$inhi,$Xhi
367*e0c4386eSCy Schubert___
368*e0c4386eSCy Schubert
369*e0c4386eSCy Schubert	&loop();
370*e0c4386eSCy Schubert
371*e0c4386eSCy Schubert$code.=<<___;
372*e0c4386eSCy Schubert	srl	$Zlo,24,$t0	# byte swap
373*e0c4386eSCy Schubert	srl	$Zlo,8,$t1
374*e0c4386eSCy Schubert
375*e0c4386eSCy Schubert	sll	$Zlo,8,$t2
376*e0c4386eSCy Schubert	sll	$Zlo,24,$Zlo
377*e0c4386eSCy Schubert	zapnot	$t0,0x11,$t0
378*e0c4386eSCy Schubert	zapnot	$t1,0x22,$t1
379*e0c4386eSCy Schubert
380*e0c4386eSCy Schubert	zapnot	$Zlo,0x88,$Zlo
381*e0c4386eSCy Schubert	or	$t0,$t1,$t0
382*e0c4386eSCy Schubert	zapnot	$t2,0x44,$t2
383*e0c4386eSCy Schubert
384*e0c4386eSCy Schubert	or	$Zlo,$t0,$Zlo
385*e0c4386eSCy Schubert	srl	$Zhi,24,$t0
386*e0c4386eSCy Schubert	srl	$Zhi,8,$t1
387*e0c4386eSCy Schubert
388*e0c4386eSCy Schubert	or	$Zlo,$t2,$Zlo
389*e0c4386eSCy Schubert	sll	$Zhi,8,$t2
390*e0c4386eSCy Schubert	sll	$Zhi,24,$Zhi
391*e0c4386eSCy Schubert
392*e0c4386eSCy Schubert	srl	$Zlo,32,$Xlo
393*e0c4386eSCy Schubert	sll	$Zlo,32,$Zlo
394*e0c4386eSCy Schubert	beq	$len,.Ldone
395*e0c4386eSCy Schubert
396*e0c4386eSCy Schubert	zapnot	$t0,0x11,$t0
397*e0c4386eSCy Schubert	zapnot	$t1,0x22,$t1
398*e0c4386eSCy Schubert	or	$Zlo,$Xlo,$Xlo
399*e0c4386eSCy Schubert	ldq_u	$inhi,0($inp)
400*e0c4386eSCy Schubert
401*e0c4386eSCy Schubert	zapnot	$Zhi,0x88,$Zhi
402*e0c4386eSCy Schubert	or	$t0,$t1,$t0
403*e0c4386eSCy Schubert	zapnot	$t2,0x44,$t2
404*e0c4386eSCy Schubert	ldq_u	$Thi0,7($inp)
405*e0c4386eSCy Schubert
406*e0c4386eSCy Schubert	or	$Zhi,$t0,$Zhi
407*e0c4386eSCy Schubert	or	$Zhi,$t2,$Zhi
408*e0c4386eSCy Schubert	ldq_u	$inlo,8($inp)
409*e0c4386eSCy Schubert	ldq_u	$Tlo0,15($inp)
410*e0c4386eSCy Schubert
411*e0c4386eSCy Schubert	srl	$Zhi,32,$Xhi
412*e0c4386eSCy Schubert	sll	$Zhi,32,$Zhi
413*e0c4386eSCy Schubert
414*e0c4386eSCy Schubert	or	$Zhi,$Xhi,$Xhi
415*e0c4386eSCy Schubert	br	zero,.Louter
416*e0c4386eSCy Schubert
417*e0c4386eSCy Schubert.Ldone:
418*e0c4386eSCy Schubert	zapnot	$t0,0x11,$t0
419*e0c4386eSCy Schubert	zapnot	$t1,0x22,$t1
420*e0c4386eSCy Schubert	or	$Zlo,$Xlo,$Xlo
421*e0c4386eSCy Schubert
422*e0c4386eSCy Schubert	zapnot	$Zhi,0x88,$Zhi
423*e0c4386eSCy Schubert	or	$t0,$t1,$t0
424*e0c4386eSCy Schubert	zapnot	$t2,0x44,$t2
425*e0c4386eSCy Schubert
426*e0c4386eSCy Schubert	or	$Zhi,$t0,$Zhi
427*e0c4386eSCy Schubert	or	$Zhi,$t2,$Zhi
428*e0c4386eSCy Schubert
429*e0c4386eSCy Schubert	srl	$Zhi,32,$Xhi
430*e0c4386eSCy Schubert	sll	$Zhi,32,$Zhi
431*e0c4386eSCy Schubert
432*e0c4386eSCy Schubert	or	$Zhi,$Xhi,$Xhi
433*e0c4386eSCy Schubert
434*e0c4386eSCy Schubert	stq	$Xlo,8($Xi)
435*e0c4386eSCy Schubert	stq	$Xhi,0($Xi)
436*e0c4386eSCy Schubert
437*e0c4386eSCy Schubert	.set	noreorder
438*e0c4386eSCy Schubert	/*ldq	ra,0(sp)*/
439*e0c4386eSCy Schubert	ldq	s0,8(sp)
440*e0c4386eSCy Schubert	ldq	s1,16(sp)
441*e0c4386eSCy Schubert	lda	sp,32(sp)
442*e0c4386eSCy Schubert	ret	(ra)
443*e0c4386eSCy Schubert.end	gcm_ghash_4bit
444*e0c4386eSCy Schubert
445*e0c4386eSCy Schubert.align	4
446*e0c4386eSCy Schubert.ent	picmeup
447*e0c4386eSCy Schubertpicmeup:
448*e0c4386eSCy Schubert	.frame	sp,0,$t0
449*e0c4386eSCy Schubert	.prologue 0
450*e0c4386eSCy Schubert	br	$rem_4bit,.Lpic
451*e0c4386eSCy Schubert.Lpic:	lda	$rem_4bit,12($rem_4bit)
452*e0c4386eSCy Schubert	ret	($t0)
453*e0c4386eSCy Schubert.end	picmeup
454*e0c4386eSCy Schubert	nop
455*e0c4386eSCy Schubertrem_4bit:
456*e0c4386eSCy Schubert	.long	0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
457*e0c4386eSCy Schubert	.long	0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
458*e0c4386eSCy Schubert	.long	0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
459*e0c4386eSCy Schubert	.long	0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
460*e0c4386eSCy Schubert.ascii	"GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
461*e0c4386eSCy Schubert.align	4
462*e0c4386eSCy Schubert
463*e0c4386eSCy Schubert___
464*e0c4386eSCy Schubert$output=pop and open STDOUT,">$output";
465*e0c4386eSCy Schubertprint $code;
466*e0c4386eSCy Schubertclose STDOUT or die "error closing STDOUT: $!";
467*e0c4386eSCy Schubert
468