xref: /linux/lib/crypto/powerpc/ghashp8-ppc.pl (revision 370c3883195566ee3e7d79e0146c3d735a406573)
1*73f315c1SEric Biggers#!/usr/bin/env perl
2*73f315c1SEric Biggers# SPDX-License-Identifier: GPL-2.0
3*73f315c1SEric Biggers
4*73f315c1SEric Biggers# This code is taken from the OpenSSL project but the author (Andy Polyakov)
5*73f315c1SEric Biggers# has relicensed it under the GPLv2. Therefore this program is free software;
6*73f315c1SEric Biggers# you can redistribute it and/or modify it under the terms of the GNU General
7*73f315c1SEric Biggers# Public License version 2 as published by the Free Software Foundation.
8*73f315c1SEric Biggers#
9*73f315c1SEric Biggers# The original headers, including the original license headers, are
10*73f315c1SEric Biggers# included below for completeness.
11*73f315c1SEric Biggers
12*73f315c1SEric Biggers# ====================================================================
13*73f315c1SEric Biggers# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14*73f315c1SEric Biggers# project. The module is, however, dual licensed under OpenSSL and
15*73f315c1SEric Biggers# CRYPTOGAMS licenses depending on where you obtain it. For further
16*73f315c1SEric Biggers# details see https://www.openssl.org/~appro/cryptogams/.
17*73f315c1SEric Biggers# ====================================================================
18*73f315c1SEric Biggers#
19*73f315c1SEric Biggers# GHASH for PowerISA v2.07.
20*73f315c1SEric Biggers#
21*73f315c1SEric Biggers# July 2014
22*73f315c1SEric Biggers#
23*73f315c1SEric Biggers# Accurate performance measurements are problematic, because it's
24*73f315c1SEric Biggers# always virtualized setup with possibly throttled processor.
25*73f315c1SEric Biggers# Relative comparison is therefore more informative. This initial
26*73f315c1SEric Biggers# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
27*73f315c1SEric Biggers# faster than "4-bit" integer-only compiler-generated 64-bit code.
28*73f315c1SEric Biggers# "Initial version" means that there is room for futher improvement.
29*73f315c1SEric Biggers
30*73f315c1SEric Biggers$flavour=shift;
31*73f315c1SEric Biggers$output =shift;
32*73f315c1SEric Biggers
33*73f315c1SEric Biggersif ($flavour =~ /64/) {
34*73f315c1SEric Biggers	$SIZE_T=8;
35*73f315c1SEric Biggers	$LRSAVE=2*$SIZE_T;
36*73f315c1SEric Biggers	$STU="stdu";
37*73f315c1SEric Biggers	$POP="ld";
38*73f315c1SEric Biggers	$PUSH="std";
39*73f315c1SEric Biggers} elsif ($flavour =~ /32/) {
40*73f315c1SEric Biggers	$SIZE_T=4;
41*73f315c1SEric Biggers	$LRSAVE=$SIZE_T;
42*73f315c1SEric Biggers	$STU="stwu";
43*73f315c1SEric Biggers	$POP="lwz";
44*73f315c1SEric Biggers	$PUSH="stw";
45*73f315c1SEric Biggers} else { die "nonsense $flavour"; }
46*73f315c1SEric Biggers
47*73f315c1SEric Biggers$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
48*73f315c1SEric Biggers( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
49*73f315c1SEric Biggers( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
50*73f315c1SEric Biggers( $xlate="${dir}../../../arch/powerpc/crypto/ppc-xlate.pl" and -f $xlate) or
51*73f315c1SEric Biggersdie "can't locate ppc-xlate.pl";
52*73f315c1SEric Biggers
53*73f315c1SEric Biggersopen STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
54*73f315c1SEric Biggers
55*73f315c1SEric Biggersmy ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block
56*73f315c1SEric Biggers
57*73f315c1SEric Biggersmy ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
58*73f315c1SEric Biggersmy ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
59*73f315c1SEric Biggersmy $vrsave="r12";
60*73f315c1SEric Biggers
61*73f315c1SEric Biggers$code=<<___;
62*73f315c1SEric Biggers.machine	"any"
63*73f315c1SEric Biggers
64*73f315c1SEric Biggers.text
65*73f315c1SEric Biggers
66*73f315c1SEric Biggers.globl	.gcm_init_p8
67*73f315c1SEric Biggers	lis		r0,0xfff0
68*73f315c1SEric Biggers	li		r8,0x10
69*73f315c1SEric Biggers	mfspr		$vrsave,256
70*73f315c1SEric Biggers	li		r9,0x20
71*73f315c1SEric Biggers	mtspr		256,r0
72*73f315c1SEric Biggers	li		r10,0x30
73*73f315c1SEric Biggers	lvx_u		$H,0,r4			# load H
74*73f315c1SEric Biggers	le?xor		r7,r7,r7
75*73f315c1SEric Biggers	le?addi		r7,r7,0x8		# need a vperm start with 08
76*73f315c1SEric Biggers	le?lvsr		5,0,r7
77*73f315c1SEric Biggers	le?vspltisb	6,0x0f
78*73f315c1SEric Biggers	le?vxor		5,5,6			# set a b-endian mask
79*73f315c1SEric Biggers	le?vperm	$H,$H,$H,5
80*73f315c1SEric Biggers
81*73f315c1SEric Biggers	vspltisb	$xC2,-16		# 0xf0
82*73f315c1SEric Biggers	vspltisb	$t0,1			# one
83*73f315c1SEric Biggers	vaddubm		$xC2,$xC2,$xC2		# 0xe0
84*73f315c1SEric Biggers	vxor		$zero,$zero,$zero
85*73f315c1SEric Biggers	vor		$xC2,$xC2,$t0		# 0xe1
86*73f315c1SEric Biggers	vsldoi		$xC2,$xC2,$zero,15	# 0xe1...
87*73f315c1SEric Biggers	vsldoi		$t1,$zero,$t0,1		# ...1
88*73f315c1SEric Biggers	vaddubm		$xC2,$xC2,$xC2		# 0xc2...
89*73f315c1SEric Biggers	vspltisb	$t2,7
90*73f315c1SEric Biggers	vor		$xC2,$xC2,$t1		# 0xc2....01
91*73f315c1SEric Biggers	vspltb		$t1,$H,0		# most significant byte
92*73f315c1SEric Biggers	vsl		$H,$H,$t0		# H<<=1
93*73f315c1SEric Biggers	vsrab		$t1,$t1,$t2		# broadcast carry bit
94*73f315c1SEric Biggers	vand		$t1,$t1,$xC2
95*73f315c1SEric Biggers	vxor		$H,$H,$t1		# twisted H
96*73f315c1SEric Biggers
97*73f315c1SEric Biggers	vsldoi		$H,$H,$H,8		# twist even more ...
98*73f315c1SEric Biggers	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
99*73f315c1SEric Biggers	vsldoi		$Hl,$zero,$H,8		# ... and split
100*73f315c1SEric Biggers	vsldoi		$Hh,$H,$zero,8
101*73f315c1SEric Biggers
102*73f315c1SEric Biggers	stvx_u		$xC2,0,r3		# save pre-computed table
103*73f315c1SEric Biggers	stvx_u		$Hl,r8,r3
104*73f315c1SEric Biggers	stvx_u		$H, r9,r3
105*73f315c1SEric Biggers	stvx_u		$Hh,r10,r3
106*73f315c1SEric Biggers
107*73f315c1SEric Biggers	mtspr		256,$vrsave
108*73f315c1SEric Biggers	blr
109*73f315c1SEric Biggers	.long		0
110*73f315c1SEric Biggers	.byte		0,12,0x14,0,0,0,2,0
111*73f315c1SEric Biggers	.long		0
112*73f315c1SEric Biggers.size	.gcm_init_p8,.-.gcm_init_p8
113*73f315c1SEric Biggers
114*73f315c1SEric Biggers.globl	.gcm_gmult_p8
115*73f315c1SEric Biggers	lis		r0,0xfff8
116*73f315c1SEric Biggers	li		r8,0x10
117*73f315c1SEric Biggers	mfspr		$vrsave,256
118*73f315c1SEric Biggers	li		r9,0x20
119*73f315c1SEric Biggers	mtspr		256,r0
120*73f315c1SEric Biggers	li		r10,0x30
121*73f315c1SEric Biggers	lvx_u		$IN,0,$Xip		# load Xi
122*73f315c1SEric Biggers
123*73f315c1SEric Biggers	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
124*73f315c1SEric Biggers	 le?lvsl	$lemask,r0,r0
125*73f315c1SEric Biggers	lvx_u		$H, r9,$Htbl
126*73f315c1SEric Biggers	 le?vspltisb	$t0,0x07
127*73f315c1SEric Biggers	lvx_u		$Hh,r10,$Htbl
128*73f315c1SEric Biggers	 le?vxor	$lemask,$lemask,$t0
129*73f315c1SEric Biggers	lvx_u		$xC2,0,$Htbl
130*73f315c1SEric Biggers	 le?vperm	$IN,$IN,$IN,$lemask
131*73f315c1SEric Biggers	vxor		$zero,$zero,$zero
132*73f315c1SEric Biggers
133*73f315c1SEric Biggers	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
134*73f315c1SEric Biggers	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
135*73f315c1SEric Biggers	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
136*73f315c1SEric Biggers
137*73f315c1SEric Biggers	vpmsumd		$t2,$Xl,$xC2		# 1st phase
138*73f315c1SEric Biggers
139*73f315c1SEric Biggers	vsldoi		$t0,$Xm,$zero,8
140*73f315c1SEric Biggers	vsldoi		$t1,$zero,$Xm,8
141*73f315c1SEric Biggers	vxor		$Xl,$Xl,$t0
142*73f315c1SEric Biggers	vxor		$Xh,$Xh,$t1
143*73f315c1SEric Biggers
144*73f315c1SEric Biggers	vsldoi		$Xl,$Xl,$Xl,8
145*73f315c1SEric Biggers	vxor		$Xl,$Xl,$t2
146*73f315c1SEric Biggers
147*73f315c1SEric Biggers	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
148*73f315c1SEric Biggers	vpmsumd		$Xl,$Xl,$xC2
149*73f315c1SEric Biggers	vxor		$t1,$t1,$Xh
150*73f315c1SEric Biggers	vxor		$Xl,$Xl,$t1
151*73f315c1SEric Biggers
152*73f315c1SEric Biggers	le?vperm	$Xl,$Xl,$Xl,$lemask
153*73f315c1SEric Biggers	stvx_u		$Xl,0,$Xip		# write out Xi
154*73f315c1SEric Biggers
155*73f315c1SEric Biggers	mtspr		256,$vrsave
156*73f315c1SEric Biggers	blr
157*73f315c1SEric Biggers	.long		0
158*73f315c1SEric Biggers	.byte		0,12,0x14,0,0,0,2,0
159*73f315c1SEric Biggers	.long		0
160*73f315c1SEric Biggers.size	.gcm_gmult_p8,.-.gcm_gmult_p8
161*73f315c1SEric Biggers
162*73f315c1SEric Biggers.globl	.gcm_ghash_p8
163*73f315c1SEric Biggers	lis		r0,0xfff8
164*73f315c1SEric Biggers	li		r8,0x10
165*73f315c1SEric Biggers	mfspr		$vrsave,256
166*73f315c1SEric Biggers	li		r9,0x20
167*73f315c1SEric Biggers	mtspr		256,r0
168*73f315c1SEric Biggers	li		r10,0x30
169*73f315c1SEric Biggers	lvx_u		$Xl,0,$Xip		# load Xi
170*73f315c1SEric Biggers
171*73f315c1SEric Biggers	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
172*73f315c1SEric Biggers	 le?lvsl	$lemask,r0,r0
173*73f315c1SEric Biggers	lvx_u		$H, r9,$Htbl
174*73f315c1SEric Biggers	 le?vspltisb	$t0,0x07
175*73f315c1SEric Biggers	lvx_u		$Hh,r10,$Htbl
176*73f315c1SEric Biggers	 le?vxor	$lemask,$lemask,$t0
177*73f315c1SEric Biggers	lvx_u		$xC2,0,$Htbl
178*73f315c1SEric Biggers	 le?vperm	$Xl,$Xl,$Xl,$lemask
179*73f315c1SEric Biggers	vxor		$zero,$zero,$zero
180*73f315c1SEric Biggers
181*73f315c1SEric Biggers	lvx_u		$IN,0,$inp
182*73f315c1SEric Biggers	addi		$inp,$inp,16
183*73f315c1SEric Biggers	subi		$len,$len,16
184*73f315c1SEric Biggers	 le?vperm	$IN,$IN,$IN,$lemask
185*73f315c1SEric Biggers	vxor		$IN,$IN,$Xl
186*73f315c1SEric Biggers	b		Loop
187*73f315c1SEric Biggers
188*73f315c1SEric Biggers.align	5
189*73f315c1SEric BiggersLoop:
190*73f315c1SEric Biggers	 subic		$len,$len,16
191*73f315c1SEric Biggers	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
192*73f315c1SEric Biggers	 subfe.		r0,r0,r0		# borrow?-1:0
193*73f315c1SEric Biggers	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
194*73f315c1SEric Biggers	 and		r0,r0,$len
195*73f315c1SEric Biggers	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
196*73f315c1SEric Biggers	 add		$inp,$inp,r0
197*73f315c1SEric Biggers
198*73f315c1SEric Biggers	vpmsumd		$t2,$Xl,$xC2		# 1st phase
199*73f315c1SEric Biggers
200*73f315c1SEric Biggers	vsldoi		$t0,$Xm,$zero,8
201*73f315c1SEric Biggers	vsldoi		$t1,$zero,$Xm,8
202*73f315c1SEric Biggers	vxor		$Xl,$Xl,$t0
203*73f315c1SEric Biggers	vxor		$Xh,$Xh,$t1
204*73f315c1SEric Biggers
205*73f315c1SEric Biggers	vsldoi		$Xl,$Xl,$Xl,8
206*73f315c1SEric Biggers	vxor		$Xl,$Xl,$t2
207*73f315c1SEric Biggers	 lvx_u		$IN,0,$inp
208*73f315c1SEric Biggers	 addi		$inp,$inp,16
209*73f315c1SEric Biggers
210*73f315c1SEric Biggers	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
211*73f315c1SEric Biggers	vpmsumd		$Xl,$Xl,$xC2
212*73f315c1SEric Biggers	 le?vperm	$IN,$IN,$IN,$lemask
213*73f315c1SEric Biggers	vxor		$t1,$t1,$Xh
214*73f315c1SEric Biggers	vxor		$IN,$IN,$t1
215*73f315c1SEric Biggers	vxor		$IN,$IN,$Xl
216*73f315c1SEric Biggers	beq		Loop			# did $len-=16 borrow?
217*73f315c1SEric Biggers
218*73f315c1SEric Biggers	vxor		$Xl,$Xl,$t1
219*73f315c1SEric Biggers	le?vperm	$Xl,$Xl,$Xl,$lemask
220*73f315c1SEric Biggers	stvx_u		$Xl,0,$Xip		# write out Xi
221*73f315c1SEric Biggers
222*73f315c1SEric Biggers	mtspr		256,$vrsave
223*73f315c1SEric Biggers	blr
224*73f315c1SEric Biggers	.long		0
225*73f315c1SEric Biggers	.byte		0,12,0x14,0,0,0,4,0
226*73f315c1SEric Biggers	.long		0
227*73f315c1SEric Biggers.size	.gcm_ghash_p8,.-.gcm_ghash_p8
228*73f315c1SEric Biggers
229*73f315c1SEric Biggers.asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
230*73f315c1SEric Biggers.align  2
231*73f315c1SEric Biggers___
232*73f315c1SEric Biggers
233*73f315c1SEric Biggersforeach (split("\n",$code)) {
234*73f315c1SEric Biggers	if ($flavour =~ /le$/o) {	# little-endian
235*73f315c1SEric Biggers	    s/le\?//o		or
236*73f315c1SEric Biggers	    s/be\?/#be#/o;
237*73f315c1SEric Biggers	} else {
238*73f315c1SEric Biggers	    s/le\?/#le#/o	or
239*73f315c1SEric Biggers	    s/be\?//o;
240*73f315c1SEric Biggers	}
241*73f315c1SEric Biggers	print $_,"\n";
242*73f315c1SEric Biggers}
243*73f315c1SEric Biggers
244*73f315c1SEric Biggersclose STDOUT; # enforce flush
245