xref: /linux/arch/powerpc/crypto/ghashp10-ppc.pl (revision 8e07e0e3964ca4e23ce7b68e2096fe660a888942)
1#!/usr/bin/env perl
2# SPDX-License-Identifier: GPL-2.0
3
4# This code is taken from the OpenSSL project but the author (Andy Polyakov)
5# has relicensed it under the GPLv2. Therefore this program is free software;
6# you can redistribute it and/or modify it under the terms of the GNU General
7# Public License version 2 as published by the Free Software Foundation.
8#
9# The original headers, including the original license headers, are
10# included below for completeness.
11
12# ====================================================================
13# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14# project. The module is, however, dual licensed under OpenSSL and
15# CRYPTOGAMS licenses depending on where you obtain it. For further
16# details see https://www.openssl.org/~appro/cryptogams/.
17# ====================================================================
18#
19# GHASH for PowerISA v2.07.
20#
21# July 2014
22#
23# Accurate performance measurements are problematic, because it's
24# always virtualized setup with possibly throttled processor.
25# Relative comparison is therefore more informative. This initial
26# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
27# faster than "4-bit" integer-only compiler-generated 64-bit code.
28# "Initial version" means that there is room for futher improvement.
29
30$flavour=shift;
31$output =shift;
32
33if ($flavour =~ /64/) {
34	$SIZE_T=8;
35	$LRSAVE=2*$SIZE_T;
36	$STU="stdu";
37	$POP="ld";
38	$PUSH="std";
39} elsif ($flavour =~ /32/) {
40	$SIZE_T=4;
41	$LRSAVE=$SIZE_T;
42	$STU="stwu";
43	$POP="lwz";
44	$PUSH="stw";
45} else { die "nonsense $flavour"; }
46
47$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
48( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
49( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
50die "can't locate ppc-xlate.pl";
51
52open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
53
54my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6));	# argument block
55
56my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
57my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
58my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
59my $vrsave="r12";
60my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
61
62$code=<<___;
63.machine	"any"
64
65.text
66
67.globl	.gcm_init_p10
68	lis		r0,0xfff0
69	li		r8,0x10
70	mfspr		$vrsave,256
71	li		r9,0x20
72	mtspr		256,r0
73	li		r10,0x30
74	lvx_u		$H,0,r4			# load H
75	le?xor		r7,r7,r7
76	le?addi		r7,r7,0x8		# need a vperm start with 08
77	le?lvsr		5,0,r7
78	le?vspltisb	6,0x0f
79	le?vxor		5,5,6			# set a b-endian mask
80	le?vperm	$H,$H,$H,5
81
82	vspltisb	$xC2,-16		# 0xf0
83	vspltisb	$t0,1			# one
84	vaddubm		$xC2,$xC2,$xC2		# 0xe0
85	vxor		$zero,$zero,$zero
86	vor		$xC2,$xC2,$t0		# 0xe1
87	vsldoi		$xC2,$xC2,$zero,15	# 0xe1...
88	vsldoi		$t1,$zero,$t0,1		# ...1
89	vaddubm		$xC2,$xC2,$xC2		# 0xc2...
90	vspltisb	$t2,7
91	vor		$xC2,$xC2,$t1		# 0xc2....01
92	vspltb		$t1,$H,0		# most significant byte
93	vsl		$H,$H,$t0		# H<<=1
94	vsrab		$t1,$t1,$t2		# broadcast carry bit
95	vand		$t1,$t1,$xC2
96	vxor		$H,$H,$t1		# twisted H
97
98	vsldoi		$H,$H,$H,8		# twist even more ...
99	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
100	vsldoi		$Hl,$zero,$H,8		# ... and split
101	vsldoi		$Hh,$H,$zero,8
102
103	stvx_u		$xC2,0,r3		# save pre-computed table
104	stvx_u		$Hl,r8,r3
105	stvx_u		$H, r9,r3
106	stvx_u		$Hh,r10,r3
107
108	mtspr		256,$vrsave
109	blr
110	.long		0
111	.byte		0,12,0x14,0,0,0,2,0
112	.long		0
113.size	.gcm_init_p10,.-.gcm_init_p10
114
115.globl	.gcm_init_htable
116	lis		r0,0xfff0
117	li		r8,0x10
118	mfspr		$vrsave,256
119	li		r9,0x20
120	mtspr		256,r0
121	li		r10,0x30
122	lvx_u		$H,0,r4			# load H
123
124	vspltisb	$xC2,-16		# 0xf0
125	vspltisb	$t0,1			# one
126	vaddubm		$xC2,$xC2,$xC2		# 0xe0
127	vxor		$zero,$zero,$zero
128	vor		$xC2,$xC2,$t0		# 0xe1
129	vsldoi		$xC2,$xC2,$zero,15	# 0xe1...
130	vsldoi		$t1,$zero,$t0,1		# ...1
131	vaddubm		$xC2,$xC2,$xC2		# 0xc2...
132	vspltisb	$t2,7
133	vor		$xC2,$xC2,$t1		# 0xc2....01
134	vspltb		$t1,$H,0		# most significant byte
135	vsl		$H,$H,$t0		# H<<=1
136	vsrab		$t1,$t1,$t2		# broadcast carry bit
137	vand		$t1,$t1,$xC2
138	vxor		$IN,$H,$t1		# twisted H
139
140	vsldoi		$H,$IN,$IN,8		# twist even more ...
141	vsldoi		$xC2,$zero,$xC2,8	# 0xc2.0
142	vsldoi		$Hl,$zero,$H,8		# ... and split
143	vsldoi		$Hh,$H,$zero,8
144
145	stvx_u		$xC2,0,r3		# save pre-computed table
146	stvx_u		$Hl,r8,r3
147	li		r8,0x40
148	stvx_u		$H, r9,r3
149	li		r9,0x50
150	stvx_u		$Hh,r10,r3
151	li		r10,0x60
152
153	vpmsumd		$Xl,$IN,$Hl		# H.lo·H.lo
154	vpmsumd		$Xm,$IN,$H		# H.hi·H.lo+H.lo·H.hi
155	vpmsumd		$Xh,$IN,$Hh		# H.hi·H.hi
156
157	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
158
159	vsldoi		$t0,$Xm,$zero,8
160	vsldoi		$t1,$zero,$Xm,8
161	vxor		$Xl,$Xl,$t0
162	vxor		$Xh,$Xh,$t1
163
164	vsldoi		$Xl,$Xl,$Xl,8
165	vxor		$Xl,$Xl,$t2
166
167	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
168	vpmsumd		$Xl,$Xl,$xC2
169	vxor		$t1,$t1,$Xh
170	vxor		$IN1,$Xl,$t1
171
172	vsldoi		$H2,$IN1,$IN1,8
173	vsldoi		$H2l,$zero,$H2,8
174	vsldoi		$H2h,$H2,$zero,8
175
176	stvx_u		$H2l,r8,r3		# save H^2
177	li		r8,0x70
178	stvx_u		$H2,r9,r3
179	li		r9,0x80
180	stvx_u		$H2h,r10,r3
181	li		r10,0x90
182
183	vpmsumd		$Xl,$IN,$H2l		# H.lo·H^2.lo
184	 vpmsumd	$Xl1,$IN1,$H2l		# H^2.lo·H^2.lo
185	vpmsumd		$Xm,$IN,$H2		# H.hi·H^2.lo+H.lo·H^2.hi
186	 vpmsumd	$Xm1,$IN1,$H2		# H^2.hi·H^2.lo+H^2.lo·H^2.hi
187	vpmsumd		$Xh,$IN,$H2h		# H.hi·H^2.hi
188	 vpmsumd	$Xh1,$IN1,$H2h		# H^2.hi·H^2.hi
189
190	vpmsumd		$t2,$Xl,$xC2		# 1st reduction phase
191	 vpmsumd	$t6,$Xl1,$xC2		# 1st reduction phase
192
193	vsldoi		$t0,$Xm,$zero,8
194	vsldoi		$t1,$zero,$Xm,8
195	 vsldoi		$t4,$Xm1,$zero,8
196	 vsldoi		$t5,$zero,$Xm1,8
197	vxor		$Xl,$Xl,$t0
198	vxor		$Xh,$Xh,$t1
199	 vxor		$Xl1,$Xl1,$t4
200	 vxor		$Xh1,$Xh1,$t5
201
202	vsldoi		$Xl,$Xl,$Xl,8
203	 vsldoi		$Xl1,$Xl1,$Xl1,8
204	vxor		$Xl,$Xl,$t2
205	 vxor		$Xl1,$Xl1,$t6
206
207	vsldoi		$t1,$Xl,$Xl,8		# 2nd reduction phase
208	 vsldoi		$t5,$Xl1,$Xl1,8		# 2nd reduction phase
209	vpmsumd		$Xl,$Xl,$xC2
210	 vpmsumd	$Xl1,$Xl1,$xC2
211	vxor		$t1,$t1,$Xh
212	 vxor		$t5,$t5,$Xh1
213	vxor		$Xl,$Xl,$t1
214	 vxor		$Xl1,$Xl1,$t5
215
216	vsldoi		$H,$Xl,$Xl,8
217	 vsldoi		$H2,$Xl1,$Xl1,8
218	vsldoi		$Hl,$zero,$H,8
219	vsldoi		$Hh,$H,$zero,8
220	 vsldoi		$H2l,$zero,$H2,8
221	 vsldoi		$H2h,$H2,$zero,8
222
223	stvx_u		$Hl,r8,r3		# save H^3
224	li		r8,0xa0
225	stvx_u		$H,r9,r3
226	li		r9,0xb0
227	stvx_u		$Hh,r10,r3
228	li		r10,0xc0
229	 stvx_u		$H2l,r8,r3		# save H^4
230	 stvx_u		$H2,r9,r3
231	 stvx_u		$H2h,r10,r3
232
233	mtspr		256,$vrsave
234	blr
235	.long		0
236	.byte		0,12,0x14,0,0,0,2,0
237	.long		0
238.size	.gcm_init_htable,.-.gcm_init_htable
239
240.globl	.gcm_gmult_p10
241	lis		r0,0xfff8
242	li		r8,0x10
243	mfspr		$vrsave,256
244	li		r9,0x20
245	mtspr		256,r0
246	li		r10,0x30
247	lvx_u		$IN,0,$Xip		# load Xi
248
249	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
250	 le?lvsl	$lemask,r0,r0
251	lvx_u		$H, r9,$Htbl
252	 le?vspltisb	$t0,0x07
253	lvx_u		$Hh,r10,$Htbl
254	 le?vxor	$lemask,$lemask,$t0
255	lvx_u		$xC2,0,$Htbl
256	 le?vperm	$IN,$IN,$IN,$lemask
257	vxor		$zero,$zero,$zero
258
259	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
260	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
261	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
262
263	vpmsumd		$t2,$Xl,$xC2		# 1st phase
264
265	vsldoi		$t0,$Xm,$zero,8
266	vsldoi		$t1,$zero,$Xm,8
267	vxor		$Xl,$Xl,$t0
268	vxor		$Xh,$Xh,$t1
269
270	vsldoi		$Xl,$Xl,$Xl,8
271	vxor		$Xl,$Xl,$t2
272
273	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
274	vpmsumd		$Xl,$Xl,$xC2
275	vxor		$t1,$t1,$Xh
276	vxor		$Xl,$Xl,$t1
277
278	le?vperm	$Xl,$Xl,$Xl,$lemask
279	stvx_u		$Xl,0,$Xip		# write out Xi
280
281	mtspr		256,$vrsave
282	blr
283	.long		0
284	.byte		0,12,0x14,0,0,0,2,0
285	.long		0
286.size	.gcm_gmult_p10,.-.gcm_gmult_p10
287
288.globl	.gcm_ghash_p10
289	lis		r0,0xfff8
290	li		r8,0x10
291	mfspr		$vrsave,256
292	li		r9,0x20
293	mtspr		256,r0
294	li		r10,0x30
295	lvx_u		$Xl,0,$Xip		# load Xi
296
297	lvx_u		$Hl,r8,$Htbl		# load pre-computed table
298	 le?lvsl	$lemask,r0,r0
299	lvx_u		$H, r9,$Htbl
300	 le?vspltisb	$t0,0x07
301	lvx_u		$Hh,r10,$Htbl
302	 le?vxor	$lemask,$lemask,$t0
303	lvx_u		$xC2,0,$Htbl
304	 le?vperm	$Xl,$Xl,$Xl,$lemask
305	vxor		$zero,$zero,$zero
306
307	lvx_u		$IN,0,$inp
308	addi		$inp,$inp,16
309	subi		$len,$len,16
310	 le?vperm	$IN,$IN,$IN,$lemask
311	vxor		$IN,$IN,$Xl
312	b		Loop
313
314.align	5
315Loop:
316	 subic		$len,$len,16
317	vpmsumd		$Xl,$IN,$Hl		# H.lo·Xi.lo
318	 subfe.		r0,r0,r0		# borrow?-1:0
319	vpmsumd		$Xm,$IN,$H		# H.hi·Xi.lo+H.lo·Xi.hi
320	 and		r0,r0,$len
321	vpmsumd		$Xh,$IN,$Hh		# H.hi·Xi.hi
322	 add		$inp,$inp,r0
323
324	vpmsumd		$t2,$Xl,$xC2		# 1st phase
325
326	vsldoi		$t0,$Xm,$zero,8
327	vsldoi		$t1,$zero,$Xm,8
328	vxor		$Xl,$Xl,$t0
329	vxor		$Xh,$Xh,$t1
330
331	vsldoi		$Xl,$Xl,$Xl,8
332	vxor		$Xl,$Xl,$t2
333	 lvx_u		$IN,0,$inp
334	 addi		$inp,$inp,16
335
336	vsldoi		$t1,$Xl,$Xl,8		# 2nd phase
337	vpmsumd		$Xl,$Xl,$xC2
338	 le?vperm	$IN,$IN,$IN,$lemask
339	vxor		$t1,$t1,$Xh
340	vxor		$IN,$IN,$t1
341	vxor		$IN,$IN,$Xl
342	beq		Loop			# did $len-=16 borrow?
343
344	vxor		$Xl,$Xl,$t1
345	le?vperm	$Xl,$Xl,$Xl,$lemask
346	stvx_u		$Xl,0,$Xip		# write out Xi
347
348	mtspr		256,$vrsave
349	blr
350	.long		0
351	.byte		0,12,0x14,0,0,0,4,0
352	.long		0
353.size	.gcm_ghash_p10,.-.gcm_ghash_p10
354
355.asciz  "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
356.align  2
357___
358
359foreach (split("\n",$code)) {
360	if ($flavour =~ /le$/o) {	# little-endian
361	    s/le\?//o		or
362	    s/be\?/#be#/o;
363	} else {
364	    s/le\?/#le#/o	or
365	    s/be\?//o;
366	}
367	print $_,"\n";
368}
369
370close STDOUT; # enforce flush
371