xref: /linux/arch/powerpc/crypto/aes-gcm-p10.S (revision c532de5a67a70f8533d495f8f2aaa9a0491c3ad0)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2 #
3 # Accelerated AES-GCM stitched implementation for ppc64le.
4 #
5 # Copyright 2022- IBM Inc. All rights reserved
6 #
7 #===================================================================================
8 # Written by Danny Tsen <dtsen@linux.ibm.com>
9 #
10 # GHASH is based on the Karatsuba multiplication method.
11 #
12 #    Xi xor X1
13 #
14 #    X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
15 #      (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
16 #      (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
17 #      (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
18 #      (X4.h * H.h + X4.l * H.l + X4 * H)
19 #
20 # Xi = v0
21 # H Poly = v2
22 # Hash keys = v3 - v14
23 #     ( H.l, H, H.h)
24 #     ( H^2.l, H^2, H^2.h)
25 #     ( H^3.l, H^3, H^3.h)
26 #     ( H^4.l, H^4, H^4.h)
27 #
28 # v30 is IV
29 # v31 - counter 1
30 #
31 # AES used,
32 #     vs0 - vs14 for round keys
33 #     v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
34 #
35 # This implementation uses stitched AES-GCM approach to improve overall performance.
36 # AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
37 #
38 # ===================================================================================
39 #
40
41#include <asm/ppc_asm.h>
42#include <linux/linkage.h>
43
44.machine        "any"
45.text
46
47 # 4x loops
48 # v15 - v18 - input states
49 # vs1 - vs9 - round keys
50 #
51.macro Loop_aes_middle4x
52	xxlor	19+32, 1, 1
53	xxlor	20+32, 2, 2
54	xxlor	21+32, 3, 3
55	xxlor	22+32, 4, 4
56
57	vcipher	15, 15, 19
58	vcipher	16, 16, 19
59	vcipher	17, 17, 19
60	vcipher	18, 18, 19
61
62	vcipher	15, 15, 20
63	vcipher	16, 16, 20
64	vcipher	17, 17, 20
65	vcipher	18, 18, 20
66
67	vcipher	15, 15, 21
68	vcipher	16, 16, 21
69	vcipher	17, 17, 21
70	vcipher	18, 18, 21
71
72	vcipher	15, 15, 22
73	vcipher	16, 16, 22
74	vcipher	17, 17, 22
75	vcipher	18, 18, 22
76
77	xxlor	19+32, 5, 5
78	xxlor	20+32, 6, 6
79	xxlor	21+32, 7, 7
80	xxlor	22+32, 8, 8
81
82	vcipher	15, 15, 19
83	vcipher	16, 16, 19
84	vcipher	17, 17, 19
85	vcipher	18, 18, 19
86
87	vcipher	15, 15, 20
88	vcipher	16, 16, 20
89	vcipher	17, 17, 20
90	vcipher	18, 18, 20
91
92	vcipher	15, 15, 21
93	vcipher	16, 16, 21
94	vcipher	17, 17, 21
95	vcipher	18, 18, 21
96
97	vcipher	15, 15, 22
98	vcipher	16, 16, 22
99	vcipher	17, 17, 22
100	vcipher	18, 18, 22
101
102	xxlor	23+32, 9, 9
103	vcipher	15, 15, 23
104	vcipher	16, 16, 23
105	vcipher	17, 17, 23
106	vcipher	18, 18, 23
107.endm
108
109 # 8x loops
110 # v15 - v22 - input states
111 # vs1 - vs9 - round keys
112 #
113.macro Loop_aes_middle8x
114	xxlor	23+32, 1, 1
115	xxlor	24+32, 2, 2
116	xxlor	25+32, 3, 3
117	xxlor	26+32, 4, 4
118
119	vcipher	15, 15, 23
120	vcipher	16, 16, 23
121	vcipher	17, 17, 23
122	vcipher	18, 18, 23
123	vcipher	19, 19, 23
124	vcipher	20, 20, 23
125	vcipher	21, 21, 23
126	vcipher	22, 22, 23
127
128	vcipher	15, 15, 24
129	vcipher	16, 16, 24
130	vcipher	17, 17, 24
131	vcipher	18, 18, 24
132	vcipher	19, 19, 24
133	vcipher	20, 20, 24
134	vcipher	21, 21, 24
135	vcipher	22, 22, 24
136
137	vcipher	15, 15, 25
138	vcipher	16, 16, 25
139	vcipher	17, 17, 25
140	vcipher	18, 18, 25
141	vcipher	19, 19, 25
142	vcipher	20, 20, 25
143	vcipher	21, 21, 25
144	vcipher	22, 22, 25
145
146	vcipher	15, 15, 26
147	vcipher	16, 16, 26
148	vcipher	17, 17, 26
149	vcipher	18, 18, 26
150	vcipher	19, 19, 26
151	vcipher	20, 20, 26
152	vcipher	21, 21, 26
153	vcipher	22, 22, 26
154
155	xxlor	23+32, 5, 5
156	xxlor	24+32, 6, 6
157	xxlor	25+32, 7, 7
158	xxlor	26+32, 8, 8
159
160	vcipher	15, 15, 23
161	vcipher	16, 16, 23
162	vcipher	17, 17, 23
163	vcipher	18, 18, 23
164	vcipher	19, 19, 23
165	vcipher	20, 20, 23
166	vcipher	21, 21, 23
167	vcipher	22, 22, 23
168
169	vcipher	15, 15, 24
170	vcipher	16, 16, 24
171	vcipher	17, 17, 24
172	vcipher	18, 18, 24
173	vcipher	19, 19, 24
174	vcipher	20, 20, 24
175	vcipher	21, 21, 24
176	vcipher	22, 22, 24
177
178	vcipher	15, 15, 25
179	vcipher	16, 16, 25
180	vcipher	17, 17, 25
181	vcipher	18, 18, 25
182	vcipher	19, 19, 25
183	vcipher	20, 20, 25
184	vcipher	21, 21, 25
185	vcipher	22, 22, 25
186
187	vcipher	15, 15, 26
188	vcipher	16, 16, 26
189	vcipher	17, 17, 26
190	vcipher	18, 18, 26
191	vcipher	19, 19, 26
192	vcipher	20, 20, 26
193	vcipher	21, 21, 26
194	vcipher	22, 22, 26
195
196	xxlor	23+32, 9, 9
197	vcipher	15, 15, 23
198	vcipher	16, 16, 23
199	vcipher	17, 17, 23
200	vcipher	18, 18, 23
201	vcipher	19, 19, 23
202	vcipher	20, 20, 23
203	vcipher	21, 21, 23
204	vcipher	22, 22, 23
205.endm
206
207.macro Loop_aes_middle_1x
208	xxlor	19+32, 1, 1
209	xxlor	20+32, 2, 2
210	xxlor	21+32, 3, 3
211	xxlor	22+32, 4, 4
212
213	vcipher 15, 15, 19
214	vcipher 15, 15, 20
215	vcipher 15, 15, 21
216	vcipher 15, 15, 22
217
218	xxlor	19+32, 5, 5
219	xxlor	20+32, 6, 6
220	xxlor	21+32, 7, 7
221	xxlor	22+32, 8, 8
222
223	vcipher 15, 15, 19
224	vcipher 15, 15, 20
225	vcipher 15, 15, 21
226	vcipher 15, 15, 22
227
228	xxlor	19+32, 9, 9
229	vcipher 15, 15, 19
230.endm
231
232 #
233 # Compute 4x hash values based on Karatsuba method.
234 #
235.macro ppc_aes_gcm_ghash
236	vxor		15, 15, 0
237
238	vpmsumd		23, 12, 15		# H4.L * X.L
239	vpmsumd		24, 9, 16
240	vpmsumd		25, 6, 17
241	vpmsumd		26, 3, 18
242
243	vxor		23, 23, 24
244	vxor		23, 23, 25
245	vxor		23, 23, 26		# L
246
247	vpmsumd		24, 13, 15		# H4.L * X.H + H4.H * X.L
248	vpmsumd		25, 10, 16		# H3.L * X1.H + H3.H * X1.L
249	vpmsumd		26, 7, 17
250	vpmsumd		27, 4, 18
251
252	vxor		24, 24, 25
253	vxor		24, 24, 26
254	vxor		24, 24, 27		# M
255
256	# sum hash and reduction with H Poly
257	vpmsumd		28, 23, 2		# reduction
258
259	vxor		29, 29, 29
260	vsldoi		26, 24, 29, 8		# mL
261	vsldoi		29, 29, 24, 8		# mH
262	vxor		23, 23, 26		# mL + L
263
264	vsldoi		23, 23, 23, 8		# swap
265	vxor		23, 23, 28
266
267	vpmsumd		24, 14, 15		# H4.H * X.H
268	vpmsumd		25, 11, 16
269	vpmsumd		26, 8, 17
270	vpmsumd		27, 5, 18
271
272	vxor		24, 24, 25
273	vxor		24, 24, 26
274	vxor		24, 24, 27
275
276	vxor		24, 24, 29
277
278	# sum hash and reduction with H Poly
279	vsldoi		27, 23, 23, 8		# swap
280	vpmsumd		23, 23, 2
281	vxor		27, 27, 24
282	vxor		23, 23, 27
283
284	xxlor		32, 23+32, 23+32		# update hash
285
286.endm
287
288 #
289 # Combine two 4x ghash
290 # v15 - v22 - input blocks
291 #
292.macro ppc_aes_gcm_ghash2_4x
293	# first 4x hash
294	vxor		15, 15, 0		# Xi + X
295
296	vpmsumd		23, 12, 15		# H4.L * X.L
297	vpmsumd		24, 9, 16
298	vpmsumd		25, 6, 17
299	vpmsumd		26, 3, 18
300
301	vxor		23, 23, 24
302	vxor		23, 23, 25
303	vxor		23, 23, 26		# L
304
305	vpmsumd		24, 13, 15		# H4.L * X.H + H4.H * X.L
306	vpmsumd		25, 10, 16		# H3.L * X1.H + H3.H * X1.L
307	vpmsumd		26, 7, 17
308	vpmsumd		27, 4, 18
309
310	vxor		24, 24, 25
311	vxor		24, 24, 26
312
313	# sum hash and reduction with H Poly
314	vpmsumd		28, 23, 2		# reduction
315
316	vxor		29, 29, 29
317
318	vxor		24, 24, 27		# M
319	vsldoi		26, 24, 29, 8		# mL
320	vsldoi		29, 29, 24, 8		# mH
321	vxor		23, 23, 26		# mL + L
322
323	vsldoi		23, 23, 23, 8		# swap
324	vxor		23, 23, 28
325
326	vpmsumd		24, 14, 15		# H4.H * X.H
327	vpmsumd		25, 11, 16
328	vpmsumd		26, 8, 17
329	vpmsumd		27, 5, 18
330
331	vxor		24, 24, 25
332	vxor		24, 24, 26
333	vxor		24, 24, 27		# H
334
335	vxor		24, 24, 29		# H + mH
336
337	# sum hash and reduction with H Poly
338	vsldoi		27, 23, 23, 8		# swap
339	vpmsumd		23, 23, 2
340	vxor		27, 27, 24
341	vxor		27, 23, 27		# 1st Xi
342
343	# 2nd 4x hash
344	vpmsumd		24, 9, 20
345	vpmsumd		25, 6, 21
346	vpmsumd		26, 3, 22
347	vxor		19, 19, 27		# Xi + X
348	vpmsumd		23, 12, 19		# H4.L * X.L
349
350	vxor		23, 23, 24
351	vxor		23, 23, 25
352	vxor		23, 23, 26		# L
353
354	vpmsumd		24, 13, 19		# H4.L * X.H + H4.H * X.L
355	vpmsumd		25, 10, 20		# H3.L * X1.H + H3.H * X1.L
356	vpmsumd		26, 7, 21
357	vpmsumd		27, 4, 22
358
359	vxor		24, 24, 25
360	vxor		24, 24, 26
361
362	# sum hash and reduction with H Poly
363	vpmsumd		28, 23, 2		# reduction
364
365	vxor		29, 29, 29
366
367	vxor		24, 24, 27		# M
368	vsldoi		26, 24, 29, 8		# mL
369	vsldoi		29, 29, 24, 8		# mH
370	vxor		23, 23, 26		# mL + L
371
372	vsldoi		23, 23, 23, 8		# swap
373	vxor		23, 23, 28
374
375	vpmsumd		24, 14, 19		# H4.H * X.H
376	vpmsumd		25, 11, 20
377	vpmsumd		26, 8, 21
378	vpmsumd		27, 5, 22
379
380	vxor		24, 24, 25
381	vxor		24, 24, 26
382	vxor		24, 24, 27		# H
383
384	vxor		24, 24, 29		# H + mH
385
386	# sum hash and reduction with H Poly
387	vsldoi		27, 23, 23, 8		# swap
388	vpmsumd		23, 23, 2
389	vxor		27, 27, 24
390	vxor		23, 23, 27
391
392	xxlor		32, 23+32, 23+32		# update hash
393
394.endm
395
396 #
397 # Compute update single hash
398 #
399.macro ppc_update_hash_1x
400	vxor		28, 28, 0
401
402	vxor		19, 19, 19
403
404	vpmsumd		22, 3, 28		# L
405	vpmsumd		23, 4, 28		# M
406	vpmsumd		24, 5, 28		# H
407
408	vpmsumd		27, 22, 2		# reduction
409
410	vsldoi		25, 23, 19, 8		# mL
411	vsldoi		26, 19, 23, 8		# mH
412	vxor		22, 22, 25		# LL + LL
413	vxor		24, 24, 26		# HH + HH
414
415	vsldoi		22, 22, 22, 8		# swap
416	vxor		22, 22, 27
417
418	vsldoi		20, 22, 22, 8		# swap
419	vpmsumd		22, 22, 2		# reduction
420	vxor		20, 20, 24
421	vxor		22, 22, 20
422
423	vmr		0, 22			# update hash
424
425.endm
426
427.macro SAVE_REGS
428	stdu 1,-640(1)
429	mflr 0
430
431	std	14,112(1)
432	std	15,120(1)
433	std	16,128(1)
434	std	17,136(1)
435	std	18,144(1)
436	std	19,152(1)
437	std	20,160(1)
438	std	21,168(1)
439	li	9, 256
440	stvx	20, 9, 1
441	addi	9, 9, 16
442	stvx	21, 9, 1
443	addi	9, 9, 16
444	stvx	22, 9, 1
445	addi	9, 9, 16
446	stvx	23, 9, 1
447	addi	9, 9, 16
448	stvx	24, 9, 1
449	addi	9, 9, 16
450	stvx	25, 9, 1
451	addi	9, 9, 16
452	stvx	26, 9, 1
453	addi	9, 9, 16
454	stvx	27, 9, 1
455	addi	9, 9, 16
456	stvx	28, 9, 1
457	addi	9, 9, 16
458	stvx	29, 9, 1
459	addi	9, 9, 16
460	stvx	30, 9, 1
461	addi	9, 9, 16
462	stvx	31, 9, 1
463	stxv	14, 464(1)
464	stxv	15, 480(1)
465	stxv	16, 496(1)
466	stxv	17, 512(1)
467	stxv	18, 528(1)
468	stxv	19, 544(1)
469	stxv	20, 560(1)
470	stxv	21, 576(1)
471	stxv	22, 592(1)
472	std	0, 656(1)
473.endm
474
475.macro RESTORE_REGS
476	lxv	14, 464(1)
477	lxv	15, 480(1)
478	lxv	16, 496(1)
479	lxv	17, 512(1)
480	lxv	18, 528(1)
481	lxv	19, 544(1)
482	lxv	20, 560(1)
483	lxv	21, 576(1)
484	lxv	22, 592(1)
485	li	9, 256
486	lvx	20, 9, 1
487	addi	9, 9, 16
488	lvx	21, 9, 1
489	addi	9, 9, 16
490	lvx	22, 9, 1
491	addi	9, 9, 16
492	lvx	23, 9, 1
493	addi	9, 9, 16
494	lvx	24, 9, 1
495	addi	9, 9, 16
496	lvx	25, 9, 1
497	addi	9, 9, 16
498	lvx	26, 9, 1
499	addi	9, 9, 16
500	lvx	27, 9, 1
501	addi	9, 9, 16
502	lvx	28, 9, 1
503	addi	9, 9, 16
504	lvx	29, 9, 1
505	addi	9, 9, 16
506	lvx	30, 9, 1
507	addi	9, 9, 16
508	lvx	31, 9, 1
509
510	ld	0, 656(1)
511	ld      14,112(1)
512	ld      15,120(1)
513	ld      16,128(1)
514	ld      17,136(1)
515	ld      18,144(1)
516	ld      19,152(1)
517	ld      20,160(1)
518	ld	21,168(1)
519
520	mtlr	0
521	addi	1, 1, 640
522.endm
523
524.macro LOAD_HASH_TABLE
525	# Load Xi
526	lxvb16x	32, 0, 8	# load Xi
527
528	# load Hash - h^4, h^3, h^2, h
529	li	10, 32
530	lxvd2x	2+32, 10, 8	# H Poli
531	li	10, 48
532	lxvd2x	3+32, 10, 8	# Hl
533	li	10, 64
534	lxvd2x	4+32, 10, 8	# H
535	li	10, 80
536	lxvd2x	5+32, 10, 8	# Hh
537
538	li	10, 96
539	lxvd2x	6+32, 10, 8	# H^2l
540	li	10, 112
541	lxvd2x	7+32, 10, 8	# H^2
542	li	10, 128
543	lxvd2x	8+32, 10, 8	# H^2h
544
545	li	10, 144
546	lxvd2x	9+32, 10, 8	# H^3l
547	li	10, 160
548	lxvd2x	10+32, 10, 8	# H^3
549	li	10, 176
550	lxvd2x	11+32, 10, 8	# H^3h
551
552	li	10, 192
553	lxvd2x	12+32, 10, 8	# H^4l
554	li	10, 208
555	lxvd2x	13+32, 10, 8	# H^4
556	li	10, 224
557	lxvd2x	14+32, 10, 8	# H^4h
558.endm
559
560 #
561 # aes_p10_gcm_encrypt (const void *inp, void *out, size_t len,
562 #               const char *rk, unsigned char iv[16], void *Xip);
563 #
564 #    r3 - inp
565 #    r4 - out
566 #    r5 - len
567 #    r6 - AES round keys
568 #    r7 - iv and other data
569 #    r8 - Xi, HPoli, hash keys
570 #
571 #    rounds is at offset 240 in rk
572 #    Xi is at 0 in gcm_table (Xip).
573 #
574_GLOBAL(aes_p10_gcm_encrypt)
575.align 5
576
577	SAVE_REGS
578
579	LOAD_HASH_TABLE
580
581	# initialize ICB: GHASH( IV ), IV - r7
582	lxvb16x	30+32, 0, 7	# load IV  - v30
583
584	mr	12, 5		# length
585	li	11, 0		# block index
586
587	# counter 1
588	vxor	31, 31, 31
589	vspltisb 22, 1
590	vsldoi	31, 31, 22,1	# counter 1
591
592	# load round key to VSR
593	lxv	0, 0(6)
594	lxv	1, 0x10(6)
595	lxv	2, 0x20(6)
596	lxv	3, 0x30(6)
597	lxv	4, 0x40(6)
598	lxv	5, 0x50(6)
599	lxv	6, 0x60(6)
600	lxv	7, 0x70(6)
601	lxv	8, 0x80(6)
602	lxv	9, 0x90(6)
603	lxv	10, 0xa0(6)
604
605	# load rounds - 10 (128), 12 (192), 14 (256)
606	lwz	9,240(6)
607
608	#
609	# vxor	state, state, w # addroundkey
610	xxlor	32+29, 0, 0
611	vxor	15, 30, 29	# IV + round key - add round key 0
612
613	cmpdi	9, 10
614	beq	Loop_aes_gcm_8x
615
616	# load 2 more round keys (v11, v12)
617	lxv	11, 0xb0(6)
618	lxv	12, 0xc0(6)
619
620	cmpdi	9, 12
621	beq	Loop_aes_gcm_8x
622
623	# load 2 more round keys (v11, v12, v13, v14)
624	lxv	13, 0xd0(6)
625	lxv	14, 0xe0(6)
626	cmpdi	9, 14
627	beq	Loop_aes_gcm_8x
628
629	b	aes_gcm_out
630
631.align 5
632Loop_aes_gcm_8x:
633	mr	14, 3
634	mr	9, 4
635
636	#
637	# check partial block
638	#
639Continue_partial_check:
640	ld	15, 56(7)
641	cmpdi	15, 0
642	beq	Continue
643	bgt	Final_block
644	cmpdi	15, 16
645	blt	Final_block
646
647Continue:
648	# n blcoks
649	li	10, 128
650	divdu	10, 12, 10	# n 128 bytes-blocks
651	cmpdi	10, 0
652	beq	Loop_last_block
653
654	vaddudm	30, 30, 31	# IV + counter
655	vxor	16, 30, 29
656	vaddudm	30, 30, 31
657	vxor	17, 30, 29
658	vaddudm	30, 30, 31
659	vxor	18, 30, 29
660	vaddudm	30, 30, 31
661	vxor	19, 30, 29
662	vaddudm	30, 30, 31
663	vxor	20, 30, 29
664	vaddudm	30, 30, 31
665	vxor	21, 30, 29
666	vaddudm	30, 30, 31
667	vxor	22, 30, 29
668
669	mtctr	10
670
671	li	15, 16
672	li	16, 32
673	li	17, 48
674	li	18, 64
675	li	19, 80
676	li	20, 96
677	li	21, 112
678
679	lwz	10, 240(6)
680
681Loop_8x_block:
682
683	lxvb16x		15, 0, 14	# load block
684	lxvb16x		16, 15, 14	# load block
685	lxvb16x		17, 16, 14	# load block
686	lxvb16x		18, 17, 14	# load block
687	lxvb16x		19, 18, 14	# load block
688	lxvb16x		20, 19, 14	# load block
689	lxvb16x		21, 20, 14	# load block
690	lxvb16x		22, 21, 14	# load block
691	addi		14, 14, 128
692
693	Loop_aes_middle8x
694
695	xxlor	23+32, 10, 10
696
697	cmpdi	10, 10
698	beq	Do_next_ghash
699
700	# 192 bits
701	xxlor	24+32, 11, 11
702
703	vcipher	15, 15, 23
704	vcipher	16, 16, 23
705	vcipher	17, 17, 23
706	vcipher	18, 18, 23
707	vcipher	19, 19, 23
708	vcipher	20, 20, 23
709	vcipher	21, 21, 23
710	vcipher	22, 22, 23
711
712	vcipher	15, 15, 24
713	vcipher	16, 16, 24
714	vcipher	17, 17, 24
715	vcipher	18, 18, 24
716	vcipher	19, 19, 24
717	vcipher	20, 20, 24
718	vcipher	21, 21, 24
719	vcipher	22, 22, 24
720
721	xxlor	23+32, 12, 12
722
723	cmpdi	10, 12
724	beq	Do_next_ghash
725
726	# 256 bits
727	xxlor	24+32, 13, 13
728
729	vcipher	15, 15, 23
730	vcipher	16, 16, 23
731	vcipher	17, 17, 23
732	vcipher	18, 18, 23
733	vcipher	19, 19, 23
734	vcipher	20, 20, 23
735	vcipher	21, 21, 23
736	vcipher	22, 22, 23
737
738	vcipher	15, 15, 24
739	vcipher	16, 16, 24
740	vcipher	17, 17, 24
741	vcipher	18, 18, 24
742	vcipher	19, 19, 24
743	vcipher	20, 20, 24
744	vcipher	21, 21, 24
745	vcipher	22, 22, 24
746
747	xxlor	23+32, 14, 14
748
749	cmpdi	10, 14
750	beq	Do_next_ghash
751	b	aes_gcm_out
752
753Do_next_ghash:
754
755	#
756	# last round
757	vcipherlast     15, 15, 23
758	vcipherlast     16, 16, 23
759
760	xxlxor		47, 47, 15
761	stxvb16x        47, 0, 9	# store output
762	xxlxor		48, 48, 16
763	stxvb16x        48, 15, 9	# store output
764
765	vcipherlast     17, 17, 23
766	vcipherlast     18, 18, 23
767
768	xxlxor		49, 49, 17
769	stxvb16x        49, 16, 9	# store output
770	xxlxor		50, 50, 18
771	stxvb16x        50, 17, 9	# store output
772
773	vcipherlast     19, 19, 23
774	vcipherlast     20, 20, 23
775
776	xxlxor		51, 51, 19
777	stxvb16x        51, 18, 9	# store output
778	xxlxor		52, 52, 20
779	stxvb16x        52, 19, 9	# store output
780
781	vcipherlast     21, 21, 23
782	vcipherlast     22, 22, 23
783
784	xxlxor		53, 53, 21
785	stxvb16x        53, 20, 9	# store output
786	xxlxor		54, 54, 22
787	stxvb16x        54, 21, 9	# store output
788
789	addi		9, 9, 128
790
791	# ghash here
792	ppc_aes_gcm_ghash2_4x
793
794	xxlor	27+32, 0, 0
795	vaddudm 30, 30, 31		# IV + counter
796	vmr	29, 30
797	vxor    15, 30, 27		# add round key
798	vaddudm 30, 30, 31
799	vxor    16, 30, 27
800	vaddudm 30, 30, 31
801	vxor    17, 30, 27
802	vaddudm 30, 30, 31
803	vxor    18, 30, 27
804	vaddudm 30, 30, 31
805	vxor    19, 30, 27
806	vaddudm 30, 30, 31
807	vxor    20, 30, 27
808	vaddudm 30, 30, 31
809	vxor    21, 30, 27
810	vaddudm 30, 30, 31
811	vxor    22, 30, 27
812
813	addi    12, 12, -128
814	addi    11, 11, 128
815
816	bdnz	Loop_8x_block
817
818	vmr	30, 29
819	stxvb16x 30+32, 0, 7		# update IV
820
821Loop_last_block:
822	cmpdi   12, 0
823	beq     aes_gcm_out
824
825	# loop last few blocks
826	li      10, 16
827	divdu   10, 12, 10
828
829	mtctr   10
830
831	lwz	10, 240(6)
832
833	cmpdi   12, 16
834	blt     Final_block
835
836Next_rem_block:
837	lxvb16x 15, 0, 14		# load block
838
839	Loop_aes_middle_1x
840
841	xxlor	23+32, 10, 10
842
843	cmpdi	10, 10
844	beq	Do_next_1x
845
846	# 192 bits
847	xxlor	24+32, 11, 11
848
849	vcipher	15, 15, 23
850	vcipher	15, 15, 24
851
852	xxlor	23+32, 12, 12
853
854	cmpdi	10, 12
855	beq	Do_next_1x
856
857	# 256 bits
858	xxlor	24+32, 13, 13
859
860	vcipher	15, 15, 23
861	vcipher	15, 15, 24
862
863	xxlor	23+32, 14, 14
864
865	cmpdi	10, 14
866	beq	Do_next_1x
867
868Do_next_1x:
869	vcipherlast     15, 15, 23
870
871	xxlxor		47, 47, 15
872	stxvb16x	47, 0, 9	# store output
873	addi		14, 14, 16
874	addi		9, 9, 16
875
876	vmr		28, 15
877	ppc_update_hash_1x
878
879	addi		12, 12, -16
880	addi		11, 11, 16
881	xxlor		19+32, 0, 0
882	vaddudm		30, 30, 31		# IV + counter
883	vxor		15, 30, 19		# add round key
884
885	bdnz	Next_rem_block
886
887	li	15, 0
888	std	15, 56(7)		# clear partial?
889	stxvb16x 30+32, 0, 7		# update IV
890	cmpdi	12, 0
891	beq	aes_gcm_out
892
893Final_block:
894	lwz	10, 240(6)
895	Loop_aes_middle_1x
896
897	xxlor	23+32, 10, 10
898
899	cmpdi	10, 10
900	beq	Do_final_1x
901
902	# 192 bits
903	xxlor	24+32, 11, 11
904
905	vcipher	15, 15, 23
906	vcipher	15, 15, 24
907
908	xxlor	23+32, 12, 12
909
910	cmpdi	10, 12
911	beq	Do_final_1x
912
913	# 256 bits
914	xxlor	24+32, 13, 13
915
916	vcipher	15, 15, 23
917	vcipher	15, 15, 24
918
919	xxlor	23+32, 14, 14
920
921	cmpdi	10, 14
922	beq	Do_final_1x
923
924Do_final_1x:
925	vcipherlast     15, 15, 23
926
927	# check partial block
928	li	21, 0			# encrypt
929	ld	15, 56(7)		# partial?
930	cmpdi	15, 0
931	beq	Normal_block
932	bl	Do_partial_block
933
934	cmpdi	12, 0
935	ble aes_gcm_out
936
937	b Continue_partial_check
938
939Normal_block:
940	lxvb16x	15, 0, 14		# load last block
941	xxlxor	47, 47, 15
942
943	# create partial block mask
944	li	15, 16
945	sub	15, 15, 12		# index to the mask
946
947	vspltisb	16, -1		# first 16 bytes - 0xffff...ff
948	vspltisb	17, 0		# second 16 bytes - 0x0000...00
949	li	10, 192
950	stvx	16, 10, 1
951	addi	10, 10, 16
952	stvx	17, 10, 1
953
954	addi	10, 1, 192
955	lxvb16x	16, 15, 10		# load partial block mask
956	xxland	47, 47, 16
957
958	vmr	28, 15
959	ppc_update_hash_1x
960
961	# * should store only the remaining bytes.
962	bl	Write_partial_block
963
964	stxvb16x 30+32, 0, 7		# update IV
965	std	12, 56(7)		# update partial?
966	li	16, 16
967
968	stxvb16x	32, 0, 8		# write out Xi
969	stxvb16x	32, 16, 8		# write out Xi
970	b aes_gcm_out
971
972 #
973 # Compute data mask
974 #
975.macro GEN_MASK _mask _start _end
976	vspltisb	16, -1		# first 16 bytes - 0xffff...ff
977	vspltisb	17, 0		# second 16 bytes - 0x0000...00
978	li	10, 192
979	stxvb16x	17+32, 10, 1
980	add	10, 10, \_start
981	stxvb16x	16+32, 10, 1
982	add	10, 10, \_end
983	stxvb16x	17+32, 10, 1
984
985	addi	10, 1, 192
986	lxvb16x	\_mask, 0, 10		# load partial block mask
987.endm
988
989 #
990 # Handle multiple partial blocks for encrypt and decrypt
991 #   operations.
992 #
993SYM_FUNC_START_LOCAL(Do_partial_block)
994	add	17, 15, 5
995	cmpdi	17, 16
996	bgt	Big_block
997	GEN_MASK 18, 15, 5
998	b	_Partial
999SYM_FUNC_END(Do_partial_block)
1000Big_block:
1001	li	16, 16
1002	GEN_MASK 18, 15, 16
1003
1004_Partial:
1005	lxvb16x	17+32, 0, 14		# load last block
1006	sldi	16, 15, 3
1007	mtvsrdd	32+16, 0, 16
1008	vsro	17, 17, 16
1009	xxlxor	47, 47, 17+32
1010	xxland	47, 47, 18
1011
1012	vxor	0, 0, 0			# clear Xi
1013	vmr	28, 15
1014
1015	cmpdi	21, 0			# encrypt/decrypt ops?
1016	beq	Skip_decrypt
1017	xxland	32+28, 32+17, 18
1018
1019Skip_decrypt:
1020
1021	ppc_update_hash_1x
1022
1023	li	16, 16
1024	lxvb16x 32+29, 16, 8
1025	vxor	0, 0, 29
1026	stxvb16x 32, 0, 8		# save Xi
1027	stxvb16x 32, 16, 8		# save Xi
1028
1029	# store partial block
1030	# loop the rest of the stream if any
1031	sldi	16, 15, 3
1032	mtvsrdd	32+16, 0, 16
1033	vslo	15, 15, 16
1034	#stxvb16x 15+32, 0, 9		# last block
1035
1036	li	16, 16
1037	sub	17, 16, 15		# 16 - partial
1038
1039	add	16, 15, 5
1040	cmpdi	16, 16
1041	bgt	Larger_16
1042	mr	17, 5
1043Larger_16:
1044
1045	# write partial
1046	li		10, 192
1047	stxvb16x	15+32, 10, 1	# save current block
1048
1049	addi		10, 9, -1
1050	addi		16, 1, 191
1051	mtctr		17		# move partial byte count
1052
1053Write_last_partial:
1054        lbzu		18, 1(16)
1055	stbu		18, 1(10)
1056        bdnz		Write_last_partial
1057	# Complete loop partial
1058
1059	add	14, 14, 17
1060	add	9, 9, 17
1061	sub	12, 12, 17
1062	add	11, 11, 17
1063
1064	add	15, 15, 5
1065	cmpdi	15, 16
1066	blt	Save_partial
1067
1068	vaddudm	30, 30, 31
1069	stxvb16x 30+32, 0, 7		# update IV
1070	xxlor	32+29, 0, 0
1071	vxor	15, 30, 29		# IV + round key - add round key 0
1072	li	15, 0
1073	std	15, 56(7)		# partial done - clear
1074	b	Partial_done
1075Save_partial:
1076	std	15, 56(7)		# partial
1077
1078Partial_done:
1079	blr
1080
1081 #
1082 # Write partial block
1083 # r9 - output
1084 # r12 - remaining bytes
1085 # v15 - partial input data
1086 #
1087SYM_FUNC_START_LOCAL(Write_partial_block)
1088	li		10, 192
1089	stxvb16x	15+32, 10, 1		# last block
1090
1091	addi		10, 9, -1
1092	addi		16, 1, 191
1093
1094        mtctr		12			# remaining bytes
1095	li		15, 0
1096
1097Write_last_byte:
1098        lbzu		14, 1(16)
1099	stbu		14, 1(10)
1100        bdnz		Write_last_byte
1101	blr
1102SYM_FUNC_END(Write_partial_block)
1103
1104aes_gcm_out:
1105	# out = state
1106	stxvb16x	32, 0, 8		# write out Xi
1107	add	3, 11, 12		# return count
1108
1109	RESTORE_REGS
1110	blr
1111
1112 #
1113 # 8x Decrypt
1114 #
1115_GLOBAL(aes_p10_gcm_decrypt)
1116.align 5
1117
1118	SAVE_REGS
1119
1120	LOAD_HASH_TABLE
1121
1122	# initialize ICB: GHASH( IV ), IV - r7
1123	lxvb16x	30+32, 0, 7	# load IV  - v30
1124
1125	mr	12, 5		# length
1126	li	11, 0		# block index
1127
1128	# counter 1
1129	vxor	31, 31, 31
1130	vspltisb 22, 1
1131	vsldoi	31, 31, 22,1	# counter 1
1132
1133	# load round key to VSR
1134	lxv	0, 0(6)
1135	lxv	1, 0x10(6)
1136	lxv	2, 0x20(6)
1137	lxv	3, 0x30(6)
1138	lxv	4, 0x40(6)
1139	lxv	5, 0x50(6)
1140	lxv	6, 0x60(6)
1141	lxv	7, 0x70(6)
1142	lxv	8, 0x80(6)
1143	lxv	9, 0x90(6)
1144	lxv	10, 0xa0(6)
1145
1146	# load rounds - 10 (128), 12 (192), 14 (256)
1147	lwz	9,240(6)
1148
1149	#
1150	# vxor	state, state, w # addroundkey
1151	xxlor	32+29, 0, 0
1152	vxor	15, 30, 29	# IV + round key - add round key 0
1153
1154	cmpdi	9, 10
1155	beq	Loop_aes_gcm_8x_dec
1156
1157	# load 2 more round keys (v11, v12)
1158	lxv	11, 0xb0(6)
1159	lxv	12, 0xc0(6)
1160
1161	cmpdi	9, 12
1162	beq	Loop_aes_gcm_8x_dec
1163
1164	# load 2 more round keys (v11, v12, v13, v14)
1165	lxv	13, 0xd0(6)
1166	lxv	14, 0xe0(6)
1167	cmpdi	9, 14
1168	beq	Loop_aes_gcm_8x_dec
1169
1170	b	aes_gcm_out
1171
1172.align 5
1173Loop_aes_gcm_8x_dec:
1174	mr	14, 3
1175	mr	9, 4
1176
1177	#
1178	# check partial block
1179	#
1180Continue_partial_check_dec:
1181	ld	15, 56(7)
1182	cmpdi	15, 0
1183	beq	Continue_dec
1184	bgt	Final_block_dec
1185	cmpdi	15, 16
1186	blt	Final_block_dec
1187
1188Continue_dec:
1189	# n blcoks
1190	li	10, 128
1191	divdu	10, 12, 10	# n 128 bytes-blocks
1192	cmpdi	10, 0
1193	beq	Loop_last_block_dec
1194
1195	vaddudm	30, 30, 31	# IV + counter
1196	vxor	16, 30, 29
1197	vaddudm	30, 30, 31
1198	vxor	17, 30, 29
1199	vaddudm	30, 30, 31
1200	vxor	18, 30, 29
1201	vaddudm	30, 30, 31
1202	vxor	19, 30, 29
1203	vaddudm	30, 30, 31
1204	vxor	20, 30, 29
1205	vaddudm	30, 30, 31
1206	vxor	21, 30, 29
1207	vaddudm	30, 30, 31
1208	vxor	22, 30, 29
1209
1210	mtctr	10
1211
1212	li	15, 16
1213	li	16, 32
1214	li	17, 48
1215	li	18, 64
1216	li	19, 80
1217	li	20, 96
1218	li	21, 112
1219
1220	lwz	10, 240(6)
1221
1222Loop_8x_block_dec:
1223
1224	lxvb16x		15, 0, 14	# load block
1225	lxvb16x		16, 15, 14	# load block
1226	lxvb16x		17, 16, 14	# load block
1227	lxvb16x		18, 17, 14	# load block
1228	lxvb16x		19, 18, 14	# load block
1229	lxvb16x		20, 19, 14	# load block
1230	lxvb16x		21, 20, 14	# load block
1231	lxvb16x		22, 21, 14	# load block
1232	addi		14, 14, 128
1233
1234	Loop_aes_middle8x
1235
1236	xxlor	23+32, 10, 10
1237
1238	cmpdi	10, 10
1239	beq	Do_next_ghash_dec
1240
1241	# 192 bits
1242	xxlor	24+32, 11, 11
1243
1244	vcipher	15, 15, 23
1245	vcipher	16, 16, 23
1246	vcipher	17, 17, 23
1247	vcipher	18, 18, 23
1248	vcipher	19, 19, 23
1249	vcipher	20, 20, 23
1250	vcipher	21, 21, 23
1251	vcipher	22, 22, 23
1252
1253	vcipher	15, 15, 24
1254	vcipher	16, 16, 24
1255	vcipher	17, 17, 24
1256	vcipher	18, 18, 24
1257	vcipher	19, 19, 24
1258	vcipher	20, 20, 24
1259	vcipher	21, 21, 24
1260	vcipher	22, 22, 24
1261
1262	xxlor	23+32, 12, 12
1263
1264	cmpdi	10, 12
1265	beq	Do_next_ghash_dec
1266
1267	# 256 bits
1268	xxlor	24+32, 13, 13
1269
1270	vcipher	15, 15, 23
1271	vcipher	16, 16, 23
1272	vcipher	17, 17, 23
1273	vcipher	18, 18, 23
1274	vcipher	19, 19, 23
1275	vcipher	20, 20, 23
1276	vcipher	21, 21, 23
1277	vcipher	22, 22, 23
1278
1279	vcipher	15, 15, 24
1280	vcipher	16, 16, 24
1281	vcipher	17, 17, 24
1282	vcipher	18, 18, 24
1283	vcipher	19, 19, 24
1284	vcipher	20, 20, 24
1285	vcipher	21, 21, 24
1286	vcipher	22, 22, 24
1287
1288	xxlor	23+32, 14, 14
1289
1290	cmpdi	10, 14
1291	beq	Do_next_ghash_dec
1292	b	aes_gcm_out
1293
1294Do_next_ghash_dec:
1295
1296	#
1297	# last round
1298	vcipherlast     15, 15, 23
1299	vcipherlast     16, 16, 23
1300
1301	xxlxor		47, 47, 15
1302	stxvb16x        47, 0, 9	# store output
1303	xxlxor		48, 48, 16
1304	stxvb16x        48, 15, 9	# store output
1305
1306	vcipherlast     17, 17, 23
1307	vcipherlast     18, 18, 23
1308
1309	xxlxor		49, 49, 17
1310	stxvb16x        49, 16, 9	# store output
1311	xxlxor		50, 50, 18
1312	stxvb16x        50, 17, 9	# store output
1313
1314	vcipherlast     19, 19, 23
1315	vcipherlast     20, 20, 23
1316
1317	xxlxor		51, 51, 19
1318	stxvb16x        51, 18, 9	# store output
1319	xxlxor		52, 52, 20
1320	stxvb16x        52, 19, 9	# store output
1321
1322	vcipherlast     21, 21, 23
1323	vcipherlast     22, 22, 23
1324
1325	xxlxor		53, 53, 21
1326	stxvb16x        53, 20, 9	# store output
1327	xxlxor		54, 54, 22
1328	stxvb16x        54, 21, 9	# store output
1329
1330	addi		9, 9, 128
1331
1332	xxlor           15+32, 15, 15
1333	xxlor           16+32, 16, 16
1334	xxlor           17+32, 17, 17
1335	xxlor           18+32, 18, 18
1336	xxlor           19+32, 19, 19
1337	xxlor           20+32, 20, 20
1338	xxlor           21+32, 21, 21
1339	xxlor           22+32, 22, 22
1340
1341	# ghash here
1342	ppc_aes_gcm_ghash2_4x
1343
1344	xxlor	27+32, 0, 0
1345	vaddudm 30, 30, 31		# IV + counter
1346	vmr	29, 30
1347	vxor    15, 30, 27		# add round key
1348	vaddudm 30, 30, 31
1349	vxor    16, 30, 27
1350	vaddudm 30, 30, 31
1351	vxor    17, 30, 27
1352	vaddudm 30, 30, 31
1353	vxor    18, 30, 27
1354	vaddudm 30, 30, 31
1355	vxor    19, 30, 27
1356	vaddudm 30, 30, 31
1357	vxor    20, 30, 27
1358	vaddudm 30, 30, 31
1359	vxor    21, 30, 27
1360	vaddudm 30, 30, 31
1361	vxor    22, 30, 27
1362
1363	addi    12, 12, -128
1364	addi    11, 11, 128
1365
1366	bdnz	Loop_8x_block_dec
1367
1368	vmr	30, 29
1369	stxvb16x 30+32, 0, 7		# update IV
1370
1371Loop_last_block_dec:
1372	cmpdi   12, 0
1373	beq     aes_gcm_out
1374
1375	# loop last few blocks
1376	li      10, 16
1377	divdu   10, 12, 10
1378
1379	mtctr   10
1380
1381	lwz	10, 240(6)
1382
1383	cmpdi   12, 16
1384	blt     Final_block_dec
1385
1386Next_rem_block_dec:
1387	lxvb16x 15, 0, 14		# load block
1388
1389	Loop_aes_middle_1x
1390
1391	xxlor	23+32, 10, 10
1392
1393	cmpdi	10, 10
1394	beq	Do_next_1x_dec
1395
1396	# 192 bits
1397	xxlor	24+32, 11, 11
1398
1399	vcipher	15, 15, 23
1400	vcipher	15, 15, 24
1401
1402	xxlor	23+32, 12, 12
1403
1404	cmpdi	10, 12
1405	beq	Do_next_1x_dec
1406
1407	# 256 bits
1408	xxlor	24+32, 13, 13
1409
1410	vcipher	15, 15, 23
1411	vcipher	15, 15, 24
1412
1413	xxlor	23+32, 14, 14
1414
1415	cmpdi	10, 14
1416	beq	Do_next_1x_dec
1417
1418Do_next_1x_dec:
1419	vcipherlast     15, 15, 23
1420
1421	xxlxor		47, 47, 15
1422	stxvb16x	47, 0, 9	# store output
1423	addi		14, 14, 16
1424	addi		9, 9, 16
1425
1426	xxlor           28+32, 15, 15
1427	#vmr		28, 15
1428	ppc_update_hash_1x
1429
1430	addi		12, 12, -16
1431	addi		11, 11, 16
1432	xxlor		19+32, 0, 0
1433	vaddudm		30, 30, 31		# IV + counter
1434	vxor		15, 30, 19		# add round key
1435
1436	bdnz	Next_rem_block_dec
1437
1438	li	15, 0
1439	std	15, 56(7)		# clear partial?
1440	stxvb16x 30+32, 0, 7		# update IV
1441	cmpdi	12, 0
1442	beq	aes_gcm_out
1443
1444Final_block_dec:
1445	lwz	10, 240(6)
1446	Loop_aes_middle_1x
1447
1448	xxlor	23+32, 10, 10
1449
1450	cmpdi	10, 10
1451	beq	Do_final_1x_dec
1452
1453	# 192 bits
1454	xxlor	24+32, 11, 11
1455
1456	vcipher	15, 15, 23
1457	vcipher	15, 15, 24
1458
1459	xxlor	23+32, 12, 12
1460
1461	cmpdi	10, 12
1462	beq	Do_final_1x_dec
1463
1464	# 256 bits
1465	xxlor	24+32, 13, 13
1466
1467	vcipher	15, 15, 23
1468	vcipher	15, 15, 24
1469
1470	xxlor	23+32, 14, 14
1471
1472	cmpdi	10, 14
1473	beq	Do_final_1x_dec
1474
1475Do_final_1x_dec:
1476	vcipherlast     15, 15, 23
1477
1478	# check partial block
1479	li	21, 1			# decrypt
1480	ld	15, 56(7)		# partial?
1481	cmpdi	15, 0
1482	beq	Normal_block_dec
1483	bl	Do_partial_block
1484	cmpdi	12, 0
1485	ble aes_gcm_out
1486
1487	b Continue_partial_check_dec
1488
1489Normal_block_dec:
1490	lxvb16x	15, 0, 14		# load last block
1491	xxlxor	47, 47, 15
1492
1493	# create partial block mask
1494	li	15, 16
1495	sub	15, 15, 12		# index to the mask
1496
1497	vspltisb	16, -1		# first 16 bytes - 0xffff...ff
1498	vspltisb	17, 0		# second 16 bytes - 0x0000...00
1499	li	10, 192
1500	stvx	16, 10, 1
1501	addi	10, 10, 16
1502	stvx	17, 10, 1
1503
1504	addi	10, 1, 192
1505	lxvb16x	16, 15, 10		# load partial block mask
1506	xxland	47, 47, 16
1507
1508	xxland	32+28, 15, 16
1509	#vmr	28, 15
1510	ppc_update_hash_1x
1511
1512	# * should store only the remaining bytes.
1513	bl	Write_partial_block
1514
1515	stxvb16x 30+32, 0, 7		# update IV
1516	std	12, 56(7)		# update partial?
1517	li	16, 16
1518
1519	stxvb16x	32, 0, 8		# write out Xi
1520	stxvb16x	32, 16, 8		# write out Xi
1521	b aes_gcm_out
1522