xref: /linux/arch/powerpc/crypto/aes-gcm-p10.S (revision 364eeb79a213fcf9164208b53764223ad522d6b3)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2#
3# Accelerated AES-GCM stitched implementation for ppc64le.
4#
5# Copyright 2024- IBM Inc.
6#
7#===================================================================================
8# Written by Danny Tsen <dtsen@us.ibm.com>
9#
10# GHASH is based on the Karatsuba multiplication method.
11#
12#    Xi xor X1
13#
14#    X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
15#      (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
16#      (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
17#      (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
18#      (X4.h * H.h + X4.l * H.l + X4 * H)
19#
20# Xi = v0
21# H Poly = v2
22# Hash keys = v3 - v14
23#     ( H.l, H, H.h)
24#     ( H^2.l, H^2, H^2.h)
25#     ( H^3.l, H^3, H^3.h)
26#     ( H^4.l, H^4, H^4.h)
27#
28# v30 is IV
29# v31 - counter 1
30#
31# AES used,
32#     vs0 - round key 0
33#     v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
34#
35# This implementation uses stitched AES-GCM approach to improve overall performance.
36# AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
37#
38# ===================================================================================
39#
40
41#include <asm/ppc_asm.h>
42#include <linux/linkage.h>
43
44.machine        "any"
45.text
46
47.macro	SAVE_GPR GPR OFFSET FRAME
48	std	\GPR,\OFFSET(\FRAME)
49.endm
50
51.macro	SAVE_VRS VRS OFFSET FRAME
52	stxv	\VRS+32, \OFFSET(\FRAME)
53.endm
54
55.macro	RESTORE_GPR GPR OFFSET FRAME
56	ld	\GPR,\OFFSET(\FRAME)
57.endm
58
59.macro	RESTORE_VRS VRS OFFSET FRAME
60	lxv	\VRS+32, \OFFSET(\FRAME)
61.endm
62
63.macro SAVE_REGS
64	mflr 0
65	std 0, 16(1)
66	stdu 1,-512(1)
67
68	SAVE_GPR 14, 112, 1
69	SAVE_GPR 15, 120, 1
70	SAVE_GPR 16, 128, 1
71	SAVE_GPR 17, 136, 1
72	SAVE_GPR 18, 144, 1
73	SAVE_GPR 19, 152, 1
74	SAVE_GPR 20, 160, 1
75	SAVE_GPR 21, 168, 1
76	SAVE_GPR 22, 176, 1
77	SAVE_GPR 23, 184, 1
78	SAVE_GPR 24, 192, 1
79
80	addi	9, 1, 256
81	SAVE_VRS 20, 0, 9
82	SAVE_VRS 21, 16, 9
83	SAVE_VRS 22, 32, 9
84	SAVE_VRS 23, 48, 9
85	SAVE_VRS 24, 64, 9
86	SAVE_VRS 25, 80, 9
87	SAVE_VRS 26, 96, 9
88	SAVE_VRS 27, 112, 9
89	SAVE_VRS 28, 128, 9
90	SAVE_VRS 29, 144, 9
91	SAVE_VRS 30, 160, 9
92	SAVE_VRS 31, 176, 9
93.endm # SAVE_REGS
94
95.macro RESTORE_REGS
96	addi	9, 1, 256
97	RESTORE_VRS 20, 0, 9
98	RESTORE_VRS 21, 16, 9
99	RESTORE_VRS 22, 32, 9
100	RESTORE_VRS 23, 48, 9
101	RESTORE_VRS 24, 64, 9
102	RESTORE_VRS 25, 80, 9
103	RESTORE_VRS 26, 96, 9
104	RESTORE_VRS 27, 112, 9
105	RESTORE_VRS 28, 128, 9
106	RESTORE_VRS 29, 144, 9
107	RESTORE_VRS 30, 160, 9
108	RESTORE_VRS 31, 176, 9
109
110	RESTORE_GPR 14, 112, 1
111	RESTORE_GPR 15, 120, 1
112	RESTORE_GPR 16, 128, 1
113	RESTORE_GPR 17, 136, 1
114	RESTORE_GPR 18, 144, 1
115	RESTORE_GPR 19, 152, 1
116	RESTORE_GPR 20, 160, 1
117	RESTORE_GPR 21, 168, 1
118	RESTORE_GPR 22, 176, 1
119	RESTORE_GPR 23, 184, 1
120	RESTORE_GPR 24, 192, 1
121
122	addi    1, 1, 512
123	ld 0, 16(1)
124	mtlr 0
125.endm # RESTORE_REGS
126
127# 4x loops
128.macro AES_CIPHER_4x _VCIPHER ST r
129	\_VCIPHER	\ST, \ST, \r
130	\_VCIPHER	\ST+1, \ST+1, \r
131	\_VCIPHER	\ST+2, \ST+2, \r
132	\_VCIPHER	\ST+3, \ST+3, \r
133.endm
134
135# 8x loops
136.macro AES_CIPHER_8x _VCIPHER ST r
137	\_VCIPHER	\ST, \ST, \r
138	\_VCIPHER	\ST+1, \ST+1, \r
139	\_VCIPHER	\ST+2, \ST+2, \r
140	\_VCIPHER	\ST+3, \ST+3, \r
141	\_VCIPHER	\ST+4, \ST+4, \r
142	\_VCIPHER	\ST+5, \ST+5, \r
143	\_VCIPHER	\ST+6, \ST+6, \r
144	\_VCIPHER	\ST+7, \ST+7, \r
145.endm
146
147.macro LOOP_8AES_STATE
148	xxlor	32+23, 1, 1
149	xxlor	32+24, 2, 2
150	xxlor	32+25, 3, 3
151	xxlor	32+26, 4, 4
152	AES_CIPHER_8x vcipher, 15, 23
153	AES_CIPHER_8x vcipher, 15, 24
154	AES_CIPHER_8x vcipher, 15, 25
155	AES_CIPHER_8x vcipher, 15, 26
156	xxlor	32+23, 5, 5
157	xxlor	32+24, 6, 6
158	xxlor	32+25, 7, 7
159	xxlor	32+26, 8, 8
160	AES_CIPHER_8x vcipher, 15, 23
161	AES_CIPHER_8x vcipher, 15, 24
162	AES_CIPHER_8x vcipher, 15, 25
163	AES_CIPHER_8x vcipher, 15, 26
164.endm
165
166#
167# PPC_GHASH4x(H, S1, S2, S3, S4): Compute 4x hash values based on Karatsuba method.
168# H: returning digest
169# S#: states
170#
171# S1 should xor with the previous digest
172#
173# Xi = v0
174# H Poly = v2
175# Hash keys = v3 - v14
176# Scratch: v23 - v29
177#
178.macro PPC_GHASH4x H S1 S2 S3 S4
179
180	vpmsumd	23, 12, \S1		# H4.L * X.L
181	vpmsumd	24, 9, \S2
182	vpmsumd	25, 6, \S3
183	vpmsumd	26, 3, \S4
184
185	vpmsumd	27, 13, \S1		# H4.L * X.H + H4.H * X.L
186	vpmsumd	28, 10, \S2		# H3.L * X1.H + H3.H * X1.L
187
188	vxor	23, 23, 24
189	vxor	23, 23, 25
190	vxor	23, 23, 26		# L
191
192	vxor	24, 27, 28
193	vpmsumd	25, 7, \S3
194	vpmsumd	26, 4, \S4
195
196	vxor	24, 24, 25
197	vxor	24, 24, 26		# M
198
199	# sum hash and reduction with H Poly
200	vpmsumd	28, 23, 2		# reduction
201
202	vxor	1, 1, 1
203	vsldoi	25, 24, 1, 8		# mL
204	vsldoi	1, 1, 24, 8		# mH
205	vxor	23, 23, 25		# mL + L
206
207	# This performs swap and xor like,
208	#   vsldoi	23, 23, 23, 8		# swap
209	#   vxor	23, 23, 28
210	xxlor	32+25, 10, 10
211	vpermxor 23, 23, 28, 25
212
213	vpmsumd	26, 14, \S1		# H4.H * X.H
214	vpmsumd	27, 11, \S2
215	vpmsumd	28, 8, \S3
216	vpmsumd	29, 5, \S4
217
218	vxor	24, 26, 27
219	vxor	24, 24, 28
220	vxor	24, 24, 29
221
222	vxor	24, 24, 1
223
224	# sum hash and reduction with H Poly
225	vsldoi	25, 23, 23, 8		# swap
226	vpmsumd	23, 23, 2
227	vxor	27, 25, 24
228	vxor	\H, 23, 27
229.endm
230
231#
232# Compute update single ghash
233# scratch: v1, v22..v27
234#
235.macro PPC_GHASH1x H S1
236
237	vxor	1, 1, 1
238
239	vpmsumd	22, 3, \S1		# L
240	vpmsumd	23, 4, \S1		# M
241	vpmsumd	24, 5, \S1		# H
242
243	vpmsumd	27, 22, 2		# reduction
244
245	vsldoi	25, 23, 1, 8		# mL
246	vsldoi	26, 1, 23, 8		# mH
247	vxor	22, 22, 25		# LL + LL
248	vxor	24, 24, 26		# HH + HH
249
250	xxlor	32+25, 10, 10
251	vpermxor 22, 22, 27, 25
252
253	vsldoi	23, 22, 22, 8		# swap
254	vpmsumd	22, 22, 2		# reduction
255	vxor	23, 23, 24
256	vxor	\H, 22, 23
257.endm
258
259#
260# LOAD_HASH_TABLE
261# Xi = v0
262# H Poly = v2
263# Hash keys = v3 - v14
264#
265.macro LOAD_HASH_TABLE
266	# Load Xi
267	lxvb16x	32, 0, 8	# load Xi
268
269	# load Hash - h^4, h^3, h^2, h
270	li	10, 32
271	lxvd2x	2+32, 10, 8	# H Poli
272	li	10, 48
273	lxvd2x	3+32, 10, 8	# Hl
274	li	10, 64
275	lxvd2x	4+32, 10, 8	# H
276	li	10, 80
277	lxvd2x	5+32, 10, 8	# Hh
278
279	li	10, 96
280	lxvd2x	6+32, 10, 8	# H^2l
281	li	10, 112
282	lxvd2x	7+32, 10, 8	# H^2
283	li	10, 128
284	lxvd2x	8+32, 10, 8	# H^2h
285
286	li	10, 144
287	lxvd2x	9+32, 10, 8	# H^3l
288	li	10, 160
289	lxvd2x	10+32, 10, 8	# H^3
290	li	10, 176
291	lxvd2x	11+32, 10, 8	# H^3h
292
293	li	10, 192
294	lxvd2x	12+32, 10, 8	# H^4l
295	li	10, 208
296	lxvd2x	13+32, 10, 8	# H^4
297	li	10, 224
298	lxvd2x	14+32, 10, 8	# H^4h
299.endm
300
301################################################################################
302# Compute AES and ghash one block at a time.
303# r23: AES rounds
304# v30: current IV
305# vs0: roundkey 0
306#
307################################################################################
308SYM_FUNC_START_LOCAL(aes_gcm_crypt_1x)
309
310	cmpdi	5, 16
311	bge	__More_1x
312	blr
313__More_1x:
314	li      10, 16
315	divdu   12, 5, 10
316
317	xxlxor	32+15, 32+30, 0
318
319	# Pre-load 8 AES rounds to scratch vectors.
320	xxlor	32+16, 1, 1
321	xxlor	32+17, 2, 2
322	xxlor	32+18, 3, 3
323	xxlor	32+19, 4, 4
324	xxlor	32+20, 5, 5
325	xxlor	32+21, 6, 6
326	xxlor	32+28, 7, 7
327	xxlor	32+29, 8, 8
328	lwz	23, 240(6)	# n rounds
329	addi	22, 23, -9	# remaing AES rounds
330
331	cmpdi	12, 0
332	bgt	__Loop_1x
333	blr
334
335__Loop_1x:
336	mtctr	22
337	addi	10, 6, 144
338	vcipher	15, 15, 16
339	vcipher	15, 15, 17
340	vcipher	15, 15, 18
341	vcipher	15, 15, 19
342	vcipher	15, 15, 20
343	vcipher	15, 15, 21
344	vcipher	15, 15, 28
345	vcipher	15, 15, 29
346
347__Loop_aes_1state:
348	lxv	32+1, 0(10)
349	vcipher	15, 15, 1
350	addi	10, 10, 16
351	bdnz	__Loop_aes_1state
352	lxv	32+1, 0(10)		# last round key
353	lxvb16x 11, 0, 14		# load input block
354	vcipherlast 15, 15, 1
355
356	xxlxor	32+15, 32+15, 11
357	stxvb16x 32+15, 0, 9	# store output
358	addi	14, 14, 16
359	addi	9, 9, 16
360
361	cmpdi	24, 0	# decrypt?
362	bne	__Encrypt_1x
363	xxlor	15+32, 11, 11
364__Encrypt_1x:
365	vxor	15, 15, 0
366	PPC_GHASH1x 0, 15
367
368	addi	5, 5, -16
369	addi	11, 11, 16
370
371	vadduwm 30, 30, 31		# IV + counter
372	xxlxor	32+15, 32+30, 0
373	addi	12, 12, -1
374	cmpdi	12, 0
375	bgt	__Loop_1x
376
377	stxvb16x 32+30, 0, 7		# update IV
378	stxvb16x 32+0, 0, 8		# update Xi
379	blr
380SYM_FUNC_END(aes_gcm_crypt_1x)
381
382################################################################################
383# Process a normal partial block when we come here.
384#  Compute partial mask, Load and store partial block to stack.
385#  Update partial_len and pblock.
386#  pblock is (encrypted ^ AES state) for encrypt
387#        and (input ^ AES state) for decrypt.
388#
389################################################################################
390SYM_FUNC_START_LOCAL(__Process_partial)
391
392	# create partial mask
393	vspltisb 16, -1
394	li	12, 16
395	sub	12, 12, 5
396	sldi	12, 12, 3
397	mtvsrdd	32+17, 0, 12
398	vslo	16, 16, 17		# partial block mask
399
400	lxvb16x 11, 0, 14		# load partial block
401	xxland	11, 11, 32+16
402
403	# AES crypt partial
404	xxlxor	32+15, 32+30, 0
405	lwz	23, 240(6)		# n rounds
406	addi	22, 23, -1		# loop - 1
407	mtctr	22
408	addi	10, 6, 16
409
410__Loop_aes_pstate:
411	lxv	32+1, 0(10)
412	vcipher	15, 15, 1
413	addi	10, 10, 16
414	bdnz	__Loop_aes_pstate
415	lxv	32+1, 0(10)		# last round key
416	vcipherlast 15, 15, 1
417
418	xxlxor	32+15, 32+15, 11
419	vand	15, 15, 16
420
421	# AES crypt output v15
422	# Write partial
423	li	10, 224
424	stxvb16x 15+32, 10, 1		# write v15 to stack
425	addi	10, 1, 223
426	addi	12, 9, -1
427        mtctr	5			# partial block len
428__Write_partial:
429        lbzu	22, 1(10)
430	stbu	22, 1(12)
431        bdnz	__Write_partial
432
433	cmpdi	24, 0			# decrypt?
434	bne	__Encrypt_partial
435	xxlor	32+15, 11, 11		# decrypt using the input block
436__Encrypt_partial:
437	#vxor	15, 15, 0		# ^ previous hash
438	#PPC_GHASH1x 0, 15
439
440	add	14, 14, 5
441	add	9, 9, 5
442	std	5, 56(7)		# update partial
443	sub	11, 11, 5
444	li	5, 0			# done last byte
445
446	#
447	# Don't increase IV since this is the last partial.
448	# It should get updated in gcm_update if no more data blocks.
449	#vadduwm	30, 30, 31		# increase IV
450	stxvb16x 32+30, 0, 7		# update IV
451	li	10, 64
452	stxvb16x 32+0, 0, 8		# Update X1
453	stxvb16x 32+15, 10, 7		# Update pblock
454	blr
455SYM_FUNC_END(__Process_partial)
456
457################################################################################
458# Combine partial blocks and ghash when we come here.
459#
460# The partial block has to be shifted to the right location to encrypt/decrypt
461# and compute ghash if combing the previous partial block is needed.
462# - Compute ghash for a full block. Clear Partial_len and pblock. Update IV.
463#   Write Xi.
464# - Don't compute ghash if not full block.  gcm_update will take care of it
465#   is the last block. Update Partial_len and pblock.
466#
467################################################################################
468SYM_FUNC_START_LOCAL(__Combine_partial)
469
470	ld	12, 56(7)
471	mr	21, 5			# these bytes to be processed
472
473	li	17, 0
474	li	16, 16
475	sub	22, 16, 12		# bytes to complete a block
476	sub	17, 22, 5		# remaining bytes in a block
477	cmpdi	5, 16
478	ble	__Inp_msg_less16
479	li	17, 0
480	mr	21, 22
481	b	__Combine_continue
482__Inp_msg_less16:
483	cmpd	22, 5
484	bgt	__Combine_continue
485	li	17, 0
486	mr	21, 22			# these bytes to be processed
487
488__Combine_continue:
489	# load msg and shift to the proper location and mask
490	vspltisb 16, -1
491	sldi	15, 12, 3
492	mtvsrdd	32+17, 0, 15
493	vslo	16, 16, 17
494	vsro	16, 16, 17
495	sldi	15, 17, 3
496	mtvsrdd	32+17, 0, 15
497	vsro	16, 16, 17
498	vslo	16, 16, 17		# mask
499
500	lxvb16x 32+19, 0, 14		# load partial block
501	sldi	15, 12, 3
502	mtvsrdd	32+17, 0, 15
503	vsro	19, 19, 17		# 0x00..xxxx??..??
504	sldi	15, 17, 3
505	mtvsrdd	32+17, 0, 15
506	vsro	19, 19, 17		# 0x00..xxxx
507	vslo	19, 19, 17		# shift back to form 0x00..xxxx00..00
508
509	# AES crypt partial
510	xxlxor	32+15, 32+30, 0
511	lwz	23, 240(6)	# n rounds
512	addi	22, 23, -1	# loop - 1
513	mtctr	22
514	addi	10, 6, 16
515
516__Loop_aes_cpstate:
517	lxv	32+1, 0(10)
518	vcipher	15, 15, 1
519	addi	10, 10, 16
520	bdnz	__Loop_aes_cpstate
521	lxv	32+1, 0(10)		# last round key
522	vcipherlast 15, 15, 1
523
524	vxor	15, 15, 19
525	vand	15, 15, 16
526
527	# AES crypt output v15
528	# Write partial
529	li	10, 224
530	stxvb16x 15+32, 10, 1		# write v15 to stack
531	addi	10, 1, 223
532	add	10, 10, 12		# add offset
533	addi	15, 9, -1
534        mtctr	21			# partial block len
535__Write_combine_partial:
536        lbzu	22, 1(10)
537	stbu	22, 1(15)
538        bdnz	__Write_combine_partial
539
540	add	14, 14, 21
541	add	11, 11, 21
542	add	9, 9, 21
543	sub	5, 5, 21
544
545	# Encrypt/Decrypt?
546	cmpdi	24, 0			# decrypt?
547	bne	__Encrypt_combine_partial
548	vmr	15, 19		# decrypt using the input block
549
550__Encrypt_combine_partial:
551	#
552	# Update partial flag and combine ghash.
553__Update_partial_ghash:
554	li	10, 64
555	lxvb16x 32+17, 10, 7		# load previous pblock
556	add	12, 12, 21		# combined pprocessed
557	vxor	15, 15, 17		# combined pblock
558
559	cmpdi	12, 16
560	beq	__Clear_partial_flag
561	std	12, 56(7)		# update partial len
562	stxvb16x 32+15, 10, 7		# Update current pblock
563	blr
564
565__Clear_partial_flag:
566	li	12, 0
567	std	12, 56(7)
568	# Update IV and ghash here
569	vadduwm	30, 30, 31		# increase IV
570	stxvb16x 32+30, 0, 7		# update IV
571
572	# v15 either is either (input blockor encrypted)^(AES state)
573	vxor	15, 15, 0
574	PPC_GHASH1x 0, 15
575	stxvb16x 32+0, 10, 7		# update pblock for debug?
576	stxvb16x 32+0, 0, 8		# update Xi
577	blr
578SYM_FUNC_END(__Combine_partial)
579
580################################################################################
581# gcm_update(iv, Xi) - compute last hash
582#
583################################################################################
584SYM_FUNC_START(gcm_update)
585
586	ld	10, 56(3)
587	cmpdi	10, 0
588	beq	__no_update
589
590	lxvb16x	32, 0, 4	# load Xi
591	# load Hash - h^4, h^3, h^2, h
592	li	10, 32
593	lxvd2x	2+32, 10, 4	# H Poli
594	li	10, 48
595	lxvd2x	3+32, 10, 4	# Hl
596	li	10, 64
597	lxvd2x	4+32, 10, 4	# H
598	li	10, 80
599	lxvd2x	5+32, 10, 4	# Hh
600
601	addis	11, 2, permx@toc@ha
602	addi	11, 11, permx@toc@l
603	lxv	10, 0(11)	# vs10: vpermxor vector
604
605	li	9, 64
606	lxvb16x 32+6, 9, 3		# load pblock
607	vxor	6, 6, 0
608
609	vxor	1, 1, 1
610	vpmsumd	12, 3, 6		# L
611	vpmsumd	13, 4, 6		# M
612	vpmsumd	14, 5, 6		# H
613	vpmsumd	17, 12, 2		# reduction
614	vsldoi	15, 13, 1, 8		# mL
615	vsldoi	16, 1, 13, 8		# mH
616	vxor	12, 12, 15		# LL + LL
617	vxor	14, 14, 16		# HH + HH
618	xxlor	32+15, 10, 10
619	vpermxor 12, 12, 17, 15
620	vsldoi	13, 12, 12, 8		# swap
621	vpmsumd	12, 12, 2		# reduction
622	vxor	13, 13, 14
623	vxor	7, 12, 13
624
625	#vxor	0, 0, 0
626	#stxvb16x 32+0, 9, 3
627	li	10, 0
628	std	10, 56(3)
629	stxvb16x 32+7, 0, 4
630
631__no_update:
632	blr
633SYM_FUNC_END(gcm_update)
634
635################################################################################
636# aes_p10_gcm_encrypt (const void *inp, void *out, size_t len,
637#               const char *rk, unsigned char iv[16], void *Xip);
638#
639#    r3 - inp
640#    r4 - out
641#    r5 - len
642#    r6 - AES round keys
643#    r7 - iv and other data
644#    r8 - Xi, HPoli, hash keys
645#
646#    rounds is at offset 240 in rk
647#    Xi is at 0 in gcm_table (Xip).
648#
649################################################################################
650SYM_FUNC_START(aes_p10_gcm_encrypt)
651
652	cmpdi	5, 0
653	ble	__Invalid_msg_len
654
655	SAVE_REGS
656	LOAD_HASH_TABLE
657
658	# initialize ICB: GHASH( IV ), IV - r7
659	lxvb16x	30+32, 0, 7	# load IV  - v30
660
661	mr	14, 3
662	mr	9, 4
663
664	# counter 1
665	vxor	31, 31, 31
666	vspltisb 22, 1
667	vsldoi	31, 31, 22,1	# counter 1
668
669	addis	11, 2, permx@toc@ha
670	addi	11, 11, permx@toc@l
671	lxv	10, 0(11)	# vs10: vpermxor vector
672	li	11, 0
673
674	# load 9 round keys to VSR
675	lxv	0, 0(6)			# round key 0
676	lxv	1, 16(6)		# round key 1
677	lxv	2, 32(6)		# round key 2
678	lxv	3, 48(6)		# round key 3
679	lxv	4, 64(6)		# round key 4
680	lxv	5, 80(6)		# round key 5
681	lxv	6, 96(6)		# round key 6
682	lxv	7, 112(6)		# round key 7
683	lxv	8, 128(6)		# round key 8
684
685	# load rounds - 10 (128), 12 (192), 14 (256)
686	lwz	23, 240(6)		# n rounds
687	li	24, 1			# encrypt
688
689__Process_encrypt:
690	#
691	# Process different blocks
692	#
693	ld	12, 56(7)
694	cmpdi	12, 0
695	bgt	__Do_combine_enc
696	cmpdi	5, 128
697	blt	__Process_more_enc
698
699#
700# Process 8x AES/GCM blocks
701#
702__Process_8x_enc:
703	# 8x blcoks
704	li	10, 128
705	divdu	12, 5, 10	# n 128 bytes-blocks
706
707	addi	12, 12, -1	# loop - 1
708
709	vmr	15, 30		# first state: IV
710	vadduwm	16, 15, 31	# state + counter
711	vadduwm	17, 16, 31
712	vadduwm	18, 17, 31
713	vadduwm	19, 18, 31
714	vadduwm	20, 19, 31
715	vadduwm	21, 20, 31
716	vadduwm	22, 21, 31
717	xxlor	9, 32+22, 32+22	# save last state
718
719	# vxor  state, state, w # addroundkey
720	xxlor	32+29, 0, 0
721        vxor    15, 15, 29      # IV + round key - add round key 0
722	vxor	16, 16, 29
723	vxor	17, 17, 29
724	vxor	18, 18, 29
725	vxor	19, 19, 29
726	vxor	20, 20, 29
727	vxor	21, 21, 29
728	vxor	22, 22, 29
729
730	li	15, 16
731	li	16, 32
732	li	17, 48
733	li	18, 64
734	li	19, 80
735	li	20, 96
736	li	21, 112
737
738	#
739	# Pre-compute first 8 AES state and leave 1/3/5 more rounds
740	# for the loop.
741	#
742	addi	22, 23, -9		# process 8 keys
743	mtctr	22			# AES key loop
744	addi	10, 6, 144
745
746	LOOP_8AES_STATE			# process 8 AES keys
747
748__PreLoop_aes_state:
749	lxv	32+1, 0(10)		# round key
750	AES_CIPHER_8x vcipher 15 1
751	addi	10, 10, 16
752	bdnz	__PreLoop_aes_state
753	lxv	32+1, 0(10)		# last round key (v1)
754
755	cmpdi	12, 0			# Only one loop (8 block)
756	beq	__Finish_ghash
757
758#
759# Loop 8x blocks and compute ghash
760#
761__Loop_8x_block_enc:
762	vcipherlast     15, 15, 1
763	vcipherlast     16, 16, 1
764	vcipherlast     17, 17, 1
765	vcipherlast     18, 18, 1
766	vcipherlast     19, 19, 1
767	vcipherlast     20, 20, 1
768	vcipherlast     21, 21, 1
769	vcipherlast     22, 22, 1
770
771	lxvb16x	32+23, 0, 14	# load block
772	lxvb16x	32+24, 15, 14	# load block
773	lxvb16x	32+25, 16, 14	# load block
774	lxvb16x	32+26, 17, 14	# load block
775	lxvb16x	32+27, 18, 14	# load block
776	lxvb16x	32+28, 19, 14	# load block
777	lxvb16x	32+29, 20, 14	# load block
778	lxvb16x	32+30, 21, 14	# load block
779	addi	14, 14, 128
780
781	vxor	15, 15, 23
782	vxor	16, 16, 24
783	vxor	17, 17, 25
784	vxor	18, 18, 26
785	vxor	19, 19, 27
786	vxor	20, 20, 28
787	vxor	21, 21, 29
788	vxor	22, 22, 30
789
790	stxvb16x 47, 0, 9	# store output
791	stxvb16x 48, 15, 9	# store output
792	stxvb16x 49, 16, 9	# store output
793	stxvb16x 50, 17, 9	# store output
794	stxvb16x 51, 18, 9	# store output
795	stxvb16x 52, 19, 9	# store output
796	stxvb16x 53, 20, 9	# store output
797	stxvb16x 54, 21, 9	# store output
798	addi	9, 9, 128
799
800	# ghash here
801	vxor	15, 15, 0
802	PPC_GHASH4x 0, 15, 16, 17, 18
803
804	vxor	19, 19, 0
805	PPC_GHASH4x 0, 19, 20, 21, 22
806
807	xxlor	32+15, 9, 9		# last state
808	vadduwm 15, 15, 31		# state + counter
809	vadduwm 16, 15, 31
810	vadduwm 17, 16, 31
811	vadduwm 18, 17, 31
812	vadduwm 19, 18, 31
813	vadduwm 20, 19, 31
814	vadduwm 21, 20, 31
815	vadduwm 22, 21, 31
816	xxlor	9, 32+22, 32+22		# save last state
817
818	xxlor	32+27, 0, 0		# restore roundkey 0
819        vxor    15, 15, 27		# IV + round key - add round key 0
820	vxor	16, 16, 27
821	vxor	17, 17, 27
822	vxor	18, 18, 27
823	vxor	19, 19, 27
824	vxor	20, 20, 27
825	vxor	21, 21, 27
826	vxor	22, 22, 27
827
828	addi    5, 5, -128
829	addi    11, 11, 128
830
831	LOOP_8AES_STATE			# process 8 AES keys
832	mtctr	22			# AES key loop
833	addi	10, 6, 144
834__LastLoop_aes_state:
835	lxv	32+1, 0(10)		# round key
836	AES_CIPHER_8x vcipher 15 1
837	addi	10, 10, 16
838	bdnz	__LastLoop_aes_state
839	lxv	32+1, 0(10)		# last round key (v1)
840
841	addi	12, 12, -1
842	cmpdi	12, 0
843	bne	__Loop_8x_block_enc
844
845__Finish_ghash:
846	vcipherlast     15, 15, 1
847	vcipherlast     16, 16, 1
848	vcipherlast     17, 17, 1
849	vcipherlast     18, 18, 1
850	vcipherlast     19, 19, 1
851	vcipherlast     20, 20, 1
852	vcipherlast     21, 21, 1
853	vcipherlast     22, 22, 1
854
855	lxvb16x	32+23, 0, 14	# load block
856	lxvb16x	32+24, 15, 14	# load block
857	lxvb16x	32+25, 16, 14	# load block
858	lxvb16x	32+26, 17, 14	# load block
859	lxvb16x	32+27, 18, 14	# load block
860	lxvb16x	32+28, 19, 14	# load block
861	lxvb16x	32+29, 20, 14	# load block
862	lxvb16x	32+30, 21, 14	# load block
863	addi	14, 14, 128
864
865	vxor	15, 15, 23
866	vxor	16, 16, 24
867	vxor	17, 17, 25
868	vxor	18, 18, 26
869	vxor	19, 19, 27
870	vxor	20, 20, 28
871	vxor	21, 21, 29
872	vxor	22, 22, 30
873
874	stxvb16x 47, 0, 9	# store output
875	stxvb16x 48, 15, 9	# store output
876	stxvb16x 49, 16, 9	# store output
877	stxvb16x 50, 17, 9	# store output
878	stxvb16x 51, 18, 9	# store output
879	stxvb16x 52, 19, 9	# store output
880	stxvb16x 53, 20, 9	# store output
881	stxvb16x 54, 21, 9	# store output
882	addi	9, 9, 128
883
884	vxor	15, 15, 0
885	PPC_GHASH4x 0, 15, 16, 17, 18
886
887	vxor	19, 19, 0
888	PPC_GHASH4x 0, 19, 20, 21, 22
889
890	xxlor	30+32, 9, 9		# last ctr
891	vadduwm	30, 30, 31		# increase ctr
892	stxvb16x 32+30, 0, 7		# update IV
893	stxvb16x 32+0, 0, 8		# update Xi
894
895	addi    5, 5, -128
896	addi    11, 11, 128
897
898	#
899	# Done 8x blocks
900	#
901
902	cmpdi   5, 0
903	beq     aes_gcm_out
904
905__Process_more_enc:
906	li	24, 1			# encrypt
907	bl	aes_gcm_crypt_1x
908	cmpdi   5, 0
909	beq     aes_gcm_out
910
911	bl	__Process_partial
912	cmpdi   5, 0
913	beq     aes_gcm_out
914__Do_combine_enc:
915	bl	__Combine_partial
916	cmpdi	5, 0
917	bgt	__Process_encrypt
918	b	aes_gcm_out
919
920SYM_FUNC_END(aes_p10_gcm_encrypt)
921
922################################################################################
923# aes_p10_gcm_decrypt (const void *inp, void *out, size_t len,
924#               const char *rk, unsigned char iv[16], void *Xip);
925# 8x Decrypt
926#
927################################################################################
928SYM_FUNC_START(aes_p10_gcm_decrypt)
929
930	cmpdi	5, 0
931	ble	__Invalid_msg_len
932
933	SAVE_REGS
934	LOAD_HASH_TABLE
935
936	# initialize ICB: GHASH( IV ), IV - r7
937	lxvb16x	30+32, 0, 7	# load IV  - v30
938
939	mr	14, 3
940	mr	9, 4
941
942	# counter 1
943	vxor	31, 31, 31
944	vspltisb 22, 1
945	vsldoi	31, 31, 22,1	# counter 1
946
947	addis	11, 2, permx@toc@ha
948	addi	11, 11, permx@toc@l
949	lxv	10, 0(11)	# vs10: vpermxor vector
950	li	11, 0
951
952	# load 9 round keys to VSR
953	lxv	0, 0(6)			# round key 0
954	lxv	1, 16(6)		# round key 1
955	lxv	2, 32(6)		# round key 2
956	lxv	3, 48(6)		# round key 3
957	lxv	4, 64(6)		# round key 4
958	lxv	5, 80(6)		# round key 5
959	lxv	6, 96(6)		# round key 6
960	lxv	7, 112(6)		# round key 7
961	lxv	8, 128(6)		# round key 8
962
963	# load rounds - 10 (128), 12 (192), 14 (256)
964	lwz	23, 240(6)		# n rounds
965	li	24, 0			# decrypt
966
967__Process_decrypt:
968	#
969	# Process different blocks
970	#
971	ld	12, 56(7)
972	cmpdi	12, 0
973	bgt	__Do_combine_dec
974	cmpdi	5, 128
975	blt	__Process_more_dec
976
977#
978# Process 8x AES/GCM blocks
979#
980__Process_8x_dec:
981	# 8x blcoks
982	li	10, 128
983	divdu	12, 5, 10	# n 128 bytes-blocks
984
985	addi	12, 12, -1	# loop - 1
986
987	vmr	15, 30		# first state: IV
988	vadduwm	16, 15, 31	# state + counter
989	vadduwm	17, 16, 31
990	vadduwm	18, 17, 31
991	vadduwm	19, 18, 31
992	vadduwm	20, 19, 31
993	vadduwm	21, 20, 31
994	vadduwm	22, 21, 31
995	xxlor	9, 32+22, 32+22	# save last state
996
997	# vxor  state, state, w # addroundkey
998	xxlor	32+29, 0, 0
999        vxor    15, 15, 29      # IV + round key - add round key 0
1000	vxor	16, 16, 29
1001	vxor	17, 17, 29
1002	vxor	18, 18, 29
1003	vxor	19, 19, 29
1004	vxor	20, 20, 29
1005	vxor	21, 21, 29
1006	vxor	22, 22, 29
1007
1008	li	15, 16
1009	li	16, 32
1010	li	17, 48
1011	li	18, 64
1012	li	19, 80
1013	li	20, 96
1014	li	21, 112
1015
1016	#
1017	# Pre-compute first 8 AES state and leave 1/3/5 more rounds
1018	# for the loop.
1019	#
1020	addi	22, 23, -9		# process 8 keys
1021	mtctr	22			# AES key loop
1022	addi	10, 6, 144
1023
1024	LOOP_8AES_STATE			# process 8 AES keys
1025
1026__PreLoop_aes_state_dec:
1027	lxv	32+1, 0(10)		# round key
1028	AES_CIPHER_8x vcipher 15 1
1029	addi	10, 10, 16
1030	bdnz	__PreLoop_aes_state_dec
1031	lxv	32+1, 0(10)		# last round key (v1)
1032
1033	cmpdi	12, 0			# Only one loop (8 block)
1034	beq	__Finish_ghash_dec
1035
1036#
1037# Loop 8x blocks and compute ghash
1038#
1039__Loop_8x_block_dec:
1040	vcipherlast     15, 15, 1
1041	vcipherlast     16, 16, 1
1042	vcipherlast     17, 17, 1
1043	vcipherlast     18, 18, 1
1044	vcipherlast     19, 19, 1
1045	vcipherlast     20, 20, 1
1046	vcipherlast     21, 21, 1
1047	vcipherlast     22, 22, 1
1048
1049	lxvb16x	32+23, 0, 14	# load block
1050	lxvb16x	32+24, 15, 14	# load block
1051	lxvb16x	32+25, 16, 14	# load block
1052	lxvb16x	32+26, 17, 14	# load block
1053	lxvb16x	32+27, 18, 14	# load block
1054	lxvb16x	32+28, 19, 14	# load block
1055	lxvb16x	32+29, 20, 14	# load block
1056	lxvb16x	32+30, 21, 14	# load block
1057	addi	14, 14, 128
1058
1059	vxor	15, 15, 23
1060	vxor	16, 16, 24
1061	vxor	17, 17, 25
1062	vxor	18, 18, 26
1063	vxor	19, 19, 27
1064	vxor	20, 20, 28
1065	vxor	21, 21, 29
1066	vxor	22, 22, 30
1067
1068	stxvb16x 47, 0, 9	# store output
1069	stxvb16x 48, 15, 9	# store output
1070	stxvb16x 49, 16, 9	# store output
1071	stxvb16x 50, 17, 9	# store output
1072	stxvb16x 51, 18, 9	# store output
1073	stxvb16x 52, 19, 9	# store output
1074	stxvb16x 53, 20, 9	# store output
1075	stxvb16x 54, 21, 9	# store output
1076
1077	addi	9, 9, 128
1078
1079	vmr	15, 23
1080	vmr	16, 24
1081	vmr	17, 25
1082	vmr	18, 26
1083	vmr	19, 27
1084	vmr	20, 28
1085	vmr	21, 29
1086	vmr	22, 30
1087
1088	# ghash here
1089	vxor	15, 15, 0
1090	PPC_GHASH4x 0, 15, 16, 17, 18
1091
1092	vxor	19, 19, 0
1093	PPC_GHASH4x 0, 19, 20, 21, 22
1094
1095	xxlor	32+15, 9, 9		# last state
1096	vadduwm 15, 15, 31		# state + counter
1097	vadduwm 16, 15, 31
1098	vadduwm 17, 16, 31
1099	vadduwm 18, 17, 31
1100	vadduwm 19, 18, 31
1101	vadduwm 20, 19, 31
1102	vadduwm 21, 20, 31
1103	vadduwm 22, 21, 31
1104	xxlor	9, 32+22, 32+22		# save last state
1105
1106	xxlor	32+27, 0, 0		# restore roundkey 0
1107        vxor    15, 15, 27		# IV + round key - add round key 0
1108	vxor	16, 16, 27
1109	vxor	17, 17, 27
1110	vxor	18, 18, 27
1111	vxor	19, 19, 27
1112	vxor	20, 20, 27
1113	vxor	21, 21, 27
1114	vxor	22, 22, 27
1115
1116	addi    5, 5, -128
1117	addi    11, 11, 128
1118
1119	LOOP_8AES_STATE			# process 8 AES keys
1120	mtctr	22			# AES key loop
1121	addi	10, 6, 144
1122__LastLoop_aes_state_dec:
1123	lxv	32+1, 0(10)		# round key
1124	AES_CIPHER_8x vcipher 15 1
1125	addi	10, 10, 16
1126	bdnz	__LastLoop_aes_state_dec
1127	lxv	32+1, 0(10)		# last round key (v1)
1128
1129	addi	12, 12, -1
1130	cmpdi	12, 0
1131	bne	__Loop_8x_block_dec
1132
1133__Finish_ghash_dec:
1134	vcipherlast     15, 15, 1
1135	vcipherlast     16, 16, 1
1136	vcipherlast     17, 17, 1
1137	vcipherlast     18, 18, 1
1138	vcipherlast     19, 19, 1
1139	vcipherlast     20, 20, 1
1140	vcipherlast     21, 21, 1
1141	vcipherlast     22, 22, 1
1142
1143	lxvb16x	32+23, 0, 14	# load block
1144	lxvb16x	32+24, 15, 14	# load block
1145	lxvb16x	32+25, 16, 14	# load block
1146	lxvb16x	32+26, 17, 14	# load block
1147	lxvb16x	32+27, 18, 14	# load block
1148	lxvb16x	32+28, 19, 14	# load block
1149	lxvb16x	32+29, 20, 14	# load block
1150	lxvb16x	32+30, 21, 14	# load block
1151	addi	14, 14, 128
1152
1153	vxor	15, 15, 23
1154	vxor	16, 16, 24
1155	vxor	17, 17, 25
1156	vxor	18, 18, 26
1157	vxor	19, 19, 27
1158	vxor	20, 20, 28
1159	vxor	21, 21, 29
1160	vxor	22, 22, 30
1161
1162	stxvb16x 47, 0, 9	# store output
1163	stxvb16x 48, 15, 9	# store output
1164	stxvb16x 49, 16, 9	# store output
1165	stxvb16x 50, 17, 9	# store output
1166	stxvb16x 51, 18, 9	# store output
1167	stxvb16x 52, 19, 9	# store output
1168	stxvb16x 53, 20, 9	# store output
1169	stxvb16x 54, 21, 9	# store output
1170	addi	9, 9, 128
1171
1172	#vmr	15, 23
1173	vxor	15, 23, 0
1174	vmr	16, 24
1175	vmr	17, 25
1176	vmr	18, 26
1177	vmr	19, 27
1178	vmr	20, 28
1179	vmr	21, 29
1180	vmr	22, 30
1181
1182	#vxor	15, 15, 0
1183	PPC_GHASH4x 0, 15, 16, 17, 18
1184
1185	vxor	19, 19, 0
1186	PPC_GHASH4x 0, 19, 20, 21, 22
1187
1188	xxlor	30+32, 9, 9		# last ctr
1189	vadduwm	30, 30, 31		# increase ctr
1190	stxvb16x 32+30, 0, 7		# update IV
1191	stxvb16x 32+0, 0, 8		# update Xi
1192
1193	addi    5, 5, -128
1194	addi    11, 11, 128
1195
1196	#
1197	# Done 8x blocks
1198	#
1199
1200	cmpdi   5, 0
1201	beq     aes_gcm_out
1202
1203__Process_more_dec:
1204	li	24, 0			# decrypt
1205	bl	aes_gcm_crypt_1x
1206	cmpdi   5, 0
1207	beq     aes_gcm_out
1208
1209	bl	__Process_partial
1210	cmpdi   5, 0
1211	beq     aes_gcm_out
1212__Do_combine_dec:
1213	bl	__Combine_partial
1214	cmpdi	5, 0
1215	bgt	__Process_decrypt
1216	b	aes_gcm_out
1217SYM_FUNC_END(aes_p10_gcm_decrypt)
1218
1219SYM_FUNC_START_LOCAL(aes_gcm_out)
1220
1221	mr	3, 11			# return count
1222
1223	RESTORE_REGS
1224	blr
1225
1226__Invalid_msg_len:
1227	li	3, 0
1228	blr
1229SYM_FUNC_END(aes_gcm_out)
1230
1231SYM_DATA_START_LOCAL(PERMX)
1232.align 4
1233# for vector permute and xor
1234permx:
1235.long 0x4c5d6e7f, 0x08192a3b, 0xc4d5e6f7, 0x8091a2b3
1236SYM_DATA_END(permx)
1237