xref: /linux/arch/arm/crypto/ghash-ce-core.S (revision 53597deca0e38c30e6cd4ba2114fa42d2bcd85bb)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Accelerated AES-GCM implementation with ARMv8 Crypto Extensions.
4 *
5 * Copyright (C) 2015 - 2017 Linaro Ltd.
6 * Copyright (C) 2023 Google LLC. <ardb@google.com>
7 */
8
9#include <linux/linkage.h>
10#include <asm/assembler.h>
11
12	.arch		armv8-a
13	.fpu		crypto-neon-fp-armv8
14
15	SHASH		.req	q0
16	T1		.req	q1
17	XL		.req	q2
18	XM		.req	q3
19	XH		.req	q4
20	IN1		.req	q4
21
22	SHASH_L		.req	d0
23	SHASH_H		.req	d1
24	T1_L		.req	d2
25	T1_H		.req	d3
26	XL_L		.req	d4
27	XL_H		.req	d5
28	XM_L		.req	d6
29	XM_H		.req	d7
30	XH_L		.req	d8
31
32	XH2		.req	q9
33
34	MASK		.req	d28
35
36	SHASH2_p64	.req	d31
37
38	HH		.req	q10
39	HH3		.req	q11
40	HH4		.req	q12
41	HH34		.req	q13
42
43	HH_L		.req	d20
44	HH_H		.req	d21
45	HH3_L		.req	d22
46	HH3_H		.req	d23
47	HH4_L		.req	d24
48	HH4_H		.req	d25
49	HH34_L		.req	d26
50	HH34_H		.req	d27
51	SHASH2_H	.req	d29
52
53	XL2		.req	q5
54	XM2		.req	q6
55	T2		.req	q7
56	T3		.req	q8
57
58	XL2_L		.req	d10
59	XL2_H		.req	d11
60	XM2_L		.req	d12
61	XM2_H		.req	d13
62	T3_L		.req	d16
63	T3_H		.req	d17
64
65	.text
66
67	.macro		__pmull_reduce_p64
68	vmull.p64	T1, XL_L, MASK
69
70	veor		XH_L, XH_L, XM_H
71	vext.8		T1, T1, T1, #8
72	veor		XL_H, XL_H, XM_L
73	veor		T1, T1, XL
74
75	vmull.p64	XL, T1_H, MASK
76	.endm
77
78	.macro		ghash_update, enc, aggregate=1, head=1
79	vld1.64		{XL}, [r1]
80
81	.if		\head
82	/* do the head block first, if supplied */
83	ldr		ip, [sp]
84	teq		ip, #0
85	beq		0f
86	vld1.64		{T1}, [ip]
87	teq		r0, #0
88	b		3f
89	.endif
90
910:	.if		\aggregate
92	tst		r0, #3			// skip until #blocks is a
93	bne		2f			// round multiple of 4
94
95	vld1.8		{XL2-XM2}, [r2]!
961:	vld1.8		{T2-T3}, [r2]!
97
98	.ifnb		\enc
99	\enc\()_4x	XL2, XM2, T2, T3
100
101	add		ip, r3, #16
102	vld1.64		{HH}, [ip, :128]!
103	vld1.64		{HH3-HH4}, [ip, :128]
104
105	veor		SHASH2_p64, SHASH_L, SHASH_H
106	veor		SHASH2_H, HH_L, HH_H
107	veor		HH34_L, HH3_L, HH3_H
108	veor		HH34_H, HH4_L, HH4_H
109
110	vmov.i8		MASK, #0xe1
111	vshl.u64	MASK, MASK, #57
112	.endif
113
114	vrev64.8	XL2, XL2
115	vrev64.8	XM2, XM2
116
117	subs		r0, r0, #4
118
119	vext.8		T1, XL2, XL2, #8
120	veor		XL2_H, XL2_H, XL_L
121	veor		XL, XL, T1
122
123	vrev64.8	T1, T3
124	vrev64.8	T3, T2
125
126	vmull.p64	XH, HH4_H, XL_H			// a1 * b1
127	veor		XL2_H, XL2_H, XL_H
128	vmull.p64	XL, HH4_L, XL_L			// a0 * b0
129	vmull.p64	XM, HH34_H, XL2_H		// (a1 + a0)(b1 + b0)
130
131	vmull.p64	XH2, HH3_H, XM2_L		// a1 * b1
132	veor		XM2_L, XM2_L, XM2_H
133	vmull.p64	XL2, HH3_L, XM2_H		// a0 * b0
134	vmull.p64	XM2, HH34_L, XM2_L		// (a1 + a0)(b1 + b0)
135
136	veor		XH, XH, XH2
137	veor		XL, XL, XL2
138	veor		XM, XM, XM2
139
140	vmull.p64	XH2, HH_H, T3_L			// a1 * b1
141	veor		T3_L, T3_L, T3_H
142	vmull.p64	XL2, HH_L, T3_H			// a0 * b0
143	vmull.p64	XM2, SHASH2_H, T3_L		// (a1 + a0)(b1 + b0)
144
145	veor		XH, XH, XH2
146	veor		XL, XL, XL2
147	veor		XM, XM, XM2
148
149	vmull.p64	XH2, SHASH_H, T1_L		// a1 * b1
150	veor		T1_L, T1_L, T1_H
151	vmull.p64	XL2, SHASH_L, T1_H		// a0 * b0
152	vmull.p64	XM2, SHASH2_p64, T1_L		// (a1 + a0)(b1 + b0)
153
154	veor		XH, XH, XH2
155	veor		XL, XL, XL2
156	veor		XM, XM, XM2
157
158	beq		4f
159
160	vld1.8		{XL2-XM2}, [r2]!
161
162	veor		T1, XL, XH
163	veor		XM, XM, T1
164
165	__pmull_reduce_p64
166
167	veor		T1, T1, XH
168	veor		XL, XL, T1
169
170	b		1b
171	.endif
172
1732:	vld1.8		{T1}, [r2]!
174
175	.ifnb		\enc
176	\enc\()_1x	T1
177	veor		SHASH2_p64, SHASH_L, SHASH_H
178	vmov.i8		MASK, #0xe1
179	vshl.u64	MASK, MASK, #57
180	.endif
181
182	subs		r0, r0, #1
183
1843:	/* multiply XL by SHASH in GF(2^128) */
185	vrev64.8	T1, T1
186
187	vext.8		IN1, T1, T1, #8
188	veor		T1_L, T1_L, XL_H
189	veor		XL, XL, IN1
190
191	vmull.p64	XH, XL_H, SHASH_H		@ a1 * b1
192	veor		T1, T1, XL
193	vmull.p64	XL, XL_L, SHASH_L		@ a0 * b0
194	vmull.p64	XM, T1_L, SHASH2_p64		@ (a1+a0)(b1+b0)
195
1964:	veor		T1, XL, XH
197	veor		XM, XM, T1
198
199	__pmull_reduce_p64
200
201	veor		T1, T1, XH
202	veor		XL, XL, T1
203
204	bne		0b
205	.endm
206
207	/*
208	 * void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
209	 *			       u64 const h[4][2], const char *head)
210	 */
211ENTRY(pmull_ghash_update_p64)
212	vld1.64		{SHASH}, [r3]!
213	vld1.64		{HH}, [r3]!
214	vld1.64		{HH3-HH4}, [r3]
215
216	veor		SHASH2_p64, SHASH_L, SHASH_H
217	veor		SHASH2_H, HH_L, HH_H
218	veor		HH34_L, HH3_L, HH3_H
219	veor		HH34_H, HH4_L, HH4_H
220
221	vmov.i8		MASK, #0xe1
222	vshl.u64	MASK, MASK, #57
223
224	ghash_update
225	vst1.64		{XL}, [r1]
226
227	bx		lr
228ENDPROC(pmull_ghash_update_p64)
229
230	e0		.req	q9
231	e1		.req	q10
232	e2		.req	q11
233	e3		.req	q12
234	e0l		.req	d18
235	e0h		.req	d19
236	e2l		.req	d22
237	e2h		.req	d23
238	e3l		.req	d24
239	e3h		.req	d25
240	ctr		.req	q13
241	ctr0		.req	d26
242	ctr1		.req	d27
243
244	ek0		.req	q14
245	ek1		.req	q15
246
247	.macro		round, rk:req, regs:vararg
248	.irp		r, \regs
249	aese.8		\r, \rk
250	aesmc.8		\r, \r
251	.endr
252	.endm
253
254	.macro		aes_encrypt, rkp, rounds, regs:vararg
255	vld1.8		{ek0-ek1}, [\rkp, :128]!
256	cmp		\rounds, #12
257	blt		.L\@			// AES-128
258
259	round		ek0, \regs
260	vld1.8		{ek0}, [\rkp, :128]!
261	round		ek1, \regs
262	vld1.8		{ek1}, [\rkp, :128]!
263
264	beq		.L\@			// AES-192
265
266	round		ek0, \regs
267	vld1.8		{ek0}, [\rkp, :128]!
268	round		ek1, \regs
269	vld1.8		{ek1}, [\rkp, :128]!
270
271.L\@:	.rept		4
272	round		ek0, \regs
273	vld1.8		{ek0}, [\rkp, :128]!
274	round		ek1, \regs
275	vld1.8		{ek1}, [\rkp, :128]!
276	.endr
277
278	round		ek0, \regs
279	vld1.8		{ek0}, [\rkp, :128]
280
281	.irp		r, \regs
282	aese.8		\r, ek1
283	.endr
284	.irp		r, \regs
285	veor		\r, \r, ek0
286	.endr
287	.endm
288
289pmull_aes_encrypt:
290	add		ip, r5, #4
291	vld1.8		{ctr0}, [r5]		// load 12 byte IV
292	vld1.8		{ctr1}, [ip]
293	rev		r8, r7
294	vext.8		ctr1, ctr1, ctr1, #4
295	add		r7, r7, #1
296	vmov.32		ctr1[1], r8
297	vmov		e0, ctr
298
299	add		ip, r3, #64
300	aes_encrypt	ip, r6, e0
301	bx		lr
302ENDPROC(pmull_aes_encrypt)
303
304pmull_aes_encrypt_4x:
305	add		ip, r5, #4
306	vld1.8		{ctr0}, [r5]
307	vld1.8		{ctr1}, [ip]
308	rev		r8, r7
309	vext.8		ctr1, ctr1, ctr1, #4
310	add		r7, r7, #1
311	vmov.32		ctr1[1], r8
312	rev		ip, r7
313	vmov		e0, ctr
314	add		r7, r7, #1
315	vmov.32		ctr1[1], ip
316	rev		r8, r7
317	vmov		e1, ctr
318	add		r7, r7, #1
319	vmov.32		ctr1[1], r8
320	rev		ip, r7
321	vmov		e2, ctr
322	add		r7, r7, #1
323	vmov.32		ctr1[1], ip
324	vmov		e3, ctr
325
326	add		ip, r3, #64
327	aes_encrypt	ip, r6, e0, e1, e2, e3
328	bx		lr
329ENDPROC(pmull_aes_encrypt_4x)
330
331pmull_aes_encrypt_final:
332	add		ip, r5, #4
333	vld1.8		{ctr0}, [r5]
334	vld1.8		{ctr1}, [ip]
335	rev		r8, r7
336	vext.8		ctr1, ctr1, ctr1, #4
337	mov		r7, #1 << 24		// BE #1 for the tag
338	vmov.32		ctr1[1], r8
339	vmov		e0, ctr
340	vmov.32		ctr1[1], r7
341	vmov		e1, ctr
342
343	add		ip, r3, #64
344	aes_encrypt	ip, r6, e0, e1
345	bx		lr
346ENDPROC(pmull_aes_encrypt_final)
347
348	.macro		enc_1x, in0
349	bl		pmull_aes_encrypt
350	veor		\in0, \in0, e0
351	vst1.8		{\in0}, [r4]!
352	.endm
353
354	.macro		dec_1x, in0
355	bl		pmull_aes_encrypt
356	veor		e0, e0, \in0
357	vst1.8		{e0}, [r4]!
358	.endm
359
360	.macro		enc_4x, in0, in1, in2, in3
361	bl		pmull_aes_encrypt_4x
362
363	veor		\in0, \in0, e0
364	veor		\in1, \in1, e1
365	veor		\in2, \in2, e2
366	veor		\in3, \in3, e3
367
368	vst1.8		{\in0-\in1}, [r4]!
369	vst1.8		{\in2-\in3}, [r4]!
370	.endm
371
372	.macro		dec_4x, in0, in1, in2, in3
373	bl		pmull_aes_encrypt_4x
374
375	veor		e0, e0, \in0
376	veor		e1, e1, \in1
377	veor		e2, e2, \in2
378	veor		e3, e3, \in3
379
380	vst1.8		{e0-e1}, [r4]!
381	vst1.8		{e2-e3}, [r4]!
382	.endm
383
384	/*
385	 * void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src,
386	 *			  struct gcm_key const *k, char *dst,
387	 *			  char *iv, int rounds, u32 counter)
388	 */
389ENTRY(pmull_gcm_encrypt)
390	push		{r4-r8, lr}
391	ldrd		r4, r5, [sp, #24]
392	ldrd		r6, r7, [sp, #32]
393
394	vld1.64		{SHASH}, [r3]
395
396	ghash_update	enc, head=0
397	vst1.64		{XL}, [r1]
398
399	pop		{r4-r8, pc}
400ENDPROC(pmull_gcm_encrypt)
401
402	/*
403	 * void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src,
404	 *			  struct gcm_key const *k, char *dst,
405	 *			  char *iv, int rounds, u32 counter)
406	 */
407ENTRY(pmull_gcm_decrypt)
408	push		{r4-r8, lr}
409	ldrd		r4, r5, [sp, #24]
410	ldrd		r6, r7, [sp, #32]
411
412	vld1.64		{SHASH}, [r3]
413
414	ghash_update	dec, head=0
415	vst1.64		{XL}, [r1]
416
417	pop		{r4-r8, pc}
418ENDPROC(pmull_gcm_decrypt)
419
420	/*
421	 * void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag,
422	 *			    struct gcm_key const *k, char *head,
423	 *			    char *iv, int rounds, u32 counter)
424	 */
425ENTRY(pmull_gcm_enc_final)
426	push		{r4-r8, lr}
427	ldrd		r4, r5, [sp, #24]
428	ldrd		r6, r7, [sp, #32]
429
430	bl		pmull_aes_encrypt_final
431
432	cmp		r0, #0
433	beq		.Lenc_final
434
435	mov_l		ip, .Lpermute
436	sub		r4, r4, #16
437	add		r8, ip, r0
438	add		ip, ip, #32
439	add		r4, r4, r0
440	sub		ip, ip, r0
441
442	vld1.8		{e3}, [r8]		// permute vector for key stream
443	vld1.8		{e2}, [ip]		// permute vector for ghash input
444
445	vtbl.8		e3l, {e0}, e3l
446	vtbl.8		e3h, {e0}, e3h
447
448	vld1.8		{e0}, [r4]		// encrypt tail block
449	veor		e0, e0, e3
450	vst1.8		{e0}, [r4]
451
452	vtbl.8		T1_L, {e0}, e2l
453	vtbl.8		T1_H, {e0}, e2h
454
455	vld1.64		{XL}, [r1]
456.Lenc_final:
457	vld1.64		{SHASH}, [r3, :128]
458	vmov.i8		MASK, #0xe1
459	veor		SHASH2_p64, SHASH_L, SHASH_H
460	vshl.u64	MASK, MASK, #57
461	mov		r0, #1
462	bne		3f			// process head block first
463	ghash_update	aggregate=0, head=0
464
465	vrev64.8	XL, XL
466	vext.8		XL, XL, XL, #8
467	veor		XL, XL, e1
468
469	sub		r2, r2, #16		// rewind src pointer
470	vst1.8		{XL}, [r2]		// store tag
471
472	pop		{r4-r8, pc}
473ENDPROC(pmull_gcm_enc_final)
474
475	/*
476	 * int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag,
477	 *			   struct gcm_key const *k, char *head,
478	 *			   char *iv, int rounds, u32 counter,
479	 *			   const char *otag, int authsize)
480	 */
481ENTRY(pmull_gcm_dec_final)
482	push		{r4-r8, lr}
483	ldrd		r4, r5, [sp, #24]
484	ldrd		r6, r7, [sp, #32]
485
486	bl		pmull_aes_encrypt_final
487
488	cmp		r0, #0
489	beq		.Ldec_final
490
491	mov_l		ip, .Lpermute
492	sub		r4, r4, #16
493	add		r8, ip, r0
494	add		ip, ip, #32
495	add		r4, r4, r0
496	sub		ip, ip, r0
497
498	vld1.8		{e3}, [r8]		// permute vector for key stream
499	vld1.8		{e2}, [ip]		// permute vector for ghash input
500
501	vtbl.8		e3l, {e0}, e3l
502	vtbl.8		e3h, {e0}, e3h
503
504	vld1.8		{e0}, [r4]
505
506	vtbl.8		T1_L, {e0}, e2l
507	vtbl.8		T1_H, {e0}, e2h
508
509	veor		e0, e0, e3
510	vst1.8		{e0}, [r4]
511
512	vld1.64		{XL}, [r1]
513.Ldec_final:
514	vld1.64		{SHASH}, [r3]
515	vmov.i8		MASK, #0xe1
516	veor		SHASH2_p64, SHASH_L, SHASH_H
517	vshl.u64	MASK, MASK, #57
518	mov		r0, #1
519	bne		3f			// process head block first
520	ghash_update	aggregate=0, head=0
521
522	vrev64.8	XL, XL
523	vext.8		XL, XL, XL, #8
524	veor		XL, XL, e1
525
526	mov_l		ip, .Lpermute
527	ldrd		r2, r3, [sp, #40]	// otag and authsize
528	vld1.8		{T1}, [r2]
529	add		ip, ip, r3
530	vceq.i8		T1, T1, XL		// compare tags
531	vmvn		T1, T1			// 0 for eq, -1 for ne
532
533	vld1.8		{e0}, [ip]
534	vtbl.8		XL_L, {T1}, e0l		// keep authsize bytes only
535	vtbl.8		XL_H, {T1}, e0h
536
537	vpmin.s8	XL_L, XL_L, XL_H	// take the minimum s8 across the vector
538	vpmin.s8	XL_L, XL_L, XL_L
539	vmov.32		r0, XL_L[0]		// fail if != 0x0
540
541	pop		{r4-r8, pc}
542ENDPROC(pmull_gcm_dec_final)
543
544	.section	".rodata", "a", %progbits
545	.align		5
546.Lpermute:
547	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
548	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
549	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
550	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
551	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
552	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
553