xref: /linux/arch/arm/crypto/ghash-ce-core.S (revision eb01fe7abbe2d0b38824d2a93fdb4cc3eaf2ccc1)
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
4 *
5 * Copyright (C) 2015 - 2017 Linaro Ltd.
6 * Copyright (C) 2023 Google LLC. <ardb@google.com>
7 */
8
9#include <linux/linkage.h>
10#include <asm/assembler.h>
11
12	.arch		armv8-a
13	.fpu		crypto-neon-fp-armv8
14
15	SHASH		.req	q0
16	T1		.req	q1
17	XL		.req	q2
18	XM		.req	q3
19	XH		.req	q4
20	IN1		.req	q4
21
22	SHASH_L		.req	d0
23	SHASH_H		.req	d1
24	T1_L		.req	d2
25	T1_H		.req	d3
26	XL_L		.req	d4
27	XL_H		.req	d5
28	XM_L		.req	d6
29	XM_H		.req	d7
30	XH_L		.req	d8
31
32	t0l		.req	d10
33	t0h		.req	d11
34	t1l		.req	d12
35	t1h		.req	d13
36	t2l		.req	d14
37	t2h		.req	d15
38	t3l		.req	d16
39	t3h		.req	d17
40	t4l		.req	d18
41	t4h		.req	d19
42
43	t0q		.req	q5
44	t1q		.req	q6
45	t2q		.req	q7
46	t3q		.req	q8
47	t4q		.req	q9
48	XH2		.req	q9
49
50	s1l		.req	d20
51	s1h		.req	d21
52	s2l		.req	d22
53	s2h		.req	d23
54	s3l		.req	d24
55	s3h		.req	d25
56	s4l		.req	d26
57	s4h		.req	d27
58
59	MASK		.req	d28
60	SHASH2_p8	.req	d28
61
62	k16		.req	d29
63	k32		.req	d30
64	k48		.req	d31
65	SHASH2_p64	.req	d31
66
67	HH		.req	q10
68	HH3		.req	q11
69	HH4		.req	q12
70	HH34		.req	q13
71
72	HH_L		.req	d20
73	HH_H		.req	d21
74	HH3_L		.req	d22
75	HH3_H		.req	d23
76	HH4_L		.req	d24
77	HH4_H		.req	d25
78	HH34_L		.req	d26
79	HH34_H		.req	d27
80	SHASH2_H	.req	d29
81
82	XL2		.req	q5
83	XM2		.req	q6
84	T2		.req	q7
85	T3		.req	q8
86
87	XL2_L		.req	d10
88	XL2_H		.req	d11
89	XM2_L		.req	d12
90	XM2_H		.req	d13
91	T3_L		.req	d16
92	T3_H		.req	d17
93
94	.text
95
96	.macro		__pmull_p64, rd, rn, rm, b1, b2, b3, b4
97	vmull.p64	\rd, \rn, \rm
98	.endm
99
100	/*
101	 * This implementation of 64x64 -> 128 bit polynomial multiplication
102	 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
103	 * "Fast Software Polynomial Multiplication on ARM Processors Using
104	 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
105	 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
106	 *
107	 * It has been slightly tweaked for in-order performance, and to allow
108	 * 'rq' to overlap with 'ad' or 'bd'.
109	 */
110	.macro		__pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
111	vext.8		t0l, \ad, \ad, #1	@ A1
112	.ifc		\b1, t4l
113	vext.8		t4l, \bd, \bd, #1	@ B1
114	.endif
115	vmull.p8	t0q, t0l, \bd		@ F = A1*B
116	vext.8		t1l, \ad, \ad, #2	@ A2
117	vmull.p8	t4q, \ad, \b1		@ E = A*B1
118	.ifc		\b2, t3l
119	vext.8		t3l, \bd, \bd, #2	@ B2
120	.endif
121	vmull.p8	t1q, t1l, \bd		@ H = A2*B
122	vext.8		t2l, \ad, \ad, #3	@ A3
123	vmull.p8	t3q, \ad, \b2		@ G = A*B2
124	veor		t0q, t0q, t4q		@ L = E + F
125	.ifc		\b3, t4l
126	vext.8		t4l, \bd, \bd, #3	@ B3
127	.endif
128	vmull.p8	t2q, t2l, \bd		@ J = A3*B
129	veor		t0l, t0l, t0h		@ t0 = (L) (P0 + P1) << 8
130	veor		t1q, t1q, t3q		@ M = G + H
131	.ifc		\b4, t3l
132	vext.8		t3l, \bd, \bd, #4	@ B4
133	.endif
134	vmull.p8	t4q, \ad, \b3		@ I = A*B3
135	veor		t1l, t1l, t1h		@ t1 = (M) (P2 + P3) << 16
136	vmull.p8	t3q, \ad, \b4		@ K = A*B4
137	vand		t0h, t0h, k48
138	vand		t1h, t1h, k32
139	veor		t2q, t2q, t4q		@ N = I + J
140	veor		t0l, t0l, t0h
141	veor		t1l, t1l, t1h
142	veor		t2l, t2l, t2h		@ t2 = (N) (P4 + P5) << 24
143	vand		t2h, t2h, k16
144	veor		t3l, t3l, t3h		@ t3 = (K) (P6 + P7) << 32
145	vmov.i64	t3h, #0
146	vext.8		t0q, t0q, t0q, #15
147	veor		t2l, t2l, t2h
148	vext.8		t1q, t1q, t1q, #14
149	vmull.p8	\rq, \ad, \bd		@ D = A*B
150	vext.8		t2q, t2q, t2q, #13
151	vext.8		t3q, t3q, t3q, #12
152	veor		t0q, t0q, t1q
153	veor		t2q, t2q, t3q
154	veor		\rq, \rq, t0q
155	veor		\rq, \rq, t2q
156	.endm
157
158	//
159	// PMULL (64x64->128) based reduction for CPUs that can do
160	// it in a single instruction.
161	//
162	.macro		__pmull_reduce_p64
163	vmull.p64	T1, XL_L, MASK
164
165	veor		XH_L, XH_L, XM_H
166	vext.8		T1, T1, T1, #8
167	veor		XL_H, XL_H, XM_L
168	veor		T1, T1, XL
169
170	vmull.p64	XL, T1_H, MASK
171	.endm
172
173	//
174	// Alternative reduction for CPUs that lack support for the
175	// 64x64->128 PMULL instruction
176	//
177	.macro		__pmull_reduce_p8
178	veor		XL_H, XL_H, XM_L
179	veor		XH_L, XH_L, XM_H
180
181	vshl.i64	T1, XL, #57
182	vshl.i64	T2, XL, #62
183	veor		T1, T1, T2
184	vshl.i64	T2, XL, #63
185	veor		T1, T1, T2
186	veor		XL_H, XL_H, T1_L
187	veor		XH_L, XH_L, T1_H
188
189	vshr.u64	T1, XL, #1
190	veor		XH, XH, XL
191	veor		XL, XL, T1
192	vshr.u64	T1, T1, #6
193	vshr.u64	XL, XL, #1
194	.endm
195
196	.macro		ghash_update, pn, enc, aggregate=1, head=1
197	vld1.64		{XL}, [r1]
198
199	.if		\head
200	/* do the head block first, if supplied */
201	ldr		ip, [sp]
202	teq		ip, #0
203	beq		0f
204	vld1.64		{T1}, [ip]
205	teq		r0, #0
206	b		3f
207	.endif
208
2090:	.ifc		\pn, p64
210	.if		\aggregate
211	tst		r0, #3			// skip until #blocks is a
212	bne		2f			// round multiple of 4
213
214	vld1.8		{XL2-XM2}, [r2]!
2151:	vld1.8		{T2-T3}, [r2]!
216
217	.ifnb		\enc
218	\enc\()_4x	XL2, XM2, T2, T3
219
220	add		ip, r3, #16
221	vld1.64		{HH}, [ip, :128]!
222	vld1.64		{HH3-HH4}, [ip, :128]
223
224	veor		SHASH2_p64, SHASH_L, SHASH_H
225	veor		SHASH2_H, HH_L, HH_H
226	veor		HH34_L, HH3_L, HH3_H
227	veor		HH34_H, HH4_L, HH4_H
228
229	vmov.i8		MASK, #0xe1
230	vshl.u64	MASK, MASK, #57
231	.endif
232
233	vrev64.8	XL2, XL2
234	vrev64.8	XM2, XM2
235
236	subs		r0, r0, #4
237
238	vext.8		T1, XL2, XL2, #8
239	veor		XL2_H, XL2_H, XL_L
240	veor		XL, XL, T1
241
242	vrev64.8	T1, T3
243	vrev64.8	T3, T2
244
245	vmull.p64	XH, HH4_H, XL_H			// a1 * b1
246	veor		XL2_H, XL2_H, XL_H
247	vmull.p64	XL, HH4_L, XL_L			// a0 * b0
248	vmull.p64	XM, HH34_H, XL2_H		// (a1 + a0)(b1 + b0)
249
250	vmull.p64	XH2, HH3_H, XM2_L		// a1 * b1
251	veor		XM2_L, XM2_L, XM2_H
252	vmull.p64	XL2, HH3_L, XM2_H		// a0 * b0
253	vmull.p64	XM2, HH34_L, XM2_L		// (a1 + a0)(b1 + b0)
254
255	veor		XH, XH, XH2
256	veor		XL, XL, XL2
257	veor		XM, XM, XM2
258
259	vmull.p64	XH2, HH_H, T3_L			// a1 * b1
260	veor		T3_L, T3_L, T3_H
261	vmull.p64	XL2, HH_L, T3_H			// a0 * b0
262	vmull.p64	XM2, SHASH2_H, T3_L		// (a1 + a0)(b1 + b0)
263
264	veor		XH, XH, XH2
265	veor		XL, XL, XL2
266	veor		XM, XM, XM2
267
268	vmull.p64	XH2, SHASH_H, T1_L		// a1 * b1
269	veor		T1_L, T1_L, T1_H
270	vmull.p64	XL2, SHASH_L, T1_H		// a0 * b0
271	vmull.p64	XM2, SHASH2_p64, T1_L		// (a1 + a0)(b1 + b0)
272
273	veor		XH, XH, XH2
274	veor		XL, XL, XL2
275	veor		XM, XM, XM2
276
277	beq		4f
278
279	vld1.8		{XL2-XM2}, [r2]!
280
281	veor		T1, XL, XH
282	veor		XM, XM, T1
283
284	__pmull_reduce_p64
285
286	veor		T1, T1, XH
287	veor		XL, XL, T1
288
289	b		1b
290	.endif
291	.endif
292
2932:	vld1.8		{T1}, [r2]!
294
295	.ifnb		\enc
296	\enc\()_1x	T1
297	veor		SHASH2_p64, SHASH_L, SHASH_H
298	vmov.i8		MASK, #0xe1
299	vshl.u64	MASK, MASK, #57
300	.endif
301
302	subs		r0, r0, #1
303
3043:	/* multiply XL by SHASH in GF(2^128) */
305	vrev64.8	T1, T1
306
307	vext.8		IN1, T1, T1, #8
308	veor		T1_L, T1_L, XL_H
309	veor		XL, XL, IN1
310
311	__pmull_\pn	XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h	@ a1 * b1
312	veor		T1, T1, XL
313	__pmull_\pn	XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l	@ a0 * b0
314	__pmull_\pn	XM, T1_L, SHASH2_\pn			@ (a1+a0)(b1+b0)
315
3164:	veor		T1, XL, XH
317	veor		XM, XM, T1
318
319	__pmull_reduce_\pn
320
321	veor		T1, T1, XH
322	veor		XL, XL, T1
323
324	bne		0b
325	.endm
326
327	/*
328	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
329	 *			   struct ghash_key const *k, const char *head)
330	 */
331ENTRY(pmull_ghash_update_p64)
332	vld1.64		{SHASH}, [r3]!
333	vld1.64		{HH}, [r3]!
334	vld1.64		{HH3-HH4}, [r3]
335
336	veor		SHASH2_p64, SHASH_L, SHASH_H
337	veor		SHASH2_H, HH_L, HH_H
338	veor		HH34_L, HH3_L, HH3_H
339	veor		HH34_H, HH4_L, HH4_H
340
341	vmov.i8		MASK, #0xe1
342	vshl.u64	MASK, MASK, #57
343
344	ghash_update	p64
345	vst1.64		{XL}, [r1]
346
347	bx		lr
348ENDPROC(pmull_ghash_update_p64)
349
350ENTRY(pmull_ghash_update_p8)
351	vld1.64		{SHASH}, [r3]
352	veor		SHASH2_p8, SHASH_L, SHASH_H
353
354	vext.8		s1l, SHASH_L, SHASH_L, #1
355	vext.8		s2l, SHASH_L, SHASH_L, #2
356	vext.8		s3l, SHASH_L, SHASH_L, #3
357	vext.8		s4l, SHASH_L, SHASH_L, #4
358	vext.8		s1h, SHASH_H, SHASH_H, #1
359	vext.8		s2h, SHASH_H, SHASH_H, #2
360	vext.8		s3h, SHASH_H, SHASH_H, #3
361	vext.8		s4h, SHASH_H, SHASH_H, #4
362
363	vmov.i64	k16, #0xffff
364	vmov.i64	k32, #0xffffffff
365	vmov.i64	k48, #0xffffffffffff
366
367	ghash_update	p8
368	vst1.64		{XL}, [r1]
369
370	bx		lr
371ENDPROC(pmull_ghash_update_p8)
372
373	e0		.req	q9
374	e1		.req	q10
375	e2		.req	q11
376	e3		.req	q12
377	e0l		.req	d18
378	e0h		.req	d19
379	e2l		.req	d22
380	e2h		.req	d23
381	e3l		.req	d24
382	e3h		.req	d25
383	ctr		.req	q13
384	ctr0		.req	d26
385	ctr1		.req	d27
386
387	ek0		.req	q14
388	ek1		.req	q15
389
390	.macro		round, rk:req, regs:vararg
391	.irp		r, \regs
392	aese.8		\r, \rk
393	aesmc.8		\r, \r
394	.endr
395	.endm
396
397	.macro		aes_encrypt, rkp, rounds, regs:vararg
398	vld1.8		{ek0-ek1}, [\rkp, :128]!
399	cmp		\rounds, #12
400	blt		.L\@			// AES-128
401
402	round		ek0, \regs
403	vld1.8		{ek0}, [\rkp, :128]!
404	round		ek1, \regs
405	vld1.8		{ek1}, [\rkp, :128]!
406
407	beq		.L\@			// AES-192
408
409	round		ek0, \regs
410	vld1.8		{ek0}, [\rkp, :128]!
411	round		ek1, \regs
412	vld1.8		{ek1}, [\rkp, :128]!
413
414.L\@:	.rept		4
415	round		ek0, \regs
416	vld1.8		{ek0}, [\rkp, :128]!
417	round		ek1, \regs
418	vld1.8		{ek1}, [\rkp, :128]!
419	.endr
420
421	round		ek0, \regs
422	vld1.8		{ek0}, [\rkp, :128]
423
424	.irp		r, \regs
425	aese.8		\r, ek1
426	.endr
427	.irp		r, \regs
428	veor		\r, \r, ek0
429	.endr
430	.endm
431
432pmull_aes_encrypt:
433	add		ip, r5, #4
434	vld1.8		{ctr0}, [r5]		// load 12 byte IV
435	vld1.8		{ctr1}, [ip]
436	rev		r8, r7
437	vext.8		ctr1, ctr1, ctr1, #4
438	add		r7, r7, #1
439	vmov.32		ctr1[1], r8
440	vmov		e0, ctr
441
442	add		ip, r3, #64
443	aes_encrypt	ip, r6, e0
444	bx		lr
445ENDPROC(pmull_aes_encrypt)
446
447pmull_aes_encrypt_4x:
448	add		ip, r5, #4
449	vld1.8		{ctr0}, [r5]
450	vld1.8		{ctr1}, [ip]
451	rev		r8, r7
452	vext.8		ctr1, ctr1, ctr1, #4
453	add		r7, r7, #1
454	vmov.32		ctr1[1], r8
455	rev		ip, r7
456	vmov		e0, ctr
457	add		r7, r7, #1
458	vmov.32		ctr1[1], ip
459	rev		r8, r7
460	vmov		e1, ctr
461	add		r7, r7, #1
462	vmov.32		ctr1[1], r8
463	rev		ip, r7
464	vmov		e2, ctr
465	add		r7, r7, #1
466	vmov.32		ctr1[1], ip
467	vmov		e3, ctr
468
469	add		ip, r3, #64
470	aes_encrypt	ip, r6, e0, e1, e2, e3
471	bx		lr
472ENDPROC(pmull_aes_encrypt_4x)
473
474pmull_aes_encrypt_final:
475	add		ip, r5, #4
476	vld1.8		{ctr0}, [r5]
477	vld1.8		{ctr1}, [ip]
478	rev		r8, r7
479	vext.8		ctr1, ctr1, ctr1, #4
480	mov		r7, #1 << 24		// BE #1 for the tag
481	vmov.32		ctr1[1], r8
482	vmov		e0, ctr
483	vmov.32		ctr1[1], r7
484	vmov		e1, ctr
485
486	add		ip, r3, #64
487	aes_encrypt	ip, r6, e0, e1
488	bx		lr
489ENDPROC(pmull_aes_encrypt_final)
490
491	.macro		enc_1x, in0
492	bl		pmull_aes_encrypt
493	veor		\in0, \in0, e0
494	vst1.8		{\in0}, [r4]!
495	.endm
496
497	.macro		dec_1x, in0
498	bl		pmull_aes_encrypt
499	veor		e0, e0, \in0
500	vst1.8		{e0}, [r4]!
501	.endm
502
503	.macro		enc_4x, in0, in1, in2, in3
504	bl		pmull_aes_encrypt_4x
505
506	veor		\in0, \in0, e0
507	veor		\in1, \in1, e1
508	veor		\in2, \in2, e2
509	veor		\in3, \in3, e3
510
511	vst1.8		{\in0-\in1}, [r4]!
512	vst1.8		{\in2-\in3}, [r4]!
513	.endm
514
515	.macro		dec_4x, in0, in1, in2, in3
516	bl		pmull_aes_encrypt_4x
517
518	veor		e0, e0, \in0
519	veor		e1, e1, \in1
520	veor		e2, e2, \in2
521	veor		e3, e3, \in3
522
523	vst1.8		{e0-e1}, [r4]!
524	vst1.8		{e2-e3}, [r4]!
525	.endm
526
527	/*
528	 * void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src,
529	 *			  struct gcm_key const *k, char *dst,
530	 *			  char *iv, int rounds, u32 counter)
531	 */
532ENTRY(pmull_gcm_encrypt)
533	push		{r4-r8, lr}
534	ldrd		r4, r5, [sp, #24]
535	ldrd		r6, r7, [sp, #32]
536
537	vld1.64		{SHASH}, [r3]
538
539	ghash_update	p64, enc, head=0
540	vst1.64		{XL}, [r1]
541
542	pop		{r4-r8, pc}
543ENDPROC(pmull_gcm_encrypt)
544
545	/*
546	 * void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src,
547	 *			  struct gcm_key const *k, char *dst,
548	 *			  char *iv, int rounds, u32 counter)
549	 */
550ENTRY(pmull_gcm_decrypt)
551	push		{r4-r8, lr}
552	ldrd		r4, r5, [sp, #24]
553	ldrd		r6, r7, [sp, #32]
554
555	vld1.64		{SHASH}, [r3]
556
557	ghash_update	p64, dec, head=0
558	vst1.64		{XL}, [r1]
559
560	pop		{r4-r8, pc}
561ENDPROC(pmull_gcm_decrypt)
562
563	/*
564	 * void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag,
565	 *			    struct gcm_key const *k, char *head,
566	 *			    char *iv, int rounds, u32 counter)
567	 */
568ENTRY(pmull_gcm_enc_final)
569	push		{r4-r8, lr}
570	ldrd		r4, r5, [sp, #24]
571	ldrd		r6, r7, [sp, #32]
572
573	bl		pmull_aes_encrypt_final
574
575	cmp		r0, #0
576	beq		.Lenc_final
577
578	mov_l		ip, .Lpermute
579	sub		r4, r4, #16
580	add		r8, ip, r0
581	add		ip, ip, #32
582	add		r4, r4, r0
583	sub		ip, ip, r0
584
585	vld1.8		{e3}, [r8]		// permute vector for key stream
586	vld1.8		{e2}, [ip]		// permute vector for ghash input
587
588	vtbl.8		e3l, {e0}, e3l
589	vtbl.8		e3h, {e0}, e3h
590
591	vld1.8		{e0}, [r4]		// encrypt tail block
592	veor		e0, e0, e3
593	vst1.8		{e0}, [r4]
594
595	vtbl.8		T1_L, {e0}, e2l
596	vtbl.8		T1_H, {e0}, e2h
597
598	vld1.64		{XL}, [r1]
599.Lenc_final:
600	vld1.64		{SHASH}, [r3, :128]
601	vmov.i8		MASK, #0xe1
602	veor		SHASH2_p64, SHASH_L, SHASH_H
603	vshl.u64	MASK, MASK, #57
604	mov		r0, #1
605	bne		3f			// process head block first
606	ghash_update	p64, aggregate=0, head=0
607
608	vrev64.8	XL, XL
609	vext.8		XL, XL, XL, #8
610	veor		XL, XL, e1
611
612	sub		r2, r2, #16		// rewind src pointer
613	vst1.8		{XL}, [r2]		// store tag
614
615	pop		{r4-r8, pc}
616ENDPROC(pmull_gcm_enc_final)
617
618	/*
619	 * int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag,
620	 *			   struct gcm_key const *k, char *head,
621	 *			   char *iv, int rounds, u32 counter,
622	 *			   const char *otag, int authsize)
623	 */
624ENTRY(pmull_gcm_dec_final)
625	push		{r4-r8, lr}
626	ldrd		r4, r5, [sp, #24]
627	ldrd		r6, r7, [sp, #32]
628
629	bl		pmull_aes_encrypt_final
630
631	cmp		r0, #0
632	beq		.Ldec_final
633
634	mov_l		ip, .Lpermute
635	sub		r4, r4, #16
636	add		r8, ip, r0
637	add		ip, ip, #32
638	add		r4, r4, r0
639	sub		ip, ip, r0
640
641	vld1.8		{e3}, [r8]		// permute vector for key stream
642	vld1.8		{e2}, [ip]		// permute vector for ghash input
643
644	vtbl.8		e3l, {e0}, e3l
645	vtbl.8		e3h, {e0}, e3h
646
647	vld1.8		{e0}, [r4]
648
649	vtbl.8		T1_L, {e0}, e2l
650	vtbl.8		T1_H, {e0}, e2h
651
652	veor		e0, e0, e3
653	vst1.8		{e0}, [r4]
654
655	vld1.64		{XL}, [r1]
656.Ldec_final:
657	vld1.64		{SHASH}, [r3]
658	vmov.i8		MASK, #0xe1
659	veor		SHASH2_p64, SHASH_L, SHASH_H
660	vshl.u64	MASK, MASK, #57
661	mov		r0, #1
662	bne		3f			// process head block first
663	ghash_update	p64, aggregate=0, head=0
664
665	vrev64.8	XL, XL
666	vext.8		XL, XL, XL, #8
667	veor		XL, XL, e1
668
669	mov_l		ip, .Lpermute
670	ldrd		r2, r3, [sp, #40]	// otag and authsize
671	vld1.8		{T1}, [r2]
672	add		ip, ip, r3
673	vceq.i8		T1, T1, XL		// compare tags
674	vmvn		T1, T1			// 0 for eq, -1 for ne
675
676	vld1.8		{e0}, [ip]
677	vtbl.8		XL_L, {T1}, e0l		// keep authsize bytes only
678	vtbl.8		XL_H, {T1}, e0h
679
680	vpmin.s8	XL_L, XL_L, XL_H	// take the minimum s8 across the vector
681	vpmin.s8	XL_L, XL_L, XL_L
682	vmov.32		r0, XL_L[0]		// fail if != 0x0
683
684	pop		{r4-r8, pc}
685ENDPROC(pmull_gcm_dec_final)
686
687	.section	".rodata", "a", %progbits
688	.align		5
689.Lpermute:
690	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
691	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
692	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
693	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
694	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
695	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
696