xref: /freebsd/sys/crypto/openssl/arm/ghash-armv4.S (revision bc3d5698008e9b3b19495e853cbc2598979ccf8a)
1*bc3d5698SJohn Baldwin/* $FreeBSD$ */
2*bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from ghash-armv4.pl. */
3*bc3d5698SJohn Baldwin#include "arm_arch.h"
4*bc3d5698SJohn Baldwin
5*bc3d5698SJohn Baldwin.text
6*bc3d5698SJohn Baldwin#if defined(__thumb2__) || defined(__clang__)
7*bc3d5698SJohn Baldwin.syntax	unified
8*bc3d5698SJohn Baldwin#define ldrplb  ldrbpl
9*bc3d5698SJohn Baldwin#define ldrneb  ldrbne
10*bc3d5698SJohn Baldwin#endif
11*bc3d5698SJohn Baldwin#if defined(__thumb2__)
12*bc3d5698SJohn Baldwin.thumb
13*bc3d5698SJohn Baldwin#else
14*bc3d5698SJohn Baldwin.code	32
15*bc3d5698SJohn Baldwin#endif
16*bc3d5698SJohn Baldwin
17*bc3d5698SJohn Baldwin.type	rem_4bit,%object
18*bc3d5698SJohn Baldwin.align	5
19*bc3d5698SJohn Baldwinrem_4bit:
20*bc3d5698SJohn Baldwin.short	0x0000,0x1C20,0x3840,0x2460
21*bc3d5698SJohn Baldwin.short	0x7080,0x6CA0,0x48C0,0x54E0
22*bc3d5698SJohn Baldwin.short	0xE100,0xFD20,0xD940,0xC560
23*bc3d5698SJohn Baldwin.short	0x9180,0x8DA0,0xA9C0,0xB5E0
24*bc3d5698SJohn Baldwin.size	rem_4bit,.-rem_4bit
25*bc3d5698SJohn Baldwin
26*bc3d5698SJohn Baldwin.type	rem_4bit_get,%function
27*bc3d5698SJohn Baldwinrem_4bit_get:
28*bc3d5698SJohn Baldwin#if defined(__thumb2__)
29*bc3d5698SJohn Baldwin	adr	r2,rem_4bit
30*bc3d5698SJohn Baldwin#else
31*bc3d5698SJohn Baldwin	sub	r2,pc,#8+32	@ &rem_4bit
32*bc3d5698SJohn Baldwin#endif
33*bc3d5698SJohn Baldwin	b	.Lrem_4bit_got
34*bc3d5698SJohn Baldwin	nop
35*bc3d5698SJohn Baldwin	nop
36*bc3d5698SJohn Baldwin.size	rem_4bit_get,.-rem_4bit_get
37*bc3d5698SJohn Baldwin
38*bc3d5698SJohn Baldwin.globl	gcm_ghash_4bit
39*bc3d5698SJohn Baldwin.type	gcm_ghash_4bit,%function
40*bc3d5698SJohn Baldwin.align	4
41*bc3d5698SJohn Baldwingcm_ghash_4bit:
42*bc3d5698SJohn Baldwin#if defined(__thumb2__)
43*bc3d5698SJohn Baldwin	adr	r12,rem_4bit
44*bc3d5698SJohn Baldwin#else
45*bc3d5698SJohn Baldwin	sub	r12,pc,#8+48		@ &rem_4bit
46*bc3d5698SJohn Baldwin#endif
47*bc3d5698SJohn Baldwin	add	r3,r2,r3		@ r3 to point at the end
48*bc3d5698SJohn Baldwin	stmdb	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}		@ save r3/end too
49*bc3d5698SJohn Baldwin
50*bc3d5698SJohn Baldwin	ldmia	r12,{r4,r5,r6,r7,r8,r9,r10,r11}		@ copy rem_4bit ...
51*bc3d5698SJohn Baldwin	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}		@ ... to stack
52*bc3d5698SJohn Baldwin
53*bc3d5698SJohn Baldwin	ldrb	r12,[r2,#15]
54*bc3d5698SJohn Baldwin	ldrb	r14,[r0,#15]
55*bc3d5698SJohn Baldwin.Louter:
56*bc3d5698SJohn Baldwin	eor	r12,r12,r14
57*bc3d5698SJohn Baldwin	and	r14,r12,#0xf0
58*bc3d5698SJohn Baldwin	and	r12,r12,#0x0f
59*bc3d5698SJohn Baldwin	mov	r3,#14
60*bc3d5698SJohn Baldwin
61*bc3d5698SJohn Baldwin	add	r7,r1,r12,lsl#4
62*bc3d5698SJohn Baldwin	ldmia	r7,{r4,r5,r6,r7}	@ load Htbl[nlo]
63*bc3d5698SJohn Baldwin	add	r11,r1,r14
64*bc3d5698SJohn Baldwin	ldrb	r12,[r2,#14]
65*bc3d5698SJohn Baldwin
66*bc3d5698SJohn Baldwin	and	r14,r4,#0xf		@ rem
67*bc3d5698SJohn Baldwin	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
68*bc3d5698SJohn Baldwin	add	r14,r14,r14
69*bc3d5698SJohn Baldwin	eor	r4,r8,r4,lsr#4
70*bc3d5698SJohn Baldwin	ldrh	r8,[sp,r14]		@ rem_4bit[rem]
71*bc3d5698SJohn Baldwin	eor	r4,r4,r5,lsl#28
72*bc3d5698SJohn Baldwin	ldrb	r14,[r0,#14]
73*bc3d5698SJohn Baldwin	eor	r5,r9,r5,lsr#4
74*bc3d5698SJohn Baldwin	eor	r5,r5,r6,lsl#28
75*bc3d5698SJohn Baldwin	eor	r6,r10,r6,lsr#4
76*bc3d5698SJohn Baldwin	eor	r6,r6,r7,lsl#28
77*bc3d5698SJohn Baldwin	eor	r7,r11,r7,lsr#4
78*bc3d5698SJohn Baldwin	eor	r12,r12,r14
79*bc3d5698SJohn Baldwin	and	r14,r12,#0xf0
80*bc3d5698SJohn Baldwin	and	r12,r12,#0x0f
81*bc3d5698SJohn Baldwin	eor	r7,r7,r8,lsl#16
82*bc3d5698SJohn Baldwin
83*bc3d5698SJohn Baldwin.Linner:
84*bc3d5698SJohn Baldwin	add	r11,r1,r12,lsl#4
85*bc3d5698SJohn Baldwin	and	r12,r4,#0xf		@ rem
86*bc3d5698SJohn Baldwin	subs	r3,r3,#1
87*bc3d5698SJohn Baldwin	add	r12,r12,r12
88*bc3d5698SJohn Baldwin	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nlo]
89*bc3d5698SJohn Baldwin	eor	r4,r8,r4,lsr#4
90*bc3d5698SJohn Baldwin	eor	r4,r4,r5,lsl#28
91*bc3d5698SJohn Baldwin	eor	r5,r9,r5,lsr#4
92*bc3d5698SJohn Baldwin	eor	r5,r5,r6,lsl#28
93*bc3d5698SJohn Baldwin	ldrh	r8,[sp,r12]		@ rem_4bit[rem]
94*bc3d5698SJohn Baldwin	eor	r6,r10,r6,lsr#4
95*bc3d5698SJohn Baldwin#ifdef	__thumb2__
96*bc3d5698SJohn Baldwin	it	pl
97*bc3d5698SJohn Baldwin#endif
98*bc3d5698SJohn Baldwin	ldrplb	r12,[r2,r3]
99*bc3d5698SJohn Baldwin	eor	r6,r6,r7,lsl#28
100*bc3d5698SJohn Baldwin	eor	r7,r11,r7,lsr#4
101*bc3d5698SJohn Baldwin
102*bc3d5698SJohn Baldwin	add	r11,r1,r14
103*bc3d5698SJohn Baldwin	and	r14,r4,#0xf		@ rem
104*bc3d5698SJohn Baldwin	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
105*bc3d5698SJohn Baldwin	add	r14,r14,r14
106*bc3d5698SJohn Baldwin	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
107*bc3d5698SJohn Baldwin	eor	r4,r8,r4,lsr#4
108*bc3d5698SJohn Baldwin#ifdef	__thumb2__
109*bc3d5698SJohn Baldwin	it	pl
110*bc3d5698SJohn Baldwin#endif
111*bc3d5698SJohn Baldwin	ldrplb	r8,[r0,r3]
112*bc3d5698SJohn Baldwin	eor	r4,r4,r5,lsl#28
113*bc3d5698SJohn Baldwin	eor	r5,r9,r5,lsr#4
114*bc3d5698SJohn Baldwin	ldrh	r9,[sp,r14]
115*bc3d5698SJohn Baldwin	eor	r5,r5,r6,lsl#28
116*bc3d5698SJohn Baldwin	eor	r6,r10,r6,lsr#4
117*bc3d5698SJohn Baldwin	eor	r6,r6,r7,lsl#28
118*bc3d5698SJohn Baldwin#ifdef	__thumb2__
119*bc3d5698SJohn Baldwin	it	pl
120*bc3d5698SJohn Baldwin#endif
121*bc3d5698SJohn Baldwin	eorpl	r12,r12,r8
122*bc3d5698SJohn Baldwin	eor	r7,r11,r7,lsr#4
123*bc3d5698SJohn Baldwin#ifdef	__thumb2__
124*bc3d5698SJohn Baldwin	itt	pl
125*bc3d5698SJohn Baldwin#endif
126*bc3d5698SJohn Baldwin	andpl	r14,r12,#0xf0
127*bc3d5698SJohn Baldwin	andpl	r12,r12,#0x0f
128*bc3d5698SJohn Baldwin	eor	r7,r7,r9,lsl#16	@ ^= rem_4bit[rem]
129*bc3d5698SJohn Baldwin	bpl	.Linner
130*bc3d5698SJohn Baldwin
131*bc3d5698SJohn Baldwin	ldr	r3,[sp,#32]		@ re-load r3/end
132*bc3d5698SJohn Baldwin	add	r2,r2,#16
133*bc3d5698SJohn Baldwin	mov	r14,r4
134*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__)
135*bc3d5698SJohn Baldwin	rev	r4,r4
136*bc3d5698SJohn Baldwin	str	r4,[r0,#12]
137*bc3d5698SJohn Baldwin#elif defined(__ARMEB__)
138*bc3d5698SJohn Baldwin	str	r4,[r0,#12]
139*bc3d5698SJohn Baldwin#else
140*bc3d5698SJohn Baldwin	mov	r9,r4,lsr#8
141*bc3d5698SJohn Baldwin	strb	r4,[r0,#12+3]
142*bc3d5698SJohn Baldwin	mov	r10,r4,lsr#16
143*bc3d5698SJohn Baldwin	strb	r9,[r0,#12+2]
144*bc3d5698SJohn Baldwin	mov	r11,r4,lsr#24
145*bc3d5698SJohn Baldwin	strb	r10,[r0,#12+1]
146*bc3d5698SJohn Baldwin	strb	r11,[r0,#12]
147*bc3d5698SJohn Baldwin#endif
148*bc3d5698SJohn Baldwin	cmp	r2,r3
149*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__)
150*bc3d5698SJohn Baldwin	rev	r5,r5
151*bc3d5698SJohn Baldwin	str	r5,[r0,#8]
152*bc3d5698SJohn Baldwin#elif defined(__ARMEB__)
153*bc3d5698SJohn Baldwin	str	r5,[r0,#8]
154*bc3d5698SJohn Baldwin#else
155*bc3d5698SJohn Baldwin	mov	r9,r5,lsr#8
156*bc3d5698SJohn Baldwin	strb	r5,[r0,#8+3]
157*bc3d5698SJohn Baldwin	mov	r10,r5,lsr#16
158*bc3d5698SJohn Baldwin	strb	r9,[r0,#8+2]
159*bc3d5698SJohn Baldwin	mov	r11,r5,lsr#24
160*bc3d5698SJohn Baldwin	strb	r10,[r0,#8+1]
161*bc3d5698SJohn Baldwin	strb	r11,[r0,#8]
162*bc3d5698SJohn Baldwin#endif
163*bc3d5698SJohn Baldwin
164*bc3d5698SJohn Baldwin#ifdef __thumb2__
165*bc3d5698SJohn Baldwin	it	ne
166*bc3d5698SJohn Baldwin#endif
167*bc3d5698SJohn Baldwin	ldrneb	r12,[r2,#15]
168*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__)
169*bc3d5698SJohn Baldwin	rev	r6,r6
170*bc3d5698SJohn Baldwin	str	r6,[r0,#4]
171*bc3d5698SJohn Baldwin#elif defined(__ARMEB__)
172*bc3d5698SJohn Baldwin	str	r6,[r0,#4]
173*bc3d5698SJohn Baldwin#else
174*bc3d5698SJohn Baldwin	mov	r9,r6,lsr#8
175*bc3d5698SJohn Baldwin	strb	r6,[r0,#4+3]
176*bc3d5698SJohn Baldwin	mov	r10,r6,lsr#16
177*bc3d5698SJohn Baldwin	strb	r9,[r0,#4+2]
178*bc3d5698SJohn Baldwin	mov	r11,r6,lsr#24
179*bc3d5698SJohn Baldwin	strb	r10,[r0,#4+1]
180*bc3d5698SJohn Baldwin	strb	r11,[r0,#4]
181*bc3d5698SJohn Baldwin#endif
182*bc3d5698SJohn Baldwin
183*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__)
184*bc3d5698SJohn Baldwin	rev	r7,r7
185*bc3d5698SJohn Baldwin	str	r7,[r0,#0]
186*bc3d5698SJohn Baldwin#elif defined(__ARMEB__)
187*bc3d5698SJohn Baldwin	str	r7,[r0,#0]
188*bc3d5698SJohn Baldwin#else
189*bc3d5698SJohn Baldwin	mov	r9,r7,lsr#8
190*bc3d5698SJohn Baldwin	strb	r7,[r0,#0+3]
191*bc3d5698SJohn Baldwin	mov	r10,r7,lsr#16
192*bc3d5698SJohn Baldwin	strb	r9,[r0,#0+2]
193*bc3d5698SJohn Baldwin	mov	r11,r7,lsr#24
194*bc3d5698SJohn Baldwin	strb	r10,[r0,#0+1]
195*bc3d5698SJohn Baldwin	strb	r11,[r0,#0]
196*bc3d5698SJohn Baldwin#endif
197*bc3d5698SJohn Baldwin
198*bc3d5698SJohn Baldwin	bne	.Louter
199*bc3d5698SJohn Baldwin
200*bc3d5698SJohn Baldwin	add	sp,sp,#36
201*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=5
202*bc3d5698SJohn Baldwin	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
203*bc3d5698SJohn Baldwin#else
204*bc3d5698SJohn Baldwin	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
205*bc3d5698SJohn Baldwin	tst	lr,#1
206*bc3d5698SJohn Baldwin	moveq	pc,lr			@ be binary compatible with V4, yet
207*bc3d5698SJohn Baldwin.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
208*bc3d5698SJohn Baldwin#endif
209*bc3d5698SJohn Baldwin.size	gcm_ghash_4bit,.-gcm_ghash_4bit
210*bc3d5698SJohn Baldwin
211*bc3d5698SJohn Baldwin.globl	gcm_gmult_4bit
212*bc3d5698SJohn Baldwin.type	gcm_gmult_4bit,%function
213*bc3d5698SJohn Baldwingcm_gmult_4bit:
214*bc3d5698SJohn Baldwin	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
215*bc3d5698SJohn Baldwin	ldrb	r12,[r0,#15]
216*bc3d5698SJohn Baldwin	b	rem_4bit_get
217*bc3d5698SJohn Baldwin.Lrem_4bit_got:
218*bc3d5698SJohn Baldwin	and	r14,r12,#0xf0
219*bc3d5698SJohn Baldwin	and	r12,r12,#0x0f
220*bc3d5698SJohn Baldwin	mov	r3,#14
221*bc3d5698SJohn Baldwin
222*bc3d5698SJohn Baldwin	add	r7,r1,r12,lsl#4
223*bc3d5698SJohn Baldwin	ldmia	r7,{r4,r5,r6,r7}	@ load Htbl[nlo]
224*bc3d5698SJohn Baldwin	ldrb	r12,[r0,#14]
225*bc3d5698SJohn Baldwin
226*bc3d5698SJohn Baldwin	add	r11,r1,r14
227*bc3d5698SJohn Baldwin	and	r14,r4,#0xf		@ rem
228*bc3d5698SJohn Baldwin	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
229*bc3d5698SJohn Baldwin	add	r14,r14,r14
230*bc3d5698SJohn Baldwin	eor	r4,r8,r4,lsr#4
231*bc3d5698SJohn Baldwin	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
232*bc3d5698SJohn Baldwin	eor	r4,r4,r5,lsl#28
233*bc3d5698SJohn Baldwin	eor	r5,r9,r5,lsr#4
234*bc3d5698SJohn Baldwin	eor	r5,r5,r6,lsl#28
235*bc3d5698SJohn Baldwin	eor	r6,r10,r6,lsr#4
236*bc3d5698SJohn Baldwin	eor	r6,r6,r7,lsl#28
237*bc3d5698SJohn Baldwin	eor	r7,r11,r7,lsr#4
238*bc3d5698SJohn Baldwin	and	r14,r12,#0xf0
239*bc3d5698SJohn Baldwin	eor	r7,r7,r8,lsl#16
240*bc3d5698SJohn Baldwin	and	r12,r12,#0x0f
241*bc3d5698SJohn Baldwin
242*bc3d5698SJohn Baldwin.Loop:
243*bc3d5698SJohn Baldwin	add	r11,r1,r12,lsl#4
244*bc3d5698SJohn Baldwin	and	r12,r4,#0xf		@ rem
245*bc3d5698SJohn Baldwin	subs	r3,r3,#1
246*bc3d5698SJohn Baldwin	add	r12,r12,r12
247*bc3d5698SJohn Baldwin	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nlo]
248*bc3d5698SJohn Baldwin	eor	r4,r8,r4,lsr#4
249*bc3d5698SJohn Baldwin	eor	r4,r4,r5,lsl#28
250*bc3d5698SJohn Baldwin	eor	r5,r9,r5,lsr#4
251*bc3d5698SJohn Baldwin	eor	r5,r5,r6,lsl#28
252*bc3d5698SJohn Baldwin	ldrh	r8,[r2,r12]	@ rem_4bit[rem]
253*bc3d5698SJohn Baldwin	eor	r6,r10,r6,lsr#4
254*bc3d5698SJohn Baldwin#ifdef	__thumb2__
255*bc3d5698SJohn Baldwin	it	pl
256*bc3d5698SJohn Baldwin#endif
257*bc3d5698SJohn Baldwin	ldrplb	r12,[r0,r3]
258*bc3d5698SJohn Baldwin	eor	r6,r6,r7,lsl#28
259*bc3d5698SJohn Baldwin	eor	r7,r11,r7,lsr#4
260*bc3d5698SJohn Baldwin
261*bc3d5698SJohn Baldwin	add	r11,r1,r14
262*bc3d5698SJohn Baldwin	and	r14,r4,#0xf		@ rem
263*bc3d5698SJohn Baldwin	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
264*bc3d5698SJohn Baldwin	add	r14,r14,r14
265*bc3d5698SJohn Baldwin	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
266*bc3d5698SJohn Baldwin	eor	r4,r8,r4,lsr#4
267*bc3d5698SJohn Baldwin	eor	r4,r4,r5,lsl#28
268*bc3d5698SJohn Baldwin	eor	r5,r9,r5,lsr#4
269*bc3d5698SJohn Baldwin	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
270*bc3d5698SJohn Baldwin	eor	r5,r5,r6,lsl#28
271*bc3d5698SJohn Baldwin	eor	r6,r10,r6,lsr#4
272*bc3d5698SJohn Baldwin	eor	r6,r6,r7,lsl#28
273*bc3d5698SJohn Baldwin	eor	r7,r11,r7,lsr#4
274*bc3d5698SJohn Baldwin#ifdef	__thumb2__
275*bc3d5698SJohn Baldwin	itt	pl
276*bc3d5698SJohn Baldwin#endif
277*bc3d5698SJohn Baldwin	andpl	r14,r12,#0xf0
278*bc3d5698SJohn Baldwin	andpl	r12,r12,#0x0f
279*bc3d5698SJohn Baldwin	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
280*bc3d5698SJohn Baldwin	bpl	.Loop
281*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__)
282*bc3d5698SJohn Baldwin	rev	r4,r4
283*bc3d5698SJohn Baldwin	str	r4,[r0,#12]
284*bc3d5698SJohn Baldwin#elif defined(__ARMEB__)
285*bc3d5698SJohn Baldwin	str	r4,[r0,#12]
286*bc3d5698SJohn Baldwin#else
287*bc3d5698SJohn Baldwin	mov	r9,r4,lsr#8
288*bc3d5698SJohn Baldwin	strb	r4,[r0,#12+3]
289*bc3d5698SJohn Baldwin	mov	r10,r4,lsr#16
290*bc3d5698SJohn Baldwin	strb	r9,[r0,#12+2]
291*bc3d5698SJohn Baldwin	mov	r11,r4,lsr#24
292*bc3d5698SJohn Baldwin	strb	r10,[r0,#12+1]
293*bc3d5698SJohn Baldwin	strb	r11,[r0,#12]
294*bc3d5698SJohn Baldwin#endif
295*bc3d5698SJohn Baldwin
296*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__)
297*bc3d5698SJohn Baldwin	rev	r5,r5
298*bc3d5698SJohn Baldwin	str	r5,[r0,#8]
299*bc3d5698SJohn Baldwin#elif defined(__ARMEB__)
300*bc3d5698SJohn Baldwin	str	r5,[r0,#8]
301*bc3d5698SJohn Baldwin#else
302*bc3d5698SJohn Baldwin	mov	r9,r5,lsr#8
303*bc3d5698SJohn Baldwin	strb	r5,[r0,#8+3]
304*bc3d5698SJohn Baldwin	mov	r10,r5,lsr#16
305*bc3d5698SJohn Baldwin	strb	r9,[r0,#8+2]
306*bc3d5698SJohn Baldwin	mov	r11,r5,lsr#24
307*bc3d5698SJohn Baldwin	strb	r10,[r0,#8+1]
308*bc3d5698SJohn Baldwin	strb	r11,[r0,#8]
309*bc3d5698SJohn Baldwin#endif
310*bc3d5698SJohn Baldwin
311*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__)
312*bc3d5698SJohn Baldwin	rev	r6,r6
313*bc3d5698SJohn Baldwin	str	r6,[r0,#4]
314*bc3d5698SJohn Baldwin#elif defined(__ARMEB__)
315*bc3d5698SJohn Baldwin	str	r6,[r0,#4]
316*bc3d5698SJohn Baldwin#else
317*bc3d5698SJohn Baldwin	mov	r9,r6,lsr#8
318*bc3d5698SJohn Baldwin	strb	r6,[r0,#4+3]
319*bc3d5698SJohn Baldwin	mov	r10,r6,lsr#16
320*bc3d5698SJohn Baldwin	strb	r9,[r0,#4+2]
321*bc3d5698SJohn Baldwin	mov	r11,r6,lsr#24
322*bc3d5698SJohn Baldwin	strb	r10,[r0,#4+1]
323*bc3d5698SJohn Baldwin	strb	r11,[r0,#4]
324*bc3d5698SJohn Baldwin#endif
325*bc3d5698SJohn Baldwin
326*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__)
327*bc3d5698SJohn Baldwin	rev	r7,r7
328*bc3d5698SJohn Baldwin	str	r7,[r0,#0]
329*bc3d5698SJohn Baldwin#elif defined(__ARMEB__)
330*bc3d5698SJohn Baldwin	str	r7,[r0,#0]
331*bc3d5698SJohn Baldwin#else
332*bc3d5698SJohn Baldwin	mov	r9,r7,lsr#8
333*bc3d5698SJohn Baldwin	strb	r7,[r0,#0+3]
334*bc3d5698SJohn Baldwin	mov	r10,r7,lsr#16
335*bc3d5698SJohn Baldwin	strb	r9,[r0,#0+2]
336*bc3d5698SJohn Baldwin	mov	r11,r7,lsr#24
337*bc3d5698SJohn Baldwin	strb	r10,[r0,#0+1]
338*bc3d5698SJohn Baldwin	strb	r11,[r0,#0]
339*bc3d5698SJohn Baldwin#endif
340*bc3d5698SJohn Baldwin
341*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=5
342*bc3d5698SJohn Baldwin	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
343*bc3d5698SJohn Baldwin#else
344*bc3d5698SJohn Baldwin	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
345*bc3d5698SJohn Baldwin	tst	lr,#1
346*bc3d5698SJohn Baldwin	moveq	pc,lr			@ be binary compatible with V4, yet
347*bc3d5698SJohn Baldwin.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
348*bc3d5698SJohn Baldwin#endif
349*bc3d5698SJohn Baldwin.size	gcm_gmult_4bit,.-gcm_gmult_4bit
350*bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7
351*bc3d5698SJohn Baldwin.arch	armv7-a
352*bc3d5698SJohn Baldwin.fpu	neon
353*bc3d5698SJohn Baldwin
354*bc3d5698SJohn Baldwin.globl	gcm_init_neon
355*bc3d5698SJohn Baldwin.type	gcm_init_neon,%function
356*bc3d5698SJohn Baldwin.align	4
357*bc3d5698SJohn Baldwingcm_init_neon:
358*bc3d5698SJohn Baldwin	vld1.64	d7,[r1]!		@ load H
359*bc3d5698SJohn Baldwin	vmov.i8	q8,#0xe1
360*bc3d5698SJohn Baldwin	vld1.64	d6,[r1]
361*bc3d5698SJohn Baldwin	vshl.i64	d17,#57
362*bc3d5698SJohn Baldwin	vshr.u64	d16,#63		@ t0=0xc2....01
363*bc3d5698SJohn Baldwin	vdup.8	q9,d7[7]
364*bc3d5698SJohn Baldwin	vshr.u64	d26,d6,#63
365*bc3d5698SJohn Baldwin	vshr.s8	q9,#7			@ broadcast carry bit
366*bc3d5698SJohn Baldwin	vshl.i64	q3,q3,#1
367*bc3d5698SJohn Baldwin	vand	q8,q8,q9
368*bc3d5698SJohn Baldwin	vorr	d7,d26		@ H<<<=1
369*bc3d5698SJohn Baldwin	veor	q3,q3,q8		@ twisted H
370*bc3d5698SJohn Baldwin	vstmia	r0,{q3}
371*bc3d5698SJohn Baldwin
372*bc3d5698SJohn Baldwin	bx	lr					@ bx lr
373*bc3d5698SJohn Baldwin.size	gcm_init_neon,.-gcm_init_neon
374*bc3d5698SJohn Baldwin
375*bc3d5698SJohn Baldwin.globl	gcm_gmult_neon
376*bc3d5698SJohn Baldwin.type	gcm_gmult_neon,%function
377*bc3d5698SJohn Baldwin.align	4
378*bc3d5698SJohn Baldwingcm_gmult_neon:
379*bc3d5698SJohn Baldwin	vld1.64	d7,[r0]!		@ load Xi
380*bc3d5698SJohn Baldwin	vld1.64	d6,[r0]!
381*bc3d5698SJohn Baldwin	vmov.i64	d29,#0x0000ffffffffffff
382*bc3d5698SJohn Baldwin	vldmia	r1,{d26,d27}	@ load twisted H
383*bc3d5698SJohn Baldwin	vmov.i64	d30,#0x00000000ffffffff
384*bc3d5698SJohn Baldwin#ifdef __ARMEL__
385*bc3d5698SJohn Baldwin	vrev64.8	q3,q3
386*bc3d5698SJohn Baldwin#endif
387*bc3d5698SJohn Baldwin	vmov.i64	d31,#0x000000000000ffff
388*bc3d5698SJohn Baldwin	veor	d28,d26,d27		@ Karatsuba pre-processing
389*bc3d5698SJohn Baldwin	mov	r3,#16
390*bc3d5698SJohn Baldwin	b	.Lgmult_neon
391*bc3d5698SJohn Baldwin.size	gcm_gmult_neon,.-gcm_gmult_neon
392*bc3d5698SJohn Baldwin
393*bc3d5698SJohn Baldwin.globl	gcm_ghash_neon
394*bc3d5698SJohn Baldwin.type	gcm_ghash_neon,%function
395*bc3d5698SJohn Baldwin.align	4
396*bc3d5698SJohn Baldwingcm_ghash_neon:
397*bc3d5698SJohn Baldwin	vld1.64	d1,[r0]!		@ load Xi
398*bc3d5698SJohn Baldwin	vld1.64	d0,[r0]!
399*bc3d5698SJohn Baldwin	vmov.i64	d29,#0x0000ffffffffffff
400*bc3d5698SJohn Baldwin	vldmia	r1,{d26,d27}	@ load twisted H
401*bc3d5698SJohn Baldwin	vmov.i64	d30,#0x00000000ffffffff
402*bc3d5698SJohn Baldwin#ifdef __ARMEL__
403*bc3d5698SJohn Baldwin	vrev64.8	q0,q0
404*bc3d5698SJohn Baldwin#endif
405*bc3d5698SJohn Baldwin	vmov.i64	d31,#0x000000000000ffff
406*bc3d5698SJohn Baldwin	veor	d28,d26,d27		@ Karatsuba pre-processing
407*bc3d5698SJohn Baldwin
408*bc3d5698SJohn Baldwin.Loop_neon:
409*bc3d5698SJohn Baldwin	vld1.64	d7,[r2]!		@ load inp
410*bc3d5698SJohn Baldwin	vld1.64	d6,[r2]!
411*bc3d5698SJohn Baldwin#ifdef __ARMEL__
412*bc3d5698SJohn Baldwin	vrev64.8	q3,q3
413*bc3d5698SJohn Baldwin#endif
414*bc3d5698SJohn Baldwin	veor	q3,q0			@ inp^=Xi
415*bc3d5698SJohn Baldwin.Lgmult_neon:
416*bc3d5698SJohn Baldwin	vext.8	d16, d26, d26, #1	@ A1
417*bc3d5698SJohn Baldwin	vmull.p8	q8, d16, d6		@ F = A1*B
418*bc3d5698SJohn Baldwin	vext.8	d0, d6, d6, #1	@ B1
419*bc3d5698SJohn Baldwin	vmull.p8	q0, d26, d0		@ E = A*B1
420*bc3d5698SJohn Baldwin	vext.8	d18, d26, d26, #2	@ A2
421*bc3d5698SJohn Baldwin	vmull.p8	q9, d18, d6		@ H = A2*B
422*bc3d5698SJohn Baldwin	vext.8	d22, d6, d6, #2	@ B2
423*bc3d5698SJohn Baldwin	vmull.p8	q11, d26, d22		@ G = A*B2
424*bc3d5698SJohn Baldwin	vext.8	d20, d26, d26, #3	@ A3
425*bc3d5698SJohn Baldwin	veor	q8, q8, q0		@ L = E + F
426*bc3d5698SJohn Baldwin	vmull.p8	q10, d20, d6		@ J = A3*B
427*bc3d5698SJohn Baldwin	vext.8	d0, d6, d6, #3	@ B3
428*bc3d5698SJohn Baldwin	veor	q9, q9, q11		@ M = G + H
429*bc3d5698SJohn Baldwin	vmull.p8	q0, d26, d0		@ I = A*B3
430*bc3d5698SJohn Baldwin	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
431*bc3d5698SJohn Baldwin	vand	d17, d17, d29
432*bc3d5698SJohn Baldwin	vext.8	d22, d6, d6, #4	@ B4
433*bc3d5698SJohn Baldwin	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
434*bc3d5698SJohn Baldwin	vand	d19, d19, d30
435*bc3d5698SJohn Baldwin	vmull.p8	q11, d26, d22		@ K = A*B4
436*bc3d5698SJohn Baldwin	veor	q10, q10, q0		@ N = I + J
437*bc3d5698SJohn Baldwin	veor	d16, d16, d17
438*bc3d5698SJohn Baldwin	veor	d18, d18, d19
439*bc3d5698SJohn Baldwin	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
440*bc3d5698SJohn Baldwin	vand	d21, d21, d31
441*bc3d5698SJohn Baldwin	vext.8	q8, q8, q8, #15
442*bc3d5698SJohn Baldwin	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
443*bc3d5698SJohn Baldwin	vmov.i64	d23, #0
444*bc3d5698SJohn Baldwin	vext.8	q9, q9, q9, #14
445*bc3d5698SJohn Baldwin	veor	d20, d20, d21
446*bc3d5698SJohn Baldwin	vmull.p8	q0, d26, d6		@ D = A*B
447*bc3d5698SJohn Baldwin	vext.8	q11, q11, q11, #12
448*bc3d5698SJohn Baldwin	vext.8	q10, q10, q10, #13
449*bc3d5698SJohn Baldwin	veor	q8, q8, q9
450*bc3d5698SJohn Baldwin	veor	q10, q10, q11
451*bc3d5698SJohn Baldwin	veor	q0, q0, q8
452*bc3d5698SJohn Baldwin	veor	q0, q0, q10
453*bc3d5698SJohn Baldwin	veor	d6,d6,d7	@ Karatsuba pre-processing
454*bc3d5698SJohn Baldwin	vext.8	d16, d28, d28, #1	@ A1
455*bc3d5698SJohn Baldwin	vmull.p8	q8, d16, d6		@ F = A1*B
456*bc3d5698SJohn Baldwin	vext.8	d2, d6, d6, #1	@ B1
457*bc3d5698SJohn Baldwin	vmull.p8	q1, d28, d2		@ E = A*B1
458*bc3d5698SJohn Baldwin	vext.8	d18, d28, d28, #2	@ A2
459*bc3d5698SJohn Baldwin	vmull.p8	q9, d18, d6		@ H = A2*B
460*bc3d5698SJohn Baldwin	vext.8	d22, d6, d6, #2	@ B2
461*bc3d5698SJohn Baldwin	vmull.p8	q11, d28, d22		@ G = A*B2
462*bc3d5698SJohn Baldwin	vext.8	d20, d28, d28, #3	@ A3
463*bc3d5698SJohn Baldwin	veor	q8, q8, q1		@ L = E + F
464*bc3d5698SJohn Baldwin	vmull.p8	q10, d20, d6		@ J = A3*B
465*bc3d5698SJohn Baldwin	vext.8	d2, d6, d6, #3	@ B3
466*bc3d5698SJohn Baldwin	veor	q9, q9, q11		@ M = G + H
467*bc3d5698SJohn Baldwin	vmull.p8	q1, d28, d2		@ I = A*B3
468*bc3d5698SJohn Baldwin	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
469*bc3d5698SJohn Baldwin	vand	d17, d17, d29
470*bc3d5698SJohn Baldwin	vext.8	d22, d6, d6, #4	@ B4
471*bc3d5698SJohn Baldwin	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
472*bc3d5698SJohn Baldwin	vand	d19, d19, d30
473*bc3d5698SJohn Baldwin	vmull.p8	q11, d28, d22		@ K = A*B4
474*bc3d5698SJohn Baldwin	veor	q10, q10, q1		@ N = I + J
475*bc3d5698SJohn Baldwin	veor	d16, d16, d17
476*bc3d5698SJohn Baldwin	veor	d18, d18, d19
477*bc3d5698SJohn Baldwin	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
478*bc3d5698SJohn Baldwin	vand	d21, d21, d31
479*bc3d5698SJohn Baldwin	vext.8	q8, q8, q8, #15
480*bc3d5698SJohn Baldwin	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
481*bc3d5698SJohn Baldwin	vmov.i64	d23, #0
482*bc3d5698SJohn Baldwin	vext.8	q9, q9, q9, #14
483*bc3d5698SJohn Baldwin	veor	d20, d20, d21
484*bc3d5698SJohn Baldwin	vmull.p8	q1, d28, d6		@ D = A*B
485*bc3d5698SJohn Baldwin	vext.8	q11, q11, q11, #12
486*bc3d5698SJohn Baldwin	vext.8	q10, q10, q10, #13
487*bc3d5698SJohn Baldwin	veor	q8, q8, q9
488*bc3d5698SJohn Baldwin	veor	q10, q10, q11
489*bc3d5698SJohn Baldwin	veor	q1, q1, q8
490*bc3d5698SJohn Baldwin	veor	q1, q1, q10
491*bc3d5698SJohn Baldwin	vext.8	d16, d27, d27, #1	@ A1
492*bc3d5698SJohn Baldwin	vmull.p8	q8, d16, d7		@ F = A1*B
493*bc3d5698SJohn Baldwin	vext.8	d4, d7, d7, #1	@ B1
494*bc3d5698SJohn Baldwin	vmull.p8	q2, d27, d4		@ E = A*B1
495*bc3d5698SJohn Baldwin	vext.8	d18, d27, d27, #2	@ A2
496*bc3d5698SJohn Baldwin	vmull.p8	q9, d18, d7		@ H = A2*B
497*bc3d5698SJohn Baldwin	vext.8	d22, d7, d7, #2	@ B2
498*bc3d5698SJohn Baldwin	vmull.p8	q11, d27, d22		@ G = A*B2
499*bc3d5698SJohn Baldwin	vext.8	d20, d27, d27, #3	@ A3
500*bc3d5698SJohn Baldwin	veor	q8, q8, q2		@ L = E + F
501*bc3d5698SJohn Baldwin	vmull.p8	q10, d20, d7		@ J = A3*B
502*bc3d5698SJohn Baldwin	vext.8	d4, d7, d7, #3	@ B3
503*bc3d5698SJohn Baldwin	veor	q9, q9, q11		@ M = G + H
504*bc3d5698SJohn Baldwin	vmull.p8	q2, d27, d4		@ I = A*B3
505*bc3d5698SJohn Baldwin	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
506*bc3d5698SJohn Baldwin	vand	d17, d17, d29
507*bc3d5698SJohn Baldwin	vext.8	d22, d7, d7, #4	@ B4
508*bc3d5698SJohn Baldwin	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
509*bc3d5698SJohn Baldwin	vand	d19, d19, d30
510*bc3d5698SJohn Baldwin	vmull.p8	q11, d27, d22		@ K = A*B4
511*bc3d5698SJohn Baldwin	veor	q10, q10, q2		@ N = I + J
512*bc3d5698SJohn Baldwin	veor	d16, d16, d17
513*bc3d5698SJohn Baldwin	veor	d18, d18, d19
514*bc3d5698SJohn Baldwin	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
515*bc3d5698SJohn Baldwin	vand	d21, d21, d31
516*bc3d5698SJohn Baldwin	vext.8	q8, q8, q8, #15
517*bc3d5698SJohn Baldwin	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
518*bc3d5698SJohn Baldwin	vmov.i64	d23, #0
519*bc3d5698SJohn Baldwin	vext.8	q9, q9, q9, #14
520*bc3d5698SJohn Baldwin	veor	d20, d20, d21
521*bc3d5698SJohn Baldwin	vmull.p8	q2, d27, d7		@ D = A*B
522*bc3d5698SJohn Baldwin	vext.8	q11, q11, q11, #12
523*bc3d5698SJohn Baldwin	vext.8	q10, q10, q10, #13
524*bc3d5698SJohn Baldwin	veor	q8, q8, q9
525*bc3d5698SJohn Baldwin	veor	q10, q10, q11
526*bc3d5698SJohn Baldwin	veor	q2, q2, q8
527*bc3d5698SJohn Baldwin	veor	q2, q2, q10
528*bc3d5698SJohn Baldwin	veor	q1,q1,q0		@ Karatsuba post-processing
529*bc3d5698SJohn Baldwin	veor	q1,q1,q2
530*bc3d5698SJohn Baldwin	veor	d1,d1,d2
531*bc3d5698SJohn Baldwin	veor	d4,d4,d3	@ Xh|Xl - 256-bit result
532*bc3d5698SJohn Baldwin
533*bc3d5698SJohn Baldwin	@ equivalent of reduction_avx from ghash-x86_64.pl
534*bc3d5698SJohn Baldwin	vshl.i64	q9,q0,#57		@ 1st phase
535*bc3d5698SJohn Baldwin	vshl.i64	q10,q0,#62
536*bc3d5698SJohn Baldwin	veor	q10,q10,q9		@
537*bc3d5698SJohn Baldwin	vshl.i64	q9,q0,#63
538*bc3d5698SJohn Baldwin	veor	q10, q10, q9		@
539*bc3d5698SJohn Baldwin	veor	d1,d1,d20	@
540*bc3d5698SJohn Baldwin	veor	d4,d4,d21
541*bc3d5698SJohn Baldwin
542*bc3d5698SJohn Baldwin	vshr.u64	q10,q0,#1		@ 2nd phase
543*bc3d5698SJohn Baldwin	veor	q2,q2,q0
544*bc3d5698SJohn Baldwin	veor	q0,q0,q10		@
545*bc3d5698SJohn Baldwin	vshr.u64	q10,q10,#6
546*bc3d5698SJohn Baldwin	vshr.u64	q0,q0,#1		@
547*bc3d5698SJohn Baldwin	veor	q0,q0,q2		@
548*bc3d5698SJohn Baldwin	veor	q0,q0,q10		@
549*bc3d5698SJohn Baldwin
550*bc3d5698SJohn Baldwin	subs	r3,#16
551*bc3d5698SJohn Baldwin	bne	.Loop_neon
552*bc3d5698SJohn Baldwin
553*bc3d5698SJohn Baldwin#ifdef __ARMEL__
554*bc3d5698SJohn Baldwin	vrev64.8	q0,q0
555*bc3d5698SJohn Baldwin#endif
556*bc3d5698SJohn Baldwin	sub	r0,#16
557*bc3d5698SJohn Baldwin	vst1.64	d1,[r0]!		@ write out Xi
558*bc3d5698SJohn Baldwin	vst1.64	d0,[r0]
559*bc3d5698SJohn Baldwin
560*bc3d5698SJohn Baldwin	bx	lr					@ bx lr
561*bc3d5698SJohn Baldwin.size	gcm_ghash_neon,.-gcm_ghash_neon
562*bc3d5698SJohn Baldwin#endif
563*bc3d5698SJohn Baldwin.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
564*bc3d5698SJohn Baldwin.align	2
565*bc3d5698SJohn Baldwin.align	2
566