xref: /freebsd/sys/crypto/openssl/arm/ghash-armv4.S (revision c0855eaa3ee9614804b6bd6a255aa9f71e095f43)
1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from ghash-armv4.pl. */
2bc3d5698SJohn Baldwin#include "arm_arch.h"
3bc3d5698SJohn Baldwin
4bc3d5698SJohn Baldwin#if defined(__thumb2__) || defined(__clang__)
5bc3d5698SJohn Baldwin.syntax	unified
6bc3d5698SJohn Baldwin#define ldrplb  ldrbpl
7bc3d5698SJohn Baldwin#define ldrneb  ldrbne
8bc3d5698SJohn Baldwin#endif
9bc3d5698SJohn Baldwin#if defined(__thumb2__)
10bc3d5698SJohn Baldwin.thumb
11bc3d5698SJohn Baldwin#else
12bc3d5698SJohn Baldwin.code	32
13bc3d5698SJohn Baldwin#endif
14bc3d5698SJohn Baldwin
15*c0855eaaSJohn Baldwin.text
16*c0855eaaSJohn Baldwin
17bc3d5698SJohn Baldwin.type	rem_4bit,%object
18bc3d5698SJohn Baldwin.align	5
19bc3d5698SJohn Baldwinrem_4bit:
20bc3d5698SJohn Baldwin.short	0x0000,0x1C20,0x3840,0x2460
21bc3d5698SJohn Baldwin.short	0x7080,0x6CA0,0x48C0,0x54E0
22bc3d5698SJohn Baldwin.short	0xE100,0xFD20,0xD940,0xC560
23bc3d5698SJohn Baldwin.short	0x9180,0x8DA0,0xA9C0,0xB5E0
24bc3d5698SJohn Baldwin.size	rem_4bit,.-rem_4bit
25bc3d5698SJohn Baldwin
26bc3d5698SJohn Baldwin.type	rem_4bit_get,%function
27bc3d5698SJohn Baldwinrem_4bit_get:
28bc3d5698SJohn Baldwin#if defined(__thumb2__)
29bc3d5698SJohn Baldwin	adr	r2,rem_4bit
30bc3d5698SJohn Baldwin#else
31bc3d5698SJohn Baldwin	sub	r2,pc,#8+32	@ &rem_4bit
32bc3d5698SJohn Baldwin#endif
33bc3d5698SJohn Baldwin	b	.Lrem_4bit_got
34bc3d5698SJohn Baldwin	nop
35bc3d5698SJohn Baldwin	nop
36bc3d5698SJohn Baldwin.size	rem_4bit_get,.-rem_4bit_get
37bc3d5698SJohn Baldwin
38bc3d5698SJohn Baldwin.globl	gcm_ghash_4bit
39bc3d5698SJohn Baldwin.type	gcm_ghash_4bit,%function
40bc3d5698SJohn Baldwin.align	4
41bc3d5698SJohn Baldwingcm_ghash_4bit:
42bc3d5698SJohn Baldwin#if defined(__thumb2__)
43bc3d5698SJohn Baldwin	adr	r12,rem_4bit
44bc3d5698SJohn Baldwin#else
45bc3d5698SJohn Baldwin	sub	r12,pc,#8+48		@ &rem_4bit
46bc3d5698SJohn Baldwin#endif
47bc3d5698SJohn Baldwin	add	r3,r2,r3		@ r3 to point at the end
48bc3d5698SJohn Baldwin	stmdb	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}		@ save r3/end too
49bc3d5698SJohn Baldwin
50bc3d5698SJohn Baldwin	ldmia	r12,{r4,r5,r6,r7,r8,r9,r10,r11}		@ copy rem_4bit ...
51bc3d5698SJohn Baldwin	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}		@ ... to stack
52bc3d5698SJohn Baldwin
53bc3d5698SJohn Baldwin	ldrb	r12,[r2,#15]
54bc3d5698SJohn Baldwin	ldrb	r14,[r0,#15]
55bc3d5698SJohn Baldwin.Louter:
56bc3d5698SJohn Baldwin	eor	r12,r12,r14
57bc3d5698SJohn Baldwin	and	r14,r12,#0xf0
58bc3d5698SJohn Baldwin	and	r12,r12,#0x0f
59bc3d5698SJohn Baldwin	mov	r3,#14
60bc3d5698SJohn Baldwin
61bc3d5698SJohn Baldwin	add	r7,r1,r12,lsl#4
62bc3d5698SJohn Baldwin	ldmia	r7,{r4,r5,r6,r7}	@ load Htbl[nlo]
63bc3d5698SJohn Baldwin	add	r11,r1,r14
64bc3d5698SJohn Baldwin	ldrb	r12,[r2,#14]
65bc3d5698SJohn Baldwin
66bc3d5698SJohn Baldwin	and	r14,r4,#0xf		@ rem
67bc3d5698SJohn Baldwin	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
68bc3d5698SJohn Baldwin	add	r14,r14,r14
69bc3d5698SJohn Baldwin	eor	r4,r8,r4,lsr#4
70bc3d5698SJohn Baldwin	ldrh	r8,[sp,r14]		@ rem_4bit[rem]
71bc3d5698SJohn Baldwin	eor	r4,r4,r5,lsl#28
72bc3d5698SJohn Baldwin	ldrb	r14,[r0,#14]
73bc3d5698SJohn Baldwin	eor	r5,r9,r5,lsr#4
74bc3d5698SJohn Baldwin	eor	r5,r5,r6,lsl#28
75bc3d5698SJohn Baldwin	eor	r6,r10,r6,lsr#4
76bc3d5698SJohn Baldwin	eor	r6,r6,r7,lsl#28
77bc3d5698SJohn Baldwin	eor	r7,r11,r7,lsr#4
78bc3d5698SJohn Baldwin	eor	r12,r12,r14
79bc3d5698SJohn Baldwin	and	r14,r12,#0xf0
80bc3d5698SJohn Baldwin	and	r12,r12,#0x0f
81bc3d5698SJohn Baldwin	eor	r7,r7,r8,lsl#16
82bc3d5698SJohn Baldwin
83bc3d5698SJohn Baldwin.Linner:
84bc3d5698SJohn Baldwin	add	r11,r1,r12,lsl#4
85bc3d5698SJohn Baldwin	and	r12,r4,#0xf		@ rem
86bc3d5698SJohn Baldwin	subs	r3,r3,#1
87bc3d5698SJohn Baldwin	add	r12,r12,r12
88bc3d5698SJohn Baldwin	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nlo]
89bc3d5698SJohn Baldwin	eor	r4,r8,r4,lsr#4
90bc3d5698SJohn Baldwin	eor	r4,r4,r5,lsl#28
91bc3d5698SJohn Baldwin	eor	r5,r9,r5,lsr#4
92bc3d5698SJohn Baldwin	eor	r5,r5,r6,lsl#28
93bc3d5698SJohn Baldwin	ldrh	r8,[sp,r12]		@ rem_4bit[rem]
94bc3d5698SJohn Baldwin	eor	r6,r10,r6,lsr#4
95bc3d5698SJohn Baldwin#ifdef	__thumb2__
96bc3d5698SJohn Baldwin	it	pl
97bc3d5698SJohn Baldwin#endif
98bc3d5698SJohn Baldwin	ldrplb	r12,[r2,r3]
99bc3d5698SJohn Baldwin	eor	r6,r6,r7,lsl#28
100bc3d5698SJohn Baldwin	eor	r7,r11,r7,lsr#4
101bc3d5698SJohn Baldwin
102bc3d5698SJohn Baldwin	add	r11,r1,r14
103bc3d5698SJohn Baldwin	and	r14,r4,#0xf		@ rem
104bc3d5698SJohn Baldwin	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
105bc3d5698SJohn Baldwin	add	r14,r14,r14
106bc3d5698SJohn Baldwin	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
107bc3d5698SJohn Baldwin	eor	r4,r8,r4,lsr#4
108bc3d5698SJohn Baldwin#ifdef	__thumb2__
109bc3d5698SJohn Baldwin	it	pl
110bc3d5698SJohn Baldwin#endif
111bc3d5698SJohn Baldwin	ldrplb	r8,[r0,r3]
112bc3d5698SJohn Baldwin	eor	r4,r4,r5,lsl#28
113bc3d5698SJohn Baldwin	eor	r5,r9,r5,lsr#4
114bc3d5698SJohn Baldwin	ldrh	r9,[sp,r14]
115bc3d5698SJohn Baldwin	eor	r5,r5,r6,lsl#28
116bc3d5698SJohn Baldwin	eor	r6,r10,r6,lsr#4
117bc3d5698SJohn Baldwin	eor	r6,r6,r7,lsl#28
118bc3d5698SJohn Baldwin#ifdef	__thumb2__
119bc3d5698SJohn Baldwin	it	pl
120bc3d5698SJohn Baldwin#endif
121bc3d5698SJohn Baldwin	eorpl	r12,r12,r8
122bc3d5698SJohn Baldwin	eor	r7,r11,r7,lsr#4
123bc3d5698SJohn Baldwin#ifdef	__thumb2__
124bc3d5698SJohn Baldwin	itt	pl
125bc3d5698SJohn Baldwin#endif
126bc3d5698SJohn Baldwin	andpl	r14,r12,#0xf0
127bc3d5698SJohn Baldwin	andpl	r12,r12,#0x0f
128bc3d5698SJohn Baldwin	eor	r7,r7,r9,lsl#16	@ ^= rem_4bit[rem]
129bc3d5698SJohn Baldwin	bpl	.Linner
130bc3d5698SJohn Baldwin
131bc3d5698SJohn Baldwin	ldr	r3,[sp,#32]		@ re-load r3/end
132bc3d5698SJohn Baldwin	add	r2,r2,#16
133bc3d5698SJohn Baldwin	mov	r14,r4
134bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__)
135bc3d5698SJohn Baldwin	rev	r4,r4
136bc3d5698SJohn Baldwin	str	r4,[r0,#12]
137bc3d5698SJohn Baldwin#elif defined(__ARMEB__)
138bc3d5698SJohn Baldwin	str	r4,[r0,#12]
139bc3d5698SJohn Baldwin#else
140bc3d5698SJohn Baldwin	mov	r9,r4,lsr#8
141bc3d5698SJohn Baldwin	strb	r4,[r0,#12+3]
142bc3d5698SJohn Baldwin	mov	r10,r4,lsr#16
143bc3d5698SJohn Baldwin	strb	r9,[r0,#12+2]
144bc3d5698SJohn Baldwin	mov	r11,r4,lsr#24
145bc3d5698SJohn Baldwin	strb	r10,[r0,#12+1]
146bc3d5698SJohn Baldwin	strb	r11,[r0,#12]
147bc3d5698SJohn Baldwin#endif
148bc3d5698SJohn Baldwin	cmp	r2,r3
149bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__)
150bc3d5698SJohn Baldwin	rev	r5,r5
151bc3d5698SJohn Baldwin	str	r5,[r0,#8]
152bc3d5698SJohn Baldwin#elif defined(__ARMEB__)
153bc3d5698SJohn Baldwin	str	r5,[r0,#8]
154bc3d5698SJohn Baldwin#else
155bc3d5698SJohn Baldwin	mov	r9,r5,lsr#8
156bc3d5698SJohn Baldwin	strb	r5,[r0,#8+3]
157bc3d5698SJohn Baldwin	mov	r10,r5,lsr#16
158bc3d5698SJohn Baldwin	strb	r9,[r0,#8+2]
159bc3d5698SJohn Baldwin	mov	r11,r5,lsr#24
160bc3d5698SJohn Baldwin	strb	r10,[r0,#8+1]
161bc3d5698SJohn Baldwin	strb	r11,[r0,#8]
162bc3d5698SJohn Baldwin#endif
163bc3d5698SJohn Baldwin
164bc3d5698SJohn Baldwin#ifdef __thumb2__
165bc3d5698SJohn Baldwin	it	ne
166bc3d5698SJohn Baldwin#endif
167bc3d5698SJohn Baldwin	ldrneb	r12,[r2,#15]
168bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__)
169bc3d5698SJohn Baldwin	rev	r6,r6
170bc3d5698SJohn Baldwin	str	r6,[r0,#4]
171bc3d5698SJohn Baldwin#elif defined(__ARMEB__)
172bc3d5698SJohn Baldwin	str	r6,[r0,#4]
173bc3d5698SJohn Baldwin#else
174bc3d5698SJohn Baldwin	mov	r9,r6,lsr#8
175bc3d5698SJohn Baldwin	strb	r6,[r0,#4+3]
176bc3d5698SJohn Baldwin	mov	r10,r6,lsr#16
177bc3d5698SJohn Baldwin	strb	r9,[r0,#4+2]
178bc3d5698SJohn Baldwin	mov	r11,r6,lsr#24
179bc3d5698SJohn Baldwin	strb	r10,[r0,#4+1]
180bc3d5698SJohn Baldwin	strb	r11,[r0,#4]
181bc3d5698SJohn Baldwin#endif
182bc3d5698SJohn Baldwin
183bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__)
184bc3d5698SJohn Baldwin	rev	r7,r7
185bc3d5698SJohn Baldwin	str	r7,[r0,#0]
186bc3d5698SJohn Baldwin#elif defined(__ARMEB__)
187bc3d5698SJohn Baldwin	str	r7,[r0,#0]
188bc3d5698SJohn Baldwin#else
189bc3d5698SJohn Baldwin	mov	r9,r7,lsr#8
190bc3d5698SJohn Baldwin	strb	r7,[r0,#0+3]
191bc3d5698SJohn Baldwin	mov	r10,r7,lsr#16
192bc3d5698SJohn Baldwin	strb	r9,[r0,#0+2]
193bc3d5698SJohn Baldwin	mov	r11,r7,lsr#24
194bc3d5698SJohn Baldwin	strb	r10,[r0,#0+1]
195bc3d5698SJohn Baldwin	strb	r11,[r0,#0]
196bc3d5698SJohn Baldwin#endif
197bc3d5698SJohn Baldwin
198bc3d5698SJohn Baldwin	bne	.Louter
199bc3d5698SJohn Baldwin
200bc3d5698SJohn Baldwin	add	sp,sp,#36
201bc3d5698SJohn Baldwin#if __ARM_ARCH__>=5
202bc3d5698SJohn Baldwin	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
203bc3d5698SJohn Baldwin#else
204bc3d5698SJohn Baldwin	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
205bc3d5698SJohn Baldwin	tst	lr,#1
206bc3d5698SJohn Baldwin	moveq	pc,lr			@ be binary compatible with V4, yet
207bc3d5698SJohn Baldwin.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
208bc3d5698SJohn Baldwin#endif
209bc3d5698SJohn Baldwin.size	gcm_ghash_4bit,.-gcm_ghash_4bit
210bc3d5698SJohn Baldwin
211bc3d5698SJohn Baldwin.globl	gcm_gmult_4bit
212bc3d5698SJohn Baldwin.type	gcm_gmult_4bit,%function
213bc3d5698SJohn Baldwingcm_gmult_4bit:
214bc3d5698SJohn Baldwin	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
215bc3d5698SJohn Baldwin	ldrb	r12,[r0,#15]
216bc3d5698SJohn Baldwin	b	rem_4bit_get
217bc3d5698SJohn Baldwin.Lrem_4bit_got:
218bc3d5698SJohn Baldwin	and	r14,r12,#0xf0
219bc3d5698SJohn Baldwin	and	r12,r12,#0x0f
220bc3d5698SJohn Baldwin	mov	r3,#14
221bc3d5698SJohn Baldwin
222bc3d5698SJohn Baldwin	add	r7,r1,r12,lsl#4
223bc3d5698SJohn Baldwin	ldmia	r7,{r4,r5,r6,r7}	@ load Htbl[nlo]
224bc3d5698SJohn Baldwin	ldrb	r12,[r0,#14]
225bc3d5698SJohn Baldwin
226bc3d5698SJohn Baldwin	add	r11,r1,r14
227bc3d5698SJohn Baldwin	and	r14,r4,#0xf		@ rem
228bc3d5698SJohn Baldwin	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
229bc3d5698SJohn Baldwin	add	r14,r14,r14
230bc3d5698SJohn Baldwin	eor	r4,r8,r4,lsr#4
231bc3d5698SJohn Baldwin	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
232bc3d5698SJohn Baldwin	eor	r4,r4,r5,lsl#28
233bc3d5698SJohn Baldwin	eor	r5,r9,r5,lsr#4
234bc3d5698SJohn Baldwin	eor	r5,r5,r6,lsl#28
235bc3d5698SJohn Baldwin	eor	r6,r10,r6,lsr#4
236bc3d5698SJohn Baldwin	eor	r6,r6,r7,lsl#28
237bc3d5698SJohn Baldwin	eor	r7,r11,r7,lsr#4
238bc3d5698SJohn Baldwin	and	r14,r12,#0xf0
239bc3d5698SJohn Baldwin	eor	r7,r7,r8,lsl#16
240bc3d5698SJohn Baldwin	and	r12,r12,#0x0f
241bc3d5698SJohn Baldwin
242bc3d5698SJohn Baldwin.Loop:
243bc3d5698SJohn Baldwin	add	r11,r1,r12,lsl#4
244bc3d5698SJohn Baldwin	and	r12,r4,#0xf		@ rem
245bc3d5698SJohn Baldwin	subs	r3,r3,#1
246bc3d5698SJohn Baldwin	add	r12,r12,r12
247bc3d5698SJohn Baldwin	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nlo]
248bc3d5698SJohn Baldwin	eor	r4,r8,r4,lsr#4
249bc3d5698SJohn Baldwin	eor	r4,r4,r5,lsl#28
250bc3d5698SJohn Baldwin	eor	r5,r9,r5,lsr#4
251bc3d5698SJohn Baldwin	eor	r5,r5,r6,lsl#28
252bc3d5698SJohn Baldwin	ldrh	r8,[r2,r12]	@ rem_4bit[rem]
253bc3d5698SJohn Baldwin	eor	r6,r10,r6,lsr#4
254bc3d5698SJohn Baldwin#ifdef	__thumb2__
255bc3d5698SJohn Baldwin	it	pl
256bc3d5698SJohn Baldwin#endif
257bc3d5698SJohn Baldwin	ldrplb	r12,[r0,r3]
258bc3d5698SJohn Baldwin	eor	r6,r6,r7,lsl#28
259bc3d5698SJohn Baldwin	eor	r7,r11,r7,lsr#4
260bc3d5698SJohn Baldwin
261bc3d5698SJohn Baldwin	add	r11,r1,r14
262bc3d5698SJohn Baldwin	and	r14,r4,#0xf		@ rem
263bc3d5698SJohn Baldwin	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
264bc3d5698SJohn Baldwin	add	r14,r14,r14
265bc3d5698SJohn Baldwin	ldmia	r11,{r8,r9,r10,r11}	@ load Htbl[nhi]
266bc3d5698SJohn Baldwin	eor	r4,r8,r4,lsr#4
267bc3d5698SJohn Baldwin	eor	r4,r4,r5,lsl#28
268bc3d5698SJohn Baldwin	eor	r5,r9,r5,lsr#4
269bc3d5698SJohn Baldwin	ldrh	r8,[r2,r14]	@ rem_4bit[rem]
270bc3d5698SJohn Baldwin	eor	r5,r5,r6,lsl#28
271bc3d5698SJohn Baldwin	eor	r6,r10,r6,lsr#4
272bc3d5698SJohn Baldwin	eor	r6,r6,r7,lsl#28
273bc3d5698SJohn Baldwin	eor	r7,r11,r7,lsr#4
274bc3d5698SJohn Baldwin#ifdef	__thumb2__
275bc3d5698SJohn Baldwin	itt	pl
276bc3d5698SJohn Baldwin#endif
277bc3d5698SJohn Baldwin	andpl	r14,r12,#0xf0
278bc3d5698SJohn Baldwin	andpl	r12,r12,#0x0f
279bc3d5698SJohn Baldwin	eor	r7,r7,r8,lsl#16	@ ^= rem_4bit[rem]
280bc3d5698SJohn Baldwin	bpl	.Loop
281bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__)
282bc3d5698SJohn Baldwin	rev	r4,r4
283bc3d5698SJohn Baldwin	str	r4,[r0,#12]
284bc3d5698SJohn Baldwin#elif defined(__ARMEB__)
285bc3d5698SJohn Baldwin	str	r4,[r0,#12]
286bc3d5698SJohn Baldwin#else
287bc3d5698SJohn Baldwin	mov	r9,r4,lsr#8
288bc3d5698SJohn Baldwin	strb	r4,[r0,#12+3]
289bc3d5698SJohn Baldwin	mov	r10,r4,lsr#16
290bc3d5698SJohn Baldwin	strb	r9,[r0,#12+2]
291bc3d5698SJohn Baldwin	mov	r11,r4,lsr#24
292bc3d5698SJohn Baldwin	strb	r10,[r0,#12+1]
293bc3d5698SJohn Baldwin	strb	r11,[r0,#12]
294bc3d5698SJohn Baldwin#endif
295bc3d5698SJohn Baldwin
296bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__)
297bc3d5698SJohn Baldwin	rev	r5,r5
298bc3d5698SJohn Baldwin	str	r5,[r0,#8]
299bc3d5698SJohn Baldwin#elif defined(__ARMEB__)
300bc3d5698SJohn Baldwin	str	r5,[r0,#8]
301bc3d5698SJohn Baldwin#else
302bc3d5698SJohn Baldwin	mov	r9,r5,lsr#8
303bc3d5698SJohn Baldwin	strb	r5,[r0,#8+3]
304bc3d5698SJohn Baldwin	mov	r10,r5,lsr#16
305bc3d5698SJohn Baldwin	strb	r9,[r0,#8+2]
306bc3d5698SJohn Baldwin	mov	r11,r5,lsr#24
307bc3d5698SJohn Baldwin	strb	r10,[r0,#8+1]
308bc3d5698SJohn Baldwin	strb	r11,[r0,#8]
309bc3d5698SJohn Baldwin#endif
310bc3d5698SJohn Baldwin
311bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__)
312bc3d5698SJohn Baldwin	rev	r6,r6
313bc3d5698SJohn Baldwin	str	r6,[r0,#4]
314bc3d5698SJohn Baldwin#elif defined(__ARMEB__)
315bc3d5698SJohn Baldwin	str	r6,[r0,#4]
316bc3d5698SJohn Baldwin#else
317bc3d5698SJohn Baldwin	mov	r9,r6,lsr#8
318bc3d5698SJohn Baldwin	strb	r6,[r0,#4+3]
319bc3d5698SJohn Baldwin	mov	r10,r6,lsr#16
320bc3d5698SJohn Baldwin	strb	r9,[r0,#4+2]
321bc3d5698SJohn Baldwin	mov	r11,r6,lsr#24
322bc3d5698SJohn Baldwin	strb	r10,[r0,#4+1]
323bc3d5698SJohn Baldwin	strb	r11,[r0,#4]
324bc3d5698SJohn Baldwin#endif
325bc3d5698SJohn Baldwin
326bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__)
327bc3d5698SJohn Baldwin	rev	r7,r7
328bc3d5698SJohn Baldwin	str	r7,[r0,#0]
329bc3d5698SJohn Baldwin#elif defined(__ARMEB__)
330bc3d5698SJohn Baldwin	str	r7,[r0,#0]
331bc3d5698SJohn Baldwin#else
332bc3d5698SJohn Baldwin	mov	r9,r7,lsr#8
333bc3d5698SJohn Baldwin	strb	r7,[r0,#0+3]
334bc3d5698SJohn Baldwin	mov	r10,r7,lsr#16
335bc3d5698SJohn Baldwin	strb	r9,[r0,#0+2]
336bc3d5698SJohn Baldwin	mov	r11,r7,lsr#24
337bc3d5698SJohn Baldwin	strb	r10,[r0,#0+1]
338bc3d5698SJohn Baldwin	strb	r11,[r0,#0]
339bc3d5698SJohn Baldwin#endif
340bc3d5698SJohn Baldwin
341bc3d5698SJohn Baldwin#if __ARM_ARCH__>=5
342bc3d5698SJohn Baldwin	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
343bc3d5698SJohn Baldwin#else
344bc3d5698SJohn Baldwin	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
345bc3d5698SJohn Baldwin	tst	lr,#1
346bc3d5698SJohn Baldwin	moveq	pc,lr			@ be binary compatible with V4, yet
347bc3d5698SJohn Baldwin.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
348bc3d5698SJohn Baldwin#endif
349bc3d5698SJohn Baldwin.size	gcm_gmult_4bit,.-gcm_gmult_4bit
350bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7
351bc3d5698SJohn Baldwin.arch	armv7-a
352bc3d5698SJohn Baldwin.fpu	neon
353bc3d5698SJohn Baldwin
354bc3d5698SJohn Baldwin.globl	gcm_init_neon
355bc3d5698SJohn Baldwin.type	gcm_init_neon,%function
356bc3d5698SJohn Baldwin.align	4
357bc3d5698SJohn Baldwingcm_init_neon:
358bc3d5698SJohn Baldwin	vld1.64	d7,[r1]!		@ load H
359bc3d5698SJohn Baldwin	vmov.i8	q8,#0xe1
360bc3d5698SJohn Baldwin	vld1.64	d6,[r1]
361bc3d5698SJohn Baldwin	vshl.i64	d17,#57
362bc3d5698SJohn Baldwin	vshr.u64	d16,#63		@ t0=0xc2....01
363bc3d5698SJohn Baldwin	vdup.8	q9,d7[7]
364bc3d5698SJohn Baldwin	vshr.u64	d26,d6,#63
365bc3d5698SJohn Baldwin	vshr.s8	q9,#7			@ broadcast carry bit
366bc3d5698SJohn Baldwin	vshl.i64	q3,q3,#1
367bc3d5698SJohn Baldwin	vand	q8,q8,q9
368bc3d5698SJohn Baldwin	vorr	d7,d26		@ H<<<=1
369bc3d5698SJohn Baldwin	veor	q3,q3,q8		@ twisted H
370bc3d5698SJohn Baldwin	vstmia	r0,{q3}
371bc3d5698SJohn Baldwin
372bc3d5698SJohn Baldwin	bx	lr					@ bx lr
373bc3d5698SJohn Baldwin.size	gcm_init_neon,.-gcm_init_neon
374bc3d5698SJohn Baldwin
375bc3d5698SJohn Baldwin.globl	gcm_gmult_neon
376bc3d5698SJohn Baldwin.type	gcm_gmult_neon,%function
377bc3d5698SJohn Baldwin.align	4
378bc3d5698SJohn Baldwingcm_gmult_neon:
379bc3d5698SJohn Baldwin	vld1.64	d7,[r0]!		@ load Xi
380bc3d5698SJohn Baldwin	vld1.64	d6,[r0]!
381bc3d5698SJohn Baldwin	vmov.i64	d29,#0x0000ffffffffffff
382bc3d5698SJohn Baldwin	vldmia	r1,{d26,d27}	@ load twisted H
383bc3d5698SJohn Baldwin	vmov.i64	d30,#0x00000000ffffffff
384bc3d5698SJohn Baldwin#ifdef __ARMEL__
385bc3d5698SJohn Baldwin	vrev64.8	q3,q3
386bc3d5698SJohn Baldwin#endif
387bc3d5698SJohn Baldwin	vmov.i64	d31,#0x000000000000ffff
388bc3d5698SJohn Baldwin	veor	d28,d26,d27		@ Karatsuba pre-processing
389bc3d5698SJohn Baldwin	mov	r3,#16
390bc3d5698SJohn Baldwin	b	.Lgmult_neon
391bc3d5698SJohn Baldwin.size	gcm_gmult_neon,.-gcm_gmult_neon
392bc3d5698SJohn Baldwin
393bc3d5698SJohn Baldwin.globl	gcm_ghash_neon
394bc3d5698SJohn Baldwin.type	gcm_ghash_neon,%function
395bc3d5698SJohn Baldwin.align	4
396bc3d5698SJohn Baldwingcm_ghash_neon:
397bc3d5698SJohn Baldwin	vld1.64	d1,[r0]!		@ load Xi
398bc3d5698SJohn Baldwin	vld1.64	d0,[r0]!
399bc3d5698SJohn Baldwin	vmov.i64	d29,#0x0000ffffffffffff
400bc3d5698SJohn Baldwin	vldmia	r1,{d26,d27}	@ load twisted H
401bc3d5698SJohn Baldwin	vmov.i64	d30,#0x00000000ffffffff
402bc3d5698SJohn Baldwin#ifdef __ARMEL__
403bc3d5698SJohn Baldwin	vrev64.8	q0,q0
404bc3d5698SJohn Baldwin#endif
405bc3d5698SJohn Baldwin	vmov.i64	d31,#0x000000000000ffff
406bc3d5698SJohn Baldwin	veor	d28,d26,d27		@ Karatsuba pre-processing
407bc3d5698SJohn Baldwin
408bc3d5698SJohn Baldwin.Loop_neon:
409bc3d5698SJohn Baldwin	vld1.64	d7,[r2]!		@ load inp
410bc3d5698SJohn Baldwin	vld1.64	d6,[r2]!
411bc3d5698SJohn Baldwin#ifdef __ARMEL__
412bc3d5698SJohn Baldwin	vrev64.8	q3,q3
413bc3d5698SJohn Baldwin#endif
414bc3d5698SJohn Baldwin	veor	q3,q0			@ inp^=Xi
415bc3d5698SJohn Baldwin.Lgmult_neon:
416bc3d5698SJohn Baldwin	vext.8	d16, d26, d26, #1	@ A1
417bc3d5698SJohn Baldwin	vmull.p8	q8, d16, d6		@ F = A1*B
418bc3d5698SJohn Baldwin	vext.8	d0, d6, d6, #1	@ B1
419bc3d5698SJohn Baldwin	vmull.p8	q0, d26, d0		@ E = A*B1
420bc3d5698SJohn Baldwin	vext.8	d18, d26, d26, #2	@ A2
421bc3d5698SJohn Baldwin	vmull.p8	q9, d18, d6		@ H = A2*B
422bc3d5698SJohn Baldwin	vext.8	d22, d6, d6, #2	@ B2
423bc3d5698SJohn Baldwin	vmull.p8	q11, d26, d22		@ G = A*B2
424bc3d5698SJohn Baldwin	vext.8	d20, d26, d26, #3	@ A3
425bc3d5698SJohn Baldwin	veor	q8, q8, q0		@ L = E + F
426bc3d5698SJohn Baldwin	vmull.p8	q10, d20, d6		@ J = A3*B
427bc3d5698SJohn Baldwin	vext.8	d0, d6, d6, #3	@ B3
428bc3d5698SJohn Baldwin	veor	q9, q9, q11		@ M = G + H
429bc3d5698SJohn Baldwin	vmull.p8	q0, d26, d0		@ I = A*B3
430bc3d5698SJohn Baldwin	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
431bc3d5698SJohn Baldwin	vand	d17, d17, d29
432bc3d5698SJohn Baldwin	vext.8	d22, d6, d6, #4	@ B4
433bc3d5698SJohn Baldwin	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
434bc3d5698SJohn Baldwin	vand	d19, d19, d30
435bc3d5698SJohn Baldwin	vmull.p8	q11, d26, d22		@ K = A*B4
436bc3d5698SJohn Baldwin	veor	q10, q10, q0		@ N = I + J
437bc3d5698SJohn Baldwin	veor	d16, d16, d17
438bc3d5698SJohn Baldwin	veor	d18, d18, d19
439bc3d5698SJohn Baldwin	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
440bc3d5698SJohn Baldwin	vand	d21, d21, d31
441bc3d5698SJohn Baldwin	vext.8	q8, q8, q8, #15
442bc3d5698SJohn Baldwin	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
443bc3d5698SJohn Baldwin	vmov.i64	d23, #0
444bc3d5698SJohn Baldwin	vext.8	q9, q9, q9, #14
445bc3d5698SJohn Baldwin	veor	d20, d20, d21
446bc3d5698SJohn Baldwin	vmull.p8	q0, d26, d6		@ D = A*B
447bc3d5698SJohn Baldwin	vext.8	q11, q11, q11, #12
448bc3d5698SJohn Baldwin	vext.8	q10, q10, q10, #13
449bc3d5698SJohn Baldwin	veor	q8, q8, q9
450bc3d5698SJohn Baldwin	veor	q10, q10, q11
451bc3d5698SJohn Baldwin	veor	q0, q0, q8
452bc3d5698SJohn Baldwin	veor	q0, q0, q10
453bc3d5698SJohn Baldwin	veor	d6,d6,d7	@ Karatsuba pre-processing
454bc3d5698SJohn Baldwin	vext.8	d16, d28, d28, #1	@ A1
455bc3d5698SJohn Baldwin	vmull.p8	q8, d16, d6		@ F = A1*B
456bc3d5698SJohn Baldwin	vext.8	d2, d6, d6, #1	@ B1
457bc3d5698SJohn Baldwin	vmull.p8	q1, d28, d2		@ E = A*B1
458bc3d5698SJohn Baldwin	vext.8	d18, d28, d28, #2	@ A2
459bc3d5698SJohn Baldwin	vmull.p8	q9, d18, d6		@ H = A2*B
460bc3d5698SJohn Baldwin	vext.8	d22, d6, d6, #2	@ B2
461bc3d5698SJohn Baldwin	vmull.p8	q11, d28, d22		@ G = A*B2
462bc3d5698SJohn Baldwin	vext.8	d20, d28, d28, #3	@ A3
463bc3d5698SJohn Baldwin	veor	q8, q8, q1		@ L = E + F
464bc3d5698SJohn Baldwin	vmull.p8	q10, d20, d6		@ J = A3*B
465bc3d5698SJohn Baldwin	vext.8	d2, d6, d6, #3	@ B3
466bc3d5698SJohn Baldwin	veor	q9, q9, q11		@ M = G + H
467bc3d5698SJohn Baldwin	vmull.p8	q1, d28, d2		@ I = A*B3
468bc3d5698SJohn Baldwin	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
469bc3d5698SJohn Baldwin	vand	d17, d17, d29
470bc3d5698SJohn Baldwin	vext.8	d22, d6, d6, #4	@ B4
471bc3d5698SJohn Baldwin	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
472bc3d5698SJohn Baldwin	vand	d19, d19, d30
473bc3d5698SJohn Baldwin	vmull.p8	q11, d28, d22		@ K = A*B4
474bc3d5698SJohn Baldwin	veor	q10, q10, q1		@ N = I + J
475bc3d5698SJohn Baldwin	veor	d16, d16, d17
476bc3d5698SJohn Baldwin	veor	d18, d18, d19
477bc3d5698SJohn Baldwin	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
478bc3d5698SJohn Baldwin	vand	d21, d21, d31
479bc3d5698SJohn Baldwin	vext.8	q8, q8, q8, #15
480bc3d5698SJohn Baldwin	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
481bc3d5698SJohn Baldwin	vmov.i64	d23, #0
482bc3d5698SJohn Baldwin	vext.8	q9, q9, q9, #14
483bc3d5698SJohn Baldwin	veor	d20, d20, d21
484bc3d5698SJohn Baldwin	vmull.p8	q1, d28, d6		@ D = A*B
485bc3d5698SJohn Baldwin	vext.8	q11, q11, q11, #12
486bc3d5698SJohn Baldwin	vext.8	q10, q10, q10, #13
487bc3d5698SJohn Baldwin	veor	q8, q8, q9
488bc3d5698SJohn Baldwin	veor	q10, q10, q11
489bc3d5698SJohn Baldwin	veor	q1, q1, q8
490bc3d5698SJohn Baldwin	veor	q1, q1, q10
491bc3d5698SJohn Baldwin	vext.8	d16, d27, d27, #1	@ A1
492bc3d5698SJohn Baldwin	vmull.p8	q8, d16, d7		@ F = A1*B
493bc3d5698SJohn Baldwin	vext.8	d4, d7, d7, #1	@ B1
494bc3d5698SJohn Baldwin	vmull.p8	q2, d27, d4		@ E = A*B1
495bc3d5698SJohn Baldwin	vext.8	d18, d27, d27, #2	@ A2
496bc3d5698SJohn Baldwin	vmull.p8	q9, d18, d7		@ H = A2*B
497bc3d5698SJohn Baldwin	vext.8	d22, d7, d7, #2	@ B2
498bc3d5698SJohn Baldwin	vmull.p8	q11, d27, d22		@ G = A*B2
499bc3d5698SJohn Baldwin	vext.8	d20, d27, d27, #3	@ A3
500bc3d5698SJohn Baldwin	veor	q8, q8, q2		@ L = E + F
501bc3d5698SJohn Baldwin	vmull.p8	q10, d20, d7		@ J = A3*B
502bc3d5698SJohn Baldwin	vext.8	d4, d7, d7, #3	@ B3
503bc3d5698SJohn Baldwin	veor	q9, q9, q11		@ M = G + H
504bc3d5698SJohn Baldwin	vmull.p8	q2, d27, d4		@ I = A*B3
505bc3d5698SJohn Baldwin	veor	d16, d16, d17	@ t0 = (L) (P0 + P1) << 8
506bc3d5698SJohn Baldwin	vand	d17, d17, d29
507bc3d5698SJohn Baldwin	vext.8	d22, d7, d7, #4	@ B4
508bc3d5698SJohn Baldwin	veor	d18, d18, d19	@ t1 = (M) (P2 + P3) << 16
509bc3d5698SJohn Baldwin	vand	d19, d19, d30
510bc3d5698SJohn Baldwin	vmull.p8	q11, d27, d22		@ K = A*B4
511bc3d5698SJohn Baldwin	veor	q10, q10, q2		@ N = I + J
512bc3d5698SJohn Baldwin	veor	d16, d16, d17
513bc3d5698SJohn Baldwin	veor	d18, d18, d19
514bc3d5698SJohn Baldwin	veor	d20, d20, d21	@ t2 = (N) (P4 + P5) << 24
515bc3d5698SJohn Baldwin	vand	d21, d21, d31
516bc3d5698SJohn Baldwin	vext.8	q8, q8, q8, #15
517bc3d5698SJohn Baldwin	veor	d22, d22, d23	@ t3 = (K) (P6 + P7) << 32
518bc3d5698SJohn Baldwin	vmov.i64	d23, #0
519bc3d5698SJohn Baldwin	vext.8	q9, q9, q9, #14
520bc3d5698SJohn Baldwin	veor	d20, d20, d21
521bc3d5698SJohn Baldwin	vmull.p8	q2, d27, d7		@ D = A*B
522bc3d5698SJohn Baldwin	vext.8	q11, q11, q11, #12
523bc3d5698SJohn Baldwin	vext.8	q10, q10, q10, #13
524bc3d5698SJohn Baldwin	veor	q8, q8, q9
525bc3d5698SJohn Baldwin	veor	q10, q10, q11
526bc3d5698SJohn Baldwin	veor	q2, q2, q8
527bc3d5698SJohn Baldwin	veor	q2, q2, q10
528bc3d5698SJohn Baldwin	veor	q1,q1,q0		@ Karatsuba post-processing
529bc3d5698SJohn Baldwin	veor	q1,q1,q2
530bc3d5698SJohn Baldwin	veor	d1,d1,d2
531bc3d5698SJohn Baldwin	veor	d4,d4,d3	@ Xh|Xl - 256-bit result
532bc3d5698SJohn Baldwin
533bc3d5698SJohn Baldwin	@ equivalent of reduction_avx from ghash-x86_64.pl
534bc3d5698SJohn Baldwin	vshl.i64	q9,q0,#57		@ 1st phase
535bc3d5698SJohn Baldwin	vshl.i64	q10,q0,#62
536bc3d5698SJohn Baldwin	veor	q10,q10,q9		@
537bc3d5698SJohn Baldwin	vshl.i64	q9,q0,#63
538bc3d5698SJohn Baldwin	veor	q10, q10, q9		@
539bc3d5698SJohn Baldwin	veor	d1,d1,d20	@
540bc3d5698SJohn Baldwin	veor	d4,d4,d21
541bc3d5698SJohn Baldwin
542bc3d5698SJohn Baldwin	vshr.u64	q10,q0,#1		@ 2nd phase
543bc3d5698SJohn Baldwin	veor	q2,q2,q0
544bc3d5698SJohn Baldwin	veor	q0,q0,q10		@
545bc3d5698SJohn Baldwin	vshr.u64	q10,q10,#6
546bc3d5698SJohn Baldwin	vshr.u64	q0,q0,#1		@
547bc3d5698SJohn Baldwin	veor	q0,q0,q2		@
548bc3d5698SJohn Baldwin	veor	q0,q0,q10		@
549bc3d5698SJohn Baldwin
550bc3d5698SJohn Baldwin	subs	r3,#16
551bc3d5698SJohn Baldwin	bne	.Loop_neon
552bc3d5698SJohn Baldwin
553bc3d5698SJohn Baldwin#ifdef __ARMEL__
554bc3d5698SJohn Baldwin	vrev64.8	q0,q0
555bc3d5698SJohn Baldwin#endif
556bc3d5698SJohn Baldwin	sub	r0,#16
557bc3d5698SJohn Baldwin	vst1.64	d1,[r0]!		@ write out Xi
558bc3d5698SJohn Baldwin	vst1.64	d0,[r0]
559bc3d5698SJohn Baldwin
560bc3d5698SJohn Baldwin	bx	lr					@ bx lr
561bc3d5698SJohn Baldwin.size	gcm_ghash_neon,.-gcm_ghash_neon
562bc3d5698SJohn Baldwin#endif
563bc3d5698SJohn Baldwin.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
564bc3d5698SJohn Baldwin.align	2
565bc3d5698SJohn Baldwin.align	2
566