xref: /freebsd/sys/crypto/openssl/arm/poly1305-armv4.S (revision c0855eaa3ee9614804b6bd6a255aa9f71e095f43)
1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from poly1305-armv4.pl. */
2bc3d5698SJohn Baldwin#include "arm_arch.h"
3bc3d5698SJohn Baldwin
4bc3d5698SJohn Baldwin#if defined(__thumb2__)
5bc3d5698SJohn Baldwin.syntax	unified
6bc3d5698SJohn Baldwin.thumb
7bc3d5698SJohn Baldwin#else
8bc3d5698SJohn Baldwin.code	32
9bc3d5698SJohn Baldwin#endif
10bc3d5698SJohn Baldwin
11*c0855eaaSJohn Baldwin.text
12*c0855eaaSJohn Baldwin
13bc3d5698SJohn Baldwin.globl	poly1305_emit
14bc3d5698SJohn Baldwin.globl	poly1305_blocks
15bc3d5698SJohn Baldwin.globl	poly1305_init
16bc3d5698SJohn Baldwin.type	poly1305_init,%function
17bc3d5698SJohn Baldwin.align	5
18bc3d5698SJohn Baldwinpoly1305_init:
19bc3d5698SJohn Baldwin.Lpoly1305_init:
20bc3d5698SJohn Baldwin	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
21bc3d5698SJohn Baldwin
22bc3d5698SJohn Baldwin	eor	r3,r3,r3
23bc3d5698SJohn Baldwin	cmp	r1,#0
24bc3d5698SJohn Baldwin	str	r3,[r0,#0]		@ zero hash value
25bc3d5698SJohn Baldwin	str	r3,[r0,#4]
26bc3d5698SJohn Baldwin	str	r3,[r0,#8]
27bc3d5698SJohn Baldwin	str	r3,[r0,#12]
28bc3d5698SJohn Baldwin	str	r3,[r0,#16]
29bc3d5698SJohn Baldwin	str	r3,[r0,#36]		@ is_base2_26
30bc3d5698SJohn Baldwin	add	r0,r0,#20
31bc3d5698SJohn Baldwin
32bc3d5698SJohn Baldwin#ifdef	__thumb2__
33bc3d5698SJohn Baldwin	it	eq
34bc3d5698SJohn Baldwin#endif
35bc3d5698SJohn Baldwin	moveq	r0,#0
36bc3d5698SJohn Baldwin	beq	.Lno_key
37bc3d5698SJohn Baldwin
38bc3d5698SJohn Baldwin#if	__ARM_MAX_ARCH__>=7
39bc3d5698SJohn Baldwin	adr	r11,.Lpoly1305_init
40bc3d5698SJohn Baldwin	ldr	r12,.LOPENSSL_armcap
41bc3d5698SJohn Baldwin#endif
42bc3d5698SJohn Baldwin	ldrb	r4,[r1,#0]
43bc3d5698SJohn Baldwin	mov	r10,#0x0fffffff
44bc3d5698SJohn Baldwin	ldrb	r5,[r1,#1]
45bc3d5698SJohn Baldwin	and	r3,r10,#-4		@ 0x0ffffffc
46bc3d5698SJohn Baldwin	ldrb	r6,[r1,#2]
47bc3d5698SJohn Baldwin	ldrb	r7,[r1,#3]
48bc3d5698SJohn Baldwin	orr	r4,r4,r5,lsl#8
49bc3d5698SJohn Baldwin	ldrb	r5,[r1,#4]
50bc3d5698SJohn Baldwin	orr	r4,r4,r6,lsl#16
51bc3d5698SJohn Baldwin	ldrb	r6,[r1,#5]
52bc3d5698SJohn Baldwin	orr	r4,r4,r7,lsl#24
53bc3d5698SJohn Baldwin	ldrb	r7,[r1,#6]
54bc3d5698SJohn Baldwin	and	r4,r4,r10
55bc3d5698SJohn Baldwin
56bc3d5698SJohn Baldwin#if	__ARM_MAX_ARCH__>=7
57*c0855eaaSJohn Baldwin# if !defined(_WIN32)
58bc3d5698SJohn Baldwin	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
59*c0855eaaSJohn Baldwin# endif
60*c0855eaaSJohn Baldwin# if defined(__APPLE__) || defined(_WIN32)
61bc3d5698SJohn Baldwin	ldr	r12,[r12]
62bc3d5698SJohn Baldwin# endif
63bc3d5698SJohn Baldwin#endif
64bc3d5698SJohn Baldwin	ldrb	r8,[r1,#7]
65bc3d5698SJohn Baldwin	orr	r5,r5,r6,lsl#8
66bc3d5698SJohn Baldwin	ldrb	r6,[r1,#8]
67bc3d5698SJohn Baldwin	orr	r5,r5,r7,lsl#16
68bc3d5698SJohn Baldwin	ldrb	r7,[r1,#9]
69bc3d5698SJohn Baldwin	orr	r5,r5,r8,lsl#24
70bc3d5698SJohn Baldwin	ldrb	r8,[r1,#10]
71bc3d5698SJohn Baldwin	and	r5,r5,r3
72bc3d5698SJohn Baldwin
73bc3d5698SJohn Baldwin#if	__ARM_MAX_ARCH__>=7
74bc3d5698SJohn Baldwin	tst	r12,#ARMV7_NEON		@ check for NEON
75bc3d5698SJohn Baldwin# ifdef	__thumb2__
76*c0855eaaSJohn Baldwin	adr	r9,.Lpoly1305_blocks_neon
77*c0855eaaSJohn Baldwin	adr	r11,.Lpoly1305_blocks
78*c0855eaaSJohn Baldwin	adr	r12,.Lpoly1305_emit
79*c0855eaaSJohn Baldwin	adr	r10,.Lpoly1305_emit_neon
80*c0855eaaSJohn Baldwin	itt	ne
81bc3d5698SJohn Baldwin	movne	r11,r9
82bc3d5698SJohn Baldwin	movne	r12,r10
83*c0855eaaSJohn Baldwin	orr	r11,r11,#1	@ thumb-ify address
84*c0855eaaSJohn Baldwin	orr	r12,r12,#1
85bc3d5698SJohn Baldwin# else
863396647cSJung-uk Kim	addeq	r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
873396647cSJung-uk Kim	addne	r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
883396647cSJung-uk Kim	addeq	r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
893396647cSJung-uk Kim	addne	r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
90bc3d5698SJohn Baldwin# endif
91bc3d5698SJohn Baldwin#endif
92bc3d5698SJohn Baldwin	ldrb	r9,[r1,#11]
93bc3d5698SJohn Baldwin	orr	r6,r6,r7,lsl#8
94bc3d5698SJohn Baldwin	ldrb	r7,[r1,#12]
95bc3d5698SJohn Baldwin	orr	r6,r6,r8,lsl#16
96bc3d5698SJohn Baldwin	ldrb	r8,[r1,#13]
97bc3d5698SJohn Baldwin	orr	r6,r6,r9,lsl#24
98bc3d5698SJohn Baldwin	ldrb	r9,[r1,#14]
99bc3d5698SJohn Baldwin	and	r6,r6,r3
100bc3d5698SJohn Baldwin
101bc3d5698SJohn Baldwin	ldrb	r10,[r1,#15]
102bc3d5698SJohn Baldwin	orr	r7,r7,r8,lsl#8
103bc3d5698SJohn Baldwin	str	r4,[r0,#0]
104bc3d5698SJohn Baldwin	orr	r7,r7,r9,lsl#16
105bc3d5698SJohn Baldwin	str	r5,[r0,#4]
106bc3d5698SJohn Baldwin	orr	r7,r7,r10,lsl#24
107bc3d5698SJohn Baldwin	str	r6,[r0,#8]
108bc3d5698SJohn Baldwin	and	r7,r7,r3
109bc3d5698SJohn Baldwin	str	r7,[r0,#12]
110bc3d5698SJohn Baldwin#if	__ARM_MAX_ARCH__>=7
111bc3d5698SJohn Baldwin	stmia	r2,{r11,r12}		@ fill functions table
112bc3d5698SJohn Baldwin	mov	r0,#1
113bc3d5698SJohn Baldwin#else
114bc3d5698SJohn Baldwin	mov	r0,#0
115bc3d5698SJohn Baldwin#endif
116bc3d5698SJohn Baldwin.Lno_key:
117bc3d5698SJohn Baldwin	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
118bc3d5698SJohn Baldwin#if	__ARM_ARCH__>=5
119bc3d5698SJohn Baldwin	bx	lr				@ bx	lr
120bc3d5698SJohn Baldwin#else
121bc3d5698SJohn Baldwin	tst	lr,#1
122bc3d5698SJohn Baldwin	moveq	pc,lr			@ be binary compatible with V4, yet
123bc3d5698SJohn Baldwin.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
124bc3d5698SJohn Baldwin#endif
125bc3d5698SJohn Baldwin.size	poly1305_init,.-poly1305_init
126bc3d5698SJohn Baldwin.type	poly1305_blocks,%function
127bc3d5698SJohn Baldwin.align	5
128bc3d5698SJohn Baldwinpoly1305_blocks:
129bc3d5698SJohn Baldwin.Lpoly1305_blocks:
130bc3d5698SJohn Baldwin	stmdb	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
131bc3d5698SJohn Baldwin
132bc3d5698SJohn Baldwin	ands	r2,r2,#-16
133bc3d5698SJohn Baldwin	beq	.Lno_data
134bc3d5698SJohn Baldwin
135bc3d5698SJohn Baldwin	cmp	r3,#0
136bc3d5698SJohn Baldwin	add	r2,r2,r1		@ end pointer
137bc3d5698SJohn Baldwin	sub	sp,sp,#32
138bc3d5698SJohn Baldwin
139bc3d5698SJohn Baldwin	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12}		@ load context
140bc3d5698SJohn Baldwin
141bc3d5698SJohn Baldwin	str	r0,[sp,#12]		@ offload stuff
142bc3d5698SJohn Baldwin	mov	lr,r1
143bc3d5698SJohn Baldwin	str	r2,[sp,#16]
144bc3d5698SJohn Baldwin	str	r10,[sp,#20]
145bc3d5698SJohn Baldwin	str	r11,[sp,#24]
146bc3d5698SJohn Baldwin	str	r12,[sp,#28]
147bc3d5698SJohn Baldwin	b	.Loop
148bc3d5698SJohn Baldwin
149bc3d5698SJohn Baldwin.Loop:
150bc3d5698SJohn Baldwin#if __ARM_ARCH__<7
151bc3d5698SJohn Baldwin	ldrb	r0,[lr],#16		@ load input
152bc3d5698SJohn Baldwin# ifdef	__thumb2__
153bc3d5698SJohn Baldwin	it	hi
154bc3d5698SJohn Baldwin# endif
155bc3d5698SJohn Baldwin	addhi	r8,r8,#1		@ 1<<128
156bc3d5698SJohn Baldwin	ldrb	r1,[lr,#-15]
157bc3d5698SJohn Baldwin	ldrb	r2,[lr,#-14]
158bc3d5698SJohn Baldwin	ldrb	r3,[lr,#-13]
159bc3d5698SJohn Baldwin	orr	r1,r0,r1,lsl#8
160bc3d5698SJohn Baldwin	ldrb	r0,[lr,#-12]
161bc3d5698SJohn Baldwin	orr	r2,r1,r2,lsl#16
162bc3d5698SJohn Baldwin	ldrb	r1,[lr,#-11]
163bc3d5698SJohn Baldwin	orr	r3,r2,r3,lsl#24
164bc3d5698SJohn Baldwin	ldrb	r2,[lr,#-10]
165bc3d5698SJohn Baldwin	adds	r4,r4,r3		@ accumulate input
166bc3d5698SJohn Baldwin
167bc3d5698SJohn Baldwin	ldrb	r3,[lr,#-9]
168bc3d5698SJohn Baldwin	orr	r1,r0,r1,lsl#8
169bc3d5698SJohn Baldwin	ldrb	r0,[lr,#-8]
170bc3d5698SJohn Baldwin	orr	r2,r1,r2,lsl#16
171bc3d5698SJohn Baldwin	ldrb	r1,[lr,#-7]
172bc3d5698SJohn Baldwin	orr	r3,r2,r3,lsl#24
173bc3d5698SJohn Baldwin	ldrb	r2,[lr,#-6]
174bc3d5698SJohn Baldwin	adcs	r5,r5,r3
175bc3d5698SJohn Baldwin
176bc3d5698SJohn Baldwin	ldrb	r3,[lr,#-5]
177bc3d5698SJohn Baldwin	orr	r1,r0,r1,lsl#8
178bc3d5698SJohn Baldwin	ldrb	r0,[lr,#-4]
179bc3d5698SJohn Baldwin	orr	r2,r1,r2,lsl#16
180bc3d5698SJohn Baldwin	ldrb	r1,[lr,#-3]
181bc3d5698SJohn Baldwin	orr	r3,r2,r3,lsl#24
182bc3d5698SJohn Baldwin	ldrb	r2,[lr,#-2]
183bc3d5698SJohn Baldwin	adcs	r6,r6,r3
184bc3d5698SJohn Baldwin
185bc3d5698SJohn Baldwin	ldrb	r3,[lr,#-1]
186bc3d5698SJohn Baldwin	orr	r1,r0,r1,lsl#8
187bc3d5698SJohn Baldwin	str	lr,[sp,#8]		@ offload input pointer
188bc3d5698SJohn Baldwin	orr	r2,r1,r2,lsl#16
189bc3d5698SJohn Baldwin	add	r10,r10,r10,lsr#2
190bc3d5698SJohn Baldwin	orr	r3,r2,r3,lsl#24
191bc3d5698SJohn Baldwin#else
192bc3d5698SJohn Baldwin	ldr	r0,[lr],#16		@ load input
193bc3d5698SJohn Baldwin# ifdef	__thumb2__
194bc3d5698SJohn Baldwin	it	hi
195bc3d5698SJohn Baldwin# endif
196bc3d5698SJohn Baldwin	addhi	r8,r8,#1		@ padbit
197bc3d5698SJohn Baldwin	ldr	r1,[lr,#-12]
198bc3d5698SJohn Baldwin	ldr	r2,[lr,#-8]
199bc3d5698SJohn Baldwin	ldr	r3,[lr,#-4]
200bc3d5698SJohn Baldwin# ifdef	__ARMEB__
201bc3d5698SJohn Baldwin	rev	r0,r0
202bc3d5698SJohn Baldwin	rev	r1,r1
203bc3d5698SJohn Baldwin	rev	r2,r2
204bc3d5698SJohn Baldwin	rev	r3,r3
205bc3d5698SJohn Baldwin# endif
206bc3d5698SJohn Baldwin	adds	r4,r4,r0		@ accumulate input
207bc3d5698SJohn Baldwin	str	lr,[sp,#8]		@ offload input pointer
208bc3d5698SJohn Baldwin	adcs	r5,r5,r1
209bc3d5698SJohn Baldwin	add	r10,r10,r10,lsr#2
210bc3d5698SJohn Baldwin	adcs	r6,r6,r2
211bc3d5698SJohn Baldwin#endif
212bc3d5698SJohn Baldwin	add	r11,r11,r11,lsr#2
213bc3d5698SJohn Baldwin	adcs	r7,r7,r3
214bc3d5698SJohn Baldwin	add	r12,r12,r12,lsr#2
215bc3d5698SJohn Baldwin
216bc3d5698SJohn Baldwin	umull	r2,r3,r5,r9
217bc3d5698SJohn Baldwin	adc	r8,r8,#0
218bc3d5698SJohn Baldwin	umull	r0,r1,r4,r9
219bc3d5698SJohn Baldwin	umlal	r2,r3,r8,r10
220bc3d5698SJohn Baldwin	umlal	r0,r1,r7,r10
221bc3d5698SJohn Baldwin	ldr	r10,[sp,#20]		@ reload r10
222bc3d5698SJohn Baldwin	umlal	r2,r3,r6,r12
223bc3d5698SJohn Baldwin	umlal	r0,r1,r5,r12
224bc3d5698SJohn Baldwin	umlal	r2,r3,r7,r11
225bc3d5698SJohn Baldwin	umlal	r0,r1,r6,r11
226bc3d5698SJohn Baldwin	umlal	r2,r3,r4,r10
227bc3d5698SJohn Baldwin	str	r0,[sp,#0]		@ future r4
228bc3d5698SJohn Baldwin	mul	r0,r11,r8
229bc3d5698SJohn Baldwin	ldr	r11,[sp,#24]		@ reload r11
230bc3d5698SJohn Baldwin	adds	r2,r2,r1		@ d1+=d0>>32
231bc3d5698SJohn Baldwin	eor	r1,r1,r1
232bc3d5698SJohn Baldwin	adc	lr,r3,#0		@ future r6
233bc3d5698SJohn Baldwin	str	r2,[sp,#4]		@ future r5
234bc3d5698SJohn Baldwin
235bc3d5698SJohn Baldwin	mul	r2,r12,r8
236bc3d5698SJohn Baldwin	eor	r3,r3,r3
237bc3d5698SJohn Baldwin	umlal	r0,r1,r7,r12
238bc3d5698SJohn Baldwin	ldr	r12,[sp,#28]		@ reload r12
239bc3d5698SJohn Baldwin	umlal	r2,r3,r7,r9
240bc3d5698SJohn Baldwin	umlal	r0,r1,r6,r9
241bc3d5698SJohn Baldwin	umlal	r2,r3,r6,r10
242bc3d5698SJohn Baldwin	umlal	r0,r1,r5,r10
243bc3d5698SJohn Baldwin	umlal	r2,r3,r5,r11
244bc3d5698SJohn Baldwin	umlal	r0,r1,r4,r11
245bc3d5698SJohn Baldwin	umlal	r2,r3,r4,r12
246bc3d5698SJohn Baldwin	ldr	r4,[sp,#0]
247bc3d5698SJohn Baldwin	mul	r8,r9,r8
248bc3d5698SJohn Baldwin	ldr	r5,[sp,#4]
249bc3d5698SJohn Baldwin
250bc3d5698SJohn Baldwin	adds	r6,lr,r0		@ d2+=d1>>32
251bc3d5698SJohn Baldwin	ldr	lr,[sp,#8]		@ reload input pointer
252bc3d5698SJohn Baldwin	adc	r1,r1,#0
253bc3d5698SJohn Baldwin	adds	r7,r2,r1		@ d3+=d2>>32
254bc3d5698SJohn Baldwin	ldr	r0,[sp,#16]		@ reload end pointer
255bc3d5698SJohn Baldwin	adc	r3,r3,#0
256bc3d5698SJohn Baldwin	add	r8,r8,r3		@ h4+=d3>>32
257bc3d5698SJohn Baldwin
258bc3d5698SJohn Baldwin	and	r1,r8,#-4
259bc3d5698SJohn Baldwin	and	r8,r8,#3
260bc3d5698SJohn Baldwin	add	r1,r1,r1,lsr#2		@ *=5
261bc3d5698SJohn Baldwin	adds	r4,r4,r1
262bc3d5698SJohn Baldwin	adcs	r5,r5,#0
263bc3d5698SJohn Baldwin	adcs	r6,r6,#0
264bc3d5698SJohn Baldwin	adcs	r7,r7,#0
265bc3d5698SJohn Baldwin	adc	r8,r8,#0
266bc3d5698SJohn Baldwin
267bc3d5698SJohn Baldwin	cmp	r0,lr			@ done yet?
268bc3d5698SJohn Baldwin	bhi	.Loop
269bc3d5698SJohn Baldwin
270bc3d5698SJohn Baldwin	ldr	r0,[sp,#12]
271bc3d5698SJohn Baldwin	add	sp,sp,#32
272bc3d5698SJohn Baldwin	stmia	r0,{r4,r5,r6,r7,r8}		@ store the result
273bc3d5698SJohn Baldwin
274bc3d5698SJohn Baldwin.Lno_data:
275bc3d5698SJohn Baldwin#if	__ARM_ARCH__>=5
276bc3d5698SJohn Baldwin	ldmia	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc}
277bc3d5698SJohn Baldwin#else
278bc3d5698SJohn Baldwin	ldmia	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
279bc3d5698SJohn Baldwin	tst	lr,#1
280bc3d5698SJohn Baldwin	moveq	pc,lr			@ be binary compatible with V4, yet
281bc3d5698SJohn Baldwin.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
282bc3d5698SJohn Baldwin#endif
283bc3d5698SJohn Baldwin.size	poly1305_blocks,.-poly1305_blocks
284bc3d5698SJohn Baldwin.type	poly1305_emit,%function
285bc3d5698SJohn Baldwin.align	5
286bc3d5698SJohn Baldwinpoly1305_emit:
2873396647cSJung-uk Kim.Lpoly1305_emit:
288bc3d5698SJohn Baldwin	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
289bc3d5698SJohn Baldwin.Lpoly1305_emit_enter:
290bc3d5698SJohn Baldwin
291bc3d5698SJohn Baldwin	ldmia	r0,{r3,r4,r5,r6,r7}
292bc3d5698SJohn Baldwin	adds	r8,r3,#5		@ compare to modulus
293bc3d5698SJohn Baldwin	adcs	r9,r4,#0
294bc3d5698SJohn Baldwin	adcs	r10,r5,#0
295bc3d5698SJohn Baldwin	adcs	r11,r6,#0
296bc3d5698SJohn Baldwin	adc	r7,r7,#0
297bc3d5698SJohn Baldwin	tst	r7,#4			@ did it carry/borrow?
298bc3d5698SJohn Baldwin
299bc3d5698SJohn Baldwin#ifdef	__thumb2__
300bc3d5698SJohn Baldwin	it	ne
301bc3d5698SJohn Baldwin#endif
302bc3d5698SJohn Baldwin	movne	r3,r8
303bc3d5698SJohn Baldwin	ldr	r8,[r2,#0]
304bc3d5698SJohn Baldwin#ifdef	__thumb2__
305bc3d5698SJohn Baldwin	it	ne
306bc3d5698SJohn Baldwin#endif
307bc3d5698SJohn Baldwin	movne	r4,r9
308bc3d5698SJohn Baldwin	ldr	r9,[r2,#4]
309bc3d5698SJohn Baldwin#ifdef	__thumb2__
310bc3d5698SJohn Baldwin	it	ne
311bc3d5698SJohn Baldwin#endif
312bc3d5698SJohn Baldwin	movne	r5,r10
313bc3d5698SJohn Baldwin	ldr	r10,[r2,#8]
314bc3d5698SJohn Baldwin#ifdef	__thumb2__
315bc3d5698SJohn Baldwin	it	ne
316bc3d5698SJohn Baldwin#endif
317bc3d5698SJohn Baldwin	movne	r6,r11
318bc3d5698SJohn Baldwin	ldr	r11,[r2,#12]
319bc3d5698SJohn Baldwin
320bc3d5698SJohn Baldwin	adds	r3,r3,r8
321bc3d5698SJohn Baldwin	adcs	r4,r4,r9
322bc3d5698SJohn Baldwin	adcs	r5,r5,r10
323bc3d5698SJohn Baldwin	adc	r6,r6,r11
324bc3d5698SJohn Baldwin
325bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7
326bc3d5698SJohn Baldwin# ifdef __ARMEB__
327bc3d5698SJohn Baldwin	rev	r3,r3
328bc3d5698SJohn Baldwin	rev	r4,r4
329bc3d5698SJohn Baldwin	rev	r5,r5
330bc3d5698SJohn Baldwin	rev	r6,r6
331bc3d5698SJohn Baldwin# endif
332bc3d5698SJohn Baldwin	str	r3,[r1,#0]
333bc3d5698SJohn Baldwin	str	r4,[r1,#4]
334bc3d5698SJohn Baldwin	str	r5,[r1,#8]
335bc3d5698SJohn Baldwin	str	r6,[r1,#12]
336bc3d5698SJohn Baldwin#else
337bc3d5698SJohn Baldwin	strb	r3,[r1,#0]
338bc3d5698SJohn Baldwin	mov	r3,r3,lsr#8
339bc3d5698SJohn Baldwin	strb	r4,[r1,#4]
340bc3d5698SJohn Baldwin	mov	r4,r4,lsr#8
341bc3d5698SJohn Baldwin	strb	r5,[r1,#8]
342bc3d5698SJohn Baldwin	mov	r5,r5,lsr#8
343bc3d5698SJohn Baldwin	strb	r6,[r1,#12]
344bc3d5698SJohn Baldwin	mov	r6,r6,lsr#8
345bc3d5698SJohn Baldwin
346bc3d5698SJohn Baldwin	strb	r3,[r1,#1]
347bc3d5698SJohn Baldwin	mov	r3,r3,lsr#8
348bc3d5698SJohn Baldwin	strb	r4,[r1,#5]
349bc3d5698SJohn Baldwin	mov	r4,r4,lsr#8
350bc3d5698SJohn Baldwin	strb	r5,[r1,#9]
351bc3d5698SJohn Baldwin	mov	r5,r5,lsr#8
352bc3d5698SJohn Baldwin	strb	r6,[r1,#13]
353bc3d5698SJohn Baldwin	mov	r6,r6,lsr#8
354bc3d5698SJohn Baldwin
355bc3d5698SJohn Baldwin	strb	r3,[r1,#2]
356bc3d5698SJohn Baldwin	mov	r3,r3,lsr#8
357bc3d5698SJohn Baldwin	strb	r4,[r1,#6]
358bc3d5698SJohn Baldwin	mov	r4,r4,lsr#8
359bc3d5698SJohn Baldwin	strb	r5,[r1,#10]
360bc3d5698SJohn Baldwin	mov	r5,r5,lsr#8
361bc3d5698SJohn Baldwin	strb	r6,[r1,#14]
362bc3d5698SJohn Baldwin	mov	r6,r6,lsr#8
363bc3d5698SJohn Baldwin
364bc3d5698SJohn Baldwin	strb	r3,[r1,#3]
365bc3d5698SJohn Baldwin	strb	r4,[r1,#7]
366bc3d5698SJohn Baldwin	strb	r5,[r1,#11]
367bc3d5698SJohn Baldwin	strb	r6,[r1,#15]
368bc3d5698SJohn Baldwin#endif
369bc3d5698SJohn Baldwin	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
370bc3d5698SJohn Baldwin#if	__ARM_ARCH__>=5
371bc3d5698SJohn Baldwin	bx	lr				@ bx	lr
372bc3d5698SJohn Baldwin#else
373bc3d5698SJohn Baldwin	tst	lr,#1
374bc3d5698SJohn Baldwin	moveq	pc,lr			@ be binary compatible with V4, yet
375bc3d5698SJohn Baldwin.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
376bc3d5698SJohn Baldwin#endif
377bc3d5698SJohn Baldwin.size	poly1305_emit,.-poly1305_emit
378bc3d5698SJohn Baldwin#if	__ARM_MAX_ARCH__>=7
379bc3d5698SJohn Baldwin.fpu	neon
380bc3d5698SJohn Baldwin
381bc3d5698SJohn Baldwin.type	poly1305_init_neon,%function
382bc3d5698SJohn Baldwin.align	5
383bc3d5698SJohn Baldwinpoly1305_init_neon:
384bc3d5698SJohn Baldwin	ldr	r4,[r0,#20]		@ load key base 2^32
385bc3d5698SJohn Baldwin	ldr	r5,[r0,#24]
386bc3d5698SJohn Baldwin	ldr	r6,[r0,#28]
387bc3d5698SJohn Baldwin	ldr	r7,[r0,#32]
388bc3d5698SJohn Baldwin
389bc3d5698SJohn Baldwin	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
390bc3d5698SJohn Baldwin	mov	r3,r4,lsr#26
391bc3d5698SJohn Baldwin	mov	r4,r5,lsr#20
392bc3d5698SJohn Baldwin	orr	r3,r3,r5,lsl#6
393bc3d5698SJohn Baldwin	mov	r5,r6,lsr#14
394bc3d5698SJohn Baldwin	orr	r4,r4,r6,lsl#12
395bc3d5698SJohn Baldwin	mov	r6,r7,lsr#8
396bc3d5698SJohn Baldwin	orr	r5,r5,r7,lsl#18
397bc3d5698SJohn Baldwin	and	r3,r3,#0x03ffffff
398bc3d5698SJohn Baldwin	and	r4,r4,#0x03ffffff
399bc3d5698SJohn Baldwin	and	r5,r5,#0x03ffffff
400bc3d5698SJohn Baldwin
401bc3d5698SJohn Baldwin	vdup.32	d0,r2			@ r^1 in both lanes
402bc3d5698SJohn Baldwin	add	r2,r3,r3,lsl#2		@ *5
403bc3d5698SJohn Baldwin	vdup.32	d1,r3
404bc3d5698SJohn Baldwin	add	r3,r4,r4,lsl#2
405bc3d5698SJohn Baldwin	vdup.32	d2,r2
406bc3d5698SJohn Baldwin	vdup.32	d3,r4
407bc3d5698SJohn Baldwin	add	r4,r5,r5,lsl#2
408bc3d5698SJohn Baldwin	vdup.32	d4,r3
409bc3d5698SJohn Baldwin	vdup.32	d5,r5
410bc3d5698SJohn Baldwin	add	r5,r6,r6,lsl#2
411bc3d5698SJohn Baldwin	vdup.32	d6,r4
412bc3d5698SJohn Baldwin	vdup.32	d7,r6
413bc3d5698SJohn Baldwin	vdup.32	d8,r5
414bc3d5698SJohn Baldwin
415bc3d5698SJohn Baldwin	mov	r5,#2		@ counter
416bc3d5698SJohn Baldwin
417bc3d5698SJohn Baldwin.Lsquare_neon:
418bc3d5698SJohn Baldwin	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
419bc3d5698SJohn Baldwin	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
420bc3d5698SJohn Baldwin	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
421bc3d5698SJohn Baldwin	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
422bc3d5698SJohn Baldwin	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
423bc3d5698SJohn Baldwin	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
424bc3d5698SJohn Baldwin
425bc3d5698SJohn Baldwin	vmull.u32	q5,d0,d0[1]
426bc3d5698SJohn Baldwin	vmull.u32	q6,d1,d0[1]
427bc3d5698SJohn Baldwin	vmull.u32	q7,d3,d0[1]
428bc3d5698SJohn Baldwin	vmull.u32	q8,d5,d0[1]
429bc3d5698SJohn Baldwin	vmull.u32	q9,d7,d0[1]
430bc3d5698SJohn Baldwin
431bc3d5698SJohn Baldwin	vmlal.u32	q5,d7,d2[1]
432bc3d5698SJohn Baldwin	vmlal.u32	q6,d0,d1[1]
433bc3d5698SJohn Baldwin	vmlal.u32	q7,d1,d1[1]
434bc3d5698SJohn Baldwin	vmlal.u32	q8,d3,d1[1]
435bc3d5698SJohn Baldwin	vmlal.u32	q9,d5,d1[1]
436bc3d5698SJohn Baldwin
437bc3d5698SJohn Baldwin	vmlal.u32	q5,d5,d4[1]
438bc3d5698SJohn Baldwin	vmlal.u32	q6,d7,d4[1]
439bc3d5698SJohn Baldwin	vmlal.u32	q8,d1,d3[1]
440bc3d5698SJohn Baldwin	vmlal.u32	q7,d0,d3[1]
441bc3d5698SJohn Baldwin	vmlal.u32	q9,d3,d3[1]
442bc3d5698SJohn Baldwin
443bc3d5698SJohn Baldwin	vmlal.u32	q5,d3,d6[1]
444bc3d5698SJohn Baldwin	vmlal.u32	q8,d0,d5[1]
445bc3d5698SJohn Baldwin	vmlal.u32	q6,d5,d6[1]
446bc3d5698SJohn Baldwin	vmlal.u32	q7,d7,d6[1]
447bc3d5698SJohn Baldwin	vmlal.u32	q9,d1,d5[1]
448bc3d5698SJohn Baldwin
449bc3d5698SJohn Baldwin	vmlal.u32	q8,d7,d8[1]
450bc3d5698SJohn Baldwin	vmlal.u32	q5,d1,d8[1]
451bc3d5698SJohn Baldwin	vmlal.u32	q6,d3,d8[1]
452bc3d5698SJohn Baldwin	vmlal.u32	q7,d5,d8[1]
453bc3d5698SJohn Baldwin	vmlal.u32	q9,d0,d7[1]
454bc3d5698SJohn Baldwin
455bc3d5698SJohn Baldwin	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
456bc3d5698SJohn Baldwin	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
457bc3d5698SJohn Baldwin	@ and P. Schwabe
458bc3d5698SJohn Baldwin	@
459bc3d5698SJohn Baldwin	@ H0>>+H1>>+H2>>+H3>>+H4
460bc3d5698SJohn Baldwin	@ H3>>+H4>>*5+H0>>+H1
461bc3d5698SJohn Baldwin	@
462bc3d5698SJohn Baldwin	@ Trivia.
463bc3d5698SJohn Baldwin	@
464bc3d5698SJohn Baldwin	@ Result of multiplication of n-bit number by m-bit number is
465bc3d5698SJohn Baldwin	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
466bc3d5698SJohn Baldwin	@ m-bit number multiplied by 2^n is still n+m bits wide.
467bc3d5698SJohn Baldwin	@
468bc3d5698SJohn Baldwin	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
469bc3d5698SJohn Baldwin	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
470bc3d5698SJohn Baldwin	@ one is n+1 bits wide.
471bc3d5698SJohn Baldwin	@
472bc3d5698SJohn Baldwin	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
473bc3d5698SJohn Baldwin	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
474bc3d5698SJohn Baldwin	@ can be 27. However! In cases when their width exceeds 26 bits
475bc3d5698SJohn Baldwin	@ they are limited by 2^26+2^6. This in turn means that *sum*
476bc3d5698SJohn Baldwin	@ of the products with these values can still be viewed as sum
477bc3d5698SJohn Baldwin	@ of 52-bit numbers as long as the amount of addends is not a
478bc3d5698SJohn Baldwin	@ power of 2. For example,
479bc3d5698SJohn Baldwin	@
480bc3d5698SJohn Baldwin	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
481bc3d5698SJohn Baldwin	@
482bc3d5698SJohn Baldwin	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
483bc3d5698SJohn Baldwin	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
484bc3d5698SJohn Baldwin	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
485bc3d5698SJohn Baldwin	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
486bc3d5698SJohn Baldwin	@ which is less than 32 * (2^52) or 2^57. And when processing
487bc3d5698SJohn Baldwin	@ data we are looking at triple as many addends...
488bc3d5698SJohn Baldwin	@
489bc3d5698SJohn Baldwin	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
490bc3d5698SJohn Baldwin	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
491bc3d5698SJohn Baldwin	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
492bc3d5698SJohn Baldwin	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
493bc3d5698SJohn Baldwin	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
494bc3d5698SJohn Baldwin	@ This means that result of reduction have to be compressed upon
495bc3d5698SJohn Baldwin	@ loop wrap-around. This can be done in the process of reduction
496bc3d5698SJohn Baldwin	@ to minimize amount of instructions [as well as amount of
497bc3d5698SJohn Baldwin	@ 128-bit instructions, which benefits low-end processors], but
498bc3d5698SJohn Baldwin	@ one has to watch for H2 (which is narrower than H0) and 5*H4
499bc3d5698SJohn Baldwin	@ not being wider than 58 bits, so that result of right shift
500bc3d5698SJohn Baldwin	@ by 26 bits fits in 32 bits. This is also useful on x86,
501bc3d5698SJohn Baldwin	@ because it allows to use paddd in place for paddq, which
502bc3d5698SJohn Baldwin	@ benefits Atom, where paddq is ridiculously slow.
503bc3d5698SJohn Baldwin
504bc3d5698SJohn Baldwin	vshr.u64	q15,q8,#26
505bc3d5698SJohn Baldwin	vmovn.i64	d16,q8
506bc3d5698SJohn Baldwin	vshr.u64	q4,q5,#26
507bc3d5698SJohn Baldwin	vmovn.i64	d10,q5
508bc3d5698SJohn Baldwin	vadd.i64	q9,q9,q15		@ h3 -> h4
509bc3d5698SJohn Baldwin	vbic.i32	d16,#0xfc000000	@ &=0x03ffffff
510bc3d5698SJohn Baldwin	vadd.i64	q6,q6,q4		@ h0 -> h1
511bc3d5698SJohn Baldwin	vbic.i32	d10,#0xfc000000
512bc3d5698SJohn Baldwin
513bc3d5698SJohn Baldwin	vshrn.u64	d30,q9,#26
514bc3d5698SJohn Baldwin	vmovn.i64	d18,q9
515bc3d5698SJohn Baldwin	vshr.u64	q4,q6,#26
516bc3d5698SJohn Baldwin	vmovn.i64	d12,q6
517bc3d5698SJohn Baldwin	vadd.i64	q7,q7,q4		@ h1 -> h2
518bc3d5698SJohn Baldwin	vbic.i32	d18,#0xfc000000
519bc3d5698SJohn Baldwin	vbic.i32	d12,#0xfc000000
520bc3d5698SJohn Baldwin
521bc3d5698SJohn Baldwin	vadd.i32	d10,d10,d30
522bc3d5698SJohn Baldwin	vshl.u32	d30,d30,#2
523bc3d5698SJohn Baldwin	vshrn.u64	d8,q7,#26
524bc3d5698SJohn Baldwin	vmovn.i64	d14,q7
525bc3d5698SJohn Baldwin	vadd.i32	d10,d10,d30	@ h4 -> h0
526bc3d5698SJohn Baldwin	vadd.i32	d16,d16,d8	@ h2 -> h3
527bc3d5698SJohn Baldwin	vbic.i32	d14,#0xfc000000
528bc3d5698SJohn Baldwin
529bc3d5698SJohn Baldwin	vshr.u32	d30,d10,#26
530bc3d5698SJohn Baldwin	vbic.i32	d10,#0xfc000000
531bc3d5698SJohn Baldwin	vshr.u32	d8,d16,#26
532bc3d5698SJohn Baldwin	vbic.i32	d16,#0xfc000000
533bc3d5698SJohn Baldwin	vadd.i32	d12,d12,d30	@ h0 -> h1
534bc3d5698SJohn Baldwin	vadd.i32	d18,d18,d8	@ h3 -> h4
535bc3d5698SJohn Baldwin
536bc3d5698SJohn Baldwin	subs	r5,r5,#1
537bc3d5698SJohn Baldwin	beq	.Lsquare_break_neon
538bc3d5698SJohn Baldwin
539bc3d5698SJohn Baldwin	add	r6,r0,#(48+0*9*4)
540bc3d5698SJohn Baldwin	add	r7,r0,#(48+1*9*4)
541bc3d5698SJohn Baldwin
542bc3d5698SJohn Baldwin	vtrn.32	d0,d10		@ r^2:r^1
543bc3d5698SJohn Baldwin	vtrn.32	d3,d14
544bc3d5698SJohn Baldwin	vtrn.32	d5,d16
545bc3d5698SJohn Baldwin	vtrn.32	d1,d12
546bc3d5698SJohn Baldwin	vtrn.32	d7,d18
547bc3d5698SJohn Baldwin
548bc3d5698SJohn Baldwin	vshl.u32	d4,d3,#2		@ *5
549bc3d5698SJohn Baldwin	vshl.u32	d6,d5,#2
550bc3d5698SJohn Baldwin	vshl.u32	d2,d1,#2
551bc3d5698SJohn Baldwin	vshl.u32	d8,d7,#2
552bc3d5698SJohn Baldwin	vadd.i32	d4,d4,d3
553bc3d5698SJohn Baldwin	vadd.i32	d2,d2,d1
554bc3d5698SJohn Baldwin	vadd.i32	d6,d6,d5
555bc3d5698SJohn Baldwin	vadd.i32	d8,d8,d7
556bc3d5698SJohn Baldwin
557bc3d5698SJohn Baldwin	vst4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!
558bc3d5698SJohn Baldwin	vst4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!
559bc3d5698SJohn Baldwin	vst4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
560bc3d5698SJohn Baldwin	vst4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
561bc3d5698SJohn Baldwin	vst1.32	{d8[0]},[r6,:32]
562bc3d5698SJohn Baldwin	vst1.32	{d8[1]},[r7,:32]
563bc3d5698SJohn Baldwin
564bc3d5698SJohn Baldwin	b	.Lsquare_neon
565bc3d5698SJohn Baldwin
566bc3d5698SJohn Baldwin.align	4
567bc3d5698SJohn Baldwin.Lsquare_break_neon:
568bc3d5698SJohn Baldwin	add	r6,r0,#(48+2*4*9)
569bc3d5698SJohn Baldwin	add	r7,r0,#(48+3*4*9)
570bc3d5698SJohn Baldwin
571bc3d5698SJohn Baldwin	vmov	d0,d10		@ r^4:r^3
572bc3d5698SJohn Baldwin	vshl.u32	d2,d12,#2		@ *5
573bc3d5698SJohn Baldwin	vmov	d1,d12
574bc3d5698SJohn Baldwin	vshl.u32	d4,d14,#2
575bc3d5698SJohn Baldwin	vmov	d3,d14
576bc3d5698SJohn Baldwin	vshl.u32	d6,d16,#2
577bc3d5698SJohn Baldwin	vmov	d5,d16
578bc3d5698SJohn Baldwin	vshl.u32	d8,d18,#2
579bc3d5698SJohn Baldwin	vmov	d7,d18
580bc3d5698SJohn Baldwin	vadd.i32	d2,d2,d12
581bc3d5698SJohn Baldwin	vadd.i32	d4,d4,d14
582bc3d5698SJohn Baldwin	vadd.i32	d6,d6,d16
583bc3d5698SJohn Baldwin	vadd.i32	d8,d8,d18
584bc3d5698SJohn Baldwin
585bc3d5698SJohn Baldwin	vst4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!
586bc3d5698SJohn Baldwin	vst4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!
587bc3d5698SJohn Baldwin	vst4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
588bc3d5698SJohn Baldwin	vst4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
589bc3d5698SJohn Baldwin	vst1.32	{d8[0]},[r6]
590bc3d5698SJohn Baldwin	vst1.32	{d8[1]},[r7]
591bc3d5698SJohn Baldwin
592bc3d5698SJohn Baldwin	bx	lr				@ bx	lr
593bc3d5698SJohn Baldwin.size	poly1305_init_neon,.-poly1305_init_neon
594bc3d5698SJohn Baldwin
595bc3d5698SJohn Baldwin.type	poly1305_blocks_neon,%function
596bc3d5698SJohn Baldwin.align	5
597bc3d5698SJohn Baldwinpoly1305_blocks_neon:
5983396647cSJung-uk Kim.Lpoly1305_blocks_neon:
599bc3d5698SJohn Baldwin	ldr	ip,[r0,#36]		@ is_base2_26
600bc3d5698SJohn Baldwin	ands	r2,r2,#-16
601bc3d5698SJohn Baldwin	beq	.Lno_data_neon
602bc3d5698SJohn Baldwin
603bc3d5698SJohn Baldwin	cmp	r2,#64
604bc3d5698SJohn Baldwin	bhs	.Lenter_neon
605bc3d5698SJohn Baldwin	tst	ip,ip			@ is_base2_26?
606bc3d5698SJohn Baldwin	beq	.Lpoly1305_blocks
607bc3d5698SJohn Baldwin
608bc3d5698SJohn Baldwin.Lenter_neon:
609bc3d5698SJohn Baldwin	stmdb	sp!,{r4,r5,r6,r7}
610bc3d5698SJohn Baldwin	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
611bc3d5698SJohn Baldwin
612bc3d5698SJohn Baldwin	tst	ip,ip			@ is_base2_26?
613bc3d5698SJohn Baldwin	bne	.Lbase2_26_neon
614bc3d5698SJohn Baldwin
615bc3d5698SJohn Baldwin	stmdb	sp!,{r1,r2,r3,lr}
616bc3d5698SJohn Baldwin	bl	poly1305_init_neon
617bc3d5698SJohn Baldwin
618bc3d5698SJohn Baldwin	ldr	r4,[r0,#0]		@ load hash value base 2^32
619bc3d5698SJohn Baldwin	ldr	r5,[r0,#4]
620bc3d5698SJohn Baldwin	ldr	r6,[r0,#8]
621bc3d5698SJohn Baldwin	ldr	r7,[r0,#12]
622bc3d5698SJohn Baldwin	ldr	ip,[r0,#16]
623bc3d5698SJohn Baldwin
624bc3d5698SJohn Baldwin	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
625bc3d5698SJohn Baldwin	mov	r3,r4,lsr#26
626bc3d5698SJohn Baldwin	veor	d10,d10,d10
627bc3d5698SJohn Baldwin	mov	r4,r5,lsr#20
628bc3d5698SJohn Baldwin	orr	r3,r3,r5,lsl#6
629bc3d5698SJohn Baldwin	veor	d12,d12,d12
630bc3d5698SJohn Baldwin	mov	r5,r6,lsr#14
631bc3d5698SJohn Baldwin	orr	r4,r4,r6,lsl#12
632bc3d5698SJohn Baldwin	veor	d14,d14,d14
633bc3d5698SJohn Baldwin	mov	r6,r7,lsr#8
634bc3d5698SJohn Baldwin	orr	r5,r5,r7,lsl#18
635bc3d5698SJohn Baldwin	veor	d16,d16,d16
636bc3d5698SJohn Baldwin	and	r3,r3,#0x03ffffff
637bc3d5698SJohn Baldwin	orr	r6,r6,ip,lsl#24
638bc3d5698SJohn Baldwin	veor	d18,d18,d18
639bc3d5698SJohn Baldwin	and	r4,r4,#0x03ffffff
640bc3d5698SJohn Baldwin	mov	r1,#1
641bc3d5698SJohn Baldwin	and	r5,r5,#0x03ffffff
642bc3d5698SJohn Baldwin	str	r1,[r0,#36]		@ is_base2_26
643bc3d5698SJohn Baldwin
644bc3d5698SJohn Baldwin	vmov.32	d10[0],r2
645bc3d5698SJohn Baldwin	vmov.32	d12[0],r3
646bc3d5698SJohn Baldwin	vmov.32	d14[0],r4
647bc3d5698SJohn Baldwin	vmov.32	d16[0],r5
648bc3d5698SJohn Baldwin	vmov.32	d18[0],r6
649bc3d5698SJohn Baldwin	adr	r5,.Lzeros
650bc3d5698SJohn Baldwin
651bc3d5698SJohn Baldwin	ldmia	sp!,{r1,r2,r3,lr}
652bc3d5698SJohn Baldwin	b	.Lbase2_32_neon
653bc3d5698SJohn Baldwin
654bc3d5698SJohn Baldwin.align	4
655bc3d5698SJohn Baldwin.Lbase2_26_neon:
656bc3d5698SJohn Baldwin	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
657bc3d5698SJohn Baldwin	@ load hash value
658bc3d5698SJohn Baldwin
659bc3d5698SJohn Baldwin	veor	d10,d10,d10
660bc3d5698SJohn Baldwin	veor	d12,d12,d12
661bc3d5698SJohn Baldwin	veor	d14,d14,d14
662bc3d5698SJohn Baldwin	veor	d16,d16,d16
663bc3d5698SJohn Baldwin	veor	d18,d18,d18
664bc3d5698SJohn Baldwin	vld4.32	{d10[0],d12[0],d14[0],d16[0]},[r0]!
665bc3d5698SJohn Baldwin	adr	r5,.Lzeros
666bc3d5698SJohn Baldwin	vld1.32	{d18[0]},[r0]
667bc3d5698SJohn Baldwin	sub	r0,r0,#16		@ rewind
668bc3d5698SJohn Baldwin
669bc3d5698SJohn Baldwin.Lbase2_32_neon:
670bc3d5698SJohn Baldwin	add	r4,r1,#32
671bc3d5698SJohn Baldwin	mov	r3,r3,lsl#24
672bc3d5698SJohn Baldwin	tst	r2,#31
673bc3d5698SJohn Baldwin	beq	.Leven
674bc3d5698SJohn Baldwin
675bc3d5698SJohn Baldwin	vld4.32	{d20[0],d22[0],d24[0],d26[0]},[r1]!
676bc3d5698SJohn Baldwin	vmov.32	d28[0],r3
677bc3d5698SJohn Baldwin	sub	r2,r2,#16
678bc3d5698SJohn Baldwin	add	r4,r1,#32
679bc3d5698SJohn Baldwin
680bc3d5698SJohn Baldwin# ifdef	__ARMEB__
681bc3d5698SJohn Baldwin	vrev32.8	q10,q10
682bc3d5698SJohn Baldwin	vrev32.8	q13,q13
683bc3d5698SJohn Baldwin	vrev32.8	q11,q11
684bc3d5698SJohn Baldwin	vrev32.8	q12,q12
685bc3d5698SJohn Baldwin# endif
686bc3d5698SJohn Baldwin	vsri.u32	d28,d26,#8	@ base 2^32 -> base 2^26
687bc3d5698SJohn Baldwin	vshl.u32	d26,d26,#18
688bc3d5698SJohn Baldwin
689bc3d5698SJohn Baldwin	vsri.u32	d26,d24,#14
690bc3d5698SJohn Baldwin	vshl.u32	d24,d24,#12
691bc3d5698SJohn Baldwin	vadd.i32	d29,d28,d18	@ add hash value and move to #hi
692bc3d5698SJohn Baldwin
693bc3d5698SJohn Baldwin	vbic.i32	d26,#0xfc000000
694bc3d5698SJohn Baldwin	vsri.u32	d24,d22,#20
695bc3d5698SJohn Baldwin	vshl.u32	d22,d22,#6
696bc3d5698SJohn Baldwin
697bc3d5698SJohn Baldwin	vbic.i32	d24,#0xfc000000
698bc3d5698SJohn Baldwin	vsri.u32	d22,d20,#26
699bc3d5698SJohn Baldwin	vadd.i32	d27,d26,d16
700bc3d5698SJohn Baldwin
701bc3d5698SJohn Baldwin	vbic.i32	d20,#0xfc000000
702bc3d5698SJohn Baldwin	vbic.i32	d22,#0xfc000000
703bc3d5698SJohn Baldwin	vadd.i32	d25,d24,d14
704bc3d5698SJohn Baldwin
705bc3d5698SJohn Baldwin	vadd.i32	d21,d20,d10
706bc3d5698SJohn Baldwin	vadd.i32	d23,d22,d12
707bc3d5698SJohn Baldwin
708bc3d5698SJohn Baldwin	mov	r7,r5
709bc3d5698SJohn Baldwin	add	r6,r0,#48
710bc3d5698SJohn Baldwin
711bc3d5698SJohn Baldwin	cmp	r2,r2
712bc3d5698SJohn Baldwin	b	.Long_tail
713bc3d5698SJohn Baldwin
714bc3d5698SJohn Baldwin.align	4
715bc3d5698SJohn Baldwin.Leven:
716bc3d5698SJohn Baldwin	subs	r2,r2,#64
717bc3d5698SJohn Baldwin	it	lo
718bc3d5698SJohn Baldwin	movlo	r4,r5
719bc3d5698SJohn Baldwin
720bc3d5698SJohn Baldwin	vmov.i32	q14,#1<<24		@ padbit, yes, always
721bc3d5698SJohn Baldwin	vld4.32	{d20,d22,d24,d26},[r1]	@ inp[0:1]
722bc3d5698SJohn Baldwin	add	r1,r1,#64
723bc3d5698SJohn Baldwin	vld4.32	{d21,d23,d25,d27},[r4]	@ inp[2:3] (or 0)
724bc3d5698SJohn Baldwin	add	r4,r4,#64
725bc3d5698SJohn Baldwin	itt	hi
726bc3d5698SJohn Baldwin	addhi	r7,r0,#(48+1*9*4)
727bc3d5698SJohn Baldwin	addhi	r6,r0,#(48+3*9*4)
728bc3d5698SJohn Baldwin
729bc3d5698SJohn Baldwin# ifdef	__ARMEB__
730bc3d5698SJohn Baldwin	vrev32.8	q10,q10
731bc3d5698SJohn Baldwin	vrev32.8	q13,q13
732bc3d5698SJohn Baldwin	vrev32.8	q11,q11
733bc3d5698SJohn Baldwin	vrev32.8	q12,q12
734bc3d5698SJohn Baldwin# endif
735bc3d5698SJohn Baldwin	vsri.u32	q14,q13,#8		@ base 2^32 -> base 2^26
736bc3d5698SJohn Baldwin	vshl.u32	q13,q13,#18
737bc3d5698SJohn Baldwin
738bc3d5698SJohn Baldwin	vsri.u32	q13,q12,#14
739bc3d5698SJohn Baldwin	vshl.u32	q12,q12,#12
740bc3d5698SJohn Baldwin
741bc3d5698SJohn Baldwin	vbic.i32	q13,#0xfc000000
742bc3d5698SJohn Baldwin	vsri.u32	q12,q11,#20
743bc3d5698SJohn Baldwin	vshl.u32	q11,q11,#6
744bc3d5698SJohn Baldwin
745bc3d5698SJohn Baldwin	vbic.i32	q12,#0xfc000000
746bc3d5698SJohn Baldwin	vsri.u32	q11,q10,#26
747bc3d5698SJohn Baldwin
748bc3d5698SJohn Baldwin	vbic.i32	q10,#0xfc000000
749bc3d5698SJohn Baldwin	vbic.i32	q11,#0xfc000000
750bc3d5698SJohn Baldwin
751bc3d5698SJohn Baldwin	bls	.Lskip_loop
752bc3d5698SJohn Baldwin
753bc3d5698SJohn Baldwin	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^2
754bc3d5698SJohn Baldwin	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^4
755bc3d5698SJohn Baldwin	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
756bc3d5698SJohn Baldwin	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
757bc3d5698SJohn Baldwin	b	.Loop_neon
758bc3d5698SJohn Baldwin
759bc3d5698SJohn Baldwin.align	5
760bc3d5698SJohn Baldwin.Loop_neon:
761bc3d5698SJohn Baldwin	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
762bc3d5698SJohn Baldwin	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
763bc3d5698SJohn Baldwin	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
764bc3d5698SJohn Baldwin	@   ___________________/
765bc3d5698SJohn Baldwin	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
766bc3d5698SJohn Baldwin	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
767bc3d5698SJohn Baldwin	@   ___________________/ ____________________/
768bc3d5698SJohn Baldwin	@
769bc3d5698SJohn Baldwin	@ Note that we start with inp[2:3]*r^2. This is because it
770bc3d5698SJohn Baldwin	@ doesn't depend on reduction in previous iteration.
771bc3d5698SJohn Baldwin	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
772bc3d5698SJohn Baldwin	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
773bc3d5698SJohn Baldwin	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
774bc3d5698SJohn Baldwin	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
775bc3d5698SJohn Baldwin	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
776bc3d5698SJohn Baldwin	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
777bc3d5698SJohn Baldwin
778bc3d5698SJohn Baldwin	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
779bc3d5698SJohn Baldwin	@ inp[2:3]*r^2
780bc3d5698SJohn Baldwin
781bc3d5698SJohn Baldwin	vadd.i32	d24,d24,d14	@ accumulate inp[0:1]
782bc3d5698SJohn Baldwin	vmull.u32	q7,d25,d0[1]
783bc3d5698SJohn Baldwin	vadd.i32	d20,d20,d10
784bc3d5698SJohn Baldwin	vmull.u32	q5,d21,d0[1]
785bc3d5698SJohn Baldwin	vadd.i32	d26,d26,d16
786bc3d5698SJohn Baldwin	vmull.u32	q8,d27,d0[1]
787bc3d5698SJohn Baldwin	vmlal.u32	q7,d23,d1[1]
788bc3d5698SJohn Baldwin	vadd.i32	d22,d22,d12
789bc3d5698SJohn Baldwin	vmull.u32	q6,d23,d0[1]
790bc3d5698SJohn Baldwin
791bc3d5698SJohn Baldwin	vadd.i32	d28,d28,d18
792bc3d5698SJohn Baldwin	vmull.u32	q9,d29,d0[1]
793bc3d5698SJohn Baldwin	subs	r2,r2,#64
794bc3d5698SJohn Baldwin	vmlal.u32	q5,d29,d2[1]
795bc3d5698SJohn Baldwin	it	lo
796bc3d5698SJohn Baldwin	movlo	r4,r5
797bc3d5698SJohn Baldwin	vmlal.u32	q8,d25,d1[1]
798bc3d5698SJohn Baldwin	vld1.32	d8[1],[r7,:32]
799bc3d5698SJohn Baldwin	vmlal.u32	q6,d21,d1[1]
800bc3d5698SJohn Baldwin	vmlal.u32	q9,d27,d1[1]
801bc3d5698SJohn Baldwin
802bc3d5698SJohn Baldwin	vmlal.u32	q5,d27,d4[1]
803bc3d5698SJohn Baldwin	vmlal.u32	q8,d23,d3[1]
804bc3d5698SJohn Baldwin	vmlal.u32	q9,d25,d3[1]
805bc3d5698SJohn Baldwin	vmlal.u32	q6,d29,d4[1]
806bc3d5698SJohn Baldwin	vmlal.u32	q7,d21,d3[1]
807bc3d5698SJohn Baldwin
808bc3d5698SJohn Baldwin	vmlal.u32	q8,d21,d5[1]
809bc3d5698SJohn Baldwin	vmlal.u32	q5,d25,d6[1]
810bc3d5698SJohn Baldwin	vmlal.u32	q9,d23,d5[1]
811bc3d5698SJohn Baldwin	vmlal.u32	q6,d27,d6[1]
812bc3d5698SJohn Baldwin	vmlal.u32	q7,d29,d6[1]
813bc3d5698SJohn Baldwin
814bc3d5698SJohn Baldwin	vmlal.u32	q8,d29,d8[1]
815bc3d5698SJohn Baldwin	vmlal.u32	q5,d23,d8[1]
816bc3d5698SJohn Baldwin	vmlal.u32	q9,d21,d7[1]
817bc3d5698SJohn Baldwin	vmlal.u32	q6,d25,d8[1]
818bc3d5698SJohn Baldwin	vmlal.u32	q7,d27,d8[1]
819bc3d5698SJohn Baldwin
820bc3d5698SJohn Baldwin	vld4.32	{d21,d23,d25,d27},[r4]	@ inp[2:3] (or 0)
821bc3d5698SJohn Baldwin	add	r4,r4,#64
822bc3d5698SJohn Baldwin
823bc3d5698SJohn Baldwin	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
824bc3d5698SJohn Baldwin	@ (hash+inp[0:1])*r^4 and accumulate
825bc3d5698SJohn Baldwin
826bc3d5698SJohn Baldwin	vmlal.u32	q8,d26,d0[0]
827bc3d5698SJohn Baldwin	vmlal.u32	q5,d20,d0[0]
828bc3d5698SJohn Baldwin	vmlal.u32	q9,d28,d0[0]
829bc3d5698SJohn Baldwin	vmlal.u32	q6,d22,d0[0]
830bc3d5698SJohn Baldwin	vmlal.u32	q7,d24,d0[0]
831bc3d5698SJohn Baldwin	vld1.32	d8[0],[r6,:32]
832bc3d5698SJohn Baldwin
833bc3d5698SJohn Baldwin	vmlal.u32	q8,d24,d1[0]
834bc3d5698SJohn Baldwin	vmlal.u32	q5,d28,d2[0]
835bc3d5698SJohn Baldwin	vmlal.u32	q9,d26,d1[0]
836bc3d5698SJohn Baldwin	vmlal.u32	q6,d20,d1[0]
837bc3d5698SJohn Baldwin	vmlal.u32	q7,d22,d1[0]
838bc3d5698SJohn Baldwin
839bc3d5698SJohn Baldwin	vmlal.u32	q8,d22,d3[0]
840bc3d5698SJohn Baldwin	vmlal.u32	q5,d26,d4[0]
841bc3d5698SJohn Baldwin	vmlal.u32	q9,d24,d3[0]
842bc3d5698SJohn Baldwin	vmlal.u32	q6,d28,d4[0]
843bc3d5698SJohn Baldwin	vmlal.u32	q7,d20,d3[0]
844bc3d5698SJohn Baldwin
845bc3d5698SJohn Baldwin	vmlal.u32	q8,d20,d5[0]
846bc3d5698SJohn Baldwin	vmlal.u32	q5,d24,d6[0]
847bc3d5698SJohn Baldwin	vmlal.u32	q9,d22,d5[0]
848bc3d5698SJohn Baldwin	vmlal.u32	q6,d26,d6[0]
849bc3d5698SJohn Baldwin	vmlal.u32	q8,d28,d8[0]
850bc3d5698SJohn Baldwin
851bc3d5698SJohn Baldwin	vmlal.u32	q7,d28,d6[0]
852bc3d5698SJohn Baldwin	vmlal.u32	q5,d22,d8[0]
853bc3d5698SJohn Baldwin	vmlal.u32	q9,d20,d7[0]
854bc3d5698SJohn Baldwin	vmov.i32	q14,#1<<24		@ padbit, yes, always
855bc3d5698SJohn Baldwin	vmlal.u32	q6,d24,d8[0]
856bc3d5698SJohn Baldwin	vmlal.u32	q7,d26,d8[0]
857bc3d5698SJohn Baldwin
858bc3d5698SJohn Baldwin	vld4.32	{d20,d22,d24,d26},[r1]	@ inp[0:1]
859bc3d5698SJohn Baldwin	add	r1,r1,#64
860bc3d5698SJohn Baldwin# ifdef	__ARMEB__
861bc3d5698SJohn Baldwin	vrev32.8	q10,q10
862bc3d5698SJohn Baldwin	vrev32.8	q11,q11
863bc3d5698SJohn Baldwin	vrev32.8	q12,q12
864bc3d5698SJohn Baldwin	vrev32.8	q13,q13
865bc3d5698SJohn Baldwin# endif
866bc3d5698SJohn Baldwin
867bc3d5698SJohn Baldwin	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
868bc3d5698SJohn Baldwin	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
869bc3d5698SJohn Baldwin	@ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
870bc3d5698SJohn Baldwin
871bc3d5698SJohn Baldwin	vshr.u64	q15,q8,#26
872bc3d5698SJohn Baldwin	vmovn.i64	d16,q8
873bc3d5698SJohn Baldwin	vshr.u64	q4,q5,#26
874bc3d5698SJohn Baldwin	vmovn.i64	d10,q5
875bc3d5698SJohn Baldwin	vadd.i64	q9,q9,q15		@ h3 -> h4
876bc3d5698SJohn Baldwin	vbic.i32	d16,#0xfc000000
877bc3d5698SJohn Baldwin	vsri.u32	q14,q13,#8		@ base 2^32 -> base 2^26
878bc3d5698SJohn Baldwin	vadd.i64	q6,q6,q4		@ h0 -> h1
879bc3d5698SJohn Baldwin	vshl.u32	q13,q13,#18
880bc3d5698SJohn Baldwin	vbic.i32	d10,#0xfc000000
881bc3d5698SJohn Baldwin
882bc3d5698SJohn Baldwin	vshrn.u64	d30,q9,#26
883bc3d5698SJohn Baldwin	vmovn.i64	d18,q9
884bc3d5698SJohn Baldwin	vshr.u64	q4,q6,#26
885bc3d5698SJohn Baldwin	vmovn.i64	d12,q6
886bc3d5698SJohn Baldwin	vadd.i64	q7,q7,q4		@ h1 -> h2
887bc3d5698SJohn Baldwin	vsri.u32	q13,q12,#14
888bc3d5698SJohn Baldwin	vbic.i32	d18,#0xfc000000
889bc3d5698SJohn Baldwin	vshl.u32	q12,q12,#12
890bc3d5698SJohn Baldwin	vbic.i32	d12,#0xfc000000
891bc3d5698SJohn Baldwin
892bc3d5698SJohn Baldwin	vadd.i32	d10,d10,d30
893bc3d5698SJohn Baldwin	vshl.u32	d30,d30,#2
894bc3d5698SJohn Baldwin	vbic.i32	q13,#0xfc000000
895bc3d5698SJohn Baldwin	vshrn.u64	d8,q7,#26
896bc3d5698SJohn Baldwin	vmovn.i64	d14,q7
897bc3d5698SJohn Baldwin	vaddl.u32	q5,d10,d30	@ h4 -> h0 [widen for a sec]
898bc3d5698SJohn Baldwin	vsri.u32	q12,q11,#20
899bc3d5698SJohn Baldwin	vadd.i32	d16,d16,d8	@ h2 -> h3
900bc3d5698SJohn Baldwin	vshl.u32	q11,q11,#6
901bc3d5698SJohn Baldwin	vbic.i32	d14,#0xfc000000
902bc3d5698SJohn Baldwin	vbic.i32	q12,#0xfc000000
903bc3d5698SJohn Baldwin
904bc3d5698SJohn Baldwin	vshrn.u64	d30,q5,#26		@ re-narrow
905bc3d5698SJohn Baldwin	vmovn.i64	d10,q5
906bc3d5698SJohn Baldwin	vsri.u32	q11,q10,#26
907bc3d5698SJohn Baldwin	vbic.i32	q10,#0xfc000000
908bc3d5698SJohn Baldwin	vshr.u32	d8,d16,#26
909bc3d5698SJohn Baldwin	vbic.i32	d16,#0xfc000000
910bc3d5698SJohn Baldwin	vbic.i32	d10,#0xfc000000
911bc3d5698SJohn Baldwin	vadd.i32	d12,d12,d30	@ h0 -> h1
912bc3d5698SJohn Baldwin	vadd.i32	d18,d18,d8	@ h3 -> h4
913bc3d5698SJohn Baldwin	vbic.i32	q11,#0xfc000000
914bc3d5698SJohn Baldwin
915bc3d5698SJohn Baldwin	bhi	.Loop_neon
916bc3d5698SJohn Baldwin
917bc3d5698SJohn Baldwin.Lskip_loop:
918bc3d5698SJohn Baldwin	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
919bc3d5698SJohn Baldwin	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
920bc3d5698SJohn Baldwin
921bc3d5698SJohn Baldwin	add	r7,r0,#(48+0*9*4)
922bc3d5698SJohn Baldwin	add	r6,r0,#(48+1*9*4)
923bc3d5698SJohn Baldwin	adds	r2,r2,#32
924bc3d5698SJohn Baldwin	it	ne
925bc3d5698SJohn Baldwin	movne	r2,#0
926bc3d5698SJohn Baldwin	bne	.Long_tail
927bc3d5698SJohn Baldwin
928bc3d5698SJohn Baldwin	vadd.i32	d25,d24,d14	@ add hash value and move to #hi
929bc3d5698SJohn Baldwin	vadd.i32	d21,d20,d10
930bc3d5698SJohn Baldwin	vadd.i32	d27,d26,d16
931bc3d5698SJohn Baldwin	vadd.i32	d23,d22,d12
932bc3d5698SJohn Baldwin	vadd.i32	d29,d28,d18
933bc3d5698SJohn Baldwin
934bc3d5698SJohn Baldwin.Long_tail:
935bc3d5698SJohn Baldwin	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^1
936bc3d5698SJohn Baldwin	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^2
937bc3d5698SJohn Baldwin
938bc3d5698SJohn Baldwin	vadd.i32	d24,d24,d14	@ can be redundant
939bc3d5698SJohn Baldwin	vmull.u32	q7,d25,d0
940bc3d5698SJohn Baldwin	vadd.i32	d20,d20,d10
941bc3d5698SJohn Baldwin	vmull.u32	q5,d21,d0
942bc3d5698SJohn Baldwin	vadd.i32	d26,d26,d16
943bc3d5698SJohn Baldwin	vmull.u32	q8,d27,d0
944bc3d5698SJohn Baldwin	vadd.i32	d22,d22,d12
945bc3d5698SJohn Baldwin	vmull.u32	q6,d23,d0
946bc3d5698SJohn Baldwin	vadd.i32	d28,d28,d18
947bc3d5698SJohn Baldwin	vmull.u32	q9,d29,d0
948bc3d5698SJohn Baldwin
949bc3d5698SJohn Baldwin	vmlal.u32	q5,d29,d2
950bc3d5698SJohn Baldwin	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
951bc3d5698SJohn Baldwin	vmlal.u32	q8,d25,d1
952bc3d5698SJohn Baldwin	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
953bc3d5698SJohn Baldwin	vmlal.u32	q6,d21,d1
954bc3d5698SJohn Baldwin	vmlal.u32	q9,d27,d1
955bc3d5698SJohn Baldwin	vmlal.u32	q7,d23,d1
956bc3d5698SJohn Baldwin
957bc3d5698SJohn Baldwin	vmlal.u32	q8,d23,d3
958bc3d5698SJohn Baldwin	vld1.32	d8[1],[r7,:32]
959bc3d5698SJohn Baldwin	vmlal.u32	q5,d27,d4
960bc3d5698SJohn Baldwin	vld1.32	d8[0],[r6,:32]
961bc3d5698SJohn Baldwin	vmlal.u32	q9,d25,d3
962bc3d5698SJohn Baldwin	vmlal.u32	q6,d29,d4
963bc3d5698SJohn Baldwin	vmlal.u32	q7,d21,d3
964bc3d5698SJohn Baldwin
965bc3d5698SJohn Baldwin	vmlal.u32	q8,d21,d5
966bc3d5698SJohn Baldwin	it	ne
967bc3d5698SJohn Baldwin	addne	r7,r0,#(48+2*9*4)
968bc3d5698SJohn Baldwin	vmlal.u32	q5,d25,d6
969bc3d5698SJohn Baldwin	it	ne
970bc3d5698SJohn Baldwin	addne	r6,r0,#(48+3*9*4)
971bc3d5698SJohn Baldwin	vmlal.u32	q9,d23,d5
972bc3d5698SJohn Baldwin	vmlal.u32	q6,d27,d6
973bc3d5698SJohn Baldwin	vmlal.u32	q7,d29,d6
974bc3d5698SJohn Baldwin
975bc3d5698SJohn Baldwin	vmlal.u32	q8,d29,d8
976bc3d5698SJohn Baldwin	vorn	q0,q0,q0	@ all-ones, can be redundant
977bc3d5698SJohn Baldwin	vmlal.u32	q5,d23,d8
978bc3d5698SJohn Baldwin	vshr.u64	q0,q0,#38
979bc3d5698SJohn Baldwin	vmlal.u32	q9,d21,d7
980bc3d5698SJohn Baldwin	vmlal.u32	q6,d25,d8
981bc3d5698SJohn Baldwin	vmlal.u32	q7,d27,d8
982bc3d5698SJohn Baldwin
983bc3d5698SJohn Baldwin	beq	.Lshort_tail
984bc3d5698SJohn Baldwin
985bc3d5698SJohn Baldwin	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
986bc3d5698SJohn Baldwin	@ (hash+inp[0:1])*r^4:r^3 and accumulate
987bc3d5698SJohn Baldwin
988bc3d5698SJohn Baldwin	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^3
989bc3d5698SJohn Baldwin	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^4
990bc3d5698SJohn Baldwin
991bc3d5698SJohn Baldwin	vmlal.u32	q7,d24,d0
992bc3d5698SJohn Baldwin	vmlal.u32	q5,d20,d0
993bc3d5698SJohn Baldwin	vmlal.u32	q8,d26,d0
994bc3d5698SJohn Baldwin	vmlal.u32	q6,d22,d0
995bc3d5698SJohn Baldwin	vmlal.u32	q9,d28,d0
996bc3d5698SJohn Baldwin
997bc3d5698SJohn Baldwin	vmlal.u32	q5,d28,d2
998bc3d5698SJohn Baldwin	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
999bc3d5698SJohn Baldwin	vmlal.u32	q8,d24,d1
1000bc3d5698SJohn Baldwin	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
1001bc3d5698SJohn Baldwin	vmlal.u32	q6,d20,d1
1002bc3d5698SJohn Baldwin	vmlal.u32	q9,d26,d1
1003bc3d5698SJohn Baldwin	vmlal.u32	q7,d22,d1
1004bc3d5698SJohn Baldwin
1005bc3d5698SJohn Baldwin	vmlal.u32	q8,d22,d3
1006bc3d5698SJohn Baldwin	vld1.32	d8[1],[r7,:32]
1007bc3d5698SJohn Baldwin	vmlal.u32	q5,d26,d4
1008bc3d5698SJohn Baldwin	vld1.32	d8[0],[r6,:32]
1009bc3d5698SJohn Baldwin	vmlal.u32	q9,d24,d3
1010bc3d5698SJohn Baldwin	vmlal.u32	q6,d28,d4
1011bc3d5698SJohn Baldwin	vmlal.u32	q7,d20,d3
1012bc3d5698SJohn Baldwin
1013bc3d5698SJohn Baldwin	vmlal.u32	q8,d20,d5
1014bc3d5698SJohn Baldwin	vmlal.u32	q5,d24,d6
1015bc3d5698SJohn Baldwin	vmlal.u32	q9,d22,d5
1016bc3d5698SJohn Baldwin	vmlal.u32	q6,d26,d6
1017bc3d5698SJohn Baldwin	vmlal.u32	q7,d28,d6
1018bc3d5698SJohn Baldwin
1019bc3d5698SJohn Baldwin	vmlal.u32	q8,d28,d8
1020bc3d5698SJohn Baldwin	vorn	q0,q0,q0	@ all-ones
1021bc3d5698SJohn Baldwin	vmlal.u32	q5,d22,d8
1022bc3d5698SJohn Baldwin	vshr.u64	q0,q0,#38
1023bc3d5698SJohn Baldwin	vmlal.u32	q9,d20,d7
1024bc3d5698SJohn Baldwin	vmlal.u32	q6,d24,d8
1025bc3d5698SJohn Baldwin	vmlal.u32	q7,d26,d8
1026bc3d5698SJohn Baldwin
1027bc3d5698SJohn Baldwin.Lshort_tail:
1028bc3d5698SJohn Baldwin	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1029bc3d5698SJohn Baldwin	@ horizontal addition
1030bc3d5698SJohn Baldwin
1031bc3d5698SJohn Baldwin	vadd.i64	d16,d16,d17
1032bc3d5698SJohn Baldwin	vadd.i64	d10,d10,d11
1033bc3d5698SJohn Baldwin	vadd.i64	d18,d18,d19
1034bc3d5698SJohn Baldwin	vadd.i64	d12,d12,d13
1035bc3d5698SJohn Baldwin	vadd.i64	d14,d14,d15
1036bc3d5698SJohn Baldwin
1037bc3d5698SJohn Baldwin	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1038bc3d5698SJohn Baldwin	@ lazy reduction, but without narrowing
1039bc3d5698SJohn Baldwin
1040bc3d5698SJohn Baldwin	vshr.u64	q15,q8,#26
1041bc3d5698SJohn Baldwin	vand.i64	q8,q8,q0
1042bc3d5698SJohn Baldwin	vshr.u64	q4,q5,#26
1043bc3d5698SJohn Baldwin	vand.i64	q5,q5,q0
1044bc3d5698SJohn Baldwin	vadd.i64	q9,q9,q15		@ h3 -> h4
1045bc3d5698SJohn Baldwin	vadd.i64	q6,q6,q4		@ h0 -> h1
1046bc3d5698SJohn Baldwin
1047bc3d5698SJohn Baldwin	vshr.u64	q15,q9,#26
1048bc3d5698SJohn Baldwin	vand.i64	q9,q9,q0
1049bc3d5698SJohn Baldwin	vshr.u64	q4,q6,#26
1050bc3d5698SJohn Baldwin	vand.i64	q6,q6,q0
1051bc3d5698SJohn Baldwin	vadd.i64	q7,q7,q4		@ h1 -> h2
1052bc3d5698SJohn Baldwin
1053bc3d5698SJohn Baldwin	vadd.i64	q5,q5,q15
1054bc3d5698SJohn Baldwin	vshl.u64	q15,q15,#2
1055bc3d5698SJohn Baldwin	vshr.u64	q4,q7,#26
1056bc3d5698SJohn Baldwin	vand.i64	q7,q7,q0
1057bc3d5698SJohn Baldwin	vadd.i64	q5,q5,q15		@ h4 -> h0
1058bc3d5698SJohn Baldwin	vadd.i64	q8,q8,q4		@ h2 -> h3
1059bc3d5698SJohn Baldwin
1060bc3d5698SJohn Baldwin	vshr.u64	q15,q5,#26
1061bc3d5698SJohn Baldwin	vand.i64	q5,q5,q0
1062bc3d5698SJohn Baldwin	vshr.u64	q4,q8,#26
1063bc3d5698SJohn Baldwin	vand.i64	q8,q8,q0
1064bc3d5698SJohn Baldwin	vadd.i64	q6,q6,q15		@ h0 -> h1
1065bc3d5698SJohn Baldwin	vadd.i64	q9,q9,q4		@ h3 -> h4
1066bc3d5698SJohn Baldwin
1067bc3d5698SJohn Baldwin	cmp	r2,#0
1068bc3d5698SJohn Baldwin	bne	.Leven
1069bc3d5698SJohn Baldwin
1070bc3d5698SJohn Baldwin	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1071bc3d5698SJohn Baldwin	@ store hash value
1072bc3d5698SJohn Baldwin
1073bc3d5698SJohn Baldwin	vst4.32	{d10[0],d12[0],d14[0],d16[0]},[r0]!
1074bc3d5698SJohn Baldwin	vst1.32	{d18[0]},[r0]
1075bc3d5698SJohn Baldwin
1076bc3d5698SJohn Baldwin	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}			@ epilogue
1077bc3d5698SJohn Baldwin	ldmia	sp!,{r4,r5,r6,r7}
1078bc3d5698SJohn Baldwin.Lno_data_neon:
1079bc3d5698SJohn Baldwin	bx	lr					@ bx	lr
1080bc3d5698SJohn Baldwin.size	poly1305_blocks_neon,.-poly1305_blocks_neon
1081bc3d5698SJohn Baldwin
1082bc3d5698SJohn Baldwin.type	poly1305_emit_neon,%function
1083bc3d5698SJohn Baldwin.align	5
1084bc3d5698SJohn Baldwinpoly1305_emit_neon:
10853396647cSJung-uk Kim.Lpoly1305_emit_neon:
1086bc3d5698SJohn Baldwin	ldr	ip,[r0,#36]		@ is_base2_26
1087bc3d5698SJohn Baldwin
1088bc3d5698SJohn Baldwin	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1089bc3d5698SJohn Baldwin
1090bc3d5698SJohn Baldwin	tst	ip,ip
1091bc3d5698SJohn Baldwin	beq	.Lpoly1305_emit_enter
1092bc3d5698SJohn Baldwin
1093bc3d5698SJohn Baldwin	ldmia	r0,{r3,r4,r5,r6,r7}
1094bc3d5698SJohn Baldwin	eor	r8,r8,r8
1095bc3d5698SJohn Baldwin
1096bc3d5698SJohn Baldwin	adds	r3,r3,r4,lsl#26	@ base 2^26 -> base 2^32
1097bc3d5698SJohn Baldwin	mov	r4,r4,lsr#6
1098bc3d5698SJohn Baldwin	adcs	r4,r4,r5,lsl#20
1099bc3d5698SJohn Baldwin	mov	r5,r5,lsr#12
1100bc3d5698SJohn Baldwin	adcs	r5,r5,r6,lsl#14
1101bc3d5698SJohn Baldwin	mov	r6,r6,lsr#18
1102bc3d5698SJohn Baldwin	adcs	r6,r6,r7,lsl#8
1103bc3d5698SJohn Baldwin	adc	r7,r8,r7,lsr#24	@ can be partially reduced ...
1104bc3d5698SJohn Baldwin
1105bc3d5698SJohn Baldwin	and	r8,r7,#-4		@ ... so reduce
1106bc3d5698SJohn Baldwin	and	r7,r6,#3
1107bc3d5698SJohn Baldwin	add	r8,r8,r8,lsr#2	@ *= 5
1108bc3d5698SJohn Baldwin	adds	r3,r3,r8
1109bc3d5698SJohn Baldwin	adcs	r4,r4,#0
1110bc3d5698SJohn Baldwin	adcs	r5,r5,#0
1111bc3d5698SJohn Baldwin	adcs	r6,r6,#0
1112bc3d5698SJohn Baldwin	adc	r7,r7,#0
1113bc3d5698SJohn Baldwin
1114bc3d5698SJohn Baldwin	adds	r8,r3,#5		@ compare to modulus
1115bc3d5698SJohn Baldwin	adcs	r9,r4,#0
1116bc3d5698SJohn Baldwin	adcs	r10,r5,#0
1117bc3d5698SJohn Baldwin	adcs	r11,r6,#0
1118bc3d5698SJohn Baldwin	adc	r7,r7,#0
1119bc3d5698SJohn Baldwin	tst	r7,#4			@ did it carry/borrow?
1120bc3d5698SJohn Baldwin
1121bc3d5698SJohn Baldwin	it	ne
1122bc3d5698SJohn Baldwin	movne	r3,r8
1123bc3d5698SJohn Baldwin	ldr	r8,[r2,#0]
1124bc3d5698SJohn Baldwin	it	ne
1125bc3d5698SJohn Baldwin	movne	r4,r9
1126bc3d5698SJohn Baldwin	ldr	r9,[r2,#4]
1127bc3d5698SJohn Baldwin	it	ne
1128bc3d5698SJohn Baldwin	movne	r5,r10
1129bc3d5698SJohn Baldwin	ldr	r10,[r2,#8]
1130bc3d5698SJohn Baldwin	it	ne
1131bc3d5698SJohn Baldwin	movne	r6,r11
1132bc3d5698SJohn Baldwin	ldr	r11,[r2,#12]
1133bc3d5698SJohn Baldwin
1134bc3d5698SJohn Baldwin	adds	r3,r3,r8		@ accumulate nonce
1135bc3d5698SJohn Baldwin	adcs	r4,r4,r9
1136bc3d5698SJohn Baldwin	adcs	r5,r5,r10
1137bc3d5698SJohn Baldwin	adc	r6,r6,r11
1138bc3d5698SJohn Baldwin
1139bc3d5698SJohn Baldwin# ifdef __ARMEB__
1140bc3d5698SJohn Baldwin	rev	r3,r3
1141bc3d5698SJohn Baldwin	rev	r4,r4
1142bc3d5698SJohn Baldwin	rev	r5,r5
1143bc3d5698SJohn Baldwin	rev	r6,r6
1144bc3d5698SJohn Baldwin# endif
1145bc3d5698SJohn Baldwin	str	r3,[r1,#0]		@ store the result
1146bc3d5698SJohn Baldwin	str	r4,[r1,#4]
1147bc3d5698SJohn Baldwin	str	r5,[r1,#8]
1148bc3d5698SJohn Baldwin	str	r6,[r1,#12]
1149bc3d5698SJohn Baldwin
1150bc3d5698SJohn Baldwin	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1151bc3d5698SJohn Baldwin	bx	lr				@ bx	lr
1152bc3d5698SJohn Baldwin.size	poly1305_emit_neon,.-poly1305_emit_neon
1153bc3d5698SJohn Baldwin
1154bc3d5698SJohn Baldwin.align	5
1155bc3d5698SJohn Baldwin.Lzeros:
1156bc3d5698SJohn Baldwin.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1157bc3d5698SJohn Baldwin.LOPENSSL_armcap:
1158*c0855eaaSJohn Baldwin# ifdef	_WIN32
1159*c0855eaaSJohn Baldwin.word	OPENSSL_armcap_P
1160*c0855eaaSJohn Baldwin# else
1161bc3d5698SJohn Baldwin.word	OPENSSL_armcap_P-.Lpoly1305_init
1162bc3d5698SJohn Baldwin# endif
1163*c0855eaaSJohn Baldwin#endif
1164bc3d5698SJohn Baldwin.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1165bc3d5698SJohn Baldwin.align	2
1166bc3d5698SJohn Baldwin.align	2
1167bc3d5698SJohn Baldwin#if	__ARM_MAX_ARCH__>=7
1168bc3d5698SJohn Baldwin.comm	OPENSSL_armcap_P,4,4
1169bc3d5698SJohn Baldwin#endif
1170