xref: /freebsd/sys/crypto/openssl/arm/poly1305-armv4.S (revision 1e4896b176ff664dc9c2fce5426bf2fdf8017a7d)
1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from poly1305-armv4.pl. */
3#include "arm_arch.h"
4
5.text
6#if defined(__thumb2__)
7.syntax	unified
8.thumb
9#else
10.code	32
11#endif
12
13.globl	poly1305_emit
14.globl	poly1305_blocks
15.globl	poly1305_init
16.type	poly1305_init,%function
17.align	5
18poly1305_init:
19.Lpoly1305_init:
20	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
21
22	eor	r3,r3,r3
23	cmp	r1,#0
24	str	r3,[r0,#0]		@ zero hash value
25	str	r3,[r0,#4]
26	str	r3,[r0,#8]
27	str	r3,[r0,#12]
28	str	r3,[r0,#16]
29	str	r3,[r0,#36]		@ is_base2_26
30	add	r0,r0,#20
31
32#ifdef	__thumb2__
33	it	eq
34#endif
35	moveq	r0,#0
36	beq	.Lno_key
37
38#if	__ARM_MAX_ARCH__>=7
39	adr	r11,.Lpoly1305_init
40	ldr	r12,.LOPENSSL_armcap
41#endif
42	ldrb	r4,[r1,#0]
43	mov	r10,#0x0fffffff
44	ldrb	r5,[r1,#1]
45	and	r3,r10,#-4		@ 0x0ffffffc
46	ldrb	r6,[r1,#2]
47	ldrb	r7,[r1,#3]
48	orr	r4,r4,r5,lsl#8
49	ldrb	r5,[r1,#4]
50	orr	r4,r4,r6,lsl#16
51	ldrb	r6,[r1,#5]
52	orr	r4,r4,r7,lsl#24
53	ldrb	r7,[r1,#6]
54	and	r4,r4,r10
55
56#if	__ARM_MAX_ARCH__>=7
57	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
58# ifdef	__APPLE__
59	ldr	r12,[r12]
60# endif
61#endif
62	ldrb	r8,[r1,#7]
63	orr	r5,r5,r6,lsl#8
64	ldrb	r6,[r1,#8]
65	orr	r5,r5,r7,lsl#16
66	ldrb	r7,[r1,#9]
67	orr	r5,r5,r8,lsl#24
68	ldrb	r8,[r1,#10]
69	and	r5,r5,r3
70
71#if	__ARM_MAX_ARCH__>=7
72	tst	r12,#ARMV7_NEON		@ check for NEON
73# ifdef	__APPLE__
74	adr	r9,poly1305_blocks_neon
75	adr	r11,poly1305_blocks
76#  ifdef __thumb2__
77	it	ne
78#  endif
79	movne	r11,r9
80	adr	r12,poly1305_emit
81	adr	r10,poly1305_emit_neon
82#  ifdef __thumb2__
83	it	ne
84#  endif
85	movne	r12,r10
86# else
87#  ifdef __thumb2__
88	itete	eq
89#  endif
90	addeq	r12,r11,#(poly1305_emit-.Lpoly1305_init)
91	addne	r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
92	addeq	r11,r11,#(poly1305_blocks-.Lpoly1305_init)
93	addne	r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
94# endif
95# ifdef	__thumb2__
96	orr	r12,r12,#1	@ thumb-ify address
97	orr	r11,r11,#1
98# endif
99#endif
100	ldrb	r9,[r1,#11]
101	orr	r6,r6,r7,lsl#8
102	ldrb	r7,[r1,#12]
103	orr	r6,r6,r8,lsl#16
104	ldrb	r8,[r1,#13]
105	orr	r6,r6,r9,lsl#24
106	ldrb	r9,[r1,#14]
107	and	r6,r6,r3
108
109	ldrb	r10,[r1,#15]
110	orr	r7,r7,r8,lsl#8
111	str	r4,[r0,#0]
112	orr	r7,r7,r9,lsl#16
113	str	r5,[r0,#4]
114	orr	r7,r7,r10,lsl#24
115	str	r6,[r0,#8]
116	and	r7,r7,r3
117	str	r7,[r0,#12]
118#if	__ARM_MAX_ARCH__>=7
119	stmia	r2,{r11,r12}		@ fill functions table
120	mov	r0,#1
121#else
122	mov	r0,#0
123#endif
124.Lno_key:
125	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
126#if	__ARM_ARCH__>=5
127	bx	lr				@ bx	lr
128#else
129	tst	lr,#1
130	moveq	pc,lr			@ be binary compatible with V4, yet
131.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
132#endif
133.size	poly1305_init,.-poly1305_init
134.type	poly1305_blocks,%function
135.align	5
136poly1305_blocks:
137.Lpoly1305_blocks:
138	stmdb	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
139
140	ands	r2,r2,#-16
141	beq	.Lno_data
142
143	cmp	r3,#0
144	add	r2,r2,r1		@ end pointer
145	sub	sp,sp,#32
146
147	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12}		@ load context
148
149	str	r0,[sp,#12]		@ offload stuff
150	mov	lr,r1
151	str	r2,[sp,#16]
152	str	r10,[sp,#20]
153	str	r11,[sp,#24]
154	str	r12,[sp,#28]
155	b	.Loop
156
157.Loop:
158#if __ARM_ARCH__<7
159	ldrb	r0,[lr],#16		@ load input
160# ifdef	__thumb2__
161	it	hi
162# endif
163	addhi	r8,r8,#1		@ 1<<128
164	ldrb	r1,[lr,#-15]
165	ldrb	r2,[lr,#-14]
166	ldrb	r3,[lr,#-13]
167	orr	r1,r0,r1,lsl#8
168	ldrb	r0,[lr,#-12]
169	orr	r2,r1,r2,lsl#16
170	ldrb	r1,[lr,#-11]
171	orr	r3,r2,r3,lsl#24
172	ldrb	r2,[lr,#-10]
173	adds	r4,r4,r3		@ accumulate input
174
175	ldrb	r3,[lr,#-9]
176	orr	r1,r0,r1,lsl#8
177	ldrb	r0,[lr,#-8]
178	orr	r2,r1,r2,lsl#16
179	ldrb	r1,[lr,#-7]
180	orr	r3,r2,r3,lsl#24
181	ldrb	r2,[lr,#-6]
182	adcs	r5,r5,r3
183
184	ldrb	r3,[lr,#-5]
185	orr	r1,r0,r1,lsl#8
186	ldrb	r0,[lr,#-4]
187	orr	r2,r1,r2,lsl#16
188	ldrb	r1,[lr,#-3]
189	orr	r3,r2,r3,lsl#24
190	ldrb	r2,[lr,#-2]
191	adcs	r6,r6,r3
192
193	ldrb	r3,[lr,#-1]
194	orr	r1,r0,r1,lsl#8
195	str	lr,[sp,#8]		@ offload input pointer
196	orr	r2,r1,r2,lsl#16
197	add	r10,r10,r10,lsr#2
198	orr	r3,r2,r3,lsl#24
199#else
200	ldr	r0,[lr],#16		@ load input
201# ifdef	__thumb2__
202	it	hi
203# endif
204	addhi	r8,r8,#1		@ padbit
205	ldr	r1,[lr,#-12]
206	ldr	r2,[lr,#-8]
207	ldr	r3,[lr,#-4]
208# ifdef	__ARMEB__
209	rev	r0,r0
210	rev	r1,r1
211	rev	r2,r2
212	rev	r3,r3
213# endif
214	adds	r4,r4,r0		@ accumulate input
215	str	lr,[sp,#8]		@ offload input pointer
216	adcs	r5,r5,r1
217	add	r10,r10,r10,lsr#2
218	adcs	r6,r6,r2
219#endif
220	add	r11,r11,r11,lsr#2
221	adcs	r7,r7,r3
222	add	r12,r12,r12,lsr#2
223
224	umull	r2,r3,r5,r9
225	adc	r8,r8,#0
226	umull	r0,r1,r4,r9
227	umlal	r2,r3,r8,r10
228	umlal	r0,r1,r7,r10
229	ldr	r10,[sp,#20]		@ reload r10
230	umlal	r2,r3,r6,r12
231	umlal	r0,r1,r5,r12
232	umlal	r2,r3,r7,r11
233	umlal	r0,r1,r6,r11
234	umlal	r2,r3,r4,r10
235	str	r0,[sp,#0]		@ future r4
236	mul	r0,r11,r8
237	ldr	r11,[sp,#24]		@ reload r11
238	adds	r2,r2,r1		@ d1+=d0>>32
239	eor	r1,r1,r1
240	adc	lr,r3,#0		@ future r6
241	str	r2,[sp,#4]		@ future r5
242
243	mul	r2,r12,r8
244	eor	r3,r3,r3
245	umlal	r0,r1,r7,r12
246	ldr	r12,[sp,#28]		@ reload r12
247	umlal	r2,r3,r7,r9
248	umlal	r0,r1,r6,r9
249	umlal	r2,r3,r6,r10
250	umlal	r0,r1,r5,r10
251	umlal	r2,r3,r5,r11
252	umlal	r0,r1,r4,r11
253	umlal	r2,r3,r4,r12
254	ldr	r4,[sp,#0]
255	mul	r8,r9,r8
256	ldr	r5,[sp,#4]
257
258	adds	r6,lr,r0		@ d2+=d1>>32
259	ldr	lr,[sp,#8]		@ reload input pointer
260	adc	r1,r1,#0
261	adds	r7,r2,r1		@ d3+=d2>>32
262	ldr	r0,[sp,#16]		@ reload end pointer
263	adc	r3,r3,#0
264	add	r8,r8,r3		@ h4+=d3>>32
265
266	and	r1,r8,#-4
267	and	r8,r8,#3
268	add	r1,r1,r1,lsr#2		@ *=5
269	adds	r4,r4,r1
270	adcs	r5,r5,#0
271	adcs	r6,r6,#0
272	adcs	r7,r7,#0
273	adc	r8,r8,#0
274
275	cmp	r0,lr			@ done yet?
276	bhi	.Loop
277
278	ldr	r0,[sp,#12]
279	add	sp,sp,#32
280	stmia	r0,{r4,r5,r6,r7,r8}		@ store the result
281
282.Lno_data:
283#if	__ARM_ARCH__>=5
284	ldmia	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc}
285#else
286	ldmia	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
287	tst	lr,#1
288	moveq	pc,lr			@ be binary compatible with V4, yet
289.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
290#endif
291.size	poly1305_blocks,.-poly1305_blocks
292.type	poly1305_emit,%function
293.align	5
294poly1305_emit:
295	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
296.Lpoly1305_emit_enter:
297
298	ldmia	r0,{r3,r4,r5,r6,r7}
299	adds	r8,r3,#5		@ compare to modulus
300	adcs	r9,r4,#0
301	adcs	r10,r5,#0
302	adcs	r11,r6,#0
303	adc	r7,r7,#0
304	tst	r7,#4			@ did it carry/borrow?
305
306#ifdef	__thumb2__
307	it	ne
308#endif
309	movne	r3,r8
310	ldr	r8,[r2,#0]
311#ifdef	__thumb2__
312	it	ne
313#endif
314	movne	r4,r9
315	ldr	r9,[r2,#4]
316#ifdef	__thumb2__
317	it	ne
318#endif
319	movne	r5,r10
320	ldr	r10,[r2,#8]
321#ifdef	__thumb2__
322	it	ne
323#endif
324	movne	r6,r11
325	ldr	r11,[r2,#12]
326
327	adds	r3,r3,r8
328	adcs	r4,r4,r9
329	adcs	r5,r5,r10
330	adc	r6,r6,r11
331
332#if __ARM_ARCH__>=7
333# ifdef __ARMEB__
334	rev	r3,r3
335	rev	r4,r4
336	rev	r5,r5
337	rev	r6,r6
338# endif
339	str	r3,[r1,#0]
340	str	r4,[r1,#4]
341	str	r5,[r1,#8]
342	str	r6,[r1,#12]
343#else
344	strb	r3,[r1,#0]
345	mov	r3,r3,lsr#8
346	strb	r4,[r1,#4]
347	mov	r4,r4,lsr#8
348	strb	r5,[r1,#8]
349	mov	r5,r5,lsr#8
350	strb	r6,[r1,#12]
351	mov	r6,r6,lsr#8
352
353	strb	r3,[r1,#1]
354	mov	r3,r3,lsr#8
355	strb	r4,[r1,#5]
356	mov	r4,r4,lsr#8
357	strb	r5,[r1,#9]
358	mov	r5,r5,lsr#8
359	strb	r6,[r1,#13]
360	mov	r6,r6,lsr#8
361
362	strb	r3,[r1,#2]
363	mov	r3,r3,lsr#8
364	strb	r4,[r1,#6]
365	mov	r4,r4,lsr#8
366	strb	r5,[r1,#10]
367	mov	r5,r5,lsr#8
368	strb	r6,[r1,#14]
369	mov	r6,r6,lsr#8
370
371	strb	r3,[r1,#3]
372	strb	r4,[r1,#7]
373	strb	r5,[r1,#11]
374	strb	r6,[r1,#15]
375#endif
376	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
377#if	__ARM_ARCH__>=5
378	bx	lr				@ bx	lr
379#else
380	tst	lr,#1
381	moveq	pc,lr			@ be binary compatible with V4, yet
382.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
383#endif
384.size	poly1305_emit,.-poly1305_emit
385#if	__ARM_MAX_ARCH__>=7
386.fpu	neon
387
388.type	poly1305_init_neon,%function
389.align	5
390poly1305_init_neon:
391	ldr	r4,[r0,#20]		@ load key base 2^32
392	ldr	r5,[r0,#24]
393	ldr	r6,[r0,#28]
394	ldr	r7,[r0,#32]
395
396	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
397	mov	r3,r4,lsr#26
398	mov	r4,r5,lsr#20
399	orr	r3,r3,r5,lsl#6
400	mov	r5,r6,lsr#14
401	orr	r4,r4,r6,lsl#12
402	mov	r6,r7,lsr#8
403	orr	r5,r5,r7,lsl#18
404	and	r3,r3,#0x03ffffff
405	and	r4,r4,#0x03ffffff
406	and	r5,r5,#0x03ffffff
407
408	vdup.32	d0,r2			@ r^1 in both lanes
409	add	r2,r3,r3,lsl#2		@ *5
410	vdup.32	d1,r3
411	add	r3,r4,r4,lsl#2
412	vdup.32	d2,r2
413	vdup.32	d3,r4
414	add	r4,r5,r5,lsl#2
415	vdup.32	d4,r3
416	vdup.32	d5,r5
417	add	r5,r6,r6,lsl#2
418	vdup.32	d6,r4
419	vdup.32	d7,r6
420	vdup.32	d8,r5
421
422	mov	r5,#2		@ counter
423
424.Lsquare_neon:
425	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
426	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
427	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
428	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
429	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
430	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
431
432	vmull.u32	q5,d0,d0[1]
433	vmull.u32	q6,d1,d0[1]
434	vmull.u32	q7,d3,d0[1]
435	vmull.u32	q8,d5,d0[1]
436	vmull.u32	q9,d7,d0[1]
437
438	vmlal.u32	q5,d7,d2[1]
439	vmlal.u32	q6,d0,d1[1]
440	vmlal.u32	q7,d1,d1[1]
441	vmlal.u32	q8,d3,d1[1]
442	vmlal.u32	q9,d5,d1[1]
443
444	vmlal.u32	q5,d5,d4[1]
445	vmlal.u32	q6,d7,d4[1]
446	vmlal.u32	q8,d1,d3[1]
447	vmlal.u32	q7,d0,d3[1]
448	vmlal.u32	q9,d3,d3[1]
449
450	vmlal.u32	q5,d3,d6[1]
451	vmlal.u32	q8,d0,d5[1]
452	vmlal.u32	q6,d5,d6[1]
453	vmlal.u32	q7,d7,d6[1]
454	vmlal.u32	q9,d1,d5[1]
455
456	vmlal.u32	q8,d7,d8[1]
457	vmlal.u32	q5,d1,d8[1]
458	vmlal.u32	q6,d3,d8[1]
459	vmlal.u32	q7,d5,d8[1]
460	vmlal.u32	q9,d0,d7[1]
461
462	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
463	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
464	@ and P. Schwabe
465	@
466	@ H0>>+H1>>+H2>>+H3>>+H4
467	@ H3>>+H4>>*5+H0>>+H1
468	@
469	@ Trivia.
470	@
471	@ Result of multiplication of n-bit number by m-bit number is
472	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
473	@ m-bit number multiplied by 2^n is still n+m bits wide.
474	@
475	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
476	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
477	@ one is n+1 bits wide.
478	@
479	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
480	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
481	@ can be 27. However! In cases when their width exceeds 26 bits
482	@ they are limited by 2^26+2^6. This in turn means that *sum*
483	@ of the products with these values can still be viewed as sum
484	@ of 52-bit numbers as long as the amount of addends is not a
485	@ power of 2. For example,
486	@
487	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
488	@
489	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
490	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
491	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
492	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
493	@ which is less than 32 * (2^52) or 2^57. And when processing
494	@ data we are looking at triple as many addends...
495	@
496	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
497	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
498	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
499	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
500	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
501	@ This means that result of reduction have to be compressed upon
502	@ loop wrap-around. This can be done in the process of reduction
503	@ to minimize amount of instructions [as well as amount of
504	@ 128-bit instructions, which benefits low-end processors], but
505	@ one has to watch for H2 (which is narrower than H0) and 5*H4
506	@ not being wider than 58 bits, so that result of right shift
507	@ by 26 bits fits in 32 bits. This is also useful on x86,
508	@ because it allows to use paddd in place for paddq, which
509	@ benefits Atom, where paddq is ridiculously slow.
510
511	vshr.u64	q15,q8,#26
512	vmovn.i64	d16,q8
513	vshr.u64	q4,q5,#26
514	vmovn.i64	d10,q5
515	vadd.i64	q9,q9,q15		@ h3 -> h4
516	vbic.i32	d16,#0xfc000000	@ &=0x03ffffff
517	vadd.i64	q6,q6,q4		@ h0 -> h1
518	vbic.i32	d10,#0xfc000000
519
520	vshrn.u64	d30,q9,#26
521	vmovn.i64	d18,q9
522	vshr.u64	q4,q6,#26
523	vmovn.i64	d12,q6
524	vadd.i64	q7,q7,q4		@ h1 -> h2
525	vbic.i32	d18,#0xfc000000
526	vbic.i32	d12,#0xfc000000
527
528	vadd.i32	d10,d10,d30
529	vshl.u32	d30,d30,#2
530	vshrn.u64	d8,q7,#26
531	vmovn.i64	d14,q7
532	vadd.i32	d10,d10,d30	@ h4 -> h0
533	vadd.i32	d16,d16,d8	@ h2 -> h3
534	vbic.i32	d14,#0xfc000000
535
536	vshr.u32	d30,d10,#26
537	vbic.i32	d10,#0xfc000000
538	vshr.u32	d8,d16,#26
539	vbic.i32	d16,#0xfc000000
540	vadd.i32	d12,d12,d30	@ h0 -> h1
541	vadd.i32	d18,d18,d8	@ h3 -> h4
542
543	subs	r5,r5,#1
544	beq	.Lsquare_break_neon
545
546	add	r6,r0,#(48+0*9*4)
547	add	r7,r0,#(48+1*9*4)
548
549	vtrn.32	d0,d10		@ r^2:r^1
550	vtrn.32	d3,d14
551	vtrn.32	d5,d16
552	vtrn.32	d1,d12
553	vtrn.32	d7,d18
554
555	vshl.u32	d4,d3,#2		@ *5
556	vshl.u32	d6,d5,#2
557	vshl.u32	d2,d1,#2
558	vshl.u32	d8,d7,#2
559	vadd.i32	d4,d4,d3
560	vadd.i32	d2,d2,d1
561	vadd.i32	d6,d6,d5
562	vadd.i32	d8,d8,d7
563
564	vst4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!
565	vst4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!
566	vst4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
567	vst4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
568	vst1.32	{d8[0]},[r6,:32]
569	vst1.32	{d8[1]},[r7,:32]
570
571	b	.Lsquare_neon
572
573.align	4
574.Lsquare_break_neon:
575	add	r6,r0,#(48+2*4*9)
576	add	r7,r0,#(48+3*4*9)
577
578	vmov	d0,d10		@ r^4:r^3
579	vshl.u32	d2,d12,#2		@ *5
580	vmov	d1,d12
581	vshl.u32	d4,d14,#2
582	vmov	d3,d14
583	vshl.u32	d6,d16,#2
584	vmov	d5,d16
585	vshl.u32	d8,d18,#2
586	vmov	d7,d18
587	vadd.i32	d2,d2,d12
588	vadd.i32	d4,d4,d14
589	vadd.i32	d6,d6,d16
590	vadd.i32	d8,d8,d18
591
592	vst4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!
593	vst4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!
594	vst4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
595	vst4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
596	vst1.32	{d8[0]},[r6]
597	vst1.32	{d8[1]},[r7]
598
599	bx	lr				@ bx	lr
600.size	poly1305_init_neon,.-poly1305_init_neon
601
602.type	poly1305_blocks_neon,%function
603.align	5
604poly1305_blocks_neon:
605	ldr	ip,[r0,#36]		@ is_base2_26
606	ands	r2,r2,#-16
607	beq	.Lno_data_neon
608
609	cmp	r2,#64
610	bhs	.Lenter_neon
611	tst	ip,ip			@ is_base2_26?
612	beq	.Lpoly1305_blocks
613
614.Lenter_neon:
615	stmdb	sp!,{r4,r5,r6,r7}
616	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
617
618	tst	ip,ip			@ is_base2_26?
619	bne	.Lbase2_26_neon
620
621	stmdb	sp!,{r1,r2,r3,lr}
622	bl	poly1305_init_neon
623
624	ldr	r4,[r0,#0]		@ load hash value base 2^32
625	ldr	r5,[r0,#4]
626	ldr	r6,[r0,#8]
627	ldr	r7,[r0,#12]
628	ldr	ip,[r0,#16]
629
630	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
631	mov	r3,r4,lsr#26
632	veor	d10,d10,d10
633	mov	r4,r5,lsr#20
634	orr	r3,r3,r5,lsl#6
635	veor	d12,d12,d12
636	mov	r5,r6,lsr#14
637	orr	r4,r4,r6,lsl#12
638	veor	d14,d14,d14
639	mov	r6,r7,lsr#8
640	orr	r5,r5,r7,lsl#18
641	veor	d16,d16,d16
642	and	r3,r3,#0x03ffffff
643	orr	r6,r6,ip,lsl#24
644	veor	d18,d18,d18
645	and	r4,r4,#0x03ffffff
646	mov	r1,#1
647	and	r5,r5,#0x03ffffff
648	str	r1,[r0,#36]		@ is_base2_26
649
650	vmov.32	d10[0],r2
651	vmov.32	d12[0],r3
652	vmov.32	d14[0],r4
653	vmov.32	d16[0],r5
654	vmov.32	d18[0],r6
655	adr	r5,.Lzeros
656
657	ldmia	sp!,{r1,r2,r3,lr}
658	b	.Lbase2_32_neon
659
660.align	4
661.Lbase2_26_neon:
662	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
663	@ load hash value
664
665	veor	d10,d10,d10
666	veor	d12,d12,d12
667	veor	d14,d14,d14
668	veor	d16,d16,d16
669	veor	d18,d18,d18
670	vld4.32	{d10[0],d12[0],d14[0],d16[0]},[r0]!
671	adr	r5,.Lzeros
672	vld1.32	{d18[0]},[r0]
673	sub	r0,r0,#16		@ rewind
674
675.Lbase2_32_neon:
676	add	r4,r1,#32
677	mov	r3,r3,lsl#24
678	tst	r2,#31
679	beq	.Leven
680
681	vld4.32	{d20[0],d22[0],d24[0],d26[0]},[r1]!
682	vmov.32	d28[0],r3
683	sub	r2,r2,#16
684	add	r4,r1,#32
685
686# ifdef	__ARMEB__
687	vrev32.8	q10,q10
688	vrev32.8	q13,q13
689	vrev32.8	q11,q11
690	vrev32.8	q12,q12
691# endif
692	vsri.u32	d28,d26,#8	@ base 2^32 -> base 2^26
693	vshl.u32	d26,d26,#18
694
695	vsri.u32	d26,d24,#14
696	vshl.u32	d24,d24,#12
697	vadd.i32	d29,d28,d18	@ add hash value and move to #hi
698
699	vbic.i32	d26,#0xfc000000
700	vsri.u32	d24,d22,#20
701	vshl.u32	d22,d22,#6
702
703	vbic.i32	d24,#0xfc000000
704	vsri.u32	d22,d20,#26
705	vadd.i32	d27,d26,d16
706
707	vbic.i32	d20,#0xfc000000
708	vbic.i32	d22,#0xfc000000
709	vadd.i32	d25,d24,d14
710
711	vadd.i32	d21,d20,d10
712	vadd.i32	d23,d22,d12
713
714	mov	r7,r5
715	add	r6,r0,#48
716
717	cmp	r2,r2
718	b	.Long_tail
719
720.align	4
721.Leven:
722	subs	r2,r2,#64
723	it	lo
724	movlo	r4,r5
725
726	vmov.i32	q14,#1<<24		@ padbit, yes, always
727	vld4.32	{d20,d22,d24,d26},[r1]	@ inp[0:1]
728	add	r1,r1,#64
729	vld4.32	{d21,d23,d25,d27},[r4]	@ inp[2:3] (or 0)
730	add	r4,r4,#64
731	itt	hi
732	addhi	r7,r0,#(48+1*9*4)
733	addhi	r6,r0,#(48+3*9*4)
734
735# ifdef	__ARMEB__
736	vrev32.8	q10,q10
737	vrev32.8	q13,q13
738	vrev32.8	q11,q11
739	vrev32.8	q12,q12
740# endif
741	vsri.u32	q14,q13,#8		@ base 2^32 -> base 2^26
742	vshl.u32	q13,q13,#18
743
744	vsri.u32	q13,q12,#14
745	vshl.u32	q12,q12,#12
746
747	vbic.i32	q13,#0xfc000000
748	vsri.u32	q12,q11,#20
749	vshl.u32	q11,q11,#6
750
751	vbic.i32	q12,#0xfc000000
752	vsri.u32	q11,q10,#26
753
754	vbic.i32	q10,#0xfc000000
755	vbic.i32	q11,#0xfc000000
756
757	bls	.Lskip_loop
758
759	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^2
760	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^4
761	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
762	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
763	b	.Loop_neon
764
765.align	5
766.Loop_neon:
767	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
768	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
769	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
770	@   ___________________/
771	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
772	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
773	@   ___________________/ ____________________/
774	@
775	@ Note that we start with inp[2:3]*r^2. This is because it
776	@ doesn't depend on reduction in previous iteration.
777	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
778	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
779	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
780	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
781	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
782	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
783
784	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
785	@ inp[2:3]*r^2
786
787	vadd.i32	d24,d24,d14	@ accumulate inp[0:1]
788	vmull.u32	q7,d25,d0[1]
789	vadd.i32	d20,d20,d10
790	vmull.u32	q5,d21,d0[1]
791	vadd.i32	d26,d26,d16
792	vmull.u32	q8,d27,d0[1]
793	vmlal.u32	q7,d23,d1[1]
794	vadd.i32	d22,d22,d12
795	vmull.u32	q6,d23,d0[1]
796
797	vadd.i32	d28,d28,d18
798	vmull.u32	q9,d29,d0[1]
799	subs	r2,r2,#64
800	vmlal.u32	q5,d29,d2[1]
801	it	lo
802	movlo	r4,r5
803	vmlal.u32	q8,d25,d1[1]
804	vld1.32	d8[1],[r7,:32]
805	vmlal.u32	q6,d21,d1[1]
806	vmlal.u32	q9,d27,d1[1]
807
808	vmlal.u32	q5,d27,d4[1]
809	vmlal.u32	q8,d23,d3[1]
810	vmlal.u32	q9,d25,d3[1]
811	vmlal.u32	q6,d29,d4[1]
812	vmlal.u32	q7,d21,d3[1]
813
814	vmlal.u32	q8,d21,d5[1]
815	vmlal.u32	q5,d25,d6[1]
816	vmlal.u32	q9,d23,d5[1]
817	vmlal.u32	q6,d27,d6[1]
818	vmlal.u32	q7,d29,d6[1]
819
820	vmlal.u32	q8,d29,d8[1]
821	vmlal.u32	q5,d23,d8[1]
822	vmlal.u32	q9,d21,d7[1]
823	vmlal.u32	q6,d25,d8[1]
824	vmlal.u32	q7,d27,d8[1]
825
826	vld4.32	{d21,d23,d25,d27},[r4]	@ inp[2:3] (or 0)
827	add	r4,r4,#64
828
829	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
830	@ (hash+inp[0:1])*r^4 and accumulate
831
832	vmlal.u32	q8,d26,d0[0]
833	vmlal.u32	q5,d20,d0[0]
834	vmlal.u32	q9,d28,d0[0]
835	vmlal.u32	q6,d22,d0[0]
836	vmlal.u32	q7,d24,d0[0]
837	vld1.32	d8[0],[r6,:32]
838
839	vmlal.u32	q8,d24,d1[0]
840	vmlal.u32	q5,d28,d2[0]
841	vmlal.u32	q9,d26,d1[0]
842	vmlal.u32	q6,d20,d1[0]
843	vmlal.u32	q7,d22,d1[0]
844
845	vmlal.u32	q8,d22,d3[0]
846	vmlal.u32	q5,d26,d4[0]
847	vmlal.u32	q9,d24,d3[0]
848	vmlal.u32	q6,d28,d4[0]
849	vmlal.u32	q7,d20,d3[0]
850
851	vmlal.u32	q8,d20,d5[0]
852	vmlal.u32	q5,d24,d6[0]
853	vmlal.u32	q9,d22,d5[0]
854	vmlal.u32	q6,d26,d6[0]
855	vmlal.u32	q8,d28,d8[0]
856
857	vmlal.u32	q7,d28,d6[0]
858	vmlal.u32	q5,d22,d8[0]
859	vmlal.u32	q9,d20,d7[0]
860	vmov.i32	q14,#1<<24		@ padbit, yes, always
861	vmlal.u32	q6,d24,d8[0]
862	vmlal.u32	q7,d26,d8[0]
863
864	vld4.32	{d20,d22,d24,d26},[r1]	@ inp[0:1]
865	add	r1,r1,#64
866# ifdef	__ARMEB__
867	vrev32.8	q10,q10
868	vrev32.8	q11,q11
869	vrev32.8	q12,q12
870	vrev32.8	q13,q13
871# endif
872
873	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
874	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
875	@ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
876
877	vshr.u64	q15,q8,#26
878	vmovn.i64	d16,q8
879	vshr.u64	q4,q5,#26
880	vmovn.i64	d10,q5
881	vadd.i64	q9,q9,q15		@ h3 -> h4
882	vbic.i32	d16,#0xfc000000
883	vsri.u32	q14,q13,#8		@ base 2^32 -> base 2^26
884	vadd.i64	q6,q6,q4		@ h0 -> h1
885	vshl.u32	q13,q13,#18
886	vbic.i32	d10,#0xfc000000
887
888	vshrn.u64	d30,q9,#26
889	vmovn.i64	d18,q9
890	vshr.u64	q4,q6,#26
891	vmovn.i64	d12,q6
892	vadd.i64	q7,q7,q4		@ h1 -> h2
893	vsri.u32	q13,q12,#14
894	vbic.i32	d18,#0xfc000000
895	vshl.u32	q12,q12,#12
896	vbic.i32	d12,#0xfc000000
897
898	vadd.i32	d10,d10,d30
899	vshl.u32	d30,d30,#2
900	vbic.i32	q13,#0xfc000000
901	vshrn.u64	d8,q7,#26
902	vmovn.i64	d14,q7
903	vaddl.u32	q5,d10,d30	@ h4 -> h0 [widen for a sec]
904	vsri.u32	q12,q11,#20
905	vadd.i32	d16,d16,d8	@ h2 -> h3
906	vshl.u32	q11,q11,#6
907	vbic.i32	d14,#0xfc000000
908	vbic.i32	q12,#0xfc000000
909
910	vshrn.u64	d30,q5,#26		@ re-narrow
911	vmovn.i64	d10,q5
912	vsri.u32	q11,q10,#26
913	vbic.i32	q10,#0xfc000000
914	vshr.u32	d8,d16,#26
915	vbic.i32	d16,#0xfc000000
916	vbic.i32	d10,#0xfc000000
917	vadd.i32	d12,d12,d30	@ h0 -> h1
918	vadd.i32	d18,d18,d8	@ h3 -> h4
919	vbic.i32	q11,#0xfc000000
920
921	bhi	.Loop_neon
922
923.Lskip_loop:
924	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
925	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
926
927	add	r7,r0,#(48+0*9*4)
928	add	r6,r0,#(48+1*9*4)
929	adds	r2,r2,#32
930	it	ne
931	movne	r2,#0
932	bne	.Long_tail
933
934	vadd.i32	d25,d24,d14	@ add hash value and move to #hi
935	vadd.i32	d21,d20,d10
936	vadd.i32	d27,d26,d16
937	vadd.i32	d23,d22,d12
938	vadd.i32	d29,d28,d18
939
940.Long_tail:
941	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^1
942	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^2
943
944	vadd.i32	d24,d24,d14	@ can be redundant
945	vmull.u32	q7,d25,d0
946	vadd.i32	d20,d20,d10
947	vmull.u32	q5,d21,d0
948	vadd.i32	d26,d26,d16
949	vmull.u32	q8,d27,d0
950	vadd.i32	d22,d22,d12
951	vmull.u32	q6,d23,d0
952	vadd.i32	d28,d28,d18
953	vmull.u32	q9,d29,d0
954
955	vmlal.u32	q5,d29,d2
956	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
957	vmlal.u32	q8,d25,d1
958	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
959	vmlal.u32	q6,d21,d1
960	vmlal.u32	q9,d27,d1
961	vmlal.u32	q7,d23,d1
962
963	vmlal.u32	q8,d23,d3
964	vld1.32	d8[1],[r7,:32]
965	vmlal.u32	q5,d27,d4
966	vld1.32	d8[0],[r6,:32]
967	vmlal.u32	q9,d25,d3
968	vmlal.u32	q6,d29,d4
969	vmlal.u32	q7,d21,d3
970
971	vmlal.u32	q8,d21,d5
972	it	ne
973	addne	r7,r0,#(48+2*9*4)
974	vmlal.u32	q5,d25,d6
975	it	ne
976	addne	r6,r0,#(48+3*9*4)
977	vmlal.u32	q9,d23,d5
978	vmlal.u32	q6,d27,d6
979	vmlal.u32	q7,d29,d6
980
981	vmlal.u32	q8,d29,d8
982	vorn	q0,q0,q0	@ all-ones, can be redundant
983	vmlal.u32	q5,d23,d8
984	vshr.u64	q0,q0,#38
985	vmlal.u32	q9,d21,d7
986	vmlal.u32	q6,d25,d8
987	vmlal.u32	q7,d27,d8
988
989	beq	.Lshort_tail
990
991	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
992	@ (hash+inp[0:1])*r^4:r^3 and accumulate
993
994	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^3
995	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^4
996
997	vmlal.u32	q7,d24,d0
998	vmlal.u32	q5,d20,d0
999	vmlal.u32	q8,d26,d0
1000	vmlal.u32	q6,d22,d0
1001	vmlal.u32	q9,d28,d0
1002
1003	vmlal.u32	q5,d28,d2
1004	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
1005	vmlal.u32	q8,d24,d1
1006	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
1007	vmlal.u32	q6,d20,d1
1008	vmlal.u32	q9,d26,d1
1009	vmlal.u32	q7,d22,d1
1010
1011	vmlal.u32	q8,d22,d3
1012	vld1.32	d8[1],[r7,:32]
1013	vmlal.u32	q5,d26,d4
1014	vld1.32	d8[0],[r6,:32]
1015	vmlal.u32	q9,d24,d3
1016	vmlal.u32	q6,d28,d4
1017	vmlal.u32	q7,d20,d3
1018
1019	vmlal.u32	q8,d20,d5
1020	vmlal.u32	q5,d24,d6
1021	vmlal.u32	q9,d22,d5
1022	vmlal.u32	q6,d26,d6
1023	vmlal.u32	q7,d28,d6
1024
1025	vmlal.u32	q8,d28,d8
1026	vorn	q0,q0,q0	@ all-ones
1027	vmlal.u32	q5,d22,d8
1028	vshr.u64	q0,q0,#38
1029	vmlal.u32	q9,d20,d7
1030	vmlal.u32	q6,d24,d8
1031	vmlal.u32	q7,d26,d8
1032
1033.Lshort_tail:
1034	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1035	@ horizontal addition
1036
1037	vadd.i64	d16,d16,d17
1038	vadd.i64	d10,d10,d11
1039	vadd.i64	d18,d18,d19
1040	vadd.i64	d12,d12,d13
1041	vadd.i64	d14,d14,d15
1042
1043	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1044	@ lazy reduction, but without narrowing
1045
1046	vshr.u64	q15,q8,#26
1047	vand.i64	q8,q8,q0
1048	vshr.u64	q4,q5,#26
1049	vand.i64	q5,q5,q0
1050	vadd.i64	q9,q9,q15		@ h3 -> h4
1051	vadd.i64	q6,q6,q4		@ h0 -> h1
1052
1053	vshr.u64	q15,q9,#26
1054	vand.i64	q9,q9,q0
1055	vshr.u64	q4,q6,#26
1056	vand.i64	q6,q6,q0
1057	vadd.i64	q7,q7,q4		@ h1 -> h2
1058
1059	vadd.i64	q5,q5,q15
1060	vshl.u64	q15,q15,#2
1061	vshr.u64	q4,q7,#26
1062	vand.i64	q7,q7,q0
1063	vadd.i64	q5,q5,q15		@ h4 -> h0
1064	vadd.i64	q8,q8,q4		@ h2 -> h3
1065
1066	vshr.u64	q15,q5,#26
1067	vand.i64	q5,q5,q0
1068	vshr.u64	q4,q8,#26
1069	vand.i64	q8,q8,q0
1070	vadd.i64	q6,q6,q15		@ h0 -> h1
1071	vadd.i64	q9,q9,q4		@ h3 -> h4
1072
1073	cmp	r2,#0
1074	bne	.Leven
1075
1076	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1077	@ store hash value
1078
1079	vst4.32	{d10[0],d12[0],d14[0],d16[0]},[r0]!
1080	vst1.32	{d18[0]},[r0]
1081
1082	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}			@ epilogue
1083	ldmia	sp!,{r4,r5,r6,r7}
1084.Lno_data_neon:
1085	bx	lr					@ bx	lr
1086.size	poly1305_blocks_neon,.-poly1305_blocks_neon
1087
1088.type	poly1305_emit_neon,%function
1089.align	5
1090poly1305_emit_neon:
1091	ldr	ip,[r0,#36]		@ is_base2_26
1092
1093	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1094
1095	tst	ip,ip
1096	beq	.Lpoly1305_emit_enter
1097
1098	ldmia	r0,{r3,r4,r5,r6,r7}
1099	eor	r8,r8,r8
1100
1101	adds	r3,r3,r4,lsl#26	@ base 2^26 -> base 2^32
1102	mov	r4,r4,lsr#6
1103	adcs	r4,r4,r5,lsl#20
1104	mov	r5,r5,lsr#12
1105	adcs	r5,r5,r6,lsl#14
1106	mov	r6,r6,lsr#18
1107	adcs	r6,r6,r7,lsl#8
1108	adc	r7,r8,r7,lsr#24	@ can be partially reduced ...
1109
1110	and	r8,r7,#-4		@ ... so reduce
1111	and	r7,r6,#3
1112	add	r8,r8,r8,lsr#2	@ *= 5
1113	adds	r3,r3,r8
1114	adcs	r4,r4,#0
1115	adcs	r5,r5,#0
1116	adcs	r6,r6,#0
1117	adc	r7,r7,#0
1118
1119	adds	r8,r3,#5		@ compare to modulus
1120	adcs	r9,r4,#0
1121	adcs	r10,r5,#0
1122	adcs	r11,r6,#0
1123	adc	r7,r7,#0
1124	tst	r7,#4			@ did it carry/borrow?
1125
1126	it	ne
1127	movne	r3,r8
1128	ldr	r8,[r2,#0]
1129	it	ne
1130	movne	r4,r9
1131	ldr	r9,[r2,#4]
1132	it	ne
1133	movne	r5,r10
1134	ldr	r10,[r2,#8]
1135	it	ne
1136	movne	r6,r11
1137	ldr	r11,[r2,#12]
1138
1139	adds	r3,r3,r8		@ accumulate nonce
1140	adcs	r4,r4,r9
1141	adcs	r5,r5,r10
1142	adc	r6,r6,r11
1143
1144# ifdef __ARMEB__
1145	rev	r3,r3
1146	rev	r4,r4
1147	rev	r5,r5
1148	rev	r6,r6
1149# endif
1150	str	r3,[r1,#0]		@ store the result
1151	str	r4,[r1,#4]
1152	str	r5,[r1,#8]
1153	str	r6,[r1,#12]
1154
1155	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1156	bx	lr				@ bx	lr
1157.size	poly1305_emit_neon,.-poly1305_emit_neon
1158
1159.align	5
1160.Lzeros:
1161.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1162.LOPENSSL_armcap:
1163.word	OPENSSL_armcap_P-.Lpoly1305_init
1164#endif
1165.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1166.align	2
1167.align	2
1168#if	__ARM_MAX_ARCH__>=7
1169.comm	OPENSSL_armcap_P,4,4
1170#endif
1171