xref: /freebsd/sys/crypto/openssl/arm/poly1305-armv4.S (revision 59144db3fca192c4637637dfe6b5a5d98632cd47)
1/* Do not modify. This file is auto-generated from poly1305-armv4.pl. */
2#include "arm_arch.h"
3
4#if defined(__thumb2__)
5.syntax	unified
6.thumb
7#else
8.code	32
9#endif
10
11.text
12
13.globl	poly1305_emit
14.globl	poly1305_blocks
15.globl	poly1305_init
16.type	poly1305_init,%function
17.align	5
18poly1305_init:
19.Lpoly1305_init:
20	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
21
22	eor	r3,r3,r3
23	cmp	r1,#0
24	str	r3,[r0,#0]		@ zero hash value
25	str	r3,[r0,#4]
26	str	r3,[r0,#8]
27	str	r3,[r0,#12]
28	str	r3,[r0,#16]
29	str	r3,[r0,#36]		@ is_base2_26
30	add	r0,r0,#20
31
32#ifdef	__thumb2__
33	it	eq
34#endif
35	moveq	r0,#0
36	beq	.Lno_key
37
38#if	__ARM_MAX_ARCH__>=7
39	adr	r11,.Lpoly1305_init
40	ldr	r12,.LOPENSSL_armcap
41#endif
42	ldrb	r4,[r1,#0]
43	mov	r10,#0x0fffffff
44	ldrb	r5,[r1,#1]
45	and	r3,r10,#-4		@ 0x0ffffffc
46	ldrb	r6,[r1,#2]
47	ldrb	r7,[r1,#3]
48	orr	r4,r4,r5,lsl#8
49	ldrb	r5,[r1,#4]
50	orr	r4,r4,r6,lsl#16
51	ldrb	r6,[r1,#5]
52	orr	r4,r4,r7,lsl#24
53	ldrb	r7,[r1,#6]
54	and	r4,r4,r10
55
56#if	__ARM_MAX_ARCH__>=7
57# if !defined(_WIN32)
58	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
59# endif
60# if defined(__APPLE__) || defined(_WIN32)
61	ldr	r12,[r12]
62# endif
63#endif
64	ldrb	r8,[r1,#7]
65	orr	r5,r5,r6,lsl#8
66	ldrb	r6,[r1,#8]
67	orr	r5,r5,r7,lsl#16
68	ldrb	r7,[r1,#9]
69	orr	r5,r5,r8,lsl#24
70	ldrb	r8,[r1,#10]
71	and	r5,r5,r3
72
73#if	__ARM_MAX_ARCH__>=7
74	tst	r12,#ARMV7_NEON		@ check for NEON
75# ifdef	__thumb2__
76	adr	r9,.Lpoly1305_blocks_neon
77	adr	r11,.Lpoly1305_blocks
78	adr	r12,.Lpoly1305_emit
79	adr	r10,.Lpoly1305_emit_neon
80	itt	ne
81	movne	r11,r9
82	movne	r12,r10
83	orr	r11,r11,#1	@ thumb-ify address
84	orr	r12,r12,#1
85# else
86	addeq	r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
87	addne	r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
88	addeq	r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
89	addne	r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
90# endif
91#endif
92	ldrb	r9,[r1,#11]
93	orr	r6,r6,r7,lsl#8
94	ldrb	r7,[r1,#12]
95	orr	r6,r6,r8,lsl#16
96	ldrb	r8,[r1,#13]
97	orr	r6,r6,r9,lsl#24
98	ldrb	r9,[r1,#14]
99	and	r6,r6,r3
100
101	ldrb	r10,[r1,#15]
102	orr	r7,r7,r8,lsl#8
103	str	r4,[r0,#0]
104	orr	r7,r7,r9,lsl#16
105	str	r5,[r0,#4]
106	orr	r7,r7,r10,lsl#24
107	str	r6,[r0,#8]
108	and	r7,r7,r3
109	str	r7,[r0,#12]
110#if	__ARM_MAX_ARCH__>=7
111	stmia	r2,{r11,r12}		@ fill functions table
112	mov	r0,#1
113#else
114	mov	r0,#0
115#endif
116.Lno_key:
117	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
118#if	__ARM_ARCH__>=5
119	bx	lr				@ bx	lr
120#else
121	tst	lr,#1
122	moveq	pc,lr			@ be binary compatible with V4, yet
123.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
124#endif
125.size	poly1305_init,.-poly1305_init
126.type	poly1305_blocks,%function
127.align	5
128poly1305_blocks:
129.Lpoly1305_blocks:
130	stmdb	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
131
132	ands	r2,r2,#-16
133	beq	.Lno_data
134
135	cmp	r3,#0
136	add	r2,r2,r1		@ end pointer
137	sub	sp,sp,#32
138
139	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12}		@ load context
140
141	str	r0,[sp,#12]		@ offload stuff
142	mov	lr,r1
143	str	r2,[sp,#16]
144	str	r10,[sp,#20]
145	str	r11,[sp,#24]
146	str	r12,[sp,#28]
147	b	.Loop
148
149.Loop:
150#if __ARM_ARCH__<7
151	ldrb	r0,[lr],#16		@ load input
152# ifdef	__thumb2__
153	it	hi
154# endif
155	addhi	r8,r8,#1		@ 1<<128
156	ldrb	r1,[lr,#-15]
157	ldrb	r2,[lr,#-14]
158	ldrb	r3,[lr,#-13]
159	orr	r1,r0,r1,lsl#8
160	ldrb	r0,[lr,#-12]
161	orr	r2,r1,r2,lsl#16
162	ldrb	r1,[lr,#-11]
163	orr	r3,r2,r3,lsl#24
164	ldrb	r2,[lr,#-10]
165	adds	r4,r4,r3		@ accumulate input
166
167	ldrb	r3,[lr,#-9]
168	orr	r1,r0,r1,lsl#8
169	ldrb	r0,[lr,#-8]
170	orr	r2,r1,r2,lsl#16
171	ldrb	r1,[lr,#-7]
172	orr	r3,r2,r3,lsl#24
173	ldrb	r2,[lr,#-6]
174	adcs	r5,r5,r3
175
176	ldrb	r3,[lr,#-5]
177	orr	r1,r0,r1,lsl#8
178	ldrb	r0,[lr,#-4]
179	orr	r2,r1,r2,lsl#16
180	ldrb	r1,[lr,#-3]
181	orr	r3,r2,r3,lsl#24
182	ldrb	r2,[lr,#-2]
183	adcs	r6,r6,r3
184
185	ldrb	r3,[lr,#-1]
186	orr	r1,r0,r1,lsl#8
187	str	lr,[sp,#8]		@ offload input pointer
188	orr	r2,r1,r2,lsl#16
189	add	r10,r10,r10,lsr#2
190	orr	r3,r2,r3,lsl#24
191#else
192	ldr	r0,[lr],#16		@ load input
193# ifdef	__thumb2__
194	it	hi
195# endif
196	addhi	r8,r8,#1		@ padbit
197	ldr	r1,[lr,#-12]
198	ldr	r2,[lr,#-8]
199	ldr	r3,[lr,#-4]
200# ifdef	__ARMEB__
201	rev	r0,r0
202	rev	r1,r1
203	rev	r2,r2
204	rev	r3,r3
205# endif
206	adds	r4,r4,r0		@ accumulate input
207	str	lr,[sp,#8]		@ offload input pointer
208	adcs	r5,r5,r1
209	add	r10,r10,r10,lsr#2
210	adcs	r6,r6,r2
211#endif
212	add	r11,r11,r11,lsr#2
213	adcs	r7,r7,r3
214	add	r12,r12,r12,lsr#2
215
216	umull	r2,r3,r5,r9
217	adc	r8,r8,#0
218	umull	r0,r1,r4,r9
219	umlal	r2,r3,r8,r10
220	umlal	r0,r1,r7,r10
221	ldr	r10,[sp,#20]		@ reload r10
222	umlal	r2,r3,r6,r12
223	umlal	r0,r1,r5,r12
224	umlal	r2,r3,r7,r11
225	umlal	r0,r1,r6,r11
226	umlal	r2,r3,r4,r10
227	str	r0,[sp,#0]		@ future r4
228	mul	r0,r11,r8
229	ldr	r11,[sp,#24]		@ reload r11
230	adds	r2,r2,r1		@ d1+=d0>>32
231	eor	r1,r1,r1
232	adc	lr,r3,#0		@ future r6
233	str	r2,[sp,#4]		@ future r5
234
235	mul	r2,r12,r8
236	eor	r3,r3,r3
237	umlal	r0,r1,r7,r12
238	ldr	r12,[sp,#28]		@ reload r12
239	umlal	r2,r3,r7,r9
240	umlal	r0,r1,r6,r9
241	umlal	r2,r3,r6,r10
242	umlal	r0,r1,r5,r10
243	umlal	r2,r3,r5,r11
244	umlal	r0,r1,r4,r11
245	umlal	r2,r3,r4,r12
246	ldr	r4,[sp,#0]
247	mul	r8,r9,r8
248	ldr	r5,[sp,#4]
249
250	adds	r6,lr,r0		@ d2+=d1>>32
251	ldr	lr,[sp,#8]		@ reload input pointer
252	adc	r1,r1,#0
253	adds	r7,r2,r1		@ d3+=d2>>32
254	ldr	r0,[sp,#16]		@ reload end pointer
255	adc	r3,r3,#0
256	add	r8,r8,r3		@ h4+=d3>>32
257
258	and	r1,r8,#-4
259	and	r8,r8,#3
260	add	r1,r1,r1,lsr#2		@ *=5
261	adds	r4,r4,r1
262	adcs	r5,r5,#0
263	adcs	r6,r6,#0
264	adcs	r7,r7,#0
265	adc	r8,r8,#0
266
267	cmp	r0,lr			@ done yet?
268	bhi	.Loop
269
270	ldr	r0,[sp,#12]
271	add	sp,sp,#32
272	stmia	r0,{r4,r5,r6,r7,r8}		@ store the result
273
274.Lno_data:
275#if	__ARM_ARCH__>=5
276	ldmia	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc}
277#else
278	ldmia	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
279	tst	lr,#1
280	moveq	pc,lr			@ be binary compatible with V4, yet
281.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
282#endif
283.size	poly1305_blocks,.-poly1305_blocks
284.type	poly1305_emit,%function
285.align	5
286poly1305_emit:
287.Lpoly1305_emit:
288	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
289.Lpoly1305_emit_enter:
290
291	ldmia	r0,{r3,r4,r5,r6,r7}
292	adds	r8,r3,#5		@ compare to modulus
293	adcs	r9,r4,#0
294	adcs	r10,r5,#0
295	adcs	r11,r6,#0
296	adc	r7,r7,#0
297	tst	r7,#4			@ did it carry/borrow?
298
299#ifdef	__thumb2__
300	it	ne
301#endif
302	movne	r3,r8
303	ldr	r8,[r2,#0]
304#ifdef	__thumb2__
305	it	ne
306#endif
307	movne	r4,r9
308	ldr	r9,[r2,#4]
309#ifdef	__thumb2__
310	it	ne
311#endif
312	movne	r5,r10
313	ldr	r10,[r2,#8]
314#ifdef	__thumb2__
315	it	ne
316#endif
317	movne	r6,r11
318	ldr	r11,[r2,#12]
319
320	adds	r3,r3,r8
321	adcs	r4,r4,r9
322	adcs	r5,r5,r10
323	adc	r6,r6,r11
324
325#if __ARM_ARCH__>=7
326# ifdef __ARMEB__
327	rev	r3,r3
328	rev	r4,r4
329	rev	r5,r5
330	rev	r6,r6
331# endif
332	str	r3,[r1,#0]
333	str	r4,[r1,#4]
334	str	r5,[r1,#8]
335	str	r6,[r1,#12]
336#else
337	strb	r3,[r1,#0]
338	mov	r3,r3,lsr#8
339	strb	r4,[r1,#4]
340	mov	r4,r4,lsr#8
341	strb	r5,[r1,#8]
342	mov	r5,r5,lsr#8
343	strb	r6,[r1,#12]
344	mov	r6,r6,lsr#8
345
346	strb	r3,[r1,#1]
347	mov	r3,r3,lsr#8
348	strb	r4,[r1,#5]
349	mov	r4,r4,lsr#8
350	strb	r5,[r1,#9]
351	mov	r5,r5,lsr#8
352	strb	r6,[r1,#13]
353	mov	r6,r6,lsr#8
354
355	strb	r3,[r1,#2]
356	mov	r3,r3,lsr#8
357	strb	r4,[r1,#6]
358	mov	r4,r4,lsr#8
359	strb	r5,[r1,#10]
360	mov	r5,r5,lsr#8
361	strb	r6,[r1,#14]
362	mov	r6,r6,lsr#8
363
364	strb	r3,[r1,#3]
365	strb	r4,[r1,#7]
366	strb	r5,[r1,#11]
367	strb	r6,[r1,#15]
368#endif
369	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
370#if	__ARM_ARCH__>=5
371	bx	lr				@ bx	lr
372#else
373	tst	lr,#1
374	moveq	pc,lr			@ be binary compatible with V4, yet
375.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
376#endif
377.size	poly1305_emit,.-poly1305_emit
378#if	__ARM_MAX_ARCH__>=7
379.fpu	neon
380
381.type	poly1305_init_neon,%function
382.align	5
383poly1305_init_neon:
384	ldr	r4,[r0,#20]		@ load key base 2^32
385	ldr	r5,[r0,#24]
386	ldr	r6,[r0,#28]
387	ldr	r7,[r0,#32]
388
389	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
390	mov	r3,r4,lsr#26
391	mov	r4,r5,lsr#20
392	orr	r3,r3,r5,lsl#6
393	mov	r5,r6,lsr#14
394	orr	r4,r4,r6,lsl#12
395	mov	r6,r7,lsr#8
396	orr	r5,r5,r7,lsl#18
397	and	r3,r3,#0x03ffffff
398	and	r4,r4,#0x03ffffff
399	and	r5,r5,#0x03ffffff
400
401	vdup.32	d0,r2			@ r^1 in both lanes
402	add	r2,r3,r3,lsl#2		@ *5
403	vdup.32	d1,r3
404	add	r3,r4,r4,lsl#2
405	vdup.32	d2,r2
406	vdup.32	d3,r4
407	add	r4,r5,r5,lsl#2
408	vdup.32	d4,r3
409	vdup.32	d5,r5
410	add	r5,r6,r6,lsl#2
411	vdup.32	d6,r4
412	vdup.32	d7,r6
413	vdup.32	d8,r5
414
415	mov	r5,#2		@ counter
416
417.Lsquare_neon:
418	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
419	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
420	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
421	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
422	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
423	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
424
425	vmull.u32	q5,d0,d0[1]
426	vmull.u32	q6,d1,d0[1]
427	vmull.u32	q7,d3,d0[1]
428	vmull.u32	q8,d5,d0[1]
429	vmull.u32	q9,d7,d0[1]
430
431	vmlal.u32	q5,d7,d2[1]
432	vmlal.u32	q6,d0,d1[1]
433	vmlal.u32	q7,d1,d1[1]
434	vmlal.u32	q8,d3,d1[1]
435	vmlal.u32	q9,d5,d1[1]
436
437	vmlal.u32	q5,d5,d4[1]
438	vmlal.u32	q6,d7,d4[1]
439	vmlal.u32	q8,d1,d3[1]
440	vmlal.u32	q7,d0,d3[1]
441	vmlal.u32	q9,d3,d3[1]
442
443	vmlal.u32	q5,d3,d6[1]
444	vmlal.u32	q8,d0,d5[1]
445	vmlal.u32	q6,d5,d6[1]
446	vmlal.u32	q7,d7,d6[1]
447	vmlal.u32	q9,d1,d5[1]
448
449	vmlal.u32	q8,d7,d8[1]
450	vmlal.u32	q5,d1,d8[1]
451	vmlal.u32	q6,d3,d8[1]
452	vmlal.u32	q7,d5,d8[1]
453	vmlal.u32	q9,d0,d7[1]
454
455	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
456	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
457	@ and P. Schwabe
458	@
459	@ H0>>+H1>>+H2>>+H3>>+H4
460	@ H3>>+H4>>*5+H0>>+H1
461	@
462	@ Trivia.
463	@
464	@ Result of multiplication of n-bit number by m-bit number is
465	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
466	@ m-bit number multiplied by 2^n is still n+m bits wide.
467	@
468	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
469	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
470	@ one is n+1 bits wide.
471	@
472	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
473	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
474	@ can be 27. However! In cases when their width exceeds 26 bits
475	@ they are limited by 2^26+2^6. This in turn means that *sum*
476	@ of the products with these values can still be viewed as sum
477	@ of 52-bit numbers as long as the amount of addends is not a
478	@ power of 2. For example,
479	@
480	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
481	@
482	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
483	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
484	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
485	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
486	@ which is less than 32 * (2^52) or 2^57. And when processing
487	@ data we are looking at triple as many addends...
488	@
489	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
490	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
491	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
492	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
493	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
494	@ This means that result of reduction have to be compressed upon
495	@ loop wrap-around. This can be done in the process of reduction
496	@ to minimize amount of instructions [as well as amount of
497	@ 128-bit instructions, which benefits low-end processors], but
498	@ one has to watch for H2 (which is narrower than H0) and 5*H4
499	@ not being wider than 58 bits, so that result of right shift
500	@ by 26 bits fits in 32 bits. This is also useful on x86,
501	@ because it allows to use paddd in place for paddq, which
502	@ benefits Atom, where paddq is ridiculously slow.
503
504	vshr.u64	q15,q8,#26
505	vmovn.i64	d16,q8
506	vshr.u64	q4,q5,#26
507	vmovn.i64	d10,q5
508	vadd.i64	q9,q9,q15		@ h3 -> h4
509	vbic.i32	d16,#0xfc000000	@ &=0x03ffffff
510	vadd.i64	q6,q6,q4		@ h0 -> h1
511	vbic.i32	d10,#0xfc000000
512
513	vshrn.u64	d30,q9,#26
514	vmovn.i64	d18,q9
515	vshr.u64	q4,q6,#26
516	vmovn.i64	d12,q6
517	vadd.i64	q7,q7,q4		@ h1 -> h2
518	vbic.i32	d18,#0xfc000000
519	vbic.i32	d12,#0xfc000000
520
521	vadd.i32	d10,d10,d30
522	vshl.u32	d30,d30,#2
523	vshrn.u64	d8,q7,#26
524	vmovn.i64	d14,q7
525	vadd.i32	d10,d10,d30	@ h4 -> h0
526	vadd.i32	d16,d16,d8	@ h2 -> h3
527	vbic.i32	d14,#0xfc000000
528
529	vshr.u32	d30,d10,#26
530	vbic.i32	d10,#0xfc000000
531	vshr.u32	d8,d16,#26
532	vbic.i32	d16,#0xfc000000
533	vadd.i32	d12,d12,d30	@ h0 -> h1
534	vadd.i32	d18,d18,d8	@ h3 -> h4
535
536	subs	r5,r5,#1
537	beq	.Lsquare_break_neon
538
539	add	r6,r0,#(48+0*9*4)
540	add	r7,r0,#(48+1*9*4)
541
542	vtrn.32	d0,d10		@ r^2:r^1
543	vtrn.32	d3,d14
544	vtrn.32	d5,d16
545	vtrn.32	d1,d12
546	vtrn.32	d7,d18
547
548	vshl.u32	d4,d3,#2		@ *5
549	vshl.u32	d6,d5,#2
550	vshl.u32	d2,d1,#2
551	vshl.u32	d8,d7,#2
552	vadd.i32	d4,d4,d3
553	vadd.i32	d2,d2,d1
554	vadd.i32	d6,d6,d5
555	vadd.i32	d8,d8,d7
556
557	vst4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!
558	vst4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!
559	vst4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
560	vst4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
561	vst1.32	{d8[0]},[r6,:32]
562	vst1.32	{d8[1]},[r7,:32]
563
564	b	.Lsquare_neon
565
566.align	4
567.Lsquare_break_neon:
568	add	r6,r0,#(48+2*4*9)
569	add	r7,r0,#(48+3*4*9)
570
571	vmov	d0,d10		@ r^4:r^3
572	vshl.u32	d2,d12,#2		@ *5
573	vmov	d1,d12
574	vshl.u32	d4,d14,#2
575	vmov	d3,d14
576	vshl.u32	d6,d16,#2
577	vmov	d5,d16
578	vshl.u32	d8,d18,#2
579	vmov	d7,d18
580	vadd.i32	d2,d2,d12
581	vadd.i32	d4,d4,d14
582	vadd.i32	d6,d6,d16
583	vadd.i32	d8,d8,d18
584
585	vst4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!
586	vst4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!
587	vst4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
588	vst4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
589	vst1.32	{d8[0]},[r6]
590	vst1.32	{d8[1]},[r7]
591
592	bx	lr				@ bx	lr
593.size	poly1305_init_neon,.-poly1305_init_neon
594
595.type	poly1305_blocks_neon,%function
596.align	5
597poly1305_blocks_neon:
598.Lpoly1305_blocks_neon:
599	ldr	ip,[r0,#36]		@ is_base2_26
600	ands	r2,r2,#-16
601	beq	.Lno_data_neon
602
603	cmp	r2,#64
604	bhs	.Lenter_neon
605	tst	ip,ip			@ is_base2_26?
606	beq	.Lpoly1305_blocks
607
608.Lenter_neon:
609	stmdb	sp!,{r4,r5,r6,r7}
610	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
611
612	tst	ip,ip			@ is_base2_26?
613	bne	.Lbase2_26_neon
614
615	stmdb	sp!,{r1,r2,r3,lr}
616	bl	poly1305_init_neon
617
618	ldr	r4,[r0,#0]		@ load hash value base 2^32
619	ldr	r5,[r0,#4]
620	ldr	r6,[r0,#8]
621	ldr	r7,[r0,#12]
622	ldr	ip,[r0,#16]
623
624	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
625	mov	r3,r4,lsr#26
626	veor	d10,d10,d10
627	mov	r4,r5,lsr#20
628	orr	r3,r3,r5,lsl#6
629	veor	d12,d12,d12
630	mov	r5,r6,lsr#14
631	orr	r4,r4,r6,lsl#12
632	veor	d14,d14,d14
633	mov	r6,r7,lsr#8
634	orr	r5,r5,r7,lsl#18
635	veor	d16,d16,d16
636	and	r3,r3,#0x03ffffff
637	orr	r6,r6,ip,lsl#24
638	veor	d18,d18,d18
639	and	r4,r4,#0x03ffffff
640	mov	r1,#1
641	and	r5,r5,#0x03ffffff
642	str	r1,[r0,#36]		@ is_base2_26
643
644	vmov.32	d10[0],r2
645	vmov.32	d12[0],r3
646	vmov.32	d14[0],r4
647	vmov.32	d16[0],r5
648	vmov.32	d18[0],r6
649	adr	r5,.Lzeros
650
651	ldmia	sp!,{r1,r2,r3,lr}
652	b	.Lbase2_32_neon
653
654.align	4
655.Lbase2_26_neon:
656	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
657	@ load hash value
658
659	veor	d10,d10,d10
660	veor	d12,d12,d12
661	veor	d14,d14,d14
662	veor	d16,d16,d16
663	veor	d18,d18,d18
664	vld4.32	{d10[0],d12[0],d14[0],d16[0]},[r0]!
665	adr	r5,.Lzeros
666	vld1.32	{d18[0]},[r0]
667	sub	r0,r0,#16		@ rewind
668
669.Lbase2_32_neon:
670	add	r4,r1,#32
671	mov	r3,r3,lsl#24
672	tst	r2,#31
673	beq	.Leven
674
675	vld4.32	{d20[0],d22[0],d24[0],d26[0]},[r1]!
676	vmov.32	d28[0],r3
677	sub	r2,r2,#16
678	add	r4,r1,#32
679
680# ifdef	__ARMEB__
681	vrev32.8	q10,q10
682	vrev32.8	q13,q13
683	vrev32.8	q11,q11
684	vrev32.8	q12,q12
685# endif
686	vsri.u32	d28,d26,#8	@ base 2^32 -> base 2^26
687	vshl.u32	d26,d26,#18
688
689	vsri.u32	d26,d24,#14
690	vshl.u32	d24,d24,#12
691	vadd.i32	d29,d28,d18	@ add hash value and move to #hi
692
693	vbic.i32	d26,#0xfc000000
694	vsri.u32	d24,d22,#20
695	vshl.u32	d22,d22,#6
696
697	vbic.i32	d24,#0xfc000000
698	vsri.u32	d22,d20,#26
699	vadd.i32	d27,d26,d16
700
701	vbic.i32	d20,#0xfc000000
702	vbic.i32	d22,#0xfc000000
703	vadd.i32	d25,d24,d14
704
705	vadd.i32	d21,d20,d10
706	vadd.i32	d23,d22,d12
707
708	mov	r7,r5
709	add	r6,r0,#48
710
711	cmp	r2,r2
712	b	.Long_tail
713
714.align	4
715.Leven:
716	subs	r2,r2,#64
717	it	lo
718	movlo	r4,r5
719
720	vmov.i32	q14,#1<<24		@ padbit, yes, always
721	vld4.32	{d20,d22,d24,d26},[r1]	@ inp[0:1]
722	add	r1,r1,#64
723	vld4.32	{d21,d23,d25,d27},[r4]	@ inp[2:3] (or 0)
724	add	r4,r4,#64
725	itt	hi
726	addhi	r7,r0,#(48+1*9*4)
727	addhi	r6,r0,#(48+3*9*4)
728
729# ifdef	__ARMEB__
730	vrev32.8	q10,q10
731	vrev32.8	q13,q13
732	vrev32.8	q11,q11
733	vrev32.8	q12,q12
734# endif
735	vsri.u32	q14,q13,#8		@ base 2^32 -> base 2^26
736	vshl.u32	q13,q13,#18
737
738	vsri.u32	q13,q12,#14
739	vshl.u32	q12,q12,#12
740
741	vbic.i32	q13,#0xfc000000
742	vsri.u32	q12,q11,#20
743	vshl.u32	q11,q11,#6
744
745	vbic.i32	q12,#0xfc000000
746	vsri.u32	q11,q10,#26
747
748	vbic.i32	q10,#0xfc000000
749	vbic.i32	q11,#0xfc000000
750
751	bls	.Lskip_loop
752
753	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^2
754	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^4
755	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
756	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
757	b	.Loop_neon
758
759.align	5
760.Loop_neon:
761	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
762	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
763	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
764	@   ___________________/
765	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
766	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
767	@   ___________________/ ____________________/
768	@
769	@ Note that we start with inp[2:3]*r^2. This is because it
770	@ doesn't depend on reduction in previous iteration.
771	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
772	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
773	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
774	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
775	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
776	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
777
778	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
779	@ inp[2:3]*r^2
780
781	vadd.i32	d24,d24,d14	@ accumulate inp[0:1]
782	vmull.u32	q7,d25,d0[1]
783	vadd.i32	d20,d20,d10
784	vmull.u32	q5,d21,d0[1]
785	vadd.i32	d26,d26,d16
786	vmull.u32	q8,d27,d0[1]
787	vmlal.u32	q7,d23,d1[1]
788	vadd.i32	d22,d22,d12
789	vmull.u32	q6,d23,d0[1]
790
791	vadd.i32	d28,d28,d18
792	vmull.u32	q9,d29,d0[1]
793	subs	r2,r2,#64
794	vmlal.u32	q5,d29,d2[1]
795	it	lo
796	movlo	r4,r5
797	vmlal.u32	q8,d25,d1[1]
798	vld1.32	d8[1],[r7,:32]
799	vmlal.u32	q6,d21,d1[1]
800	vmlal.u32	q9,d27,d1[1]
801
802	vmlal.u32	q5,d27,d4[1]
803	vmlal.u32	q8,d23,d3[1]
804	vmlal.u32	q9,d25,d3[1]
805	vmlal.u32	q6,d29,d4[1]
806	vmlal.u32	q7,d21,d3[1]
807
808	vmlal.u32	q8,d21,d5[1]
809	vmlal.u32	q5,d25,d6[1]
810	vmlal.u32	q9,d23,d5[1]
811	vmlal.u32	q6,d27,d6[1]
812	vmlal.u32	q7,d29,d6[1]
813
814	vmlal.u32	q8,d29,d8[1]
815	vmlal.u32	q5,d23,d8[1]
816	vmlal.u32	q9,d21,d7[1]
817	vmlal.u32	q6,d25,d8[1]
818	vmlal.u32	q7,d27,d8[1]
819
820	vld4.32	{d21,d23,d25,d27},[r4]	@ inp[2:3] (or 0)
821	add	r4,r4,#64
822
823	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
824	@ (hash+inp[0:1])*r^4 and accumulate
825
826	vmlal.u32	q8,d26,d0[0]
827	vmlal.u32	q5,d20,d0[0]
828	vmlal.u32	q9,d28,d0[0]
829	vmlal.u32	q6,d22,d0[0]
830	vmlal.u32	q7,d24,d0[0]
831	vld1.32	d8[0],[r6,:32]
832
833	vmlal.u32	q8,d24,d1[0]
834	vmlal.u32	q5,d28,d2[0]
835	vmlal.u32	q9,d26,d1[0]
836	vmlal.u32	q6,d20,d1[0]
837	vmlal.u32	q7,d22,d1[0]
838
839	vmlal.u32	q8,d22,d3[0]
840	vmlal.u32	q5,d26,d4[0]
841	vmlal.u32	q9,d24,d3[0]
842	vmlal.u32	q6,d28,d4[0]
843	vmlal.u32	q7,d20,d3[0]
844
845	vmlal.u32	q8,d20,d5[0]
846	vmlal.u32	q5,d24,d6[0]
847	vmlal.u32	q9,d22,d5[0]
848	vmlal.u32	q6,d26,d6[0]
849	vmlal.u32	q8,d28,d8[0]
850
851	vmlal.u32	q7,d28,d6[0]
852	vmlal.u32	q5,d22,d8[0]
853	vmlal.u32	q9,d20,d7[0]
854	vmov.i32	q14,#1<<24		@ padbit, yes, always
855	vmlal.u32	q6,d24,d8[0]
856	vmlal.u32	q7,d26,d8[0]
857
858	vld4.32	{d20,d22,d24,d26},[r1]	@ inp[0:1]
859	add	r1,r1,#64
860# ifdef	__ARMEB__
861	vrev32.8	q10,q10
862	vrev32.8	q11,q11
863	vrev32.8	q12,q12
864	vrev32.8	q13,q13
865# endif
866
867	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
868	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
869	@ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
870
871	vshr.u64	q15,q8,#26
872	vmovn.i64	d16,q8
873	vshr.u64	q4,q5,#26
874	vmovn.i64	d10,q5
875	vadd.i64	q9,q9,q15		@ h3 -> h4
876	vbic.i32	d16,#0xfc000000
877	vsri.u32	q14,q13,#8		@ base 2^32 -> base 2^26
878	vadd.i64	q6,q6,q4		@ h0 -> h1
879	vshl.u32	q13,q13,#18
880	vbic.i32	d10,#0xfc000000
881
882	vshrn.u64	d30,q9,#26
883	vmovn.i64	d18,q9
884	vshr.u64	q4,q6,#26
885	vmovn.i64	d12,q6
886	vadd.i64	q7,q7,q4		@ h1 -> h2
887	vsri.u32	q13,q12,#14
888	vbic.i32	d18,#0xfc000000
889	vshl.u32	q12,q12,#12
890	vbic.i32	d12,#0xfc000000
891
892	vadd.i32	d10,d10,d30
893	vshl.u32	d30,d30,#2
894	vbic.i32	q13,#0xfc000000
895	vshrn.u64	d8,q7,#26
896	vmovn.i64	d14,q7
897	vaddl.u32	q5,d10,d30	@ h4 -> h0 [widen for a sec]
898	vsri.u32	q12,q11,#20
899	vadd.i32	d16,d16,d8	@ h2 -> h3
900	vshl.u32	q11,q11,#6
901	vbic.i32	d14,#0xfc000000
902	vbic.i32	q12,#0xfc000000
903
904	vshrn.u64	d30,q5,#26		@ re-narrow
905	vmovn.i64	d10,q5
906	vsri.u32	q11,q10,#26
907	vbic.i32	q10,#0xfc000000
908	vshr.u32	d8,d16,#26
909	vbic.i32	d16,#0xfc000000
910	vbic.i32	d10,#0xfc000000
911	vadd.i32	d12,d12,d30	@ h0 -> h1
912	vadd.i32	d18,d18,d8	@ h3 -> h4
913	vbic.i32	q11,#0xfc000000
914
915	bhi	.Loop_neon
916
917.Lskip_loop:
918	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
919	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
920
921	add	r7,r0,#(48+0*9*4)
922	add	r6,r0,#(48+1*9*4)
923	adds	r2,r2,#32
924	it	ne
925	movne	r2,#0
926	bne	.Long_tail
927
928	vadd.i32	d25,d24,d14	@ add hash value and move to #hi
929	vadd.i32	d21,d20,d10
930	vadd.i32	d27,d26,d16
931	vadd.i32	d23,d22,d12
932	vadd.i32	d29,d28,d18
933
934.Long_tail:
935	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^1
936	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^2
937
938	vadd.i32	d24,d24,d14	@ can be redundant
939	vmull.u32	q7,d25,d0
940	vadd.i32	d20,d20,d10
941	vmull.u32	q5,d21,d0
942	vadd.i32	d26,d26,d16
943	vmull.u32	q8,d27,d0
944	vadd.i32	d22,d22,d12
945	vmull.u32	q6,d23,d0
946	vadd.i32	d28,d28,d18
947	vmull.u32	q9,d29,d0
948
949	vmlal.u32	q5,d29,d2
950	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
951	vmlal.u32	q8,d25,d1
952	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
953	vmlal.u32	q6,d21,d1
954	vmlal.u32	q9,d27,d1
955	vmlal.u32	q7,d23,d1
956
957	vmlal.u32	q8,d23,d3
958	vld1.32	d8[1],[r7,:32]
959	vmlal.u32	q5,d27,d4
960	vld1.32	d8[0],[r6,:32]
961	vmlal.u32	q9,d25,d3
962	vmlal.u32	q6,d29,d4
963	vmlal.u32	q7,d21,d3
964
965	vmlal.u32	q8,d21,d5
966	it	ne
967	addne	r7,r0,#(48+2*9*4)
968	vmlal.u32	q5,d25,d6
969	it	ne
970	addne	r6,r0,#(48+3*9*4)
971	vmlal.u32	q9,d23,d5
972	vmlal.u32	q6,d27,d6
973	vmlal.u32	q7,d29,d6
974
975	vmlal.u32	q8,d29,d8
976	vorn	q0,q0,q0	@ all-ones, can be redundant
977	vmlal.u32	q5,d23,d8
978	vshr.u64	q0,q0,#38
979	vmlal.u32	q9,d21,d7
980	vmlal.u32	q6,d25,d8
981	vmlal.u32	q7,d27,d8
982
983	beq	.Lshort_tail
984
985	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
986	@ (hash+inp[0:1])*r^4:r^3 and accumulate
987
988	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^3
989	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^4
990
991	vmlal.u32	q7,d24,d0
992	vmlal.u32	q5,d20,d0
993	vmlal.u32	q8,d26,d0
994	vmlal.u32	q6,d22,d0
995	vmlal.u32	q9,d28,d0
996
997	vmlal.u32	q5,d28,d2
998	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
999	vmlal.u32	q8,d24,d1
1000	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
1001	vmlal.u32	q6,d20,d1
1002	vmlal.u32	q9,d26,d1
1003	vmlal.u32	q7,d22,d1
1004
1005	vmlal.u32	q8,d22,d3
1006	vld1.32	d8[1],[r7,:32]
1007	vmlal.u32	q5,d26,d4
1008	vld1.32	d8[0],[r6,:32]
1009	vmlal.u32	q9,d24,d3
1010	vmlal.u32	q6,d28,d4
1011	vmlal.u32	q7,d20,d3
1012
1013	vmlal.u32	q8,d20,d5
1014	vmlal.u32	q5,d24,d6
1015	vmlal.u32	q9,d22,d5
1016	vmlal.u32	q6,d26,d6
1017	vmlal.u32	q7,d28,d6
1018
1019	vmlal.u32	q8,d28,d8
1020	vorn	q0,q0,q0	@ all-ones
1021	vmlal.u32	q5,d22,d8
1022	vshr.u64	q0,q0,#38
1023	vmlal.u32	q9,d20,d7
1024	vmlal.u32	q6,d24,d8
1025	vmlal.u32	q7,d26,d8
1026
1027.Lshort_tail:
1028	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1029	@ horizontal addition
1030
1031	vadd.i64	d16,d16,d17
1032	vadd.i64	d10,d10,d11
1033	vadd.i64	d18,d18,d19
1034	vadd.i64	d12,d12,d13
1035	vadd.i64	d14,d14,d15
1036
1037	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1038	@ lazy reduction, but without narrowing
1039
1040	vshr.u64	q15,q8,#26
1041	vand.i64	q8,q8,q0
1042	vshr.u64	q4,q5,#26
1043	vand.i64	q5,q5,q0
1044	vadd.i64	q9,q9,q15		@ h3 -> h4
1045	vadd.i64	q6,q6,q4		@ h0 -> h1
1046
1047	vshr.u64	q15,q9,#26
1048	vand.i64	q9,q9,q0
1049	vshr.u64	q4,q6,#26
1050	vand.i64	q6,q6,q0
1051	vadd.i64	q7,q7,q4		@ h1 -> h2
1052
1053	vadd.i64	q5,q5,q15
1054	vshl.u64	q15,q15,#2
1055	vshr.u64	q4,q7,#26
1056	vand.i64	q7,q7,q0
1057	vadd.i64	q5,q5,q15		@ h4 -> h0
1058	vadd.i64	q8,q8,q4		@ h2 -> h3
1059
1060	vshr.u64	q15,q5,#26
1061	vand.i64	q5,q5,q0
1062	vshr.u64	q4,q8,#26
1063	vand.i64	q8,q8,q0
1064	vadd.i64	q6,q6,q15		@ h0 -> h1
1065	vadd.i64	q9,q9,q4		@ h3 -> h4
1066
1067	cmp	r2,#0
1068	bne	.Leven
1069
1070	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1071	@ store hash value
1072
1073	vst4.32	{d10[0],d12[0],d14[0],d16[0]},[r0]!
1074	vst1.32	{d18[0]},[r0]
1075
1076	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}			@ epilogue
1077	ldmia	sp!,{r4,r5,r6,r7}
1078.Lno_data_neon:
1079	bx	lr					@ bx	lr
1080.size	poly1305_blocks_neon,.-poly1305_blocks_neon
1081
1082.type	poly1305_emit_neon,%function
1083.align	5
1084poly1305_emit_neon:
1085.Lpoly1305_emit_neon:
1086	ldr	ip,[r0,#36]		@ is_base2_26
1087
1088	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1089
1090	tst	ip,ip
1091	beq	.Lpoly1305_emit_enter
1092
1093	ldmia	r0,{r3,r4,r5,r6,r7}
1094	eor	r8,r8,r8
1095
1096	adds	r3,r3,r4,lsl#26	@ base 2^26 -> base 2^32
1097	mov	r4,r4,lsr#6
1098	adcs	r4,r4,r5,lsl#20
1099	mov	r5,r5,lsr#12
1100	adcs	r5,r5,r6,lsl#14
1101	mov	r6,r6,lsr#18
1102	adcs	r6,r6,r7,lsl#8
1103	adc	r7,r8,r7,lsr#24	@ can be partially reduced ...
1104
1105	and	r8,r7,#-4		@ ... so reduce
1106	and	r7,r6,#3
1107	add	r8,r8,r8,lsr#2	@ *= 5
1108	adds	r3,r3,r8
1109	adcs	r4,r4,#0
1110	adcs	r5,r5,#0
1111	adcs	r6,r6,#0
1112	adc	r7,r7,#0
1113
1114	adds	r8,r3,#5		@ compare to modulus
1115	adcs	r9,r4,#0
1116	adcs	r10,r5,#0
1117	adcs	r11,r6,#0
1118	adc	r7,r7,#0
1119	tst	r7,#4			@ did it carry/borrow?
1120
1121	it	ne
1122	movne	r3,r8
1123	ldr	r8,[r2,#0]
1124	it	ne
1125	movne	r4,r9
1126	ldr	r9,[r2,#4]
1127	it	ne
1128	movne	r5,r10
1129	ldr	r10,[r2,#8]
1130	it	ne
1131	movne	r6,r11
1132	ldr	r11,[r2,#12]
1133
1134	adds	r3,r3,r8		@ accumulate nonce
1135	adcs	r4,r4,r9
1136	adcs	r5,r5,r10
1137	adc	r6,r6,r11
1138
1139# ifdef __ARMEB__
1140	rev	r3,r3
1141	rev	r4,r4
1142	rev	r5,r5
1143	rev	r6,r6
1144# endif
1145	str	r3,[r1,#0]		@ store the result
1146	str	r4,[r1,#4]
1147	str	r5,[r1,#8]
1148	str	r6,[r1,#12]
1149
1150	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1151	bx	lr				@ bx	lr
1152.size	poly1305_emit_neon,.-poly1305_emit_neon
1153
1154.align	5
1155.Lzeros:
1156.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1157.LOPENSSL_armcap:
1158# ifdef	_WIN32
1159.word	OPENSSL_armcap_P
1160# else
1161.word	OPENSSL_armcap_P-.Lpoly1305_init
1162# endif
1163#endif
1164.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1165.align	2
1166.align	2
1167#if	__ARM_MAX_ARCH__>=7
1168.comm	OPENSSL_armcap_P,4,4
1169#endif
1170