xref: /freebsd/sys/crypto/openssl/arm/poly1305-armv4.S (revision 734e82fe33aa764367791a7d603b383996c6b40b)
1/* Do not modify. This file is auto-generated from poly1305-armv4.pl. */
2#include "arm_arch.h"
3
4.text
5#if defined(__thumb2__)
6.syntax	unified
7.thumb
8#else
9.code	32
10#endif
11
12.globl	poly1305_emit
13.globl	poly1305_blocks
14.globl	poly1305_init
15.type	poly1305_init,%function
16.align	5
17poly1305_init:
18.Lpoly1305_init:
19	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
20
21	eor	r3,r3,r3
22	cmp	r1,#0
23	str	r3,[r0,#0]		@ zero hash value
24	str	r3,[r0,#4]
25	str	r3,[r0,#8]
26	str	r3,[r0,#12]
27	str	r3,[r0,#16]
28	str	r3,[r0,#36]		@ is_base2_26
29	add	r0,r0,#20
30
31#ifdef	__thumb2__
32	it	eq
33#endif
34	moveq	r0,#0
35	beq	.Lno_key
36
37#if	__ARM_MAX_ARCH__>=7
38	adr	r11,.Lpoly1305_init
39	ldr	r12,.LOPENSSL_armcap
40#endif
41	ldrb	r4,[r1,#0]
42	mov	r10,#0x0fffffff
43	ldrb	r5,[r1,#1]
44	and	r3,r10,#-4		@ 0x0ffffffc
45	ldrb	r6,[r1,#2]
46	ldrb	r7,[r1,#3]
47	orr	r4,r4,r5,lsl#8
48	ldrb	r5,[r1,#4]
49	orr	r4,r4,r6,lsl#16
50	ldrb	r6,[r1,#5]
51	orr	r4,r4,r7,lsl#24
52	ldrb	r7,[r1,#6]
53	and	r4,r4,r10
54
55#if	__ARM_MAX_ARCH__>=7
56	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
57# ifdef	__APPLE__
58	ldr	r12,[r12]
59# endif
60#endif
61	ldrb	r8,[r1,#7]
62	orr	r5,r5,r6,lsl#8
63	ldrb	r6,[r1,#8]
64	orr	r5,r5,r7,lsl#16
65	ldrb	r7,[r1,#9]
66	orr	r5,r5,r8,lsl#24
67	ldrb	r8,[r1,#10]
68	and	r5,r5,r3
69
70#if	__ARM_MAX_ARCH__>=7
71	tst	r12,#ARMV7_NEON		@ check for NEON
72# ifdef	__APPLE__
73	adr	r9,poly1305_blocks_neon
74	adr	r11,poly1305_blocks
75#  ifdef __thumb2__
76	it	ne
77#  endif
78	movne	r11,r9
79	adr	r12,poly1305_emit
80	adr	r10,poly1305_emit_neon
81#  ifdef __thumb2__
82	it	ne
83#  endif
84	movne	r12,r10
85# else
86#  ifdef __thumb2__
87	itete	eq
88#  endif
89	addeq	r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
90	addne	r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init)
91	addeq	r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
92	addne	r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
93# endif
94# ifdef	__thumb2__
95	orr	r12,r12,#1	@ thumb-ify address
96	orr	r11,r11,#1
97# endif
98#endif
99	ldrb	r9,[r1,#11]
100	orr	r6,r6,r7,lsl#8
101	ldrb	r7,[r1,#12]
102	orr	r6,r6,r8,lsl#16
103	ldrb	r8,[r1,#13]
104	orr	r6,r6,r9,lsl#24
105	ldrb	r9,[r1,#14]
106	and	r6,r6,r3
107
108	ldrb	r10,[r1,#15]
109	orr	r7,r7,r8,lsl#8
110	str	r4,[r0,#0]
111	orr	r7,r7,r9,lsl#16
112	str	r5,[r0,#4]
113	orr	r7,r7,r10,lsl#24
114	str	r6,[r0,#8]
115	and	r7,r7,r3
116	str	r7,[r0,#12]
117#if	__ARM_MAX_ARCH__>=7
118	stmia	r2,{r11,r12}		@ fill functions table
119	mov	r0,#1
120#else
121	mov	r0,#0
122#endif
123.Lno_key:
124	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
125#if	__ARM_ARCH__>=5
126	bx	lr				@ bx	lr
127#else
128	tst	lr,#1
129	moveq	pc,lr			@ be binary compatible with V4, yet
130.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
131#endif
132.size	poly1305_init,.-poly1305_init
133.type	poly1305_blocks,%function
134.align	5
135poly1305_blocks:
136.Lpoly1305_blocks:
137	stmdb	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
138
139	ands	r2,r2,#-16
140	beq	.Lno_data
141
142	cmp	r3,#0
143	add	r2,r2,r1		@ end pointer
144	sub	sp,sp,#32
145
146	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12}		@ load context
147
148	str	r0,[sp,#12]		@ offload stuff
149	mov	lr,r1
150	str	r2,[sp,#16]
151	str	r10,[sp,#20]
152	str	r11,[sp,#24]
153	str	r12,[sp,#28]
154	b	.Loop
155
156.Loop:
157#if __ARM_ARCH__<7
158	ldrb	r0,[lr],#16		@ load input
159# ifdef	__thumb2__
160	it	hi
161# endif
162	addhi	r8,r8,#1		@ 1<<128
163	ldrb	r1,[lr,#-15]
164	ldrb	r2,[lr,#-14]
165	ldrb	r3,[lr,#-13]
166	orr	r1,r0,r1,lsl#8
167	ldrb	r0,[lr,#-12]
168	orr	r2,r1,r2,lsl#16
169	ldrb	r1,[lr,#-11]
170	orr	r3,r2,r3,lsl#24
171	ldrb	r2,[lr,#-10]
172	adds	r4,r4,r3		@ accumulate input
173
174	ldrb	r3,[lr,#-9]
175	orr	r1,r0,r1,lsl#8
176	ldrb	r0,[lr,#-8]
177	orr	r2,r1,r2,lsl#16
178	ldrb	r1,[lr,#-7]
179	orr	r3,r2,r3,lsl#24
180	ldrb	r2,[lr,#-6]
181	adcs	r5,r5,r3
182
183	ldrb	r3,[lr,#-5]
184	orr	r1,r0,r1,lsl#8
185	ldrb	r0,[lr,#-4]
186	orr	r2,r1,r2,lsl#16
187	ldrb	r1,[lr,#-3]
188	orr	r3,r2,r3,lsl#24
189	ldrb	r2,[lr,#-2]
190	adcs	r6,r6,r3
191
192	ldrb	r3,[lr,#-1]
193	orr	r1,r0,r1,lsl#8
194	str	lr,[sp,#8]		@ offload input pointer
195	orr	r2,r1,r2,lsl#16
196	add	r10,r10,r10,lsr#2
197	orr	r3,r2,r3,lsl#24
198#else
199	ldr	r0,[lr],#16		@ load input
200# ifdef	__thumb2__
201	it	hi
202# endif
203	addhi	r8,r8,#1		@ padbit
204	ldr	r1,[lr,#-12]
205	ldr	r2,[lr,#-8]
206	ldr	r3,[lr,#-4]
207# ifdef	__ARMEB__
208	rev	r0,r0
209	rev	r1,r1
210	rev	r2,r2
211	rev	r3,r3
212# endif
213	adds	r4,r4,r0		@ accumulate input
214	str	lr,[sp,#8]		@ offload input pointer
215	adcs	r5,r5,r1
216	add	r10,r10,r10,lsr#2
217	adcs	r6,r6,r2
218#endif
219	add	r11,r11,r11,lsr#2
220	adcs	r7,r7,r3
221	add	r12,r12,r12,lsr#2
222
223	umull	r2,r3,r5,r9
224	adc	r8,r8,#0
225	umull	r0,r1,r4,r9
226	umlal	r2,r3,r8,r10
227	umlal	r0,r1,r7,r10
228	ldr	r10,[sp,#20]		@ reload r10
229	umlal	r2,r3,r6,r12
230	umlal	r0,r1,r5,r12
231	umlal	r2,r3,r7,r11
232	umlal	r0,r1,r6,r11
233	umlal	r2,r3,r4,r10
234	str	r0,[sp,#0]		@ future r4
235	mul	r0,r11,r8
236	ldr	r11,[sp,#24]		@ reload r11
237	adds	r2,r2,r1		@ d1+=d0>>32
238	eor	r1,r1,r1
239	adc	lr,r3,#0		@ future r6
240	str	r2,[sp,#4]		@ future r5
241
242	mul	r2,r12,r8
243	eor	r3,r3,r3
244	umlal	r0,r1,r7,r12
245	ldr	r12,[sp,#28]		@ reload r12
246	umlal	r2,r3,r7,r9
247	umlal	r0,r1,r6,r9
248	umlal	r2,r3,r6,r10
249	umlal	r0,r1,r5,r10
250	umlal	r2,r3,r5,r11
251	umlal	r0,r1,r4,r11
252	umlal	r2,r3,r4,r12
253	ldr	r4,[sp,#0]
254	mul	r8,r9,r8
255	ldr	r5,[sp,#4]
256
257	adds	r6,lr,r0		@ d2+=d1>>32
258	ldr	lr,[sp,#8]		@ reload input pointer
259	adc	r1,r1,#0
260	adds	r7,r2,r1		@ d3+=d2>>32
261	ldr	r0,[sp,#16]		@ reload end pointer
262	adc	r3,r3,#0
263	add	r8,r8,r3		@ h4+=d3>>32
264
265	and	r1,r8,#-4
266	and	r8,r8,#3
267	add	r1,r1,r1,lsr#2		@ *=5
268	adds	r4,r4,r1
269	adcs	r5,r5,#0
270	adcs	r6,r6,#0
271	adcs	r7,r7,#0
272	adc	r8,r8,#0
273
274	cmp	r0,lr			@ done yet?
275	bhi	.Loop
276
277	ldr	r0,[sp,#12]
278	add	sp,sp,#32
279	stmia	r0,{r4,r5,r6,r7,r8}		@ store the result
280
281.Lno_data:
282#if	__ARM_ARCH__>=5
283	ldmia	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc}
284#else
285	ldmia	sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
286	tst	lr,#1
287	moveq	pc,lr			@ be binary compatible with V4, yet
288.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
289#endif
290.size	poly1305_blocks,.-poly1305_blocks
291.type	poly1305_emit,%function
292.align	5
293poly1305_emit:
294.Lpoly1305_emit:
295	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
296.Lpoly1305_emit_enter:
297
298	ldmia	r0,{r3,r4,r5,r6,r7}
299	adds	r8,r3,#5		@ compare to modulus
300	adcs	r9,r4,#0
301	adcs	r10,r5,#0
302	adcs	r11,r6,#0
303	adc	r7,r7,#0
304	tst	r7,#4			@ did it carry/borrow?
305
306#ifdef	__thumb2__
307	it	ne
308#endif
309	movne	r3,r8
310	ldr	r8,[r2,#0]
311#ifdef	__thumb2__
312	it	ne
313#endif
314	movne	r4,r9
315	ldr	r9,[r2,#4]
316#ifdef	__thumb2__
317	it	ne
318#endif
319	movne	r5,r10
320	ldr	r10,[r2,#8]
321#ifdef	__thumb2__
322	it	ne
323#endif
324	movne	r6,r11
325	ldr	r11,[r2,#12]
326
327	adds	r3,r3,r8
328	adcs	r4,r4,r9
329	adcs	r5,r5,r10
330	adc	r6,r6,r11
331
332#if __ARM_ARCH__>=7
333# ifdef __ARMEB__
334	rev	r3,r3
335	rev	r4,r4
336	rev	r5,r5
337	rev	r6,r6
338# endif
339	str	r3,[r1,#0]
340	str	r4,[r1,#4]
341	str	r5,[r1,#8]
342	str	r6,[r1,#12]
343#else
344	strb	r3,[r1,#0]
345	mov	r3,r3,lsr#8
346	strb	r4,[r1,#4]
347	mov	r4,r4,lsr#8
348	strb	r5,[r1,#8]
349	mov	r5,r5,lsr#8
350	strb	r6,[r1,#12]
351	mov	r6,r6,lsr#8
352
353	strb	r3,[r1,#1]
354	mov	r3,r3,lsr#8
355	strb	r4,[r1,#5]
356	mov	r4,r4,lsr#8
357	strb	r5,[r1,#9]
358	mov	r5,r5,lsr#8
359	strb	r6,[r1,#13]
360	mov	r6,r6,lsr#8
361
362	strb	r3,[r1,#2]
363	mov	r3,r3,lsr#8
364	strb	r4,[r1,#6]
365	mov	r4,r4,lsr#8
366	strb	r5,[r1,#10]
367	mov	r5,r5,lsr#8
368	strb	r6,[r1,#14]
369	mov	r6,r6,lsr#8
370
371	strb	r3,[r1,#3]
372	strb	r4,[r1,#7]
373	strb	r5,[r1,#11]
374	strb	r6,[r1,#15]
375#endif
376	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
377#if	__ARM_ARCH__>=5
378	bx	lr				@ bx	lr
379#else
380	tst	lr,#1
381	moveq	pc,lr			@ be binary compatible with V4, yet
382.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
383#endif
384.size	poly1305_emit,.-poly1305_emit
385#if	__ARM_MAX_ARCH__>=7
386.fpu	neon
387
388.type	poly1305_init_neon,%function
389.align	5
390poly1305_init_neon:
391	ldr	r4,[r0,#20]		@ load key base 2^32
392	ldr	r5,[r0,#24]
393	ldr	r6,[r0,#28]
394	ldr	r7,[r0,#32]
395
396	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
397	mov	r3,r4,lsr#26
398	mov	r4,r5,lsr#20
399	orr	r3,r3,r5,lsl#6
400	mov	r5,r6,lsr#14
401	orr	r4,r4,r6,lsl#12
402	mov	r6,r7,lsr#8
403	orr	r5,r5,r7,lsl#18
404	and	r3,r3,#0x03ffffff
405	and	r4,r4,#0x03ffffff
406	and	r5,r5,#0x03ffffff
407
408	vdup.32	d0,r2			@ r^1 in both lanes
409	add	r2,r3,r3,lsl#2		@ *5
410	vdup.32	d1,r3
411	add	r3,r4,r4,lsl#2
412	vdup.32	d2,r2
413	vdup.32	d3,r4
414	add	r4,r5,r5,lsl#2
415	vdup.32	d4,r3
416	vdup.32	d5,r5
417	add	r5,r6,r6,lsl#2
418	vdup.32	d6,r4
419	vdup.32	d7,r6
420	vdup.32	d8,r5
421
422	mov	r5,#2		@ counter
423
424.Lsquare_neon:
425	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
426	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
427	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
428	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
429	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
430	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
431
432	vmull.u32	q5,d0,d0[1]
433	vmull.u32	q6,d1,d0[1]
434	vmull.u32	q7,d3,d0[1]
435	vmull.u32	q8,d5,d0[1]
436	vmull.u32	q9,d7,d0[1]
437
438	vmlal.u32	q5,d7,d2[1]
439	vmlal.u32	q6,d0,d1[1]
440	vmlal.u32	q7,d1,d1[1]
441	vmlal.u32	q8,d3,d1[1]
442	vmlal.u32	q9,d5,d1[1]
443
444	vmlal.u32	q5,d5,d4[1]
445	vmlal.u32	q6,d7,d4[1]
446	vmlal.u32	q8,d1,d3[1]
447	vmlal.u32	q7,d0,d3[1]
448	vmlal.u32	q9,d3,d3[1]
449
450	vmlal.u32	q5,d3,d6[1]
451	vmlal.u32	q8,d0,d5[1]
452	vmlal.u32	q6,d5,d6[1]
453	vmlal.u32	q7,d7,d6[1]
454	vmlal.u32	q9,d1,d5[1]
455
456	vmlal.u32	q8,d7,d8[1]
457	vmlal.u32	q5,d1,d8[1]
458	vmlal.u32	q6,d3,d8[1]
459	vmlal.u32	q7,d5,d8[1]
460	vmlal.u32	q9,d0,d7[1]
461
462	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
463	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
464	@ and P. Schwabe
465	@
466	@ H0>>+H1>>+H2>>+H3>>+H4
467	@ H3>>+H4>>*5+H0>>+H1
468	@
469	@ Trivia.
470	@
471	@ Result of multiplication of n-bit number by m-bit number is
472	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
473	@ m-bit number multiplied by 2^n is still n+m bits wide.
474	@
475	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
476	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
477	@ one is n+1 bits wide.
478	@
479	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
480	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
481	@ can be 27. However! In cases when their width exceeds 26 bits
482	@ they are limited by 2^26+2^6. This in turn means that *sum*
483	@ of the products with these values can still be viewed as sum
484	@ of 52-bit numbers as long as the amount of addends is not a
485	@ power of 2. For example,
486	@
487	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
488	@
489	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
490	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
491	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
492	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
493	@ which is less than 32 * (2^52) or 2^57. And when processing
494	@ data we are looking at triple as many addends...
495	@
496	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
497	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
498	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
499	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
500	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
501	@ This means that result of reduction have to be compressed upon
502	@ loop wrap-around. This can be done in the process of reduction
503	@ to minimize amount of instructions [as well as amount of
504	@ 128-bit instructions, which benefits low-end processors], but
505	@ one has to watch for H2 (which is narrower than H0) and 5*H4
506	@ not being wider than 58 bits, so that result of right shift
507	@ by 26 bits fits in 32 bits. This is also useful on x86,
508	@ because it allows to use paddd in place for paddq, which
509	@ benefits Atom, where paddq is ridiculously slow.
510
511	vshr.u64	q15,q8,#26
512	vmovn.i64	d16,q8
513	vshr.u64	q4,q5,#26
514	vmovn.i64	d10,q5
515	vadd.i64	q9,q9,q15		@ h3 -> h4
516	vbic.i32	d16,#0xfc000000	@ &=0x03ffffff
517	vadd.i64	q6,q6,q4		@ h0 -> h1
518	vbic.i32	d10,#0xfc000000
519
520	vshrn.u64	d30,q9,#26
521	vmovn.i64	d18,q9
522	vshr.u64	q4,q6,#26
523	vmovn.i64	d12,q6
524	vadd.i64	q7,q7,q4		@ h1 -> h2
525	vbic.i32	d18,#0xfc000000
526	vbic.i32	d12,#0xfc000000
527
528	vadd.i32	d10,d10,d30
529	vshl.u32	d30,d30,#2
530	vshrn.u64	d8,q7,#26
531	vmovn.i64	d14,q7
532	vadd.i32	d10,d10,d30	@ h4 -> h0
533	vadd.i32	d16,d16,d8	@ h2 -> h3
534	vbic.i32	d14,#0xfc000000
535
536	vshr.u32	d30,d10,#26
537	vbic.i32	d10,#0xfc000000
538	vshr.u32	d8,d16,#26
539	vbic.i32	d16,#0xfc000000
540	vadd.i32	d12,d12,d30	@ h0 -> h1
541	vadd.i32	d18,d18,d8	@ h3 -> h4
542
543	subs	r5,r5,#1
544	beq	.Lsquare_break_neon
545
546	add	r6,r0,#(48+0*9*4)
547	add	r7,r0,#(48+1*9*4)
548
549	vtrn.32	d0,d10		@ r^2:r^1
550	vtrn.32	d3,d14
551	vtrn.32	d5,d16
552	vtrn.32	d1,d12
553	vtrn.32	d7,d18
554
555	vshl.u32	d4,d3,#2		@ *5
556	vshl.u32	d6,d5,#2
557	vshl.u32	d2,d1,#2
558	vshl.u32	d8,d7,#2
559	vadd.i32	d4,d4,d3
560	vadd.i32	d2,d2,d1
561	vadd.i32	d6,d6,d5
562	vadd.i32	d8,d8,d7
563
564	vst4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!
565	vst4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!
566	vst4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
567	vst4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
568	vst1.32	{d8[0]},[r6,:32]
569	vst1.32	{d8[1]},[r7,:32]
570
571	b	.Lsquare_neon
572
573.align	4
574.Lsquare_break_neon:
575	add	r6,r0,#(48+2*4*9)
576	add	r7,r0,#(48+3*4*9)
577
578	vmov	d0,d10		@ r^4:r^3
579	vshl.u32	d2,d12,#2		@ *5
580	vmov	d1,d12
581	vshl.u32	d4,d14,#2
582	vmov	d3,d14
583	vshl.u32	d6,d16,#2
584	vmov	d5,d16
585	vshl.u32	d8,d18,#2
586	vmov	d7,d18
587	vadd.i32	d2,d2,d12
588	vadd.i32	d4,d4,d14
589	vadd.i32	d6,d6,d16
590	vadd.i32	d8,d8,d18
591
592	vst4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!
593	vst4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!
594	vst4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
595	vst4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
596	vst1.32	{d8[0]},[r6]
597	vst1.32	{d8[1]},[r7]
598
599	bx	lr				@ bx	lr
600.size	poly1305_init_neon,.-poly1305_init_neon
601
602.type	poly1305_blocks_neon,%function
603.align	5
604poly1305_blocks_neon:
605.Lpoly1305_blocks_neon:
606	ldr	ip,[r0,#36]		@ is_base2_26
607	ands	r2,r2,#-16
608	beq	.Lno_data_neon
609
610	cmp	r2,#64
611	bhs	.Lenter_neon
612	tst	ip,ip			@ is_base2_26?
613	beq	.Lpoly1305_blocks
614
615.Lenter_neon:
616	stmdb	sp!,{r4,r5,r6,r7}
617	vstmdb	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}		@ ABI specification says so
618
619	tst	ip,ip			@ is_base2_26?
620	bne	.Lbase2_26_neon
621
622	stmdb	sp!,{r1,r2,r3,lr}
623	bl	poly1305_init_neon
624
625	ldr	r4,[r0,#0]		@ load hash value base 2^32
626	ldr	r5,[r0,#4]
627	ldr	r6,[r0,#8]
628	ldr	r7,[r0,#12]
629	ldr	ip,[r0,#16]
630
631	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
632	mov	r3,r4,lsr#26
633	veor	d10,d10,d10
634	mov	r4,r5,lsr#20
635	orr	r3,r3,r5,lsl#6
636	veor	d12,d12,d12
637	mov	r5,r6,lsr#14
638	orr	r4,r4,r6,lsl#12
639	veor	d14,d14,d14
640	mov	r6,r7,lsr#8
641	orr	r5,r5,r7,lsl#18
642	veor	d16,d16,d16
643	and	r3,r3,#0x03ffffff
644	orr	r6,r6,ip,lsl#24
645	veor	d18,d18,d18
646	and	r4,r4,#0x03ffffff
647	mov	r1,#1
648	and	r5,r5,#0x03ffffff
649	str	r1,[r0,#36]		@ is_base2_26
650
651	vmov.32	d10[0],r2
652	vmov.32	d12[0],r3
653	vmov.32	d14[0],r4
654	vmov.32	d16[0],r5
655	vmov.32	d18[0],r6
656	adr	r5,.Lzeros
657
658	ldmia	sp!,{r1,r2,r3,lr}
659	b	.Lbase2_32_neon
660
661.align	4
662.Lbase2_26_neon:
663	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
664	@ load hash value
665
666	veor	d10,d10,d10
667	veor	d12,d12,d12
668	veor	d14,d14,d14
669	veor	d16,d16,d16
670	veor	d18,d18,d18
671	vld4.32	{d10[0],d12[0],d14[0],d16[0]},[r0]!
672	adr	r5,.Lzeros
673	vld1.32	{d18[0]},[r0]
674	sub	r0,r0,#16		@ rewind
675
676.Lbase2_32_neon:
677	add	r4,r1,#32
678	mov	r3,r3,lsl#24
679	tst	r2,#31
680	beq	.Leven
681
682	vld4.32	{d20[0],d22[0],d24[0],d26[0]},[r1]!
683	vmov.32	d28[0],r3
684	sub	r2,r2,#16
685	add	r4,r1,#32
686
687# ifdef	__ARMEB__
688	vrev32.8	q10,q10
689	vrev32.8	q13,q13
690	vrev32.8	q11,q11
691	vrev32.8	q12,q12
692# endif
693	vsri.u32	d28,d26,#8	@ base 2^32 -> base 2^26
694	vshl.u32	d26,d26,#18
695
696	vsri.u32	d26,d24,#14
697	vshl.u32	d24,d24,#12
698	vadd.i32	d29,d28,d18	@ add hash value and move to #hi
699
700	vbic.i32	d26,#0xfc000000
701	vsri.u32	d24,d22,#20
702	vshl.u32	d22,d22,#6
703
704	vbic.i32	d24,#0xfc000000
705	vsri.u32	d22,d20,#26
706	vadd.i32	d27,d26,d16
707
708	vbic.i32	d20,#0xfc000000
709	vbic.i32	d22,#0xfc000000
710	vadd.i32	d25,d24,d14
711
712	vadd.i32	d21,d20,d10
713	vadd.i32	d23,d22,d12
714
715	mov	r7,r5
716	add	r6,r0,#48
717
718	cmp	r2,r2
719	b	.Long_tail
720
721.align	4
722.Leven:
723	subs	r2,r2,#64
724	it	lo
725	movlo	r4,r5
726
727	vmov.i32	q14,#1<<24		@ padbit, yes, always
728	vld4.32	{d20,d22,d24,d26},[r1]	@ inp[0:1]
729	add	r1,r1,#64
730	vld4.32	{d21,d23,d25,d27},[r4]	@ inp[2:3] (or 0)
731	add	r4,r4,#64
732	itt	hi
733	addhi	r7,r0,#(48+1*9*4)
734	addhi	r6,r0,#(48+3*9*4)
735
736# ifdef	__ARMEB__
737	vrev32.8	q10,q10
738	vrev32.8	q13,q13
739	vrev32.8	q11,q11
740	vrev32.8	q12,q12
741# endif
742	vsri.u32	q14,q13,#8		@ base 2^32 -> base 2^26
743	vshl.u32	q13,q13,#18
744
745	vsri.u32	q13,q12,#14
746	vshl.u32	q12,q12,#12
747
748	vbic.i32	q13,#0xfc000000
749	vsri.u32	q12,q11,#20
750	vshl.u32	q11,q11,#6
751
752	vbic.i32	q12,#0xfc000000
753	vsri.u32	q11,q10,#26
754
755	vbic.i32	q10,#0xfc000000
756	vbic.i32	q11,#0xfc000000
757
758	bls	.Lskip_loop
759
760	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^2
761	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^4
762	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
763	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
764	b	.Loop_neon
765
766.align	5
767.Loop_neon:
768	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
769	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
770	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
771	@   ___________________/
772	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
773	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
774	@   ___________________/ ____________________/
775	@
776	@ Note that we start with inp[2:3]*r^2. This is because it
777	@ doesn't depend on reduction in previous iteration.
778	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
779	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
780	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
781	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
782	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
783	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
784
785	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
786	@ inp[2:3]*r^2
787
788	vadd.i32	d24,d24,d14	@ accumulate inp[0:1]
789	vmull.u32	q7,d25,d0[1]
790	vadd.i32	d20,d20,d10
791	vmull.u32	q5,d21,d0[1]
792	vadd.i32	d26,d26,d16
793	vmull.u32	q8,d27,d0[1]
794	vmlal.u32	q7,d23,d1[1]
795	vadd.i32	d22,d22,d12
796	vmull.u32	q6,d23,d0[1]
797
798	vadd.i32	d28,d28,d18
799	vmull.u32	q9,d29,d0[1]
800	subs	r2,r2,#64
801	vmlal.u32	q5,d29,d2[1]
802	it	lo
803	movlo	r4,r5
804	vmlal.u32	q8,d25,d1[1]
805	vld1.32	d8[1],[r7,:32]
806	vmlal.u32	q6,d21,d1[1]
807	vmlal.u32	q9,d27,d1[1]
808
809	vmlal.u32	q5,d27,d4[1]
810	vmlal.u32	q8,d23,d3[1]
811	vmlal.u32	q9,d25,d3[1]
812	vmlal.u32	q6,d29,d4[1]
813	vmlal.u32	q7,d21,d3[1]
814
815	vmlal.u32	q8,d21,d5[1]
816	vmlal.u32	q5,d25,d6[1]
817	vmlal.u32	q9,d23,d5[1]
818	vmlal.u32	q6,d27,d6[1]
819	vmlal.u32	q7,d29,d6[1]
820
821	vmlal.u32	q8,d29,d8[1]
822	vmlal.u32	q5,d23,d8[1]
823	vmlal.u32	q9,d21,d7[1]
824	vmlal.u32	q6,d25,d8[1]
825	vmlal.u32	q7,d27,d8[1]
826
827	vld4.32	{d21,d23,d25,d27},[r4]	@ inp[2:3] (or 0)
828	add	r4,r4,#64
829
830	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
831	@ (hash+inp[0:1])*r^4 and accumulate
832
833	vmlal.u32	q8,d26,d0[0]
834	vmlal.u32	q5,d20,d0[0]
835	vmlal.u32	q9,d28,d0[0]
836	vmlal.u32	q6,d22,d0[0]
837	vmlal.u32	q7,d24,d0[0]
838	vld1.32	d8[0],[r6,:32]
839
840	vmlal.u32	q8,d24,d1[0]
841	vmlal.u32	q5,d28,d2[0]
842	vmlal.u32	q9,d26,d1[0]
843	vmlal.u32	q6,d20,d1[0]
844	vmlal.u32	q7,d22,d1[0]
845
846	vmlal.u32	q8,d22,d3[0]
847	vmlal.u32	q5,d26,d4[0]
848	vmlal.u32	q9,d24,d3[0]
849	vmlal.u32	q6,d28,d4[0]
850	vmlal.u32	q7,d20,d3[0]
851
852	vmlal.u32	q8,d20,d5[0]
853	vmlal.u32	q5,d24,d6[0]
854	vmlal.u32	q9,d22,d5[0]
855	vmlal.u32	q6,d26,d6[0]
856	vmlal.u32	q8,d28,d8[0]
857
858	vmlal.u32	q7,d28,d6[0]
859	vmlal.u32	q5,d22,d8[0]
860	vmlal.u32	q9,d20,d7[0]
861	vmov.i32	q14,#1<<24		@ padbit, yes, always
862	vmlal.u32	q6,d24,d8[0]
863	vmlal.u32	q7,d26,d8[0]
864
865	vld4.32	{d20,d22,d24,d26},[r1]	@ inp[0:1]
866	add	r1,r1,#64
867# ifdef	__ARMEB__
868	vrev32.8	q10,q10
869	vrev32.8	q11,q11
870	vrev32.8	q12,q12
871	vrev32.8	q13,q13
872# endif
873
874	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
875	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
876	@ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
877
878	vshr.u64	q15,q8,#26
879	vmovn.i64	d16,q8
880	vshr.u64	q4,q5,#26
881	vmovn.i64	d10,q5
882	vadd.i64	q9,q9,q15		@ h3 -> h4
883	vbic.i32	d16,#0xfc000000
884	vsri.u32	q14,q13,#8		@ base 2^32 -> base 2^26
885	vadd.i64	q6,q6,q4		@ h0 -> h1
886	vshl.u32	q13,q13,#18
887	vbic.i32	d10,#0xfc000000
888
889	vshrn.u64	d30,q9,#26
890	vmovn.i64	d18,q9
891	vshr.u64	q4,q6,#26
892	vmovn.i64	d12,q6
893	vadd.i64	q7,q7,q4		@ h1 -> h2
894	vsri.u32	q13,q12,#14
895	vbic.i32	d18,#0xfc000000
896	vshl.u32	q12,q12,#12
897	vbic.i32	d12,#0xfc000000
898
899	vadd.i32	d10,d10,d30
900	vshl.u32	d30,d30,#2
901	vbic.i32	q13,#0xfc000000
902	vshrn.u64	d8,q7,#26
903	vmovn.i64	d14,q7
904	vaddl.u32	q5,d10,d30	@ h4 -> h0 [widen for a sec]
905	vsri.u32	q12,q11,#20
906	vadd.i32	d16,d16,d8	@ h2 -> h3
907	vshl.u32	q11,q11,#6
908	vbic.i32	d14,#0xfc000000
909	vbic.i32	q12,#0xfc000000
910
911	vshrn.u64	d30,q5,#26		@ re-narrow
912	vmovn.i64	d10,q5
913	vsri.u32	q11,q10,#26
914	vbic.i32	q10,#0xfc000000
915	vshr.u32	d8,d16,#26
916	vbic.i32	d16,#0xfc000000
917	vbic.i32	d10,#0xfc000000
918	vadd.i32	d12,d12,d30	@ h0 -> h1
919	vadd.i32	d18,d18,d8	@ h3 -> h4
920	vbic.i32	q11,#0xfc000000
921
922	bhi	.Loop_neon
923
924.Lskip_loop:
925	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
926	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
927
928	add	r7,r0,#(48+0*9*4)
929	add	r6,r0,#(48+1*9*4)
930	adds	r2,r2,#32
931	it	ne
932	movne	r2,#0
933	bne	.Long_tail
934
935	vadd.i32	d25,d24,d14	@ add hash value and move to #hi
936	vadd.i32	d21,d20,d10
937	vadd.i32	d27,d26,d16
938	vadd.i32	d23,d22,d12
939	vadd.i32	d29,d28,d18
940
941.Long_tail:
942	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^1
943	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^2
944
945	vadd.i32	d24,d24,d14	@ can be redundant
946	vmull.u32	q7,d25,d0
947	vadd.i32	d20,d20,d10
948	vmull.u32	q5,d21,d0
949	vadd.i32	d26,d26,d16
950	vmull.u32	q8,d27,d0
951	vadd.i32	d22,d22,d12
952	vmull.u32	q6,d23,d0
953	vadd.i32	d28,d28,d18
954	vmull.u32	q9,d29,d0
955
956	vmlal.u32	q5,d29,d2
957	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
958	vmlal.u32	q8,d25,d1
959	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
960	vmlal.u32	q6,d21,d1
961	vmlal.u32	q9,d27,d1
962	vmlal.u32	q7,d23,d1
963
964	vmlal.u32	q8,d23,d3
965	vld1.32	d8[1],[r7,:32]
966	vmlal.u32	q5,d27,d4
967	vld1.32	d8[0],[r6,:32]
968	vmlal.u32	q9,d25,d3
969	vmlal.u32	q6,d29,d4
970	vmlal.u32	q7,d21,d3
971
972	vmlal.u32	q8,d21,d5
973	it	ne
974	addne	r7,r0,#(48+2*9*4)
975	vmlal.u32	q5,d25,d6
976	it	ne
977	addne	r6,r0,#(48+3*9*4)
978	vmlal.u32	q9,d23,d5
979	vmlal.u32	q6,d27,d6
980	vmlal.u32	q7,d29,d6
981
982	vmlal.u32	q8,d29,d8
983	vorn	q0,q0,q0	@ all-ones, can be redundant
984	vmlal.u32	q5,d23,d8
985	vshr.u64	q0,q0,#38
986	vmlal.u32	q9,d21,d7
987	vmlal.u32	q6,d25,d8
988	vmlal.u32	q7,d27,d8
989
990	beq	.Lshort_tail
991
992	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
993	@ (hash+inp[0:1])*r^4:r^3 and accumulate
994
995	vld4.32	{d0[1],d1[1],d2[1],d3[1]},[r7]!	@ load r^3
996	vld4.32	{d0[0],d1[0],d2[0],d3[0]},[r6]!	@ load r^4
997
998	vmlal.u32	q7,d24,d0
999	vmlal.u32	q5,d20,d0
1000	vmlal.u32	q8,d26,d0
1001	vmlal.u32	q6,d22,d0
1002	vmlal.u32	q9,d28,d0
1003
1004	vmlal.u32	q5,d28,d2
1005	vld4.32	{d4[1],d5[1],d6[1],d7[1]},[r7]!
1006	vmlal.u32	q8,d24,d1
1007	vld4.32	{d4[0],d5[0],d6[0],d7[0]},[r6]!
1008	vmlal.u32	q6,d20,d1
1009	vmlal.u32	q9,d26,d1
1010	vmlal.u32	q7,d22,d1
1011
1012	vmlal.u32	q8,d22,d3
1013	vld1.32	d8[1],[r7,:32]
1014	vmlal.u32	q5,d26,d4
1015	vld1.32	d8[0],[r6,:32]
1016	vmlal.u32	q9,d24,d3
1017	vmlal.u32	q6,d28,d4
1018	vmlal.u32	q7,d20,d3
1019
1020	vmlal.u32	q8,d20,d5
1021	vmlal.u32	q5,d24,d6
1022	vmlal.u32	q9,d22,d5
1023	vmlal.u32	q6,d26,d6
1024	vmlal.u32	q7,d28,d6
1025
1026	vmlal.u32	q8,d28,d8
1027	vorn	q0,q0,q0	@ all-ones
1028	vmlal.u32	q5,d22,d8
1029	vshr.u64	q0,q0,#38
1030	vmlal.u32	q9,d20,d7
1031	vmlal.u32	q6,d24,d8
1032	vmlal.u32	q7,d26,d8
1033
1034.Lshort_tail:
1035	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1036	@ horizontal addition
1037
1038	vadd.i64	d16,d16,d17
1039	vadd.i64	d10,d10,d11
1040	vadd.i64	d18,d18,d19
1041	vadd.i64	d12,d12,d13
1042	vadd.i64	d14,d14,d15
1043
1044	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1045	@ lazy reduction, but without narrowing
1046
1047	vshr.u64	q15,q8,#26
1048	vand.i64	q8,q8,q0
1049	vshr.u64	q4,q5,#26
1050	vand.i64	q5,q5,q0
1051	vadd.i64	q9,q9,q15		@ h3 -> h4
1052	vadd.i64	q6,q6,q4		@ h0 -> h1
1053
1054	vshr.u64	q15,q9,#26
1055	vand.i64	q9,q9,q0
1056	vshr.u64	q4,q6,#26
1057	vand.i64	q6,q6,q0
1058	vadd.i64	q7,q7,q4		@ h1 -> h2
1059
1060	vadd.i64	q5,q5,q15
1061	vshl.u64	q15,q15,#2
1062	vshr.u64	q4,q7,#26
1063	vand.i64	q7,q7,q0
1064	vadd.i64	q5,q5,q15		@ h4 -> h0
1065	vadd.i64	q8,q8,q4		@ h2 -> h3
1066
1067	vshr.u64	q15,q5,#26
1068	vand.i64	q5,q5,q0
1069	vshr.u64	q4,q8,#26
1070	vand.i64	q8,q8,q0
1071	vadd.i64	q6,q6,q15		@ h0 -> h1
1072	vadd.i64	q9,q9,q4		@ h3 -> h4
1073
1074	cmp	r2,#0
1075	bne	.Leven
1076
1077	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1078	@ store hash value
1079
1080	vst4.32	{d10[0],d12[0],d14[0],d16[0]},[r0]!
1081	vst1.32	{d18[0]},[r0]
1082
1083	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}			@ epilogue
1084	ldmia	sp!,{r4,r5,r6,r7}
1085.Lno_data_neon:
1086	bx	lr					@ bx	lr
1087.size	poly1305_blocks_neon,.-poly1305_blocks_neon
1088
1089.type	poly1305_emit_neon,%function
1090.align	5
1091poly1305_emit_neon:
1092.Lpoly1305_emit_neon:
1093	ldr	ip,[r0,#36]		@ is_base2_26
1094
1095	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1096
1097	tst	ip,ip
1098	beq	.Lpoly1305_emit_enter
1099
1100	ldmia	r0,{r3,r4,r5,r6,r7}
1101	eor	r8,r8,r8
1102
1103	adds	r3,r3,r4,lsl#26	@ base 2^26 -> base 2^32
1104	mov	r4,r4,lsr#6
1105	adcs	r4,r4,r5,lsl#20
1106	mov	r5,r5,lsr#12
1107	adcs	r5,r5,r6,lsl#14
1108	mov	r6,r6,lsr#18
1109	adcs	r6,r6,r7,lsl#8
1110	adc	r7,r8,r7,lsr#24	@ can be partially reduced ...
1111
1112	and	r8,r7,#-4		@ ... so reduce
1113	and	r7,r6,#3
1114	add	r8,r8,r8,lsr#2	@ *= 5
1115	adds	r3,r3,r8
1116	adcs	r4,r4,#0
1117	adcs	r5,r5,#0
1118	adcs	r6,r6,#0
1119	adc	r7,r7,#0
1120
1121	adds	r8,r3,#5		@ compare to modulus
1122	adcs	r9,r4,#0
1123	adcs	r10,r5,#0
1124	adcs	r11,r6,#0
1125	adc	r7,r7,#0
1126	tst	r7,#4			@ did it carry/borrow?
1127
1128	it	ne
1129	movne	r3,r8
1130	ldr	r8,[r2,#0]
1131	it	ne
1132	movne	r4,r9
1133	ldr	r9,[r2,#4]
1134	it	ne
1135	movne	r5,r10
1136	ldr	r10,[r2,#8]
1137	it	ne
1138	movne	r6,r11
1139	ldr	r11,[r2,#12]
1140
1141	adds	r3,r3,r8		@ accumulate nonce
1142	adcs	r4,r4,r9
1143	adcs	r5,r5,r10
1144	adc	r6,r6,r11
1145
1146# ifdef __ARMEB__
1147	rev	r3,r3
1148	rev	r4,r4
1149	rev	r5,r5
1150	rev	r6,r6
1151# endif
1152	str	r3,[r1,#0]		@ store the result
1153	str	r4,[r1,#4]
1154	str	r5,[r1,#8]
1155	str	r6,[r1,#12]
1156
1157	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
1158	bx	lr				@ bx	lr
1159.size	poly1305_emit_neon,.-poly1305_emit_neon
1160
1161.align	5
1162.Lzeros:
1163.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1164.LOPENSSL_armcap:
1165.word	OPENSSL_armcap_P-.Lpoly1305_init
1166#endif
1167.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1168.align	2
1169.align	2
1170#if	__ARM_MAX_ARCH__>=7
1171.comm	OPENSSL_armcap_P,4,4
1172#endif
1173