xref: /freebsd/sys/crypto/openssl/aarch64/poly1305-armv8.S (revision e2eeea75eb8b6dd50c1298067a0655880d186734)
1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from poly1305-armv8.pl. */
3#include "arm_arch.h"
4
5.text
6
7// forward "declarations" are required for Apple
8
9.globl	poly1305_blocks
10.globl	poly1305_emit
11
12.globl	poly1305_init
13.type	poly1305_init,%function
14.align	5
15poly1305_init:
16	cmp	x1,xzr
17	stp	xzr,xzr,[x0]		// zero hash value
18	stp	xzr,xzr,[x0,#16]	// [along with is_base2_26]
19
20	csel	x0,xzr,x0,eq
21	b.eq	.Lno_key
22
23#ifdef	__ILP32__
24	ldrsw	x11,.LOPENSSL_armcap_P
25#else
26	ldr	x11,.LOPENSSL_armcap_P
27#endif
28	adr	x10,.LOPENSSL_armcap_P
29
30	ldp	x7,x8,[x1]		// load key
31	mov	x9,#0xfffffffc0fffffff
32	movk	x9,#0x0fff,lsl#48
33	ldr	w17,[x10,x11]
34#ifdef	__ARMEB__
35	rev	x7,x7			// flip bytes
36	rev	x8,x8
37#endif
38	and	x7,x7,x9		// &=0ffffffc0fffffff
39	and	x9,x9,#-4
40	and	x8,x8,x9		// &=0ffffffc0ffffffc
41	stp	x7,x8,[x0,#32]	// save key value
42
43	tst	w17,#ARMV7_NEON
44
45	adr	x12,poly1305_blocks
46	adr	x7,poly1305_blocks_neon
47	adr	x13,poly1305_emit
48	adr	x8,poly1305_emit_neon
49
50	csel	x12,x12,x7,eq
51	csel	x13,x13,x8,eq
52
53#ifdef	__ILP32__
54	stp	w12,w13,[x2]
55#else
56	stp	x12,x13,[x2]
57#endif
58
59	mov	x0,#1
60.Lno_key:
61	ret
62.size	poly1305_init,.-poly1305_init
63
64.type	poly1305_blocks,%function
65.align	5
66poly1305_blocks:
67	ands	x2,x2,#-16
68	b.eq	.Lno_data
69
70	ldp	x4,x5,[x0]		// load hash value
71	ldp	x7,x8,[x0,#32]	// load key value
72	ldr	x6,[x0,#16]
73	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
74	b	.Loop
75
76.align	5
77.Loop:
78	ldp	x10,x11,[x1],#16	// load input
79	sub	x2,x2,#16
80#ifdef	__ARMEB__
81	rev	x10,x10
82	rev	x11,x11
83#endif
84	adds	x4,x4,x10		// accumulate input
85	adcs	x5,x5,x11
86
87	mul	x12,x4,x7		// h0*r0
88	adc	x6,x6,x3
89	umulh	x13,x4,x7
90
91	mul	x10,x5,x9		// h1*5*r1
92	umulh	x11,x5,x9
93
94	adds	x12,x12,x10
95	mul	x10,x4,x8		// h0*r1
96	adc	x13,x13,x11
97	umulh	x14,x4,x8
98
99	adds	x13,x13,x10
100	mul	x10,x5,x7		// h1*r0
101	adc	x14,x14,xzr
102	umulh	x11,x5,x7
103
104	adds	x13,x13,x10
105	mul	x10,x6,x9		// h2*5*r1
106	adc	x14,x14,x11
107	mul	x11,x6,x7		// h2*r0
108
109	adds	x13,x13,x10
110	adc	x14,x14,x11
111
112	and	x10,x14,#-4		// final reduction
113	and	x6,x14,#3
114	add	x10,x10,x14,lsr#2
115	adds	x4,x12,x10
116	adcs	x5,x13,xzr
117	adc	x6,x6,xzr
118
119	cbnz	x2,.Loop
120
121	stp	x4,x5,[x0]		// store hash value
122	str	x6,[x0,#16]
123
124.Lno_data:
125	ret
126.size	poly1305_blocks,.-poly1305_blocks
127
128.type	poly1305_emit,%function
129.align	5
130poly1305_emit:
131	ldp	x4,x5,[x0]		// load hash base 2^64
132	ldr	x6,[x0,#16]
133	ldp	x10,x11,[x2]	// load nonce
134
135	adds	x12,x4,#5		// compare to modulus
136	adcs	x13,x5,xzr
137	adc	x14,x6,xzr
138
139	tst	x14,#-4			// see if it's carried/borrowed
140
141	csel	x4,x4,x12,eq
142	csel	x5,x5,x13,eq
143
144#ifdef	__ARMEB__
145	ror	x10,x10,#32		// flip nonce words
146	ror	x11,x11,#32
147#endif
148	adds	x4,x4,x10		// accumulate nonce
149	adc	x5,x5,x11
150#ifdef	__ARMEB__
151	rev	x4,x4			// flip output bytes
152	rev	x5,x5
153#endif
154	stp	x4,x5,[x1]		// write result
155
156	ret
157.size	poly1305_emit,.-poly1305_emit
158.type	poly1305_mult,%function
159.align	5
160poly1305_mult:
161	mul	x12,x4,x7		// h0*r0
162	umulh	x13,x4,x7
163
164	mul	x10,x5,x9		// h1*5*r1
165	umulh	x11,x5,x9
166
167	adds	x12,x12,x10
168	mul	x10,x4,x8		// h0*r1
169	adc	x13,x13,x11
170	umulh	x14,x4,x8
171
172	adds	x13,x13,x10
173	mul	x10,x5,x7		// h1*r0
174	adc	x14,x14,xzr
175	umulh	x11,x5,x7
176
177	adds	x13,x13,x10
178	mul	x10,x6,x9		// h2*5*r1
179	adc	x14,x14,x11
180	mul	x11,x6,x7		// h2*r0
181
182	adds	x13,x13,x10
183	adc	x14,x14,x11
184
185	and	x10,x14,#-4		// final reduction
186	and	x6,x14,#3
187	add	x10,x10,x14,lsr#2
188	adds	x4,x12,x10
189	adcs	x5,x13,xzr
190	adc	x6,x6,xzr
191
192	ret
193.size	poly1305_mult,.-poly1305_mult
194
195.type	poly1305_splat,%function
196.align	5
197poly1305_splat:
198	and	x12,x4,#0x03ffffff	// base 2^64 -> base 2^26
199	ubfx	x13,x4,#26,#26
200	extr	x14,x5,x4,#52
201	and	x14,x14,#0x03ffffff
202	ubfx	x15,x5,#14,#26
203	extr	x16,x6,x5,#40
204
205	str	w12,[x0,#16*0]	// r0
206	add	w12,w13,w13,lsl#2	// r1*5
207	str	w13,[x0,#16*1]	// r1
208	add	w13,w14,w14,lsl#2	// r2*5
209	str	w12,[x0,#16*2]	// s1
210	str	w14,[x0,#16*3]	// r2
211	add	w14,w15,w15,lsl#2	// r3*5
212	str	w13,[x0,#16*4]	// s2
213	str	w15,[x0,#16*5]	// r3
214	add	w15,w16,w16,lsl#2	// r4*5
215	str	w14,[x0,#16*6]	// s3
216	str	w16,[x0,#16*7]	// r4
217	str	w15,[x0,#16*8]	// s4
218
219	ret
220.size	poly1305_splat,.-poly1305_splat
221
222.type	poly1305_blocks_neon,%function
223.align	5
224poly1305_blocks_neon:
225	ldr	x17,[x0,#24]
226	cmp	x2,#128
227	b.hs	.Lblocks_neon
228	cbz	x17,poly1305_blocks
229
230.Lblocks_neon:
231.inst	0xd503233f		// paciasp
232	stp	x29,x30,[sp,#-80]!
233	add	x29,sp,#0
234
235	ands	x2,x2,#-16
236	b.eq	.Lno_data_neon
237
238	cbz	x17,.Lbase2_64_neon
239
240	ldp	w10,w11,[x0]		// load hash value base 2^26
241	ldp	w12,w13,[x0,#8]
242	ldr	w14,[x0,#16]
243
244	tst	x2,#31
245	b.eq	.Leven_neon
246
247	ldp	x7,x8,[x0,#32]	// load key value
248
249	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
250	lsr	x5,x12,#12
251	adds	x4,x4,x12,lsl#52
252	add	x5,x5,x13,lsl#14
253	adc	x5,x5,xzr
254	lsr	x6,x14,#24
255	adds	x5,x5,x14,lsl#40
256	adc	x14,x6,xzr		// can be partially reduced...
257
258	ldp	x12,x13,[x1],#16	// load input
259	sub	x2,x2,#16
260	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
261
262	and	x10,x14,#-4		// ... so reduce
263	and	x6,x14,#3
264	add	x10,x10,x14,lsr#2
265	adds	x4,x4,x10
266	adcs	x5,x5,xzr
267	adc	x6,x6,xzr
268
269#ifdef	__ARMEB__
270	rev	x12,x12
271	rev	x13,x13
272#endif
273	adds	x4,x4,x12		// accumulate input
274	adcs	x5,x5,x13
275	adc	x6,x6,x3
276
277	bl	poly1305_mult
278	ldr	x30,[sp,#8]
279
280	cbz	x3,.Lstore_base2_64_neon
281
282	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
283	ubfx	x11,x4,#26,#26
284	extr	x12,x5,x4,#52
285	and	x12,x12,#0x03ffffff
286	ubfx	x13,x5,#14,#26
287	extr	x14,x6,x5,#40
288
289	cbnz	x2,.Leven_neon
290
291	stp	w10,w11,[x0]		// store hash value base 2^26
292	stp	w12,w13,[x0,#8]
293	str	w14,[x0,#16]
294	b	.Lno_data_neon
295
296.align	4
297.Lstore_base2_64_neon:
298	stp	x4,x5,[x0]		// store hash value base 2^64
299	stp	x6,xzr,[x0,#16]	// note that is_base2_26 is zeroed
300	b	.Lno_data_neon
301
302.align	4
303.Lbase2_64_neon:
304	ldp	x7,x8,[x0,#32]	// load key value
305
306	ldp	x4,x5,[x0]		// load hash value base 2^64
307	ldr	x6,[x0,#16]
308
309	tst	x2,#31
310	b.eq	.Linit_neon
311
312	ldp	x12,x13,[x1],#16	// load input
313	sub	x2,x2,#16
314	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
315#ifdef	__ARMEB__
316	rev	x12,x12
317	rev	x13,x13
318#endif
319	adds	x4,x4,x12		// accumulate input
320	adcs	x5,x5,x13
321	adc	x6,x6,x3
322
323	bl	poly1305_mult
324
325.Linit_neon:
326	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
327	ubfx	x11,x4,#26,#26
328	extr	x12,x5,x4,#52
329	and	x12,x12,#0x03ffffff
330	ubfx	x13,x5,#14,#26
331	extr	x14,x6,x5,#40
332
333	stp	d8,d9,[sp,#16]		// meet ABI requirements
334	stp	d10,d11,[sp,#32]
335	stp	d12,d13,[sp,#48]
336	stp	d14,d15,[sp,#64]
337
338	fmov	d24,x10
339	fmov	d25,x11
340	fmov	d26,x12
341	fmov	d27,x13
342	fmov	d28,x14
343
344	////////////////////////////////// initialize r^n table
345	mov	x4,x7			// r^1
346	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
347	mov	x5,x8
348	mov	x6,xzr
349	add	x0,x0,#48+12
350	bl	poly1305_splat
351
352	bl	poly1305_mult		// r^2
353	sub	x0,x0,#4
354	bl	poly1305_splat
355
356	bl	poly1305_mult		// r^3
357	sub	x0,x0,#4
358	bl	poly1305_splat
359
360	bl	poly1305_mult		// r^4
361	sub	x0,x0,#4
362	bl	poly1305_splat
363	ldr	x30,[sp,#8]
364
365	add	x16,x1,#32
366	adr	x17,.Lzeros
367	subs	x2,x2,#64
368	csel	x16,x17,x16,lo
369
370	mov	x4,#1
371	str	x4,[x0,#-24]		// set is_base2_26
372	sub	x0,x0,#48		// restore original x0
373	b	.Ldo_neon
374
375.align	4
376.Leven_neon:
377	add	x16,x1,#32
378	adr	x17,.Lzeros
379	subs	x2,x2,#64
380	csel	x16,x17,x16,lo
381
382	stp	d8,d9,[sp,#16]		// meet ABI requirements
383	stp	d10,d11,[sp,#32]
384	stp	d12,d13,[sp,#48]
385	stp	d14,d15,[sp,#64]
386
387	fmov	d24,x10
388	fmov	d25,x11
389	fmov	d26,x12
390	fmov	d27,x13
391	fmov	d28,x14
392
393.Ldo_neon:
394	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
395	ldp	x9,x13,[x16],#48
396
397	lsl	x3,x3,#24
398	add	x15,x0,#48
399
400#ifdef	__ARMEB__
401	rev	x8,x8
402	rev	x12,x12
403	rev	x9,x9
404	rev	x13,x13
405#endif
406	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
407	and	x5,x9,#0x03ffffff
408	ubfx	x6,x8,#26,#26
409	ubfx	x7,x9,#26,#26
410	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
411	extr	x8,x12,x8,#52
412	extr	x9,x13,x9,#52
413	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
414	fmov	d14,x4
415	and	x8,x8,#0x03ffffff
416	and	x9,x9,#0x03ffffff
417	ubfx	x10,x12,#14,#26
418	ubfx	x11,x13,#14,#26
419	add	x12,x3,x12,lsr#40
420	add	x13,x3,x13,lsr#40
421	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
422	fmov	d15,x6
423	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
424	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
425	fmov	d16,x8
426	fmov	d17,x10
427	fmov	d18,x12
428
429	ldp	x8,x12,[x1],#16	// inp[0:1]
430	ldp	x9,x13,[x1],#48
431
432	ld1	{v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
433	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
434	ld1	{v8.4s},[x15]
435
436#ifdef	__ARMEB__
437	rev	x8,x8
438	rev	x12,x12
439	rev	x9,x9
440	rev	x13,x13
441#endif
442	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
443	and	x5,x9,#0x03ffffff
444	ubfx	x6,x8,#26,#26
445	ubfx	x7,x9,#26,#26
446	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
447	extr	x8,x12,x8,#52
448	extr	x9,x13,x9,#52
449	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
450	fmov	d9,x4
451	and	x8,x8,#0x03ffffff
452	and	x9,x9,#0x03ffffff
453	ubfx	x10,x12,#14,#26
454	ubfx	x11,x13,#14,#26
455	add	x12,x3,x12,lsr#40
456	add	x13,x3,x13,lsr#40
457	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
458	fmov	d10,x6
459	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
460	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
461	movi	v31.2d,#-1
462	fmov	d11,x8
463	fmov	d12,x10
464	fmov	d13,x12
465	ushr	v31.2d,v31.2d,#38
466
467	b.ls	.Lskip_loop
468
469.align	4
470.Loop_neon:
471	////////////////////////////////////////////////////////////////
472	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
473	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
474	//   ___________________/
475	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
476	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
477	//   ___________________/ ____________________/
478	//
479	// Note that we start with inp[2:3]*r^2. This is because it
480	// doesn't depend on reduction in previous iteration.
481	////////////////////////////////////////////////////////////////
482	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
483	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
484	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
485	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
486	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
487
488	subs	x2,x2,#64
489	umull	v23.2d,v14.2s,v7.s[2]
490	csel	x16,x17,x16,lo
491	umull	v22.2d,v14.2s,v5.s[2]
492	umull	v21.2d,v14.2s,v3.s[2]
493	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
494	umull	v20.2d,v14.2s,v1.s[2]
495	ldp	x9,x13,[x16],#48
496	umull	v19.2d,v14.2s,v0.s[2]
497#ifdef	__ARMEB__
498	rev	x8,x8
499	rev	x12,x12
500	rev	x9,x9
501	rev	x13,x13
502#endif
503
504	umlal	v23.2d,v15.2s,v5.s[2]
505	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
506	umlal	v22.2d,v15.2s,v3.s[2]
507	and	x5,x9,#0x03ffffff
508	umlal	v21.2d,v15.2s,v1.s[2]
509	ubfx	x6,x8,#26,#26
510	umlal	v20.2d,v15.2s,v0.s[2]
511	ubfx	x7,x9,#26,#26
512	umlal	v19.2d,v15.2s,v8.s[2]
513	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
514
515	umlal	v23.2d,v16.2s,v3.s[2]
516	extr	x8,x12,x8,#52
517	umlal	v22.2d,v16.2s,v1.s[2]
518	extr	x9,x13,x9,#52
519	umlal	v21.2d,v16.2s,v0.s[2]
520	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
521	umlal	v20.2d,v16.2s,v8.s[2]
522	fmov	d14,x4
523	umlal	v19.2d,v16.2s,v6.s[2]
524	and	x8,x8,#0x03ffffff
525
526	umlal	v23.2d,v17.2s,v1.s[2]
527	and	x9,x9,#0x03ffffff
528	umlal	v22.2d,v17.2s,v0.s[2]
529	ubfx	x10,x12,#14,#26
530	umlal	v21.2d,v17.2s,v8.s[2]
531	ubfx	x11,x13,#14,#26
532	umlal	v20.2d,v17.2s,v6.s[2]
533	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
534	umlal	v19.2d,v17.2s,v4.s[2]
535	fmov	d15,x6
536
537	add	v11.2s,v11.2s,v26.2s
538	add	x12,x3,x12,lsr#40
539	umlal	v23.2d,v18.2s,v0.s[2]
540	add	x13,x3,x13,lsr#40
541	umlal	v22.2d,v18.2s,v8.s[2]
542	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
543	umlal	v21.2d,v18.2s,v6.s[2]
544	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
545	umlal	v20.2d,v18.2s,v4.s[2]
546	fmov	d16,x8
547	umlal	v19.2d,v18.2s,v2.s[2]
548	fmov	d17,x10
549
550	////////////////////////////////////////////////////////////////
551	// (hash+inp[0:1])*r^4 and accumulate
552
553	add	v9.2s,v9.2s,v24.2s
554	fmov	d18,x12
555	umlal	v22.2d,v11.2s,v1.s[0]
556	ldp	x8,x12,[x1],#16	// inp[0:1]
557	umlal	v19.2d,v11.2s,v6.s[0]
558	ldp	x9,x13,[x1],#48
559	umlal	v23.2d,v11.2s,v3.s[0]
560	umlal	v20.2d,v11.2s,v8.s[0]
561	umlal	v21.2d,v11.2s,v0.s[0]
562#ifdef	__ARMEB__
563	rev	x8,x8
564	rev	x12,x12
565	rev	x9,x9
566	rev	x13,x13
567#endif
568
569	add	v10.2s,v10.2s,v25.2s
570	umlal	v22.2d,v9.2s,v5.s[0]
571	umlal	v23.2d,v9.2s,v7.s[0]
572	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
573	umlal	v21.2d,v9.2s,v3.s[0]
574	and	x5,x9,#0x03ffffff
575	umlal	v19.2d,v9.2s,v0.s[0]
576	ubfx	x6,x8,#26,#26
577	umlal	v20.2d,v9.2s,v1.s[0]
578	ubfx	x7,x9,#26,#26
579
580	add	v12.2s,v12.2s,v27.2s
581	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
582	umlal	v22.2d,v10.2s,v3.s[0]
583	extr	x8,x12,x8,#52
584	umlal	v23.2d,v10.2s,v5.s[0]
585	extr	x9,x13,x9,#52
586	umlal	v19.2d,v10.2s,v8.s[0]
587	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
588	umlal	v21.2d,v10.2s,v1.s[0]
589	fmov	d9,x4
590	umlal	v20.2d,v10.2s,v0.s[0]
591	and	x8,x8,#0x03ffffff
592
593	add	v13.2s,v13.2s,v28.2s
594	and	x9,x9,#0x03ffffff
595	umlal	v22.2d,v12.2s,v0.s[0]
596	ubfx	x10,x12,#14,#26
597	umlal	v19.2d,v12.2s,v4.s[0]
598	ubfx	x11,x13,#14,#26
599	umlal	v23.2d,v12.2s,v1.s[0]
600	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
601	umlal	v20.2d,v12.2s,v6.s[0]
602	fmov	d10,x6
603	umlal	v21.2d,v12.2s,v8.s[0]
604	add	x12,x3,x12,lsr#40
605
606	umlal	v22.2d,v13.2s,v8.s[0]
607	add	x13,x3,x13,lsr#40
608	umlal	v19.2d,v13.2s,v2.s[0]
609	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
610	umlal	v23.2d,v13.2s,v0.s[0]
611	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
612	umlal	v20.2d,v13.2s,v4.s[0]
613	fmov	d11,x8
614	umlal	v21.2d,v13.2s,v6.s[0]
615	fmov	d12,x10
616	fmov	d13,x12
617
618	/////////////////////////////////////////////////////////////////
619	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
620	// and P. Schwabe
621	//
622	// [see discussion in poly1305-armv4 module]
623
624	ushr	v29.2d,v22.2d,#26
625	xtn	v27.2s,v22.2d
626	ushr	v30.2d,v19.2d,#26
627	and	v19.16b,v19.16b,v31.16b
628	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
629	bic	v27.2s,#0xfc,lsl#24	// &=0x03ffffff
630	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
631
632	ushr	v29.2d,v23.2d,#26
633	xtn	v28.2s,v23.2d
634	ushr	v30.2d,v20.2d,#26
635	xtn	v25.2s,v20.2d
636	bic	v28.2s,#0xfc,lsl#24
637	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
638
639	add	v19.2d,v19.2d,v29.2d
640	shl	v29.2d,v29.2d,#2
641	shrn	v30.2s,v21.2d,#26
642	xtn	v26.2s,v21.2d
643	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
644	bic	v25.2s,#0xfc,lsl#24
645	add	v27.2s,v27.2s,v30.2s		// h2 -> h3
646	bic	v26.2s,#0xfc,lsl#24
647
648	shrn	v29.2s,v19.2d,#26
649	xtn	v24.2s,v19.2d
650	ushr	v30.2s,v27.2s,#26
651	bic	v27.2s,#0xfc,lsl#24
652	bic	v24.2s,#0xfc,lsl#24
653	add	v25.2s,v25.2s,v29.2s		// h0 -> h1
654	add	v28.2s,v28.2s,v30.2s		// h3 -> h4
655
656	b.hi	.Loop_neon
657
658.Lskip_loop:
659	dup	v16.2d,v16.d[0]
660	add	v11.2s,v11.2s,v26.2s
661
662	////////////////////////////////////////////////////////////////
663	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
664
665	adds	x2,x2,#32
666	b.ne	.Long_tail
667
668	dup	v16.2d,v11.d[0]
669	add	v14.2s,v9.2s,v24.2s
670	add	v17.2s,v12.2s,v27.2s
671	add	v15.2s,v10.2s,v25.2s
672	add	v18.2s,v13.2s,v28.2s
673
674.Long_tail:
675	dup	v14.2d,v14.d[0]
676	umull2	v19.2d,v16.4s,v6.4s
677	umull2	v22.2d,v16.4s,v1.4s
678	umull2	v23.2d,v16.4s,v3.4s
679	umull2	v21.2d,v16.4s,v0.4s
680	umull2	v20.2d,v16.4s,v8.4s
681
682	dup	v15.2d,v15.d[0]
683	umlal2	v19.2d,v14.4s,v0.4s
684	umlal2	v21.2d,v14.4s,v3.4s
685	umlal2	v22.2d,v14.4s,v5.4s
686	umlal2	v23.2d,v14.4s,v7.4s
687	umlal2	v20.2d,v14.4s,v1.4s
688
689	dup	v17.2d,v17.d[0]
690	umlal2	v19.2d,v15.4s,v8.4s
691	umlal2	v22.2d,v15.4s,v3.4s
692	umlal2	v21.2d,v15.4s,v1.4s
693	umlal2	v23.2d,v15.4s,v5.4s
694	umlal2	v20.2d,v15.4s,v0.4s
695
696	dup	v18.2d,v18.d[0]
697	umlal2	v22.2d,v17.4s,v0.4s
698	umlal2	v23.2d,v17.4s,v1.4s
699	umlal2	v19.2d,v17.4s,v4.4s
700	umlal2	v20.2d,v17.4s,v6.4s
701	umlal2	v21.2d,v17.4s,v8.4s
702
703	umlal2	v22.2d,v18.4s,v8.4s
704	umlal2	v19.2d,v18.4s,v2.4s
705	umlal2	v23.2d,v18.4s,v0.4s
706	umlal2	v20.2d,v18.4s,v4.4s
707	umlal2	v21.2d,v18.4s,v6.4s
708
709	b.eq	.Lshort_tail
710
711	////////////////////////////////////////////////////////////////
712	// (hash+inp[0:1])*r^4:r^3 and accumulate
713
714	add	v9.2s,v9.2s,v24.2s
715	umlal	v22.2d,v11.2s,v1.2s
716	umlal	v19.2d,v11.2s,v6.2s
717	umlal	v23.2d,v11.2s,v3.2s
718	umlal	v20.2d,v11.2s,v8.2s
719	umlal	v21.2d,v11.2s,v0.2s
720
721	add	v10.2s,v10.2s,v25.2s
722	umlal	v22.2d,v9.2s,v5.2s
723	umlal	v19.2d,v9.2s,v0.2s
724	umlal	v23.2d,v9.2s,v7.2s
725	umlal	v20.2d,v9.2s,v1.2s
726	umlal	v21.2d,v9.2s,v3.2s
727
728	add	v12.2s,v12.2s,v27.2s
729	umlal	v22.2d,v10.2s,v3.2s
730	umlal	v19.2d,v10.2s,v8.2s
731	umlal	v23.2d,v10.2s,v5.2s
732	umlal	v20.2d,v10.2s,v0.2s
733	umlal	v21.2d,v10.2s,v1.2s
734
735	add	v13.2s,v13.2s,v28.2s
736	umlal	v22.2d,v12.2s,v0.2s
737	umlal	v19.2d,v12.2s,v4.2s
738	umlal	v23.2d,v12.2s,v1.2s
739	umlal	v20.2d,v12.2s,v6.2s
740	umlal	v21.2d,v12.2s,v8.2s
741
742	umlal	v22.2d,v13.2s,v8.2s
743	umlal	v19.2d,v13.2s,v2.2s
744	umlal	v23.2d,v13.2s,v0.2s
745	umlal	v20.2d,v13.2s,v4.2s
746	umlal	v21.2d,v13.2s,v6.2s
747
748.Lshort_tail:
749	////////////////////////////////////////////////////////////////
750	// horizontal add
751
752	addp	v22.2d,v22.2d,v22.2d
753	ldp	d8,d9,[sp,#16]		// meet ABI requirements
754	addp	v19.2d,v19.2d,v19.2d
755	ldp	d10,d11,[sp,#32]
756	addp	v23.2d,v23.2d,v23.2d
757	ldp	d12,d13,[sp,#48]
758	addp	v20.2d,v20.2d,v20.2d
759	ldp	d14,d15,[sp,#64]
760	addp	v21.2d,v21.2d,v21.2d
761
762	////////////////////////////////////////////////////////////////
763	// lazy reduction, but without narrowing
764
765	ushr	v29.2d,v22.2d,#26
766	and	v22.16b,v22.16b,v31.16b
767	ushr	v30.2d,v19.2d,#26
768	and	v19.16b,v19.16b,v31.16b
769
770	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
771	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
772
773	ushr	v29.2d,v23.2d,#26
774	and	v23.16b,v23.16b,v31.16b
775	ushr	v30.2d,v20.2d,#26
776	and	v20.16b,v20.16b,v31.16b
777	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
778
779	add	v19.2d,v19.2d,v29.2d
780	shl	v29.2d,v29.2d,#2
781	ushr	v30.2d,v21.2d,#26
782	and	v21.16b,v21.16b,v31.16b
783	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
784	add	v22.2d,v22.2d,v30.2d	// h2 -> h3
785
786	ushr	v29.2d,v19.2d,#26
787	and	v19.16b,v19.16b,v31.16b
788	ushr	v30.2d,v22.2d,#26
789	and	v22.16b,v22.16b,v31.16b
790	add	v20.2d,v20.2d,v29.2d	// h0 -> h1
791	add	v23.2d,v23.2d,v30.2d	// h3 -> h4
792
793	////////////////////////////////////////////////////////////////
794	// write the result, can be partially reduced
795
796	st4	{v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
797	st1	{v23.s}[0],[x0]
798
799.Lno_data_neon:
800.inst	0xd50323bf		// autiasp
801	ldr	x29,[sp],#80
802	ret
803.size	poly1305_blocks_neon,.-poly1305_blocks_neon
804
805.type	poly1305_emit_neon,%function
806.align	5
807poly1305_emit_neon:
808	ldr	x17,[x0,#24]
809	cbz	x17,poly1305_emit
810
811	ldp	w10,w11,[x0]		// load hash value base 2^26
812	ldp	w12,w13,[x0,#8]
813	ldr	w14,[x0,#16]
814
815	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
816	lsr	x5,x12,#12
817	adds	x4,x4,x12,lsl#52
818	add	x5,x5,x13,lsl#14
819	adc	x5,x5,xzr
820	lsr	x6,x14,#24
821	adds	x5,x5,x14,lsl#40
822	adc	x6,x6,xzr		// can be partially reduced...
823
824	ldp	x10,x11,[x2]	// load nonce
825
826	and	x12,x6,#-4		// ... so reduce
827	add	x12,x12,x6,lsr#2
828	and	x6,x6,#3
829	adds	x4,x4,x12
830	adcs	x5,x5,xzr
831	adc	x6,x6,xzr
832
833	adds	x12,x4,#5		// compare to modulus
834	adcs	x13,x5,xzr
835	adc	x14,x6,xzr
836
837	tst	x14,#-4			// see if it's carried/borrowed
838
839	csel	x4,x4,x12,eq
840	csel	x5,x5,x13,eq
841
842#ifdef	__ARMEB__
843	ror	x10,x10,#32		// flip nonce words
844	ror	x11,x11,#32
845#endif
846	adds	x4,x4,x10		// accumulate nonce
847	adc	x5,x5,x11
848#ifdef	__ARMEB__
849	rev	x4,x4			// flip output bytes
850	rev	x5,x5
851#endif
852	stp	x4,x5,[x1]		// write result
853
854	ret
855.size	poly1305_emit_neon,.-poly1305_emit_neon
856
857.align	5
858.Lzeros:
859.long	0,0,0,0,0,0,0,0
860.LOPENSSL_armcap_P:
861#ifdef	__ILP32__
862.long	OPENSSL_armcap_P-.
863#else
864.quad	OPENSSL_armcap_P-.
865#endif
866.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
867.align	2
868.align	2
869